lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +9 -4
- lfx/base/agents/altk_base_agent.py +16 -3
- lfx/base/agents/altk_tool_wrappers.py +1 -1
- lfx/base/agents/utils.py +4 -0
- lfx/base/composio/composio_base.py +78 -41
- lfx/base/data/base_file.py +14 -4
- lfx/base/data/cloud_storage_utils.py +156 -0
- lfx/base/data/docling_utils.py +191 -65
- lfx/base/data/storage_utils.py +109 -0
- lfx/base/datastax/astradb_base.py +75 -64
- lfx/base/mcp/util.py +2 -2
- lfx/base/models/__init__.py +11 -1
- lfx/base/models/anthropic_constants.py +21 -12
- lfx/base/models/google_generative_ai_constants.py +33 -9
- lfx/base/models/model_metadata.py +6 -0
- lfx/base/models/ollama_constants.py +196 -30
- lfx/base/models/openai_constants.py +37 -10
- lfx/base/models/unified_models.py +1123 -0
- lfx/base/models/watsonx_constants.py +36 -0
- lfx/base/tools/component_tool.py +2 -9
- lfx/cli/commands.py +6 -1
- lfx/cli/run.py +65 -409
- lfx/cli/script_loader.py +13 -3
- lfx/components/__init__.py +0 -3
- lfx/components/composio/github_composio.py +1 -1
- lfx/components/cuga/cuga_agent.py +39 -27
- lfx/components/data_source/api_request.py +4 -2
- lfx/components/docling/__init__.py +45 -11
- lfx/components/docling/chunk_docling_document.py +3 -1
- lfx/components/docling/docling_inline.py +39 -49
- lfx/components/docling/export_docling_document.py +3 -1
- lfx/components/elastic/opensearch_multimodal.py +215 -57
- lfx/components/files_and_knowledge/file.py +439 -39
- lfx/components/files_and_knowledge/ingestion.py +8 -0
- lfx/components/files_and_knowledge/retrieval.py +10 -0
- lfx/components/files_and_knowledge/save_file.py +123 -53
- lfx/components/ibm/watsonx.py +7 -1
- lfx/components/input_output/chat_output.py +7 -1
- lfx/components/langchain_utilities/tool_calling.py +14 -6
- lfx/components/llm_operations/batch_run.py +80 -25
- lfx/components/llm_operations/lambda_filter.py +33 -6
- lfx/components/llm_operations/llm_conditional_router.py +39 -7
- lfx/components/llm_operations/structured_output.py +38 -12
- lfx/components/models/__init__.py +16 -74
- lfx/components/models_and_agents/agent.py +51 -201
- lfx/components/models_and_agents/embedding_model.py +185 -339
- lfx/components/models_and_agents/language_model.py +54 -318
- lfx/components/models_and_agents/mcp_component.py +58 -9
- lfx/components/ollama/ollama.py +9 -4
- lfx/components/ollama/ollama_embeddings.py +2 -1
- lfx/components/openai/openai_chat_model.py +1 -1
- lfx/components/processing/__init__.py +0 -3
- lfx/components/vllm/__init__.py +37 -0
- lfx/components/vllm/vllm.py +141 -0
- lfx/components/vllm/vllm_embeddings.py +110 -0
- lfx/custom/custom_component/custom_component.py +8 -6
- lfx/custom/directory_reader/directory_reader.py +5 -2
- lfx/graph/utils.py +64 -18
- lfx/inputs/__init__.py +2 -0
- lfx/inputs/input_mixin.py +54 -0
- lfx/inputs/inputs.py +115 -0
- lfx/interface/initialize/loading.py +42 -12
- lfx/io/__init__.py +2 -0
- lfx/run/__init__.py +5 -0
- lfx/run/base.py +494 -0
- lfx/schema/data.py +1 -1
- lfx/schema/image.py +28 -19
- lfx/schema/message.py +19 -3
- lfx/services/interfaces.py +5 -0
- lfx/services/manager.py +5 -4
- lfx/services/mcp_composer/service.py +45 -13
- lfx/services/settings/auth.py +18 -11
- lfx/services/settings/base.py +12 -24
- lfx/services/settings/constants.py +2 -0
- lfx/services/storage/local.py +37 -0
- lfx/services/storage/service.py +19 -0
- lfx/utils/constants.py +1 -0
- lfx/utils/image.py +29 -11
- lfx/utils/validate_cloud.py +14 -3
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +84 -78
- lfx/components/processing/dataframe_to_toolset.py +0 -259
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0
|
@@ -3,7 +3,7 @@ from lfx.base.composio.composio_base import ComposioBaseComponent
|
|
|
3
3
|
|
|
4
4
|
class ComposioGitHubAPIComponent(ComposioBaseComponent):
|
|
5
5
|
display_name: str = "GitHub"
|
|
6
|
-
icon = "
|
|
6
|
+
icon = "GithubComposio"
|
|
7
7
|
documentation: str = "https://docs.composio.dev"
|
|
8
8
|
app_name = "github"
|
|
9
9
|
|
|
@@ -53,7 +53,7 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
53
53
|
"""Cuga Agent Component for advanced AI task execution.
|
|
54
54
|
|
|
55
55
|
The Cuga component is an advanced AI agent that can execute complex tasks using
|
|
56
|
-
various tools and browser automation. It supports custom
|
|
56
|
+
various tools and browser automation. It supports custom instructions, web applications,
|
|
57
57
|
and API interactions.
|
|
58
58
|
|
|
59
59
|
Attributes:
|
|
@@ -65,7 +65,7 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
65
65
|
"""
|
|
66
66
|
|
|
67
67
|
display_name: str = "Cuga"
|
|
68
|
-
description: str = "Define the Cuga agent's
|
|
68
|
+
description: str = "Define the Cuga agent's instructions, then assign it a task."
|
|
69
69
|
documentation: str = "https://docs.langflow.org/bundles-cuga"
|
|
70
70
|
icon = "bot"
|
|
71
71
|
name = "Cuga"
|
|
@@ -85,10 +85,10 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
85
85
|
),
|
|
86
86
|
*MODEL_PROVIDERS_DICT["OpenAI"]["inputs"],
|
|
87
87
|
MultilineInput(
|
|
88
|
-
name="
|
|
89
|
-
display_name="
|
|
88
|
+
name="instructions",
|
|
89
|
+
display_name="Instructions",
|
|
90
90
|
info=(
|
|
91
|
-
"Custom instructions
|
|
91
|
+
"Custom instructions for the agent to adhere to during its operation.\n"
|
|
92
92
|
"Example:\n"
|
|
93
93
|
"## Plan\n"
|
|
94
94
|
"< planning instructions e.g. which tools and when to use>\n"
|
|
@@ -117,16 +117,16 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
117
117
|
BoolInput(
|
|
118
118
|
name="lite_mode",
|
|
119
119
|
display_name="Enable CugaLite",
|
|
120
|
-
info="Enable CugaLite for simple API tasks
|
|
120
|
+
info="Faster reasoning for simple tasks. Enable CugaLite for simple API tasks.",
|
|
121
121
|
value=True,
|
|
122
|
-
advanced=
|
|
122
|
+
advanced=True,
|
|
123
123
|
),
|
|
124
124
|
IntInput(
|
|
125
125
|
name="lite_mode_tool_threshold",
|
|
126
126
|
display_name="CugaLite Tool Threshold",
|
|
127
127
|
info="Route to CugaLite if app has fewer than this many tools.",
|
|
128
128
|
value=25,
|
|
129
|
-
advanced=
|
|
129
|
+
advanced=True,
|
|
130
130
|
),
|
|
131
131
|
DropdownInput(
|
|
132
132
|
name="decomposition_strategy",
|
|
@@ -142,17 +142,17 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
142
142
|
display_name="Enable Browser",
|
|
143
143
|
info="Toggle to enable a built-in browser tool for web scraping and searching.",
|
|
144
144
|
value=False,
|
|
145
|
-
advanced=
|
|
145
|
+
advanced=True,
|
|
146
146
|
),
|
|
147
147
|
MultilineInput(
|
|
148
148
|
name="web_apps",
|
|
149
149
|
display_name="Web applications",
|
|
150
150
|
info=(
|
|
151
|
-
"
|
|
151
|
+
"Cuga will automatically start this web application when Enable Browser is true. "
|
|
152
152
|
"Currently only supports one web application. Example: https://example.com"
|
|
153
153
|
),
|
|
154
154
|
value="",
|
|
155
|
-
advanced=
|
|
155
|
+
advanced=True,
|
|
156
156
|
),
|
|
157
157
|
]
|
|
158
158
|
outputs = [
|
|
@@ -211,7 +211,6 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
211
211
|
settings.advanced_features.mode = "api"
|
|
212
212
|
|
|
213
213
|
from cuga.backend.activity_tracker.tracker import ActivityTracker
|
|
214
|
-
from cuga.backend.cuga_graph.nodes.api.variables_manager.manager import VariablesManager
|
|
215
214
|
from cuga.backend.cuga_graph.utils.agent_loop import StreamEvent
|
|
216
215
|
from cuga.backend.cuga_graph.utils.controller import (
|
|
217
216
|
AgentRunner as CugaAgent,
|
|
@@ -222,13 +221,10 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
222
221
|
from cuga.backend.llm.models import LLMManager
|
|
223
222
|
from cuga.configurations.instructions_manager import InstructionsManager
|
|
224
223
|
|
|
225
|
-
var_manager = VariablesManager()
|
|
226
|
-
|
|
227
224
|
# Reset var_manager if this is the first message in history
|
|
228
225
|
logger.debug(f"[CUGA] Checking history_messages: count={len(history_messages) if history_messages else 0}")
|
|
229
226
|
if not history_messages or len(history_messages) == 0:
|
|
230
227
|
logger.debug("[CUGA] First message in history detected, resetting var_manager")
|
|
231
|
-
var_manager.reset()
|
|
232
228
|
else:
|
|
233
229
|
logger.debug(f"[CUGA] Continuing conversation with {len(history_messages)} previous messages")
|
|
234
230
|
|
|
@@ -236,12 +232,14 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
236
232
|
llm_manager.set_llm(llm)
|
|
237
233
|
instructions_manager = InstructionsManager()
|
|
238
234
|
|
|
239
|
-
|
|
240
|
-
logger.debug(f"[CUGA]
|
|
241
|
-
instructions_manager.set_instructions_from_one_file(
|
|
235
|
+
instructions_to_use = self.instructions or ""
|
|
236
|
+
logger.debug(f"[CUGA] instructions are: {instructions_to_use}")
|
|
237
|
+
instructions_manager.set_instructions_from_one_file(instructions_to_use)
|
|
242
238
|
tracker = ActivityTracker()
|
|
243
239
|
tracker.set_tools(tools)
|
|
244
|
-
|
|
240
|
+
thread_id = self.graph.session_id
|
|
241
|
+
logger.debug(f"[CUGA] Using thread_id (session_id): {thread_id}")
|
|
242
|
+
cuga_agent = CugaAgent(browser_enabled=self.browser_enabled, thread_id=thread_id)
|
|
245
243
|
if self.browser_enabled:
|
|
246
244
|
await cuga_agent.initialize_freemode_env(start_url=self.web_apps.strip(), interface_mode="browser_only")
|
|
247
245
|
else:
|
|
@@ -257,13 +255,20 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
257
255
|
logger.debug(f"[CUGA] Processing input: {current_input}")
|
|
258
256
|
try:
|
|
259
257
|
# Convert history to LangChain format for the event
|
|
258
|
+
logger.debug(f"[CUGA] Converting {len(history_messages)} history messages to LangChain format")
|
|
260
259
|
lc_messages = []
|
|
261
|
-
for msg in history_messages:
|
|
260
|
+
for i, msg in enumerate(history_messages):
|
|
261
|
+
msg_text = getattr(msg, "text", "N/A")[:50] if hasattr(msg, "text") else "N/A"
|
|
262
|
+
logger.debug(
|
|
263
|
+
f"[CUGA] Message {i}: type={type(msg)}, sender={getattr(msg, 'sender', 'N/A')}, "
|
|
264
|
+
f"text={msg_text}..."
|
|
265
|
+
)
|
|
262
266
|
if hasattr(msg, "sender") and msg.sender == "Human":
|
|
263
267
|
lc_messages.append(HumanMessage(content=msg.text))
|
|
264
268
|
else:
|
|
265
269
|
lc_messages.append(AIMessage(content=msg.text))
|
|
266
270
|
|
|
271
|
+
logger.debug(f"[CUGA] Converted to {len(lc_messages)} LangChain messages")
|
|
267
272
|
await asyncio.sleep(0.5)
|
|
268
273
|
|
|
269
274
|
# 2. Build final response
|
|
@@ -274,7 +279,9 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
274
279
|
last_event: StreamEvent | None = None
|
|
275
280
|
tool_run_id: str | None = None
|
|
276
281
|
# 3. Chain end event with AgentFinish
|
|
277
|
-
async for event in cuga_agent.run_task_generic_yield(
|
|
282
|
+
async for event in cuga_agent.run_task_generic_yield(
|
|
283
|
+
eval_mode=False, goal=current_input, chat_messages=lc_messages
|
|
284
|
+
):
|
|
278
285
|
logger.debug(f"[CUGA] recieved event {event}")
|
|
279
286
|
if last_event is not None and tool_run_id is not None:
|
|
280
287
|
logger.debug(f"[CUGA] last event {last_event}")
|
|
@@ -350,12 +357,12 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
350
357
|
raise ValueError(msg)
|
|
351
358
|
|
|
352
359
|
try:
|
|
353
|
-
llm_model, self.chat_history, self.tools = await self.get_agent_requirements()
|
|
354
|
-
|
|
355
|
-
# Create agent message for event processing
|
|
356
360
|
from lfx.schema.content_block import ContentBlock
|
|
357
361
|
from lfx.schema.message import MESSAGE_SENDER_AI
|
|
358
362
|
|
|
363
|
+
llm_model, self.chat_history, self.tools = await self.get_agent_requirements()
|
|
364
|
+
|
|
365
|
+
# Create agent message for event processing
|
|
359
366
|
agent_message = Message(
|
|
360
367
|
sender=MESSAGE_SENDER_AI,
|
|
361
368
|
sender_name="Cuga",
|
|
@@ -368,7 +375,7 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
368
375
|
# This ensures streaming works even when not connected to ChatOutput
|
|
369
376
|
if not self.is_connected_to_chat_output():
|
|
370
377
|
# When not connected to ChatOutput, assign ID upfront for streaming support
|
|
371
|
-
agent_message.data["id"] =
|
|
378
|
+
agent_message.data["id"] = uuid.uuid4()
|
|
372
379
|
|
|
373
380
|
# Get input text
|
|
374
381
|
input_text = self.input_value.text if hasattr(self.input_value, "text") else str(self.input_value)
|
|
@@ -476,9 +483,14 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
476
483
|
"""
|
|
477
484
|
logger.debug("[CUGA] Retrieving chat history messages.")
|
|
478
485
|
logger.debug(f"[CUGA] Session ID: {self.graph.session_id}")
|
|
486
|
+
logger.debug(f"[CUGA] n_messages: {self.n_messages}")
|
|
487
|
+
logger.debug(f"[CUGA] input_value: {self.input_value}")
|
|
488
|
+
logger.debug(f"[CUGA] input_value type: {type(self.input_value)}")
|
|
489
|
+
logger.debug(f"[CUGA] input_value id: {getattr(self.input_value, 'id', None)}")
|
|
490
|
+
|
|
479
491
|
messages = (
|
|
480
492
|
await MemoryComponent(**self.get_base_args())
|
|
481
|
-
.set(session_id=self.graph.session_id, order="Ascending", n_messages=self.n_messages)
|
|
493
|
+
.set(session_id=str(self.graph.session_id), order="Ascending", n_messages=self.n_messages)
|
|
482
494
|
.retrieve_messages()
|
|
483
495
|
)
|
|
484
496
|
logger.debug(f"[CUGA] Retrieved {len(messages)} messages from memory")
|
|
@@ -678,7 +690,7 @@ class CugaComponent(ToolCallingAgentComponent):
|
|
|
678
690
|
"tools",
|
|
679
691
|
"input_value",
|
|
680
692
|
"add_current_date_tool",
|
|
681
|
-
"
|
|
693
|
+
"instructions",
|
|
682
694
|
"agent_description",
|
|
683
695
|
"max_iterations",
|
|
684
696
|
"handle_parsing_errors",
|
|
@@ -493,11 +493,13 @@ class APIRequestComponent(Component):
|
|
|
493
493
|
return self.parse_curl(self.curl_input, build_config)
|
|
494
494
|
return build_config
|
|
495
495
|
|
|
496
|
-
# print(f"Current mode: {field_value}")
|
|
497
496
|
if field_value == "cURL":
|
|
498
497
|
set_field_display(build_config, "curl_input", value=True)
|
|
499
498
|
if build_config["curl_input"]["value"]:
|
|
500
|
-
|
|
499
|
+
try:
|
|
500
|
+
build_config = self.parse_curl(build_config["curl_input"]["value"], build_config)
|
|
501
|
+
except ValueError as e:
|
|
502
|
+
self.log(f"Failed to parse cURL input: {e}")
|
|
501
503
|
else:
|
|
502
504
|
set_field_display(build_config, "curl_input", value=False)
|
|
503
505
|
|
|
@@ -3,35 +3,69 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
5
|
from lfx.components._importing import import_mod
|
|
6
|
+
from lfx.utils.validate_cloud import is_astra_cloud_environment
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
|
-
from .chunk_docling_document import ChunkDoclingDocumentComponent
|
|
9
|
-
from .docling_inline import DoclingInlineComponent
|
|
10
|
-
from .docling_remote import DoclingRemoteComponent
|
|
11
|
-
from .export_docling_document import ExportDoclingDocumentComponent
|
|
9
|
+
from .chunk_docling_document import ChunkDoclingDocumentComponent # noqa: F401
|
|
10
|
+
from .docling_inline import DoclingInlineComponent # noqa: F401
|
|
11
|
+
from .docling_remote import DoclingRemoteComponent # noqa: F401
|
|
12
|
+
from .export_docling_document import ExportDoclingDocumentComponent # noqa: F401
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
_all_components = [
|
|
15
|
+
"ChunkDoclingDocumentComponent",
|
|
16
|
+
"DoclingInlineComponent",
|
|
17
|
+
"DoclingRemoteComponent",
|
|
18
|
+
"ExportDoclingDocumentComponent",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
_all_dynamic_imports = {
|
|
14
22
|
"ChunkDoclingDocumentComponent": "chunk_docling_document",
|
|
15
23
|
"DoclingInlineComponent": "docling_inline",
|
|
16
24
|
"DoclingRemoteComponent": "docling_remote",
|
|
17
25
|
"ExportDoclingDocumentComponent": "export_docling_document",
|
|
18
26
|
}
|
|
19
27
|
|
|
20
|
-
|
|
28
|
+
# Components that require local Docling/EasyOCR dependencies (disabled in cloud)
|
|
29
|
+
_cloud_disabled_components = {
|
|
21
30
|
"ChunkDoclingDocumentComponent",
|
|
22
31
|
"DoclingInlineComponent",
|
|
23
|
-
"DoclingRemoteComponent",
|
|
24
32
|
"ExportDoclingDocumentComponent",
|
|
25
|
-
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _get_available_components() -> list[str]:
|
|
37
|
+
"""Get list of available components, filtering out cloud-disabled ones."""
|
|
38
|
+
if is_astra_cloud_environment():
|
|
39
|
+
# Only show DoclingRemoteComponent (Docling Serve) in cloud
|
|
40
|
+
return [comp for comp in _all_components if comp not in _cloud_disabled_components]
|
|
41
|
+
return _all_components
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_dynamic_imports() -> dict[str, str]:
|
|
45
|
+
"""Get dynamic imports dict, filtering out cloud-disabled ones."""
|
|
46
|
+
if is_astra_cloud_environment():
|
|
47
|
+
# Only allow DoclingRemoteComponent (Docling Serve) in cloud
|
|
48
|
+
return {k: v for k, v in _all_dynamic_imports.items() if k not in _cloud_disabled_components}
|
|
49
|
+
return _all_dynamic_imports
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Dynamically set __all__ and _dynamic_imports based on cloud environment
|
|
53
|
+
__all__: list[str] = _get_available_components() # noqa: PLE0605
|
|
54
|
+
_dynamic_imports: dict[str, str] = _get_dynamic_imports()
|
|
26
55
|
|
|
27
56
|
|
|
28
57
|
def __getattr__(attr_name: str) -> Any:
|
|
29
58
|
"""Lazily import docling components on attribute access."""
|
|
30
|
-
if
|
|
59
|
+
# Check if component is available (not disabled in cloud)
|
|
60
|
+
if is_astra_cloud_environment() and attr_name in _cloud_disabled_components:
|
|
61
|
+
msg = f"module '{__name__}' has no attribute '{attr_name}'"
|
|
62
|
+
raise AttributeError(msg)
|
|
63
|
+
|
|
64
|
+
if attr_name not in _all_dynamic_imports:
|
|
31
65
|
msg = f"module '{__name__}' has no attribute '{attr_name}'"
|
|
32
66
|
raise AttributeError(msg)
|
|
33
67
|
try:
|
|
34
|
-
result = import_mod(attr_name,
|
|
68
|
+
result = import_mod(attr_name, _all_dynamic_imports[attr_name], __spec__.parent)
|
|
35
69
|
except (ModuleNotFoundError, ImportError, AttributeError) as e:
|
|
36
70
|
msg = f"Could not import '{attr_name}' from '{__name__}': {e}"
|
|
37
71
|
raise AttributeError(msg) from e
|
|
@@ -40,4 +74,4 @@ def __getattr__(attr_name: str) -> Any:
|
|
|
40
74
|
|
|
41
75
|
|
|
42
76
|
def __dir__() -> list[str]:
|
|
43
|
-
return
|
|
77
|
+
return _get_available_components()
|
|
@@ -115,7 +115,9 @@ class ChunkDoclingDocumentComponent(Component):
|
|
|
115
115
|
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
|
|
116
116
|
|
|
117
117
|
def chunk_documents(self) -> DataFrame:
|
|
118
|
-
documents = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
118
|
+
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
119
|
+
if warning:
|
|
120
|
+
self.status = warning
|
|
119
121
|
|
|
120
122
|
chunker: BaseChunker
|
|
121
123
|
if self.chunker == "HybridChunker":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
import queue
|
|
2
|
+
import threading
|
|
1
3
|
import time
|
|
2
|
-
from multiprocessing import Queue, get_context
|
|
3
|
-
from queue import Empty
|
|
4
4
|
|
|
5
5
|
from lfx.base.data import BaseFileComponent
|
|
6
6
|
from lfx.base.data.docling_utils import _serialize_pydantic_model, docling_worker
|
|
@@ -92,60 +92,57 @@ class DoclingInlineComponent(BaseFileComponent):
|
|
|
92
92
|
*BaseFileComponent.get_base_outputs(),
|
|
93
93
|
]
|
|
94
94
|
|
|
95
|
-
def
|
|
96
|
-
|
|
95
|
+
def _wait_for_result_with_thread_monitoring(
|
|
96
|
+
self, result_queue: queue.Queue, thread: threading.Thread, timeout: int = 300
|
|
97
|
+
):
|
|
98
|
+
"""Wait for result from queue while monitoring thread health.
|
|
97
99
|
|
|
98
|
-
Handles cases where
|
|
100
|
+
Handles cases where thread crashes without sending result.
|
|
99
101
|
"""
|
|
100
102
|
start_time = time.time()
|
|
101
103
|
|
|
102
104
|
while time.time() - start_time < timeout:
|
|
103
|
-
# Check if
|
|
104
|
-
if not
|
|
105
|
-
#
|
|
105
|
+
# Check if thread is still alive
|
|
106
|
+
if not thread.is_alive():
|
|
107
|
+
# Thread finished, try to get any result it might have sent
|
|
106
108
|
try:
|
|
107
|
-
result =
|
|
108
|
-
except Empty:
|
|
109
|
-
#
|
|
110
|
-
msg =
|
|
109
|
+
result = result_queue.get_nowait()
|
|
110
|
+
except queue.Empty:
|
|
111
|
+
# Thread finished without sending result
|
|
112
|
+
msg = "Worker thread crashed unexpectedly without producing result."
|
|
111
113
|
raise RuntimeError(msg) from None
|
|
112
114
|
else:
|
|
113
|
-
self.log("
|
|
115
|
+
self.log("Thread completed and result retrieved")
|
|
114
116
|
return result
|
|
115
117
|
|
|
116
118
|
# Poll the queue instead of blocking
|
|
117
119
|
try:
|
|
118
|
-
result =
|
|
119
|
-
except Empty:
|
|
120
|
+
result = result_queue.get(timeout=1)
|
|
121
|
+
except queue.Empty:
|
|
120
122
|
# No result yet, continue monitoring
|
|
121
123
|
continue
|
|
122
124
|
else:
|
|
123
|
-
self.log("Result received from worker
|
|
125
|
+
self.log("Result received from worker thread")
|
|
124
126
|
return result
|
|
125
127
|
|
|
126
128
|
# Overall timeout reached
|
|
127
|
-
msg = f"
|
|
129
|
+
msg = f"Thread timed out after {timeout} seconds"
|
|
128
130
|
raise TimeoutError(msg)
|
|
129
131
|
|
|
130
|
-
def
|
|
131
|
-
"""
|
|
132
|
+
def _stop_thread_gracefully(self, thread: threading.Thread, timeout: int = 10):
|
|
133
|
+
"""Wait for thread to complete gracefully.
|
|
132
134
|
|
|
133
|
-
|
|
135
|
+
Note: Python threads cannot be forcefully killed, so we just wait.
|
|
136
|
+
The thread should respond to shutdown signals via the queue.
|
|
134
137
|
"""
|
|
135
|
-
if not
|
|
138
|
+
if not thread.is_alive():
|
|
136
139
|
return
|
|
137
140
|
|
|
138
|
-
self.log("
|
|
139
|
-
|
|
140
|
-
proc.join(timeout=timeout_terminate)
|
|
141
|
+
self.log("Waiting for thread to complete gracefully")
|
|
142
|
+
thread.join(timeout=timeout)
|
|
141
143
|
|
|
142
|
-
if
|
|
143
|
-
self.log("
|
|
144
|
-
proc.kill() # Send SIGKILL
|
|
145
|
-
proc.join(timeout=timeout_kill)
|
|
146
|
-
|
|
147
|
-
if proc.is_alive():
|
|
148
|
-
self.log("Warning: Process still alive after SIGKILL")
|
|
144
|
+
if thread.is_alive():
|
|
145
|
+
self.log("Warning: Thread still alive after timeout")
|
|
149
146
|
|
|
150
147
|
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
|
|
151
148
|
try:
|
|
@@ -167,44 +164,37 @@ class DoclingInlineComponent(BaseFileComponent):
|
|
|
167
164
|
if self.pic_desc_llm is not None:
|
|
168
165
|
pic_desc_config = _serialize_pydantic_model(self.pic_desc_llm)
|
|
169
166
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
167
|
+
# Use threading instead of multiprocessing for memory sharing
|
|
168
|
+
# This enables the global DocumentConverter cache to work across runs
|
|
169
|
+
result_queue: queue.Queue = queue.Queue()
|
|
170
|
+
thread = threading.Thread(
|
|
173
171
|
target=docling_worker,
|
|
174
172
|
kwargs={
|
|
175
173
|
"file_paths": file_paths,
|
|
176
|
-
"queue":
|
|
174
|
+
"queue": result_queue,
|
|
177
175
|
"pipeline": self.pipeline,
|
|
178
176
|
"ocr_engine": self.ocr_engine,
|
|
179
177
|
"do_picture_classification": self.do_picture_classification,
|
|
180
178
|
"pic_desc_config": pic_desc_config,
|
|
181
179
|
"pic_desc_prompt": self.pic_desc_prompt,
|
|
182
180
|
},
|
|
181
|
+
daemon=False, # Allow thread to complete even if main thread exits
|
|
183
182
|
)
|
|
184
183
|
|
|
185
184
|
result = None
|
|
186
|
-
|
|
185
|
+
thread.start()
|
|
187
186
|
|
|
188
187
|
try:
|
|
189
|
-
result = self.
|
|
188
|
+
result = self._wait_for_result_with_thread_monitoring(result_queue, thread, timeout=300)
|
|
190
189
|
except KeyboardInterrupt:
|
|
191
|
-
self.log("Docling
|
|
190
|
+
self.log("Docling thread cancelled by user")
|
|
192
191
|
result = []
|
|
193
192
|
except Exception as e:
|
|
194
193
|
self.log(f"Error during processing: {e}")
|
|
195
194
|
raise
|
|
196
195
|
finally:
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
self._terminate_process_gracefully(proc)
|
|
200
|
-
finally:
|
|
201
|
-
# Always close and cleanup queue resources
|
|
202
|
-
try:
|
|
203
|
-
queue.close()
|
|
204
|
-
queue.join_thread()
|
|
205
|
-
except Exception as e: # noqa: BLE001
|
|
206
|
-
# Ignore cleanup errors, but log them
|
|
207
|
-
self.log(f"Warning: Error during queue cleanup - {e}")
|
|
196
|
+
# Wait for thread to complete gracefully
|
|
197
|
+
self._stop_thread_gracefully(thread)
|
|
208
198
|
|
|
209
199
|
# Enhanced error checking with dependency-specific handling
|
|
210
200
|
if isinstance(result, dict) and "error" in result:
|
|
@@ -86,7 +86,9 @@ class ExportDoclingDocumentComponent(Component):
|
|
|
86
86
|
return build_config
|
|
87
87
|
|
|
88
88
|
def export_document(self) -> list[Data]:
|
|
89
|
-
documents = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
89
|
+
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
90
|
+
if warning:
|
|
91
|
+
self.status = warning
|
|
90
92
|
|
|
91
93
|
results: list[Data] = []
|
|
92
94
|
try:
|