lfx-nightly 0.2.0.dev41__py3-none-any.whl → 0.3.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/__main__.py +137 -6
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +10 -6
- lfx/base/agents/altk_base_agent.py +5 -3
- lfx/base/agents/altk_tool_wrappers.py +1 -1
- lfx/base/agents/events.py +1 -1
- lfx/base/agents/utils.py +4 -0
- lfx/base/composio/composio_base.py +78 -41
- lfx/base/data/cloud_storage_utils.py +156 -0
- lfx/base/data/docling_utils.py +130 -55
- lfx/base/datastax/astradb_base.py +75 -64
- lfx/base/embeddings/embeddings_class.py +113 -0
- lfx/base/models/__init__.py +11 -1
- lfx/base/models/google_generative_ai_constants.py +33 -9
- lfx/base/models/model_metadata.py +6 -0
- lfx/base/models/ollama_constants.py +196 -30
- lfx/base/models/openai_constants.py +37 -10
- lfx/base/models/unified_models.py +1123 -0
- lfx/base/models/watsonx_constants.py +43 -4
- lfx/base/prompts/api_utils.py +40 -5
- lfx/base/tools/component_tool.py +2 -9
- lfx/cli/__init__.py +10 -2
- lfx/cli/commands.py +3 -0
- lfx/cli/run.py +65 -409
- lfx/cli/script_loader.py +18 -7
- lfx/cli/validation.py +6 -3
- lfx/components/__init__.py +0 -3
- lfx/components/composio/github_composio.py +1 -1
- lfx/components/cuga/cuga_agent.py +39 -27
- lfx/components/data_source/api_request.py +4 -2
- lfx/components/datastax/astradb_assistant_manager.py +4 -2
- lfx/components/docling/__init__.py +45 -11
- lfx/components/docling/docling_inline.py +39 -49
- lfx/components/docling/docling_remote.py +1 -0
- lfx/components/elastic/opensearch_multimodal.py +1733 -0
- lfx/components/files_and_knowledge/file.py +384 -36
- lfx/components/files_and_knowledge/ingestion.py +8 -0
- lfx/components/files_and_knowledge/retrieval.py +10 -0
- lfx/components/files_and_knowledge/save_file.py +91 -88
- lfx/components/langchain_utilities/ibm_granite_handler.py +211 -0
- lfx/components/langchain_utilities/tool_calling.py +37 -6
- lfx/components/llm_operations/batch_run.py +64 -18
- lfx/components/llm_operations/lambda_filter.py +213 -101
- lfx/components/llm_operations/llm_conditional_router.py +39 -7
- lfx/components/llm_operations/structured_output.py +38 -12
- lfx/components/models/__init__.py +16 -74
- lfx/components/models_and_agents/agent.py +51 -203
- lfx/components/models_and_agents/embedding_model.py +171 -255
- lfx/components/models_and_agents/language_model.py +54 -318
- lfx/components/models_and_agents/mcp_component.py +96 -10
- lfx/components/models_and_agents/prompt.py +105 -18
- lfx/components/ollama/ollama_embeddings.py +111 -29
- lfx/components/openai/openai_chat_model.py +1 -1
- lfx/components/processing/text_operations.py +580 -0
- lfx/components/vllm/__init__.py +37 -0
- lfx/components/vllm/vllm.py +141 -0
- lfx/components/vllm/vllm_embeddings.py +110 -0
- lfx/custom/custom_component/component.py +65 -10
- lfx/custom/custom_component/custom_component.py +8 -6
- lfx/events/observability/__init__.py +0 -0
- lfx/events/observability/lifecycle_events.py +111 -0
- lfx/field_typing/__init__.py +57 -58
- lfx/graph/graph/base.py +40 -1
- lfx/graph/utils.py +109 -30
- lfx/graph/vertex/base.py +75 -23
- lfx/graph/vertex/vertex_types.py +0 -5
- lfx/inputs/__init__.py +2 -0
- lfx/inputs/input_mixin.py +55 -0
- lfx/inputs/inputs.py +120 -0
- lfx/interface/components.py +24 -7
- lfx/interface/initialize/loading.py +42 -12
- lfx/io/__init__.py +2 -0
- lfx/run/__init__.py +5 -0
- lfx/run/base.py +464 -0
- lfx/schema/__init__.py +50 -0
- lfx/schema/data.py +1 -1
- lfx/schema/image.py +26 -7
- lfx/schema/message.py +104 -11
- lfx/schema/workflow.py +171 -0
- lfx/services/deps.py +12 -0
- lfx/services/interfaces.py +43 -1
- lfx/services/mcp_composer/service.py +7 -1
- lfx/services/schema.py +1 -0
- lfx/services/settings/auth.py +95 -4
- lfx/services/settings/base.py +11 -1
- lfx/services/settings/constants.py +2 -0
- lfx/services/settings/utils.py +82 -0
- lfx/services/storage/local.py +13 -8
- lfx/services/transaction/__init__.py +5 -0
- lfx/services/transaction/service.py +35 -0
- lfx/tests/unit/components/__init__.py +0 -0
- lfx/utils/constants.py +2 -0
- lfx/utils/mustache_security.py +79 -0
- lfx/utils/validate_cloud.py +81 -3
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/METADATA +7 -2
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/RECORD +98 -80
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/entry_points.txt +0 -0
lfx/base/agents/agent.py
CHANGED
|
@@ -71,8 +71,8 @@ class LCAgentComponent(Component):
|
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
outputs = [
|
|
74
|
-
Output(display_name="Agent", name="agent", method="build_agent", hidden=True, tool_mode=False),
|
|
75
74
|
Output(display_name="Response", name="response", method="message_response"),
|
|
75
|
+
Output(display_name="Agent", name="agent", method="build_agent", tool_mode=False),
|
|
76
76
|
]
|
|
77
77
|
|
|
78
78
|
# Get shared callbacks for tracing and save them to self.shared_callbacks
|
|
@@ -185,8 +185,10 @@ class LCAgentComponent(Component):
|
|
|
185
185
|
if "input" not in input_dict:
|
|
186
186
|
input_dict = {"input": self.input_value}
|
|
187
187
|
|
|
188
|
-
if
|
|
189
|
-
|
|
188
|
+
# Use enhanced prompt if available (set by IBM Granite handler), otherwise use original
|
|
189
|
+
system_prompt_to_use = getattr(self, "_effective_system_prompt", None) or self.system_prompt
|
|
190
|
+
if system_prompt_to_use and system_prompt_to_use.strip():
|
|
191
|
+
input_dict["system_prompt"] = system_prompt_to_use
|
|
190
192
|
|
|
191
193
|
if hasattr(self, "chat_history") and self.chat_history:
|
|
192
194
|
if isinstance(self.chat_history, Data):
|
|
@@ -272,9 +274,11 @@ class LCAgentComponent(Component):
|
|
|
272
274
|
on_token_callback,
|
|
273
275
|
)
|
|
274
276
|
except ExceptionWithMessageError as e:
|
|
275
|
-
if
|
|
276
|
-
|
|
277
|
-
|
|
277
|
+
# Only delete message from database if it has an ID (was stored)
|
|
278
|
+
if hasattr(e, "agent_message"):
|
|
279
|
+
msg_id = e.agent_message.get_id()
|
|
280
|
+
if msg_id:
|
|
281
|
+
await delete_message(id_=msg_id)
|
|
278
282
|
await self._send_message_event(e.agent_message, category="remove_message")
|
|
279
283
|
logger.error(f"ExceptionWithMessageError: {e}")
|
|
280
284
|
raise
|
|
@@ -378,9 +378,11 @@ class ALTKBaseAgentComponent(AgentComponent):
|
|
|
378
378
|
cast("SendMessageFunctionType", self.send_message),
|
|
379
379
|
)
|
|
380
380
|
except ExceptionWithMessageError as e:
|
|
381
|
-
if
|
|
382
|
-
|
|
383
|
-
|
|
381
|
+
# Only delete message from database if it has an ID (was stored)
|
|
382
|
+
if hasattr(e, "agent_message"):
|
|
383
|
+
msg_id = e.agent_message.get_id()
|
|
384
|
+
if msg_id:
|
|
385
|
+
await delete_message(id_=msg_id)
|
|
384
386
|
await self._send_message_event(e.agent_message, category="remove_message")
|
|
385
387
|
logger.error(f"ExceptionWithMessageError: {e}")
|
|
386
388
|
raise
|
|
@@ -513,7 +513,7 @@ class PostToolProcessor(ALTKBaseTool):
|
|
|
513
513
|
output = None
|
|
514
514
|
try:
|
|
515
515
|
output = middleware.process(input_data, AgentPhase.RUNTIME)
|
|
516
|
-
except
|
|
516
|
+
except Exception as e: # noqa: BLE001
|
|
517
517
|
logger.error(f"Exception in executing CodeGenerationComponent: {e}")
|
|
518
518
|
if output is not None and hasattr(output, "result"):
|
|
519
519
|
logger.info(f"Output of CodeGenerationComponent: {output.result}")
|
lfx/base/agents/events.py
CHANGED
|
@@ -388,7 +388,7 @@ async def process_agent_events(
|
|
|
388
388
|
agent_message = await send_message_callback(message=agent_message)
|
|
389
389
|
# Capture the original message id - this must stay consistent throughout if streaming
|
|
390
390
|
# Message may not contain id if the Agent is not connected to a Chat Output (_should_skip_message is True)
|
|
391
|
-
initial_message_id = agent_message.
|
|
391
|
+
initial_message_id = agent_message.get_id()
|
|
392
392
|
try:
|
|
393
393
|
# Create a mapping of run_ids to tool contents
|
|
394
394
|
tool_blocks_map: dict[str, ToolContent] = {}
|
lfx/base/agents/utils.py
CHANGED
|
@@ -224,6 +224,10 @@ def get_chat_output_sender_name(self) -> str | None:
|
|
|
224
224
|
if not hasattr(self, "graph") or not self.graph:
|
|
225
225
|
return None
|
|
226
226
|
|
|
227
|
+
# Check if graph has vertices attribute (PlaceholderGraph doesn't)
|
|
228
|
+
if not hasattr(self.graph, "vertices"):
|
|
229
|
+
return None
|
|
230
|
+
|
|
227
231
|
for vertex in self.graph.vertices:
|
|
228
232
|
# Safely check if vertex has data attribute, correct type, and raw_params
|
|
229
233
|
if (
|
|
@@ -41,6 +41,58 @@ class ComposioBaseComponent(Component):
|
|
|
41
41
|
|
|
42
42
|
default_tools_limit: int = 5
|
|
43
43
|
|
|
44
|
+
# Reserved attribute names that conflict with Component base class
|
|
45
|
+
RESERVED_ATTRIBUTES: set[str] = {
|
|
46
|
+
# Core component attributes
|
|
47
|
+
"name",
|
|
48
|
+
"description",
|
|
49
|
+
"status",
|
|
50
|
+
"display_name",
|
|
51
|
+
"icon",
|
|
52
|
+
"priority",
|
|
53
|
+
"code",
|
|
54
|
+
"inputs",
|
|
55
|
+
"outputs",
|
|
56
|
+
"selected_output",
|
|
57
|
+
# Properties and methods
|
|
58
|
+
"trace_type",
|
|
59
|
+
"trace_name",
|
|
60
|
+
"function",
|
|
61
|
+
"repr_value",
|
|
62
|
+
"field_config",
|
|
63
|
+
"field_order",
|
|
64
|
+
"frozen",
|
|
65
|
+
"build_parameters",
|
|
66
|
+
"cache",
|
|
67
|
+
"tools_metadata",
|
|
68
|
+
"vertex",
|
|
69
|
+
# User and session attributes
|
|
70
|
+
"user_id", # Already handled separately but included for completeness
|
|
71
|
+
"session_id",
|
|
72
|
+
"flow_id",
|
|
73
|
+
"flow_name",
|
|
74
|
+
"context",
|
|
75
|
+
# Common method names
|
|
76
|
+
"build",
|
|
77
|
+
"run",
|
|
78
|
+
"stop",
|
|
79
|
+
"start",
|
|
80
|
+
"validate",
|
|
81
|
+
"get_function",
|
|
82
|
+
"set_attributes",
|
|
83
|
+
# Additional common conflicts
|
|
84
|
+
"id",
|
|
85
|
+
"type",
|
|
86
|
+
"value",
|
|
87
|
+
"metadata",
|
|
88
|
+
"logs",
|
|
89
|
+
"results",
|
|
90
|
+
"artifacts",
|
|
91
|
+
"parameters",
|
|
92
|
+
"template",
|
|
93
|
+
"config",
|
|
94
|
+
}
|
|
95
|
+
|
|
44
96
|
_base_inputs = [
|
|
45
97
|
MessageTextInput(
|
|
46
98
|
name="entity_id",
|
|
@@ -623,13 +675,9 @@ class ComposioBaseComponent(Component):
|
|
|
623
675
|
attachment_related_found = True
|
|
624
676
|
continue # Skip individual attachment fields
|
|
625
677
|
|
|
626
|
-
# Handle
|
|
627
|
-
if clean_field == "user_id":
|
|
628
|
-
clean_field = f"{self.app_name}_user_id"
|
|
629
|
-
|
|
630
|
-
# Handle reserved attribute name conflicts (e.g., 'status', 'name')
|
|
678
|
+
# Handle reserved attribute name conflicts
|
|
631
679
|
# Prefix with app name to prevent clashes with component attributes
|
|
632
|
-
if clean_field in
|
|
680
|
+
if clean_field in self.RESERVED_ATTRIBUTES:
|
|
633
681
|
clean_field = f"{self.app_name}_{clean_field}"
|
|
634
682
|
|
|
635
683
|
action_fields.append(clean_field)
|
|
@@ -795,28 +843,16 @@ class ComposioBaseComponent(Component):
|
|
|
795
843
|
# Don't add individual attachment sub-fields to the schema
|
|
796
844
|
continue
|
|
797
845
|
|
|
798
|
-
# Handle
|
|
799
|
-
if clean_field_name
|
|
800
|
-
|
|
846
|
+
# Handle reserved attribute name conflicts
|
|
847
|
+
if clean_field_name in self.RESERVED_ATTRIBUTES:
|
|
848
|
+
original_name = clean_field_name
|
|
849
|
+
clean_field_name = f"{self.app_name}_{clean_field_name}"
|
|
801
850
|
# Update the field schema description to reflect the name change
|
|
802
851
|
field_schema_copy = field_schema.copy()
|
|
852
|
+
original_description = field_schema.get("description", "")
|
|
803
853
|
field_schema_copy["description"] = (
|
|
804
|
-
f"
|
|
805
|
-
)
|
|
806
|
-
elif clean_field_name == "status":
|
|
807
|
-
clean_field_name = f"{self.app_name}_status"
|
|
808
|
-
# Update the field schema description to reflect the name change
|
|
809
|
-
field_schema_copy = field_schema.copy()
|
|
810
|
-
field_schema_copy["description"] = f"Status for {self.app_name.title()}: " + field_schema.get(
|
|
811
|
-
"description", ""
|
|
812
|
-
)
|
|
813
|
-
elif clean_field_name == "name":
|
|
814
|
-
clean_field_name = f"{self.app_name}_name"
|
|
815
|
-
# Update the field schema description to reflect the name change
|
|
816
|
-
field_schema_copy = field_schema.copy()
|
|
817
|
-
field_schema_copy["description"] = f"Name for {self.app_name.title()}: " + field_schema.get(
|
|
818
|
-
"description", ""
|
|
819
|
-
)
|
|
854
|
+
f"{original_name.replace('_', ' ').title()} for {self.app_name.title()}: {original_description}"
|
|
855
|
+
).strip()
|
|
820
856
|
else:
|
|
821
857
|
# Use the original field schema for all other fields
|
|
822
858
|
field_schema_copy = field_schema
|
|
@@ -842,12 +878,8 @@ class ComposioBaseComponent(Component):
|
|
|
842
878
|
cleaned_required = []
|
|
843
879
|
for field in flat_schema["required"]:
|
|
844
880
|
base = field.replace("[0]", "")
|
|
845
|
-
if base
|
|
846
|
-
cleaned_required.append(f"{self.app_name}
|
|
847
|
-
elif base == "status":
|
|
848
|
-
cleaned_required.append(f"{self.app_name}_status")
|
|
849
|
-
elif base == "name":
|
|
850
|
-
cleaned_required.append(f"{self.app_name}_name")
|
|
881
|
+
if base in self.RESERVED_ATTRIBUTES:
|
|
882
|
+
cleaned_required.append(f"{self.app_name}_{base}")
|
|
851
883
|
else:
|
|
852
884
|
cleaned_required.append(base)
|
|
853
885
|
flat_schema["required"] = cleaned_required
|
|
@@ -943,9 +975,10 @@ class ComposioBaseComponent(Component):
|
|
|
943
975
|
inp.advanced = True
|
|
944
976
|
|
|
945
977
|
# Skip entity_id being mapped to user_id parameter
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
978
|
+
# Check both original name and renamed version
|
|
979
|
+
if inp.name in {"user_id", f"{self.app_name}_user_id"} and getattr(
|
|
980
|
+
self, "entity_id", None
|
|
981
|
+
) == getattr(inp, "value", None):
|
|
949
982
|
continue
|
|
950
983
|
|
|
951
984
|
processed_inputs.append(inp)
|
|
@@ -2422,12 +2455,11 @@ class ComposioBaseComponent(Component):
|
|
|
2422
2455
|
|
|
2423
2456
|
# Handle renamed fields - map back to original names for API execution
|
|
2424
2457
|
final_field_name = field
|
|
2425
|
-
if
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
final_field_name = "name"
|
|
2458
|
+
# Check if this is a renamed reserved attribute
|
|
2459
|
+
if field.startswith(f"{self.app_name}_"):
|
|
2460
|
+
potential_original = field[len(self.app_name) + 1 :] # Remove app_name prefix
|
|
2461
|
+
if potential_original in self.RESERVED_ATTRIBUTES:
|
|
2462
|
+
final_field_name = potential_original
|
|
2431
2463
|
|
|
2432
2464
|
arguments[final_field_name] = value
|
|
2433
2465
|
|
|
@@ -2538,7 +2570,7 @@ class ComposioBaseComponent(Component):
|
|
|
2538
2570
|
build_config[fname]["value"] = "" if fname not in self._bool_variables else False
|
|
2539
2571
|
# Hide any other visible, non-protected fields that look like parameters
|
|
2540
2572
|
protected = {
|
|
2541
|
-
|
|
2573
|
+
# Component control fields
|
|
2542
2574
|
"entity_id",
|
|
2543
2575
|
"api_key",
|
|
2544
2576
|
"auth_link",
|
|
@@ -2570,6 +2602,11 @@ class ComposioBaseComponent(Component):
|
|
|
2570
2602
|
"instance_url",
|
|
2571
2603
|
"tenant_id",
|
|
2572
2604
|
}
|
|
2605
|
+
# Add all reserved Component attributes to protected set
|
|
2606
|
+
protected.update(self.RESERVED_ATTRIBUTES)
|
|
2607
|
+
# Also add the renamed versions (with app_name prefix) to protected set
|
|
2608
|
+
for attr in self.RESERVED_ATTRIBUTES:
|
|
2609
|
+
protected.add(f"{self.app_name}_{attr}")
|
|
2573
2610
|
# Add all dynamic auth fields to protected set
|
|
2574
2611
|
protected.update(self._auth_dynamic_fields)
|
|
2575
2612
|
# Also protect any auth fields discovered across all instances
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Shared utilities for cloud storage operations (AWS S3 and Google Drive).
|
|
2
|
+
|
|
3
|
+
This module provides common functionality used by both read and write file components
|
|
4
|
+
to avoid code duplication.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def validate_aws_credentials(component: Any) -> None:
|
|
14
|
+
"""Validate that required AWS S3 credentials are present.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
component: Component instance with AWS credential attributes
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
ValueError: If any required credential is missing
|
|
21
|
+
"""
|
|
22
|
+
if not getattr(component, "aws_access_key_id", None):
|
|
23
|
+
msg = "AWS Access Key ID is required for S3 storage"
|
|
24
|
+
raise ValueError(msg)
|
|
25
|
+
if not getattr(component, "aws_secret_access_key", None):
|
|
26
|
+
msg = "AWS Secret Key is required for S3 storage"
|
|
27
|
+
raise ValueError(msg)
|
|
28
|
+
if not getattr(component, "bucket_name", None):
|
|
29
|
+
msg = "S3 Bucket Name is required for S3 storage"
|
|
30
|
+
raise ValueError(msg)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_s3_client(component: Any):
|
|
34
|
+
"""Create and return a configured boto3 S3 client.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
component: Component instance with AWS credential attributes
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
boto3 S3 client instance
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ImportError: If boto3 is not installed
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
import boto3
|
|
47
|
+
except ImportError as e:
|
|
48
|
+
msg = "boto3 is not installed. Please install it using `uv pip install boto3`."
|
|
49
|
+
raise ImportError(msg) from e
|
|
50
|
+
|
|
51
|
+
client_config = {
|
|
52
|
+
"aws_access_key_id": component.aws_access_key_id,
|
|
53
|
+
"aws_secret_access_key": component.aws_secret_access_key,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if hasattr(component, "aws_region") and component.aws_region:
|
|
57
|
+
client_config["region_name"] = component.aws_region
|
|
58
|
+
|
|
59
|
+
return boto3.client("s3", **client_config)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_google_service_account_key(service_account_key: str) -> dict:
|
|
63
|
+
"""Parse Google service account JSON key with multiple fallback strategies.
|
|
64
|
+
|
|
65
|
+
This function handles various common formatting issues when users paste
|
|
66
|
+
service account keys, including:
|
|
67
|
+
- Control characters
|
|
68
|
+
- Extra whitespace
|
|
69
|
+
- Double-encoded JSON strings
|
|
70
|
+
- Escaped newlines in private_key field
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
service_account_key: Service account JSON key as string
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
dict: Parsed service account credentials
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
ValueError: If all parsing strategies fail
|
|
80
|
+
"""
|
|
81
|
+
credentials_dict = None
|
|
82
|
+
parse_errors = []
|
|
83
|
+
|
|
84
|
+
# Strategy 1: Parse as-is with strict=False to allow control characters
|
|
85
|
+
try:
|
|
86
|
+
credentials_dict = json.loads(service_account_key, strict=False)
|
|
87
|
+
except json.JSONDecodeError as e:
|
|
88
|
+
parse_errors.append(f"Standard parse: {e!s}")
|
|
89
|
+
|
|
90
|
+
# Strategy 2: Strip whitespace and try again
|
|
91
|
+
if credentials_dict is None:
|
|
92
|
+
try:
|
|
93
|
+
cleaned_key = service_account_key.strip()
|
|
94
|
+
credentials_dict = json.loads(cleaned_key, strict=False)
|
|
95
|
+
except json.JSONDecodeError as e:
|
|
96
|
+
parse_errors.append(f"Stripped parse: {e!s}")
|
|
97
|
+
|
|
98
|
+
# Strategy 3: Check if it's double-encoded (JSON string of a JSON string)
|
|
99
|
+
if credentials_dict is None:
|
|
100
|
+
try:
|
|
101
|
+
decoded_once = json.loads(service_account_key, strict=False)
|
|
102
|
+
credentials_dict = json.loads(decoded_once, strict=False) if isinstance(decoded_once, str) else decoded_once
|
|
103
|
+
except json.JSONDecodeError as e:
|
|
104
|
+
parse_errors.append(f"Double-encoded parse: {e!s}")
|
|
105
|
+
|
|
106
|
+
# Strategy 4: Try to fix common issues with newlines in the private_key field
|
|
107
|
+
if credentials_dict is None:
|
|
108
|
+
try:
|
|
109
|
+
# Replace literal \n with actual newlines which is common in pasted JSON
|
|
110
|
+
fixed_key = service_account_key.replace("\\n", "\n")
|
|
111
|
+
credentials_dict = json.loads(fixed_key, strict=False)
|
|
112
|
+
except json.JSONDecodeError as e:
|
|
113
|
+
parse_errors.append(f"Newline-fixed parse: {e!s}")
|
|
114
|
+
|
|
115
|
+
if credentials_dict is None:
|
|
116
|
+
error_details = "; ".join(parse_errors)
|
|
117
|
+
msg = (
|
|
118
|
+
f"Unable to parse service account key JSON. Tried multiple strategies: {error_details}. "
|
|
119
|
+
"Please ensure you've copied the entire JSON content from your service account key file. "
|
|
120
|
+
"The JSON should start with '{' and contain fields like 'type', 'project_id', 'private_key', etc."
|
|
121
|
+
)
|
|
122
|
+
raise ValueError(msg)
|
|
123
|
+
|
|
124
|
+
return credentials_dict
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def create_google_drive_service(service_account_key: str, scopes: list[str], *, return_credentials: bool = False):
|
|
128
|
+
"""Create and return a configured Google Drive API service.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
service_account_key: Service account JSON key as string
|
|
132
|
+
scopes: List of Google API scopes to request
|
|
133
|
+
return_credentials: If True, return both service and credentials as tuple
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Google Drive API service instance, or tuple of (service, credentials) if return_credentials=True
|
|
137
|
+
|
|
138
|
+
Raises:
|
|
139
|
+
ImportError: If Google API client libraries are not installed
|
|
140
|
+
ValueError: If credentials cannot be parsed
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
from google.oauth2 import service_account
|
|
144
|
+
from googleapiclient.discovery import build
|
|
145
|
+
except ImportError as e:
|
|
146
|
+
msg = "Google API client libraries are not installed. Please install them."
|
|
147
|
+
raise ImportError(msg) from e
|
|
148
|
+
|
|
149
|
+
credentials_dict = parse_google_service_account_key(service_account_key)
|
|
150
|
+
|
|
151
|
+
credentials = service_account.Credentials.from_service_account_info(credentials_dict, scopes=scopes)
|
|
152
|
+
service = build("drive", "v3", credentials=credentials)
|
|
153
|
+
|
|
154
|
+
if return_credentials:
|
|
155
|
+
return service, credentials
|
|
156
|
+
return service
|
lfx/base/data/docling_utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import signal
|
|
|
3
3
|
import sys
|
|
4
4
|
import traceback
|
|
5
5
|
from contextlib import suppress
|
|
6
|
-
from
|
|
6
|
+
from functools import lru_cache
|
|
7
7
|
|
|
8
8
|
from docling_core.types.doc import DoclingDocument
|
|
9
9
|
from pydantic import BaseModel, SecretStr, TypeAdapter
|
|
@@ -12,9 +12,6 @@ from lfx.log.logger import logger
|
|
|
12
12
|
from lfx.schema.data import Data
|
|
13
13
|
from lfx.schema.dataframe import DataFrame
|
|
14
14
|
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from langchain_core.language_models.chat_models import BaseChatModel
|
|
17
|
-
|
|
18
15
|
|
|
19
16
|
class DoclingDependencyError(Exception):
|
|
20
17
|
"""Custom exception for missing Docling dependencies."""
|
|
@@ -152,6 +149,81 @@ def _deserialize_pydantic_model(data: dict):
|
|
|
152
149
|
return adapter.validate_python(data["config"])
|
|
153
150
|
|
|
154
151
|
|
|
152
|
+
# Global cache for DocumentConverter instances
|
|
153
|
+
# This cache persists across multiple runs and thread invocations
|
|
154
|
+
@lru_cache(maxsize=4)
|
|
155
|
+
def _get_cached_converter(
|
|
156
|
+
pipeline: str,
|
|
157
|
+
ocr_engine: str,
|
|
158
|
+
*,
|
|
159
|
+
do_picture_classification: bool,
|
|
160
|
+
pic_desc_config_hash: str | None,
|
|
161
|
+
):
|
|
162
|
+
"""Create and cache a DocumentConverter instance based on configuration.
|
|
163
|
+
|
|
164
|
+
This function uses LRU caching to maintain DocumentConverter instances in memory,
|
|
165
|
+
eliminating the 15-20 minute model loading time on subsequent runs.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
pipeline: The pipeline type ("standard" or "vlm")
|
|
169
|
+
ocr_engine: The OCR engine to use
|
|
170
|
+
do_picture_classification: Whether to enable picture classification
|
|
171
|
+
pic_desc_config_hash: Hash of the picture description config (for cache key)
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
A cached or newly created DocumentConverter instance
|
|
175
|
+
"""
|
|
176
|
+
from docling.datamodel.base_models import InputFormat
|
|
177
|
+
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
|
|
178
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
179
|
+
from docling.models.factories import get_ocr_factory
|
|
180
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
181
|
+
|
|
182
|
+
logger.info(f"Creating DocumentConverter for pipeline={pipeline}, ocr_engine={ocr_engine}")
|
|
183
|
+
|
|
184
|
+
# Configure the standard PDF pipeline
|
|
185
|
+
def _get_standard_opts() -> PdfPipelineOptions:
|
|
186
|
+
pipeline_options = PdfPipelineOptions()
|
|
187
|
+
pipeline_options.do_ocr = ocr_engine not in {"", "None"}
|
|
188
|
+
if pipeline_options.do_ocr:
|
|
189
|
+
ocr_factory = get_ocr_factory(
|
|
190
|
+
allow_external_plugins=False,
|
|
191
|
+
)
|
|
192
|
+
ocr_options: OcrOptions = ocr_factory.create_options(
|
|
193
|
+
kind=ocr_engine,
|
|
194
|
+
)
|
|
195
|
+
pipeline_options.ocr_options = ocr_options
|
|
196
|
+
|
|
197
|
+
pipeline_options.do_picture_classification = do_picture_classification
|
|
198
|
+
|
|
199
|
+
# Note: pic_desc_config_hash is for cache key only
|
|
200
|
+
# Actual picture description is handled separately (non-cached path)
|
|
201
|
+
_ = pic_desc_config_hash # Mark as intentionally unused
|
|
202
|
+
|
|
203
|
+
return pipeline_options
|
|
204
|
+
|
|
205
|
+
# Configure the VLM pipeline
|
|
206
|
+
def _get_vlm_opts() -> VlmPipelineOptions:
|
|
207
|
+
return VlmPipelineOptions()
|
|
208
|
+
|
|
209
|
+
if pipeline == "standard":
|
|
210
|
+
pdf_format_option = PdfFormatOption(
|
|
211
|
+
pipeline_options=_get_standard_opts(),
|
|
212
|
+
)
|
|
213
|
+
elif pipeline == "vlm":
|
|
214
|
+
pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
|
|
215
|
+
else:
|
|
216
|
+
msg = f"Unknown pipeline: {pipeline!r}"
|
|
217
|
+
raise ValueError(msg)
|
|
218
|
+
|
|
219
|
+
format_options: dict[InputFormat, FormatOption] = {
|
|
220
|
+
InputFormat.PDF: pdf_format_option,
|
|
221
|
+
InputFormat.IMAGE: pdf_format_option,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return DocumentConverter(format_options=format_options)
|
|
225
|
+
|
|
226
|
+
|
|
155
227
|
def docling_worker(
|
|
156
228
|
*,
|
|
157
229
|
file_paths: list[str],
|
|
@@ -162,7 +234,12 @@ def docling_worker(
|
|
|
162
234
|
pic_desc_config: dict | None,
|
|
163
235
|
pic_desc_prompt: str,
|
|
164
236
|
):
|
|
165
|
-
"""Worker function for processing files with Docling
|
|
237
|
+
"""Worker function for processing files with Docling using threading.
|
|
238
|
+
|
|
239
|
+
This function now uses a globally cached DocumentConverter instance,
|
|
240
|
+
significantly reducing processing time on subsequent runs from 15-20 minutes
|
|
241
|
+
to just seconds.
|
|
242
|
+
"""
|
|
166
243
|
# Signal handling for graceful shutdown
|
|
167
244
|
shutdown_requested = False
|
|
168
245
|
|
|
@@ -205,12 +282,12 @@ def docling_worker(
|
|
|
205
282
|
check_shutdown()
|
|
206
283
|
|
|
207
284
|
try:
|
|
208
|
-
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
209
|
-
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
|
|
210
|
-
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
211
|
-
from docling.models.factories import get_ocr_factory
|
|
212
|
-
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
213
|
-
from langchain_docling.picture_description import PictureDescriptionLangChainOptions
|
|
285
|
+
from docling.datamodel.base_models import ConversionStatus, InputFormat # noqa: F401
|
|
286
|
+
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions # noqa: F401
|
|
287
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption # noqa: F401
|
|
288
|
+
from docling.models.factories import get_ocr_factory # noqa: F401
|
|
289
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline # noqa: F401
|
|
290
|
+
from langchain_docling.picture_description import PictureDescriptionLangChainOptions # noqa: F401
|
|
214
291
|
|
|
215
292
|
# Check for shutdown after imports
|
|
216
293
|
check_shutdown()
|
|
@@ -233,27 +310,34 @@ def docling_worker(
|
|
|
233
310
|
queue.put({"error": "Worker interrupted during imports", "shutdown": True})
|
|
234
311
|
return
|
|
235
312
|
|
|
236
|
-
#
|
|
237
|
-
|
|
313
|
+
# Use cached converter instead of creating new one each time
|
|
314
|
+
# This is the key optimization that eliminates 15-20 minute model load times
|
|
315
|
+
def _get_converter() -> DocumentConverter:
|
|
238
316
|
check_shutdown() # Check before heavy operations
|
|
239
317
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
if pipeline_options.do_ocr:
|
|
243
|
-
ocr_factory = get_ocr_factory(
|
|
244
|
-
allow_external_plugins=False,
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
ocr_options: OcrOptions = ocr_factory.create_options(
|
|
248
|
-
kind=ocr_engine,
|
|
249
|
-
)
|
|
250
|
-
pipeline_options.ocr_options = ocr_options
|
|
251
|
-
|
|
252
|
-
pipeline_options.do_picture_classification = do_picture_classification
|
|
253
|
-
|
|
318
|
+
# For now, we don't support pic_desc_config caching due to serialization complexity
|
|
319
|
+
# This is a known limitation that can be addressed in a future enhancement
|
|
254
320
|
if pic_desc_config:
|
|
255
|
-
|
|
256
|
-
|
|
321
|
+
logger.warning(
|
|
322
|
+
"Picture description with LLM is not yet supported with cached converters. "
|
|
323
|
+
"Using non-cached converter for this request."
|
|
324
|
+
)
|
|
325
|
+
# Fall back to creating a new converter (old behavior)
|
|
326
|
+
from docling.datamodel.base_models import InputFormat
|
|
327
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
328
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
329
|
+
from docling.models.factories import get_ocr_factory
|
|
330
|
+
from langchain_docling.picture_description import PictureDescriptionLangChainOptions
|
|
331
|
+
|
|
332
|
+
pipeline_options = PdfPipelineOptions()
|
|
333
|
+
pipeline_options.do_ocr = ocr_engine not in {"", "None"}
|
|
334
|
+
if pipeline_options.do_ocr:
|
|
335
|
+
ocr_factory = get_ocr_factory(allow_external_plugins=False)
|
|
336
|
+
ocr_options = ocr_factory.create_options(kind=ocr_engine)
|
|
337
|
+
pipeline_options.ocr_options = ocr_options
|
|
338
|
+
|
|
339
|
+
pipeline_options.do_picture_classification = do_picture_classification
|
|
340
|
+
pic_desc_llm = _deserialize_pydantic_model(pic_desc_config)
|
|
257
341
|
logger.info("Docling enabling the picture description stage.")
|
|
258
342
|
pipeline_options.do_picture_description = True
|
|
259
343
|
pipeline_options.allow_external_plugins = True
|
|
@@ -261,33 +345,24 @@ def docling_worker(
|
|
|
261
345
|
llm=pic_desc_llm,
|
|
262
346
|
prompt=pic_desc_prompt,
|
|
263
347
|
)
|
|
264
|
-
return pipeline_options
|
|
265
348
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
raise ValueError(msg)
|
|
284
|
-
|
|
285
|
-
format_options: dict[InputFormat, FormatOption] = {
|
|
286
|
-
InputFormat.PDF: pdf_format_option,
|
|
287
|
-
InputFormat.IMAGE: pdf_format_option,
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
return DocumentConverter(format_options=format_options)
|
|
349
|
+
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
|
|
350
|
+
format_options: dict[InputFormat, FormatOption] = {
|
|
351
|
+
InputFormat.PDF: pdf_format_option,
|
|
352
|
+
InputFormat.IMAGE: pdf_format_option,
|
|
353
|
+
}
|
|
354
|
+
return DocumentConverter(format_options=format_options)
|
|
355
|
+
|
|
356
|
+
# Use cached converter - this is where the magic happens!
|
|
357
|
+
# First run: creates and caches converter (15-20 min)
|
|
358
|
+
# Subsequent runs: reuses cached converter (seconds)
|
|
359
|
+
pic_desc_config_hash = None # Will be None since we checked above
|
|
360
|
+
return _get_cached_converter(
|
|
361
|
+
pipeline=pipeline,
|
|
362
|
+
ocr_engine=ocr_engine,
|
|
363
|
+
do_picture_classification=do_picture_classification,
|
|
364
|
+
pic_desc_config_hash=pic_desc_config_hash,
|
|
365
|
+
)
|
|
291
366
|
|
|
292
367
|
try:
|
|
293
368
|
# Check for shutdown before creating converter (can be slow)
|