lfx-nightly 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +21 -4
- lfx/base/agents/altk_base_agent.py +393 -0
- lfx/base/agents/altk_tool_wrappers.py +565 -0
- lfx/base/agents/events.py +2 -1
- lfx/base/composio/composio_base.py +159 -224
- lfx/base/data/base_file.py +97 -20
- lfx/base/data/docling_utils.py +61 -10
- lfx/base/data/storage_utils.py +301 -0
- lfx/base/data/utils.py +178 -14
- lfx/base/mcp/util.py +2 -2
- lfx/base/models/anthropic_constants.py +21 -12
- lfx/base/models/groq_constants.py +74 -58
- lfx/base/models/groq_model_discovery.py +265 -0
- lfx/base/models/model.py +1 -1
- lfx/base/models/model_utils.py +100 -0
- lfx/base/models/openai_constants.py +7 -0
- lfx/base/models/watsonx_constants.py +32 -8
- lfx/base/tools/run_flow.py +601 -129
- lfx/cli/commands.py +9 -4
- lfx/cli/common.py +2 -2
- lfx/cli/run.py +1 -1
- lfx/cli/script_loader.py +53 -11
- lfx/components/Notion/create_page.py +1 -1
- lfx/components/Notion/list_database_properties.py +1 -1
- lfx/components/Notion/list_pages.py +1 -1
- lfx/components/Notion/list_users.py +1 -1
- lfx/components/Notion/page_content_viewer.py +1 -1
- lfx/components/Notion/search.py +1 -1
- lfx/components/Notion/update_page_property.py +1 -1
- lfx/components/__init__.py +19 -5
- lfx/components/{agents → altk}/__init__.py +5 -9
- lfx/components/altk/altk_agent.py +193 -0
- lfx/components/apify/apify_actor.py +1 -1
- lfx/components/composio/__init__.py +70 -18
- lfx/components/composio/apollo_composio.py +11 -0
- lfx/components/composio/bitbucket_composio.py +11 -0
- lfx/components/composio/canva_composio.py +11 -0
- lfx/components/composio/coda_composio.py +11 -0
- lfx/components/composio/composio_api.py +10 -0
- lfx/components/composio/discord_composio.py +1 -1
- lfx/components/composio/elevenlabs_composio.py +11 -0
- lfx/components/composio/exa_composio.py +11 -0
- lfx/components/composio/firecrawl_composio.py +11 -0
- lfx/components/composio/fireflies_composio.py +11 -0
- lfx/components/composio/gmail_composio.py +1 -1
- lfx/components/composio/googlebigquery_composio.py +11 -0
- lfx/components/composio/googlecalendar_composio.py +1 -1
- lfx/components/composio/googledocs_composio.py +1 -1
- lfx/components/composio/googlemeet_composio.py +1 -1
- lfx/components/composio/googlesheets_composio.py +1 -1
- lfx/components/composio/googletasks_composio.py +1 -1
- lfx/components/composio/heygen_composio.py +11 -0
- lfx/components/composio/mem0_composio.py +11 -0
- lfx/components/composio/peopledatalabs_composio.py +11 -0
- lfx/components/composio/perplexityai_composio.py +11 -0
- lfx/components/composio/serpapi_composio.py +11 -0
- lfx/components/composio/slack_composio.py +3 -574
- lfx/components/composio/slackbot_composio.py +1 -1
- lfx/components/composio/snowflake_composio.py +11 -0
- lfx/components/composio/tavily_composio.py +11 -0
- lfx/components/composio/youtube_composio.py +2 -2
- lfx/components/cuga/__init__.py +34 -0
- lfx/components/cuga/cuga_agent.py +730 -0
- lfx/components/data/__init__.py +78 -28
- lfx/components/data_source/__init__.py +58 -0
- lfx/components/{data → data_source}/api_request.py +26 -3
- lfx/components/{data → data_source}/csv_to_data.py +15 -10
- lfx/components/{data → data_source}/json_to_data.py +15 -8
- lfx/components/{data → data_source}/news_search.py +1 -1
- lfx/components/{data → data_source}/rss.py +1 -1
- lfx/components/{data → data_source}/sql_executor.py +1 -1
- lfx/components/{data → data_source}/url.py +1 -1
- lfx/components/{data → data_source}/web_search.py +1 -1
- lfx/components/datastax/astradb_cql.py +1 -1
- lfx/components/datastax/astradb_graph.py +1 -1
- lfx/components/datastax/astradb_tool.py +1 -1
- lfx/components/datastax/astradb_vectorstore.py +1 -1
- lfx/components/datastax/hcd.py +1 -1
- lfx/components/deactivated/json_document_builder.py +1 -1
- lfx/components/docling/__init__.py +0 -3
- lfx/components/docling/chunk_docling_document.py +3 -1
- lfx/components/docling/export_docling_document.py +3 -1
- lfx/components/elastic/elasticsearch.py +1 -1
- lfx/components/files_and_knowledge/__init__.py +47 -0
- lfx/components/{data → files_and_knowledge}/directory.py +1 -1
- lfx/components/{data → files_and_knowledge}/file.py +304 -24
- lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +2 -2
- lfx/components/{data → files_and_knowledge}/save_file.py +218 -31
- lfx/components/flow_controls/__init__.py +58 -0
- lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
- lfx/components/{logic → flow_controls}/loop.py +43 -9
- lfx/components/flow_controls/run_flow.py +108 -0
- lfx/components/glean/glean_search_api.py +1 -1
- lfx/components/groq/groq.py +35 -28
- lfx/components/helpers/__init__.py +102 -0
- lfx/components/ibm/watsonx.py +7 -1
- lfx/components/input_output/__init__.py +3 -1
- lfx/components/input_output/chat.py +4 -3
- lfx/components/input_output/chat_output.py +10 -4
- lfx/components/input_output/text.py +1 -1
- lfx/components/input_output/text_output.py +1 -1
- lfx/components/{data → input_output}/webhook.py +1 -1
- lfx/components/knowledge_bases/__init__.py +59 -4
- lfx/components/langchain_utilities/character.py +1 -1
- lfx/components/langchain_utilities/csv_agent.py +84 -16
- lfx/components/langchain_utilities/json_agent.py +67 -12
- lfx/components/langchain_utilities/language_recursive.py +1 -1
- lfx/components/llm_operations/__init__.py +46 -0
- lfx/components/{processing → llm_operations}/batch_run.py +17 -8
- lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
- lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
- lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
- lfx/components/{processing → llm_operations}/structured_output.py +1 -1
- lfx/components/logic/__init__.py +126 -0
- lfx/components/mem0/mem0_chat_memory.py +11 -0
- lfx/components/models/__init__.py +64 -9
- lfx/components/models_and_agents/__init__.py +49 -0
- lfx/components/{agents → models_and_agents}/agent.py +6 -4
- lfx/components/models_and_agents/embedding_model.py +353 -0
- lfx/components/models_and_agents/language_model.py +398 -0
- lfx/components/{agents → models_and_agents}/mcp_component.py +53 -44
- lfx/components/{helpers → models_and_agents}/memory.py +1 -1
- lfx/components/nvidia/system_assist.py +1 -1
- lfx/components/olivya/olivya.py +1 -1
- lfx/components/ollama/ollama.py +24 -5
- lfx/components/processing/__init__.py +9 -60
- lfx/components/processing/converter.py +1 -1
- lfx/components/processing/dataframe_operations.py +1 -1
- lfx/components/processing/parse_json_data.py +2 -2
- lfx/components/processing/parser.py +1 -1
- lfx/components/processing/split_text.py +1 -1
- lfx/components/qdrant/qdrant.py +1 -1
- lfx/components/redis/redis.py +1 -1
- lfx/components/twelvelabs/split_video.py +10 -0
- lfx/components/twelvelabs/video_file.py +12 -0
- lfx/components/utilities/__init__.py +43 -0
- lfx/components/{helpers → utilities}/calculator_core.py +1 -1
- lfx/components/{helpers → utilities}/current_date.py +1 -1
- lfx/components/{processing → utilities}/python_repl_core.py +1 -1
- lfx/components/vectorstores/local_db.py +9 -0
- lfx/components/youtube/youtube_transcripts.py +118 -30
- lfx/custom/custom_component/component.py +57 -1
- lfx/custom/custom_component/custom_component.py +68 -6
- lfx/custom/directory_reader/directory_reader.py +5 -2
- lfx/graph/edge/base.py +43 -20
- lfx/graph/state/model.py +15 -2
- lfx/graph/utils.py +6 -0
- lfx/graph/vertex/param_handler.py +10 -7
- lfx/helpers/__init__.py +12 -0
- lfx/helpers/flow.py +117 -0
- lfx/inputs/input_mixin.py +24 -1
- lfx/inputs/inputs.py +13 -1
- lfx/interface/components.py +161 -83
- lfx/log/logger.py +5 -3
- lfx/schema/image.py +2 -12
- lfx/services/database/__init__.py +5 -0
- lfx/services/database/service.py +25 -0
- lfx/services/deps.py +87 -22
- lfx/services/interfaces.py +5 -0
- lfx/services/manager.py +24 -10
- lfx/services/mcp_composer/service.py +1029 -162
- lfx/services/session.py +5 -0
- lfx/services/settings/auth.py +18 -11
- lfx/services/settings/base.py +56 -30
- lfx/services/settings/constants.py +8 -0
- lfx/services/storage/local.py +108 -46
- lfx/services/storage/service.py +171 -29
- lfx/template/field/base.py +3 -0
- lfx/utils/image.py +29 -11
- lfx/utils/ssrf_protection.py +384 -0
- lfx/utils/validate_cloud.py +26 -0
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/METADATA +38 -22
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/RECORD +189 -160
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/WHEEL +1 -1
- lfx/components/agents/altk_agent.py +0 -366
- lfx/components/agents/cuga_agent.py +0 -1013
- lfx/components/docling/docling_remote_vlm.py +0 -284
- lfx/components/logic/run_flow.py +0 -71
- lfx/components/models/embedding_model.py +0 -195
- lfx/components/models/language_model.py +0 -144
- lfx/components/processing/dataframe_to_toolset.py +0 -259
- /lfx/components/{data → data_source}/mock_data.py +0 -0
- /lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +0 -0
- /lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
- /lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
- /lfx/components/{logic → flow_controls}/listen.py +0 -0
- /lfx/components/{logic → flow_controls}/notify.py +0 -0
- /lfx/components/{logic → flow_controls}/pass_message.py +0 -0
- /lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
- /lfx/components/{processing → models_and_agents}/prompt.py +0 -0
- /lfx/components/{helpers → processing}/create_list.py +0 -0
- /lfx/components/{helpers → processing}/output_parser.py +0 -0
- /lfx/components/{helpers → processing}/store_message.py +0 -0
- /lfx/components/{helpers → utilities}/id_generator.py +0 -0
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/entry_points.txt +0 -0
|
@@ -10,34 +10,44 @@ Notes:
|
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import contextlib
|
|
13
14
|
import json
|
|
14
15
|
import subprocess
|
|
15
16
|
import sys
|
|
16
17
|
import textwrap
|
|
17
18
|
from copy import deepcopy
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from tempfile import NamedTemporaryFile
|
|
18
21
|
from typing import Any
|
|
19
22
|
|
|
20
23
|
from lfx.base.data.base_file import BaseFileComponent
|
|
24
|
+
from lfx.base.data.storage_utils import parse_storage_path, validate_image_content_type
|
|
21
25
|
from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
|
|
22
26
|
from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
|
|
23
27
|
from lfx.io import BoolInput, FileInput, IntInput, Output
|
|
24
28
|
from lfx.schema.data import Data
|
|
25
29
|
from lfx.schema.dataframe import DataFrame # noqa: TC001
|
|
26
30
|
from lfx.schema.message import Message
|
|
31
|
+
from lfx.services.deps import get_settings_service, get_storage_service
|
|
32
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
27
33
|
|
|
28
34
|
|
|
29
35
|
class FileComponent(BaseFileComponent):
|
|
30
36
|
"""File component with optional Docling processing (isolated in a subprocess)."""
|
|
31
37
|
|
|
32
38
|
display_name = "Read File"
|
|
33
|
-
description
|
|
34
|
-
|
|
39
|
+
# description is now a dynamic property - see get_tool_description()
|
|
40
|
+
_base_description = "Loads content from one or more files."
|
|
41
|
+
documentation: str = "https://docs.langflow.org/read-file"
|
|
35
42
|
icon = "file-text"
|
|
36
43
|
name = "File"
|
|
44
|
+
add_tool_output = True # Enable tool mode toggle without requiring tool_mode inputs
|
|
37
45
|
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
46
|
+
# Extensions that can be processed without Docling (using standard text parsing)
|
|
47
|
+
TEXT_EXTENSIONS = TEXT_FILE_TYPES
|
|
48
|
+
|
|
49
|
+
# Extensions that require Docling for processing (images, advanced office formats, etc.)
|
|
50
|
+
DOCLING_ONLY_EXTENSIONS = [
|
|
41
51
|
"adoc",
|
|
42
52
|
"asciidoc",
|
|
43
53
|
"asc",
|
|
@@ -61,6 +71,12 @@ class FileComponent(BaseFileComponent):
|
|
|
61
71
|
"webp",
|
|
62
72
|
]
|
|
63
73
|
|
|
74
|
+
# Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.
|
|
75
|
+
VALID_EXTENSIONS = [
|
|
76
|
+
*TEXT_EXTENSIONS,
|
|
77
|
+
*DOCLING_ONLY_EXTENSIONS,
|
|
78
|
+
]
|
|
79
|
+
|
|
64
80
|
# Fixed export settings used when markdown export is requested.
|
|
65
81
|
EXPORT_FORMAT = "Markdown"
|
|
66
82
|
IMAGE_MODE = "placeholder"
|
|
@@ -70,10 +86,24 @@ class FileComponent(BaseFileComponent):
|
|
|
70
86
|
for input_item in _base_inputs:
|
|
71
87
|
if isinstance(input_item, FileInput) and input_item.name == "path":
|
|
72
88
|
input_item.real_time_refresh = True
|
|
89
|
+
input_item.tool_mode = False # Disable tool mode for file upload input
|
|
90
|
+
input_item.required = False # Make it optional so it doesn't error in tool mode
|
|
73
91
|
break
|
|
74
92
|
|
|
75
93
|
inputs = [
|
|
76
94
|
*_base_inputs,
|
|
95
|
+
StrInput(
|
|
96
|
+
name="file_path_str",
|
|
97
|
+
display_name="File Path",
|
|
98
|
+
info=(
|
|
99
|
+
"Path to the file to read. Used when component is called as a tool. "
|
|
100
|
+
"If not provided, will use the uploaded file from 'path' input."
|
|
101
|
+
),
|
|
102
|
+
show=False,
|
|
103
|
+
advanced=True,
|
|
104
|
+
tool_mode=True, # Required for Toolset toggle, but _get_tools() ignores this parameter
|
|
105
|
+
required=False,
|
|
106
|
+
),
|
|
77
107
|
BoolInput(
|
|
78
108
|
name="advanced_mode",
|
|
79
109
|
display_name="Advanced Parser",
|
|
@@ -152,9 +182,87 @@ class FileComponent(BaseFileComponent):
|
|
|
152
182
|
]
|
|
153
183
|
|
|
154
184
|
outputs = [
|
|
155
|
-
Output(display_name="Raw Content", name="message", method="load_files_message"),
|
|
185
|
+
Output(display_name="Raw Content", name="message", method="load_files_message", tool_mode=True),
|
|
156
186
|
]
|
|
157
187
|
|
|
188
|
+
# ------------------------------ Tool description with file names --------------
|
|
189
|
+
|
|
190
|
+
def get_tool_description(self) -> str:
|
|
191
|
+
"""Return a dynamic description that includes the names of uploaded files.
|
|
192
|
+
|
|
193
|
+
This helps the Agent understand which files are available to read.
|
|
194
|
+
"""
|
|
195
|
+
base_description = "Loads and returns the content from uploaded files."
|
|
196
|
+
|
|
197
|
+
# Get the list of uploaded file paths
|
|
198
|
+
file_paths = getattr(self, "path", None)
|
|
199
|
+
if not file_paths:
|
|
200
|
+
return base_description
|
|
201
|
+
|
|
202
|
+
# Ensure it's a list
|
|
203
|
+
if not isinstance(file_paths, list):
|
|
204
|
+
file_paths = [file_paths]
|
|
205
|
+
|
|
206
|
+
# Extract just the file names from the paths
|
|
207
|
+
file_names = []
|
|
208
|
+
for fp in file_paths:
|
|
209
|
+
if fp:
|
|
210
|
+
name = Path(fp).name
|
|
211
|
+
file_names.append(name)
|
|
212
|
+
|
|
213
|
+
if file_names:
|
|
214
|
+
files_str = ", ".join(file_names)
|
|
215
|
+
return f"{base_description} Available files: {files_str}. Call this tool to read these files."
|
|
216
|
+
|
|
217
|
+
return base_description
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def description(self) -> str:
|
|
221
|
+
"""Dynamic description property that includes uploaded file names."""
|
|
222
|
+
return self.get_tool_description()
|
|
223
|
+
|
|
224
|
+
async def _get_tools(self) -> list:
|
|
225
|
+
"""Override to create a tool without parameters.
|
|
226
|
+
|
|
227
|
+
The Read File component should use the files already uploaded via UI,
|
|
228
|
+
not accept file paths from the Agent (which wouldn't know the internal paths).
|
|
229
|
+
"""
|
|
230
|
+
from langchain_core.tools import StructuredTool
|
|
231
|
+
from pydantic import BaseModel
|
|
232
|
+
|
|
233
|
+
# Empty schema - no parameters needed
|
|
234
|
+
class EmptySchema(BaseModel):
|
|
235
|
+
"""No parameters required - uses pre-uploaded files."""
|
|
236
|
+
|
|
237
|
+
async def read_files_tool() -> str:
|
|
238
|
+
"""Read the content of uploaded files."""
|
|
239
|
+
try:
|
|
240
|
+
result = self.load_files_message()
|
|
241
|
+
if hasattr(result, "get_text"):
|
|
242
|
+
return result.get_text()
|
|
243
|
+
if hasattr(result, "text"):
|
|
244
|
+
return result.text
|
|
245
|
+
return str(result)
|
|
246
|
+
except (FileNotFoundError, ValueError, OSError, RuntimeError) as e:
|
|
247
|
+
return f"Error reading files: {e}"
|
|
248
|
+
|
|
249
|
+
description = self.get_tool_description()
|
|
250
|
+
|
|
251
|
+
tool = StructuredTool(
|
|
252
|
+
name="load_files_message",
|
|
253
|
+
description=description,
|
|
254
|
+
coroutine=read_files_tool,
|
|
255
|
+
args_schema=EmptySchema,
|
|
256
|
+
handle_tool_error=True,
|
|
257
|
+
tags=["load_files_message"],
|
|
258
|
+
metadata={
|
|
259
|
+
"display_name": "Read File",
|
|
260
|
+
"display_description": description,
|
|
261
|
+
},
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
return [tool]
|
|
265
|
+
|
|
158
266
|
# ------------------------------ UI helpers --------------------------------------
|
|
159
267
|
|
|
160
268
|
def _path_value(self, template: dict) -> list[str]:
|
|
@@ -213,39 +321,84 @@ class FileComponent(BaseFileComponent):
|
|
|
213
321
|
file_path = paths[0] if field_name == "path" else frontend_node["template"]["path"]["file_path"][0]
|
|
214
322
|
if file_path.endswith((".csv", ".xlsx", ".parquet")):
|
|
215
323
|
frontend_node["outputs"].append(
|
|
216
|
-
Output(
|
|
324
|
+
Output(
|
|
325
|
+
display_name="Structured Content",
|
|
326
|
+
name="dataframe",
|
|
327
|
+
method="load_files_structured",
|
|
328
|
+
tool_mode=True,
|
|
329
|
+
),
|
|
217
330
|
)
|
|
218
331
|
elif file_path.endswith(".json"):
|
|
219
332
|
frontend_node["outputs"].append(
|
|
220
|
-
Output(display_name="Structured Content", name="json", method="load_files_json"),
|
|
333
|
+
Output(display_name="Structured Content", name="json", method="load_files_json", tool_mode=True),
|
|
221
334
|
)
|
|
222
335
|
|
|
223
336
|
advanced_mode = frontend_node.get("template", {}).get("advanced_mode", {}).get("value", False)
|
|
224
337
|
if advanced_mode:
|
|
225
338
|
frontend_node["outputs"].append(
|
|
226
|
-
Output(
|
|
339
|
+
Output(
|
|
340
|
+
display_name="Structured Output",
|
|
341
|
+
name="advanced_dataframe",
|
|
342
|
+
method="load_files_dataframe",
|
|
343
|
+
tool_mode=True,
|
|
344
|
+
),
|
|
227
345
|
)
|
|
228
346
|
frontend_node["outputs"].append(
|
|
229
|
-
Output(
|
|
347
|
+
Output(
|
|
348
|
+
display_name="Markdown", name="advanced_markdown", method="load_files_markdown", tool_mode=True
|
|
349
|
+
),
|
|
230
350
|
)
|
|
231
351
|
frontend_node["outputs"].append(
|
|
232
|
-
Output(display_name="File Path", name="path", method="load_files_path"),
|
|
352
|
+
Output(display_name="File Path", name="path", method="load_files_path", tool_mode=True),
|
|
233
353
|
)
|
|
234
354
|
else:
|
|
235
355
|
frontend_node["outputs"].append(
|
|
236
|
-
Output(display_name="Raw Content", name="message", method="load_files_message"),
|
|
356
|
+
Output(display_name="Raw Content", name="message", method="load_files_message", tool_mode=True),
|
|
237
357
|
)
|
|
238
358
|
frontend_node["outputs"].append(
|
|
239
|
-
Output(display_name="File Path", name="path", method="load_files_path"),
|
|
359
|
+
Output(display_name="File Path", name="path", method="load_files_path", tool_mode=True),
|
|
240
360
|
)
|
|
241
361
|
else:
|
|
242
362
|
# Multiple files => DataFrame output; advanced parser disabled
|
|
243
|
-
frontend_node["outputs"].append(
|
|
363
|
+
frontend_node["outputs"].append(
|
|
364
|
+
Output(display_name="Files", name="dataframe", method="load_files", tool_mode=True)
|
|
365
|
+
)
|
|
244
366
|
|
|
245
367
|
return frontend_node
|
|
246
368
|
|
|
247
369
|
# ------------------------------ Core processing ----------------------------------
|
|
248
370
|
|
|
371
|
+
def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
|
|
372
|
+
"""Override to handle file_path_str input from tool mode.
|
|
373
|
+
|
|
374
|
+
When called as a tool, the file_path_str parameter can be set.
|
|
375
|
+
If not provided, it will fall back to using the path FileInput (uploaded file).
|
|
376
|
+
Priority:
|
|
377
|
+
1. file_path_str (if provided by the tool call)
|
|
378
|
+
2. path (uploaded file from UI)
|
|
379
|
+
"""
|
|
380
|
+
# Check if file_path_str is provided (from tool mode)
|
|
381
|
+
file_path_str = getattr(self, "file_path_str", None)
|
|
382
|
+
if file_path_str:
|
|
383
|
+
# Use the string path from tool mode
|
|
384
|
+
from pathlib import Path
|
|
385
|
+
|
|
386
|
+
from lfx.schema.data import Data
|
|
387
|
+
|
|
388
|
+
resolved_path = Path(self.resolve_path(file_path_str))
|
|
389
|
+
if not resolved_path.exists():
|
|
390
|
+
msg = f"File or directory not found: {file_path_str}"
|
|
391
|
+
self.log(msg)
|
|
392
|
+
if not self.silent_errors:
|
|
393
|
+
raise ValueError(msg)
|
|
394
|
+
return []
|
|
395
|
+
|
|
396
|
+
data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(resolved_path)})
|
|
397
|
+
return [BaseFileComponent.BaseFile(data_obj, resolved_path, delete_after_processing=False)]
|
|
398
|
+
|
|
399
|
+
# Otherwise use the default implementation (uses path FileInput)
|
|
400
|
+
return super()._validate_and_resolve_paths()
|
|
401
|
+
|
|
249
402
|
def _is_docling_compatible(self, file_path: str) -> bool:
|
|
250
403
|
"""Lightweight extension gate for Docling-compatible types."""
|
|
251
404
|
docling_exts = (
|
|
@@ -282,17 +435,76 @@ class FileComponent(BaseFileComponent):
|
|
|
282
435
|
)
|
|
283
436
|
return file_path.lower().endswith(docling_exts)
|
|
284
437
|
|
|
438
|
+
async def _get_local_file_for_docling(self, file_path: str) -> tuple[str, bool]:
|
|
439
|
+
"""Get a local file path for Docling processing, downloading from S3 if needed.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
file_path: Either a local path or S3 key (format "flow_id/filename")
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
tuple[str, bool]: (local_path, should_delete) where should_delete indicates
|
|
446
|
+
if this is a temporary file that should be cleaned up
|
|
447
|
+
"""
|
|
448
|
+
settings = get_settings_service().settings
|
|
449
|
+
if settings.storage_type == "local":
|
|
450
|
+
return file_path, False
|
|
451
|
+
|
|
452
|
+
# S3 storage - download to temp file
|
|
453
|
+
parsed = parse_storage_path(file_path)
|
|
454
|
+
if not parsed:
|
|
455
|
+
msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
|
|
456
|
+
raise ValueError(msg)
|
|
457
|
+
|
|
458
|
+
storage_service = get_storage_service()
|
|
459
|
+
flow_id, filename = parsed
|
|
460
|
+
|
|
461
|
+
# Get file content from S3
|
|
462
|
+
content = await storage_service.get_file(flow_id, filename)
|
|
463
|
+
|
|
464
|
+
suffix = Path(filename).suffix
|
|
465
|
+
with NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp_file:
|
|
466
|
+
tmp_file.write(content)
|
|
467
|
+
temp_path = tmp_file.name
|
|
468
|
+
|
|
469
|
+
return temp_path, True
|
|
470
|
+
|
|
285
471
|
def _process_docling_in_subprocess(self, file_path: str) -> Data | None:
|
|
286
472
|
"""Run Docling in a separate OS process and map the result to a Data object.
|
|
287
473
|
|
|
288
474
|
We avoid multiprocessing pickling by launching `python -c "<script>"` and
|
|
289
475
|
passing JSON config via stdin. The child prints a JSON result to stdout.
|
|
476
|
+
|
|
477
|
+
For S3 storage, the file is downloaded to a temp file first.
|
|
290
478
|
"""
|
|
291
479
|
if not file_path:
|
|
292
480
|
return None
|
|
293
481
|
|
|
482
|
+
settings = get_settings_service().settings
|
|
483
|
+
if settings.storage_type == "s3":
|
|
484
|
+
local_path, should_delete = run_until_complete(self._get_local_file_for_docling(file_path))
|
|
485
|
+
else:
|
|
486
|
+
local_path = file_path
|
|
487
|
+
should_delete = False
|
|
488
|
+
|
|
489
|
+
try:
|
|
490
|
+
return self._process_docling_subprocess_impl(local_path, file_path)
|
|
491
|
+
finally:
|
|
492
|
+
# Clean up temp file if we created one
|
|
493
|
+
if should_delete:
|
|
494
|
+
with contextlib.suppress(Exception):
|
|
495
|
+
Path(local_path).unlink() # Ignore cleanup errors
|
|
496
|
+
|
|
497
|
+
def _process_docling_subprocess_impl(self, local_file_path: str, original_file_path: str) -> Data | None:
|
|
498
|
+
"""Implementation of Docling subprocess processing.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
local_file_path: Path to local file to process
|
|
502
|
+
original_file_path: Original file path to include in metadata
|
|
503
|
+
Returns:
|
|
504
|
+
Data object with processed content
|
|
505
|
+
"""
|
|
294
506
|
args: dict[str, Any] = {
|
|
295
|
-
"file_path":
|
|
507
|
+
"file_path": local_file_path,
|
|
296
508
|
"markdown": bool(self.markdown),
|
|
297
509
|
"image_mode": str(self.IMAGE_MODE),
|
|
298
510
|
"md_image_placeholder": str(self.md_image_placeholder),
|
|
@@ -303,7 +515,7 @@ class FileComponent(BaseFileComponent):
|
|
|
303
515
|
),
|
|
304
516
|
}
|
|
305
517
|
|
|
306
|
-
self.log(f"Starting Docling subprocess for file: {
|
|
518
|
+
self.log(f"Starting Docling subprocess for file: {local_file_path}")
|
|
307
519
|
self.log(args)
|
|
308
520
|
|
|
309
521
|
# Child script for isolating the docling processing
|
|
@@ -496,14 +708,17 @@ class FileComponent(BaseFileComponent):
|
|
|
496
708
|
|
|
497
709
|
if not proc.stdout:
|
|
498
710
|
err_msg = proc.stderr.decode("utf-8", errors="replace") or "no output from child process"
|
|
499
|
-
return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path":
|
|
711
|
+
return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
|
|
500
712
|
|
|
501
713
|
try:
|
|
502
714
|
result = json.loads(proc.stdout.decode("utf-8"))
|
|
503
715
|
except Exception as e: # noqa: BLE001
|
|
504
716
|
err_msg = proc.stderr.decode("utf-8", errors="replace")
|
|
505
717
|
return Data(
|
|
506
|
-
data={
|
|
718
|
+
data={
|
|
719
|
+
"error": f"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}",
|
|
720
|
+
"file_path": original_file_path,
|
|
721
|
+
},
|
|
507
722
|
)
|
|
508
723
|
|
|
509
724
|
if not result.get("ok"):
|
|
@@ -533,6 +748,39 @@ class FileComponent(BaseFileComponent):
|
|
|
533
748
|
msg = "No files to process."
|
|
534
749
|
raise ValueError(msg)
|
|
535
750
|
|
|
751
|
+
# Validate image files to detect content/extension mismatches
|
|
752
|
+
# This prevents API errors like "Image does not match the provided media type"
|
|
753
|
+
image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
|
|
754
|
+
for file in file_list:
|
|
755
|
+
extension = file.path.suffix[1:].lower()
|
|
756
|
+
if extension in image_extensions:
|
|
757
|
+
# file.path is already resolved, read bytes directly
|
|
758
|
+
try:
|
|
759
|
+
content = file.path.read_bytes()
|
|
760
|
+
is_valid, error_msg = validate_image_content_type(
|
|
761
|
+
str(file.path),
|
|
762
|
+
content=content,
|
|
763
|
+
)
|
|
764
|
+
if not is_valid:
|
|
765
|
+
self.log(error_msg)
|
|
766
|
+
if not self.silent_errors:
|
|
767
|
+
raise ValueError(error_msg)
|
|
768
|
+
except OSError as e:
|
|
769
|
+
self.log(f"Could not read file for validation: {e}")
|
|
770
|
+
# Continue - let it fail later with better error
|
|
771
|
+
|
|
772
|
+
# Validate that files requiring Docling are only processed when advanced mode is enabled
|
|
773
|
+
if not self.advanced_mode:
|
|
774
|
+
for file in file_list:
|
|
775
|
+
extension = file.path.suffix[1:].lower()
|
|
776
|
+
if extension in self.DOCLING_ONLY_EXTENSIONS:
|
|
777
|
+
msg = (
|
|
778
|
+
f"File '{file.path.name}' has extension '.{extension}' which requires "
|
|
779
|
+
f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
|
|
780
|
+
)
|
|
781
|
+
self.log(msg)
|
|
782
|
+
raise ValueError(msg)
|
|
783
|
+
|
|
536
784
|
def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:
|
|
537
785
|
try:
|
|
538
786
|
return parse_text_file_to_data(file_path, silent_errors=silent_errors)
|
|
@@ -559,7 +807,8 @@ class FileComponent(BaseFileComponent):
|
|
|
559
807
|
# --- UNNEST: expand each element in `doc` to its own Data row
|
|
560
808
|
payload = getattr(advanced_data, "data", {}) or {}
|
|
561
809
|
doc_rows = payload.get("doc")
|
|
562
|
-
if isinstance(doc_rows, list):
|
|
810
|
+
if isinstance(doc_rows, list) and doc_rows:
|
|
811
|
+
# Non-empty list of structured rows
|
|
563
812
|
rows: list[Data | None] = [
|
|
564
813
|
Data(
|
|
565
814
|
data={
|
|
@@ -570,6 +819,19 @@ class FileComponent(BaseFileComponent):
|
|
|
570
819
|
for item in doc_rows
|
|
571
820
|
]
|
|
572
821
|
final_return.extend(self.rollup_data(file_list, rows))
|
|
822
|
+
elif isinstance(doc_rows, list) and not doc_rows:
|
|
823
|
+
# Empty list - file was processed but no text content found
|
|
824
|
+
# Create a Data object indicating no content was extracted
|
|
825
|
+
self.log(f"No text extracted from '{file_path}', creating placeholder data")
|
|
826
|
+
empty_data = Data(
|
|
827
|
+
data={
|
|
828
|
+
"file_path": file_path,
|
|
829
|
+
"text": "(No text content extracted from image)",
|
|
830
|
+
"info": "Image processed successfully but contained no extractable text",
|
|
831
|
+
**{k: v for k, v in payload.items() if k != "doc"},
|
|
832
|
+
},
|
|
833
|
+
)
|
|
834
|
+
final_return.extend(self.rollup_data([file], [empty_data]))
|
|
573
835
|
else:
|
|
574
836
|
# If not structured, keep as-is (e.g., markdown export or error dict)
|
|
575
837
|
final_return.extend(self.rollup_data(file_list, [advanced_data]))
|
|
@@ -577,6 +839,7 @@ class FileComponent(BaseFileComponent):
|
|
|
577
839
|
|
|
578
840
|
# Standard multi-file (or single non-advanced) path
|
|
579
841
|
concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)
|
|
842
|
+
|
|
580
843
|
file_paths = [str(f.path) for f in file_list]
|
|
581
844
|
self.log(f"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.")
|
|
582
845
|
my_data = parallel_load_data(
|
|
@@ -592,13 +855,17 @@ class FileComponent(BaseFileComponent):
|
|
|
592
855
|
def load_files_helper(self) -> DataFrame:
|
|
593
856
|
result = self.load_files()
|
|
594
857
|
|
|
595
|
-
#
|
|
596
|
-
if
|
|
597
|
-
if hasattr(result, "error"):
|
|
598
|
-
raise ValueError(result.error[0])
|
|
858
|
+
# Result is a DataFrame - check if it has any rows
|
|
859
|
+
if result.empty:
|
|
599
860
|
msg = "Could not extract content from the provided file(s)."
|
|
600
861
|
raise ValueError(msg)
|
|
601
862
|
|
|
863
|
+
# Check for error column with error messages
|
|
864
|
+
if "error" in result.columns:
|
|
865
|
+
errors = result["error"].dropna().tolist()
|
|
866
|
+
if errors and not any(col in result.columns for col in ["text", "doc", "exported_content"]):
|
|
867
|
+
raise ValueError(errors[0])
|
|
868
|
+
|
|
602
869
|
return result
|
|
603
870
|
|
|
604
871
|
def load_files_dataframe(self) -> DataFrame:
|
|
@@ -610,4 +877,17 @@ class FileComponent(BaseFileComponent):
|
|
|
610
877
|
"""Load files using advanced Docling processing and export to Markdown format."""
|
|
611
878
|
self.markdown = True
|
|
612
879
|
result = self.load_files_helper()
|
|
613
|
-
|
|
880
|
+
|
|
881
|
+
# Result is a DataFrame - check for text or exported_content columns
|
|
882
|
+
if "text" in result.columns and not result["text"].isna().all():
|
|
883
|
+
text_values = result["text"].dropna().tolist()
|
|
884
|
+
if text_values:
|
|
885
|
+
return Message(text=str(text_values[0]))
|
|
886
|
+
|
|
887
|
+
if "exported_content" in result.columns and not result["exported_content"].isna().all():
|
|
888
|
+
content_values = result["exported_content"].dropna().tolist()
|
|
889
|
+
if content_values:
|
|
890
|
+
return Message(text=str(content_values[0]))
|
|
891
|
+
|
|
892
|
+
# Return empty message with info that no text was found
|
|
893
|
+
return Message(text="(No text content extracted from file)")
|
|
@@ -235,8 +235,8 @@ class KnowledgeRetrievalComponent(Component):
|
|
|
235
235
|
|
|
236
236
|
# Only proceed if we have valid document IDs
|
|
237
237
|
if doc_ids:
|
|
238
|
-
# Access underlying
|
|
239
|
-
collection = chroma.
|
|
238
|
+
# Access underlying collection to get embeddings
|
|
239
|
+
collection = chroma._collection # noqa: SLF001
|
|
240
240
|
embeddings_result = collection.get(where={"_id": {"$in": doc_ids}}, include=["metadatas", "embeddings"])
|
|
241
241
|
|
|
242
242
|
# Create a mapping from document ID to embedding
|