lfx-nightly 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +21 -4
  3. lfx/base/agents/altk_base_agent.py +393 -0
  4. lfx/base/agents/altk_tool_wrappers.py +565 -0
  5. lfx/base/agents/events.py +2 -1
  6. lfx/base/composio/composio_base.py +159 -224
  7. lfx/base/data/base_file.py +97 -20
  8. lfx/base/data/docling_utils.py +61 -10
  9. lfx/base/data/storage_utils.py +301 -0
  10. lfx/base/data/utils.py +178 -14
  11. lfx/base/mcp/util.py +2 -2
  12. lfx/base/models/anthropic_constants.py +21 -12
  13. lfx/base/models/groq_constants.py +74 -58
  14. lfx/base/models/groq_model_discovery.py +265 -0
  15. lfx/base/models/model.py +1 -1
  16. lfx/base/models/model_utils.py +100 -0
  17. lfx/base/models/openai_constants.py +7 -0
  18. lfx/base/models/watsonx_constants.py +32 -8
  19. lfx/base/tools/run_flow.py +601 -129
  20. lfx/cli/commands.py +9 -4
  21. lfx/cli/common.py +2 -2
  22. lfx/cli/run.py +1 -1
  23. lfx/cli/script_loader.py +53 -11
  24. lfx/components/Notion/create_page.py +1 -1
  25. lfx/components/Notion/list_database_properties.py +1 -1
  26. lfx/components/Notion/list_pages.py +1 -1
  27. lfx/components/Notion/list_users.py +1 -1
  28. lfx/components/Notion/page_content_viewer.py +1 -1
  29. lfx/components/Notion/search.py +1 -1
  30. lfx/components/Notion/update_page_property.py +1 -1
  31. lfx/components/__init__.py +19 -5
  32. lfx/components/{agents → altk}/__init__.py +5 -9
  33. lfx/components/altk/altk_agent.py +193 -0
  34. lfx/components/apify/apify_actor.py +1 -1
  35. lfx/components/composio/__init__.py +70 -18
  36. lfx/components/composio/apollo_composio.py +11 -0
  37. lfx/components/composio/bitbucket_composio.py +11 -0
  38. lfx/components/composio/canva_composio.py +11 -0
  39. lfx/components/composio/coda_composio.py +11 -0
  40. lfx/components/composio/composio_api.py +10 -0
  41. lfx/components/composio/discord_composio.py +1 -1
  42. lfx/components/composio/elevenlabs_composio.py +11 -0
  43. lfx/components/composio/exa_composio.py +11 -0
  44. lfx/components/composio/firecrawl_composio.py +11 -0
  45. lfx/components/composio/fireflies_composio.py +11 -0
  46. lfx/components/composio/gmail_composio.py +1 -1
  47. lfx/components/composio/googlebigquery_composio.py +11 -0
  48. lfx/components/composio/googlecalendar_composio.py +1 -1
  49. lfx/components/composio/googledocs_composio.py +1 -1
  50. lfx/components/composio/googlemeet_composio.py +1 -1
  51. lfx/components/composio/googlesheets_composio.py +1 -1
  52. lfx/components/composio/googletasks_composio.py +1 -1
  53. lfx/components/composio/heygen_composio.py +11 -0
  54. lfx/components/composio/mem0_composio.py +11 -0
  55. lfx/components/composio/peopledatalabs_composio.py +11 -0
  56. lfx/components/composio/perplexityai_composio.py +11 -0
  57. lfx/components/composio/serpapi_composio.py +11 -0
  58. lfx/components/composio/slack_composio.py +3 -574
  59. lfx/components/composio/slackbot_composio.py +1 -1
  60. lfx/components/composio/snowflake_composio.py +11 -0
  61. lfx/components/composio/tavily_composio.py +11 -0
  62. lfx/components/composio/youtube_composio.py +2 -2
  63. lfx/components/cuga/__init__.py +34 -0
  64. lfx/components/cuga/cuga_agent.py +730 -0
  65. lfx/components/data/__init__.py +78 -28
  66. lfx/components/data_source/__init__.py +58 -0
  67. lfx/components/{data → data_source}/api_request.py +26 -3
  68. lfx/components/{data → data_source}/csv_to_data.py +15 -10
  69. lfx/components/{data → data_source}/json_to_data.py +15 -8
  70. lfx/components/{data → data_source}/news_search.py +1 -1
  71. lfx/components/{data → data_source}/rss.py +1 -1
  72. lfx/components/{data → data_source}/sql_executor.py +1 -1
  73. lfx/components/{data → data_source}/url.py +1 -1
  74. lfx/components/{data → data_source}/web_search.py +1 -1
  75. lfx/components/datastax/astradb_cql.py +1 -1
  76. lfx/components/datastax/astradb_graph.py +1 -1
  77. lfx/components/datastax/astradb_tool.py +1 -1
  78. lfx/components/datastax/astradb_vectorstore.py +1 -1
  79. lfx/components/datastax/hcd.py +1 -1
  80. lfx/components/deactivated/json_document_builder.py +1 -1
  81. lfx/components/docling/__init__.py +0 -3
  82. lfx/components/docling/chunk_docling_document.py +3 -1
  83. lfx/components/docling/export_docling_document.py +3 -1
  84. lfx/components/elastic/elasticsearch.py +1 -1
  85. lfx/components/files_and_knowledge/__init__.py +47 -0
  86. lfx/components/{data → files_and_knowledge}/directory.py +1 -1
  87. lfx/components/{data → files_and_knowledge}/file.py +304 -24
  88. lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +2 -2
  89. lfx/components/{data → files_and_knowledge}/save_file.py +218 -31
  90. lfx/components/flow_controls/__init__.py +58 -0
  91. lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
  92. lfx/components/{logic → flow_controls}/loop.py +43 -9
  93. lfx/components/flow_controls/run_flow.py +108 -0
  94. lfx/components/glean/glean_search_api.py +1 -1
  95. lfx/components/groq/groq.py +35 -28
  96. lfx/components/helpers/__init__.py +102 -0
  97. lfx/components/ibm/watsonx.py +7 -1
  98. lfx/components/input_output/__init__.py +3 -1
  99. lfx/components/input_output/chat.py +4 -3
  100. lfx/components/input_output/chat_output.py +10 -4
  101. lfx/components/input_output/text.py +1 -1
  102. lfx/components/input_output/text_output.py +1 -1
  103. lfx/components/{data → input_output}/webhook.py +1 -1
  104. lfx/components/knowledge_bases/__init__.py +59 -4
  105. lfx/components/langchain_utilities/character.py +1 -1
  106. lfx/components/langchain_utilities/csv_agent.py +84 -16
  107. lfx/components/langchain_utilities/json_agent.py +67 -12
  108. lfx/components/langchain_utilities/language_recursive.py +1 -1
  109. lfx/components/llm_operations/__init__.py +46 -0
  110. lfx/components/{processing → llm_operations}/batch_run.py +17 -8
  111. lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
  112. lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
  113. lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
  114. lfx/components/{processing → llm_operations}/structured_output.py +1 -1
  115. lfx/components/logic/__init__.py +126 -0
  116. lfx/components/mem0/mem0_chat_memory.py +11 -0
  117. lfx/components/models/__init__.py +64 -9
  118. lfx/components/models_and_agents/__init__.py +49 -0
  119. lfx/components/{agents → models_and_agents}/agent.py +6 -4
  120. lfx/components/models_and_agents/embedding_model.py +353 -0
  121. lfx/components/models_and_agents/language_model.py +398 -0
  122. lfx/components/{agents → models_and_agents}/mcp_component.py +53 -44
  123. lfx/components/{helpers → models_and_agents}/memory.py +1 -1
  124. lfx/components/nvidia/system_assist.py +1 -1
  125. lfx/components/olivya/olivya.py +1 -1
  126. lfx/components/ollama/ollama.py +24 -5
  127. lfx/components/processing/__init__.py +9 -60
  128. lfx/components/processing/converter.py +1 -1
  129. lfx/components/processing/dataframe_operations.py +1 -1
  130. lfx/components/processing/parse_json_data.py +2 -2
  131. lfx/components/processing/parser.py +1 -1
  132. lfx/components/processing/split_text.py +1 -1
  133. lfx/components/qdrant/qdrant.py +1 -1
  134. lfx/components/redis/redis.py +1 -1
  135. lfx/components/twelvelabs/split_video.py +10 -0
  136. lfx/components/twelvelabs/video_file.py +12 -0
  137. lfx/components/utilities/__init__.py +43 -0
  138. lfx/components/{helpers → utilities}/calculator_core.py +1 -1
  139. lfx/components/{helpers → utilities}/current_date.py +1 -1
  140. lfx/components/{processing → utilities}/python_repl_core.py +1 -1
  141. lfx/components/vectorstores/local_db.py +9 -0
  142. lfx/components/youtube/youtube_transcripts.py +118 -30
  143. lfx/custom/custom_component/component.py +57 -1
  144. lfx/custom/custom_component/custom_component.py +68 -6
  145. lfx/custom/directory_reader/directory_reader.py +5 -2
  146. lfx/graph/edge/base.py +43 -20
  147. lfx/graph/state/model.py +15 -2
  148. lfx/graph/utils.py +6 -0
  149. lfx/graph/vertex/param_handler.py +10 -7
  150. lfx/helpers/__init__.py +12 -0
  151. lfx/helpers/flow.py +117 -0
  152. lfx/inputs/input_mixin.py +24 -1
  153. lfx/inputs/inputs.py +13 -1
  154. lfx/interface/components.py +161 -83
  155. lfx/log/logger.py +5 -3
  156. lfx/schema/image.py +2 -12
  157. lfx/services/database/__init__.py +5 -0
  158. lfx/services/database/service.py +25 -0
  159. lfx/services/deps.py +87 -22
  160. lfx/services/interfaces.py +5 -0
  161. lfx/services/manager.py +24 -10
  162. lfx/services/mcp_composer/service.py +1029 -162
  163. lfx/services/session.py +5 -0
  164. lfx/services/settings/auth.py +18 -11
  165. lfx/services/settings/base.py +56 -30
  166. lfx/services/settings/constants.py +8 -0
  167. lfx/services/storage/local.py +108 -46
  168. lfx/services/storage/service.py +171 -29
  169. lfx/template/field/base.py +3 -0
  170. lfx/utils/image.py +29 -11
  171. lfx/utils/ssrf_protection.py +384 -0
  172. lfx/utils/validate_cloud.py +26 -0
  173. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/METADATA +38 -22
  174. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/RECORD +189 -160
  175. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/WHEEL +1 -1
  176. lfx/components/agents/altk_agent.py +0 -366
  177. lfx/components/agents/cuga_agent.py +0 -1013
  178. lfx/components/docling/docling_remote_vlm.py +0 -284
  179. lfx/components/logic/run_flow.py +0 -71
  180. lfx/components/models/embedding_model.py +0 -195
  181. lfx/components/models/language_model.py +0 -144
  182. lfx/components/processing/dataframe_to_toolset.py +0 -259
  183. /lfx/components/{data → data_source}/mock_data.py +0 -0
  184. /lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +0 -0
  185. /lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
  186. /lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
  187. /lfx/components/{logic → flow_controls}/listen.py +0 -0
  188. /lfx/components/{logic → flow_controls}/notify.py +0 -0
  189. /lfx/components/{logic → flow_controls}/pass_message.py +0 -0
  190. /lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
  191. /lfx/components/{processing → models_and_agents}/prompt.py +0 -0
  192. /lfx/components/{helpers → processing}/create_list.py +0 -0
  193. /lfx/components/{helpers → processing}/output_parser.py +0 -0
  194. /lfx/components/{helpers → processing}/store_message.py +0 -0
  195. /lfx/components/{helpers → utilities}/id_generator.py +0 -0
  196. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/entry_points.txt +0 -0
@@ -10,34 +10,44 @@ Notes:
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
+ import contextlib
13
14
  import json
14
15
  import subprocess
15
16
  import sys
16
17
  import textwrap
17
18
  from copy import deepcopy
19
+ from pathlib import Path
20
+ from tempfile import NamedTemporaryFile
18
21
  from typing import Any
19
22
 
20
23
  from lfx.base.data.base_file import BaseFileComponent
24
+ from lfx.base.data.storage_utils import parse_storage_path, validate_image_content_type
21
25
  from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
22
26
  from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
23
27
  from lfx.io import BoolInput, FileInput, IntInput, Output
24
28
  from lfx.schema.data import Data
25
29
  from lfx.schema.dataframe import DataFrame # noqa: TC001
26
30
  from lfx.schema.message import Message
31
+ from lfx.services.deps import get_settings_service, get_storage_service
32
+ from lfx.utils.async_helpers import run_until_complete
27
33
 
28
34
 
29
35
  class FileComponent(BaseFileComponent):
30
36
  """File component with optional Docling processing (isolated in a subprocess)."""
31
37
 
32
38
  display_name = "Read File"
33
- description = "Loads content from one or more files."
34
- documentation: str = "https://docs.langflow.org/components-data#file"
39
+ # description is now a dynamic property - see get_tool_description()
40
+ _base_description = "Loads content from one or more files."
41
+ documentation: str = "https://docs.langflow.org/read-file"
35
42
  icon = "file-text"
36
43
  name = "File"
44
+ add_tool_output = True # Enable tool mode toggle without requiring tool_mode inputs
37
45
 
38
- # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.
39
- VALID_EXTENSIONS = [
40
- *TEXT_FILE_TYPES,
46
+ # Extensions that can be processed without Docling (using standard text parsing)
47
+ TEXT_EXTENSIONS = TEXT_FILE_TYPES
48
+
49
+ # Extensions that require Docling for processing (images, advanced office formats, etc.)
50
+ DOCLING_ONLY_EXTENSIONS = [
41
51
  "adoc",
42
52
  "asciidoc",
43
53
  "asc",
@@ -61,6 +71,12 @@ class FileComponent(BaseFileComponent):
61
71
  "webp",
62
72
  ]
63
73
 
74
+ # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.
75
+ VALID_EXTENSIONS = [
76
+ *TEXT_EXTENSIONS,
77
+ *DOCLING_ONLY_EXTENSIONS,
78
+ ]
79
+
64
80
  # Fixed export settings used when markdown export is requested.
65
81
  EXPORT_FORMAT = "Markdown"
66
82
  IMAGE_MODE = "placeholder"
@@ -70,10 +86,24 @@ class FileComponent(BaseFileComponent):
70
86
  for input_item in _base_inputs:
71
87
  if isinstance(input_item, FileInput) and input_item.name == "path":
72
88
  input_item.real_time_refresh = True
89
+ input_item.tool_mode = False # Disable tool mode for file upload input
90
+ input_item.required = False # Make it optional so it doesn't error in tool mode
73
91
  break
74
92
 
75
93
  inputs = [
76
94
  *_base_inputs,
95
+ StrInput(
96
+ name="file_path_str",
97
+ display_name="File Path",
98
+ info=(
99
+ "Path to the file to read. Used when component is called as a tool. "
100
+ "If not provided, will use the uploaded file from 'path' input."
101
+ ),
102
+ show=False,
103
+ advanced=True,
104
+ tool_mode=True, # Required for Toolset toggle, but _get_tools() ignores this parameter
105
+ required=False,
106
+ ),
77
107
  BoolInput(
78
108
  name="advanced_mode",
79
109
  display_name="Advanced Parser",
@@ -152,9 +182,87 @@ class FileComponent(BaseFileComponent):
152
182
  ]
153
183
 
154
184
  outputs = [
155
- Output(display_name="Raw Content", name="message", method="load_files_message"),
185
+ Output(display_name="Raw Content", name="message", method="load_files_message", tool_mode=True),
156
186
  ]
157
187
 
188
+ # ------------------------------ Tool description with file names --------------
189
+
190
+ def get_tool_description(self) -> str:
191
+ """Return a dynamic description that includes the names of uploaded files.
192
+
193
+ This helps the Agent understand which files are available to read.
194
+ """
195
+ base_description = "Loads and returns the content from uploaded files."
196
+
197
+ # Get the list of uploaded file paths
198
+ file_paths = getattr(self, "path", None)
199
+ if not file_paths:
200
+ return base_description
201
+
202
+ # Ensure it's a list
203
+ if not isinstance(file_paths, list):
204
+ file_paths = [file_paths]
205
+
206
+ # Extract just the file names from the paths
207
+ file_names = []
208
+ for fp in file_paths:
209
+ if fp:
210
+ name = Path(fp).name
211
+ file_names.append(name)
212
+
213
+ if file_names:
214
+ files_str = ", ".join(file_names)
215
+ return f"{base_description} Available files: {files_str}. Call this tool to read these files."
216
+
217
+ return base_description
218
+
219
+ @property
220
+ def description(self) -> str:
221
+ """Dynamic description property that includes uploaded file names."""
222
+ return self.get_tool_description()
223
+
224
+ async def _get_tools(self) -> list:
225
+ """Override to create a tool without parameters.
226
+
227
+ The Read File component should use the files already uploaded via UI,
228
+ not accept file paths from the Agent (which wouldn't know the internal paths).
229
+ """
230
+ from langchain_core.tools import StructuredTool
231
+ from pydantic import BaseModel
232
+
233
+ # Empty schema - no parameters needed
234
+ class EmptySchema(BaseModel):
235
+ """No parameters required - uses pre-uploaded files."""
236
+
237
+ async def read_files_tool() -> str:
238
+ """Read the content of uploaded files."""
239
+ try:
240
+ result = self.load_files_message()
241
+ if hasattr(result, "get_text"):
242
+ return result.get_text()
243
+ if hasattr(result, "text"):
244
+ return result.text
245
+ return str(result)
246
+ except (FileNotFoundError, ValueError, OSError, RuntimeError) as e:
247
+ return f"Error reading files: {e}"
248
+
249
+ description = self.get_tool_description()
250
+
251
+ tool = StructuredTool(
252
+ name="load_files_message",
253
+ description=description,
254
+ coroutine=read_files_tool,
255
+ args_schema=EmptySchema,
256
+ handle_tool_error=True,
257
+ tags=["load_files_message"],
258
+ metadata={
259
+ "display_name": "Read File",
260
+ "display_description": description,
261
+ },
262
+ )
263
+
264
+ return [tool]
265
+
158
266
  # ------------------------------ UI helpers --------------------------------------
159
267
 
160
268
  def _path_value(self, template: dict) -> list[str]:
@@ -213,39 +321,84 @@ class FileComponent(BaseFileComponent):
213
321
  file_path = paths[0] if field_name == "path" else frontend_node["template"]["path"]["file_path"][0]
214
322
  if file_path.endswith((".csv", ".xlsx", ".parquet")):
215
323
  frontend_node["outputs"].append(
216
- Output(display_name="Structured Content", name="dataframe", method="load_files_structured"),
324
+ Output(
325
+ display_name="Structured Content",
326
+ name="dataframe",
327
+ method="load_files_structured",
328
+ tool_mode=True,
329
+ ),
217
330
  )
218
331
  elif file_path.endswith(".json"):
219
332
  frontend_node["outputs"].append(
220
- Output(display_name="Structured Content", name="json", method="load_files_json"),
333
+ Output(display_name="Structured Content", name="json", method="load_files_json", tool_mode=True),
221
334
  )
222
335
 
223
336
  advanced_mode = frontend_node.get("template", {}).get("advanced_mode", {}).get("value", False)
224
337
  if advanced_mode:
225
338
  frontend_node["outputs"].append(
226
- Output(display_name="Structured Output", name="advanced_dataframe", method="load_files_dataframe"),
339
+ Output(
340
+ display_name="Structured Output",
341
+ name="advanced_dataframe",
342
+ method="load_files_dataframe",
343
+ tool_mode=True,
344
+ ),
227
345
  )
228
346
  frontend_node["outputs"].append(
229
- Output(display_name="Markdown", name="advanced_markdown", method="load_files_markdown"),
347
+ Output(
348
+ display_name="Markdown", name="advanced_markdown", method="load_files_markdown", tool_mode=True
349
+ ),
230
350
  )
231
351
  frontend_node["outputs"].append(
232
- Output(display_name="File Path", name="path", method="load_files_path"),
352
+ Output(display_name="File Path", name="path", method="load_files_path", tool_mode=True),
233
353
  )
234
354
  else:
235
355
  frontend_node["outputs"].append(
236
- Output(display_name="Raw Content", name="message", method="load_files_message"),
356
+ Output(display_name="Raw Content", name="message", method="load_files_message", tool_mode=True),
237
357
  )
238
358
  frontend_node["outputs"].append(
239
- Output(display_name="File Path", name="path", method="load_files_path"),
359
+ Output(display_name="File Path", name="path", method="load_files_path", tool_mode=True),
240
360
  )
241
361
  else:
242
362
  # Multiple files => DataFrame output; advanced parser disabled
243
- frontend_node["outputs"].append(Output(display_name="Files", name="dataframe", method="load_files"))
363
+ frontend_node["outputs"].append(
364
+ Output(display_name="Files", name="dataframe", method="load_files", tool_mode=True)
365
+ )
244
366
 
245
367
  return frontend_node
246
368
 
247
369
  # ------------------------------ Core processing ----------------------------------
248
370
 
371
+ def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
372
+ """Override to handle file_path_str input from tool mode.
373
+
374
+ When called as a tool, the file_path_str parameter can be set.
375
+ If not provided, it will fall back to using the path FileInput (uploaded file).
376
+ Priority:
377
+ 1. file_path_str (if provided by the tool call)
378
+ 2. path (uploaded file from UI)
379
+ """
380
+ # Check if file_path_str is provided (from tool mode)
381
+ file_path_str = getattr(self, "file_path_str", None)
382
+ if file_path_str:
383
+ # Use the string path from tool mode
384
+ from pathlib import Path
385
+
386
+ from lfx.schema.data import Data
387
+
388
+ resolved_path = Path(self.resolve_path(file_path_str))
389
+ if not resolved_path.exists():
390
+ msg = f"File or directory not found: {file_path_str}"
391
+ self.log(msg)
392
+ if not self.silent_errors:
393
+ raise ValueError(msg)
394
+ return []
395
+
396
+ data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(resolved_path)})
397
+ return [BaseFileComponent.BaseFile(data_obj, resolved_path, delete_after_processing=False)]
398
+
399
+ # Otherwise use the default implementation (uses path FileInput)
400
+ return super()._validate_and_resolve_paths()
401
+
249
402
  def _is_docling_compatible(self, file_path: str) -> bool:
250
403
  """Lightweight extension gate for Docling-compatible types."""
251
404
  docling_exts = (
@@ -282,17 +435,76 @@ class FileComponent(BaseFileComponent):
282
435
  )
283
436
  return file_path.lower().endswith(docling_exts)
284
437
 
438
+ async def _get_local_file_for_docling(self, file_path: str) -> tuple[str, bool]:
439
+ """Get a local file path for Docling processing, downloading from S3 if needed.
440
+
441
+ Args:
442
+ file_path: Either a local path or S3 key (format "flow_id/filename")
443
+
444
+ Returns:
445
+ tuple[str, bool]: (local_path, should_delete) where should_delete indicates
446
+ if this is a temporary file that should be cleaned up
447
+ """
448
+ settings = get_settings_service().settings
449
+ if settings.storage_type == "local":
450
+ return file_path, False
451
+
452
+ # S3 storage - download to temp file
453
+ parsed = parse_storage_path(file_path)
454
+ if not parsed:
455
+ msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
456
+ raise ValueError(msg)
457
+
458
+ storage_service = get_storage_service()
459
+ flow_id, filename = parsed
460
+
461
+ # Get file content from S3
462
+ content = await storage_service.get_file(flow_id, filename)
463
+
464
+ suffix = Path(filename).suffix
465
+ with NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp_file:
466
+ tmp_file.write(content)
467
+ temp_path = tmp_file.name
468
+
469
+ return temp_path, True
470
+
285
471
  def _process_docling_in_subprocess(self, file_path: str) -> Data | None:
286
472
  """Run Docling in a separate OS process and map the result to a Data object.
287
473
 
288
474
  We avoid multiprocessing pickling by launching `python -c "<script>"` and
289
475
  passing JSON config via stdin. The child prints a JSON result to stdout.
476
+
477
+ For S3 storage, the file is downloaded to a temp file first.
290
478
  """
291
479
  if not file_path:
292
480
  return None
293
481
 
482
+ settings = get_settings_service().settings
483
+ if settings.storage_type == "s3":
484
+ local_path, should_delete = run_until_complete(self._get_local_file_for_docling(file_path))
485
+ else:
486
+ local_path = file_path
487
+ should_delete = False
488
+
489
+ try:
490
+ return self._process_docling_subprocess_impl(local_path, file_path)
491
+ finally:
492
+ # Clean up temp file if we created one
493
+ if should_delete:
494
+ with contextlib.suppress(Exception):
495
+ Path(local_path).unlink() # Ignore cleanup errors
496
+
497
+ def _process_docling_subprocess_impl(self, local_file_path: str, original_file_path: str) -> Data | None:
498
+ """Implementation of Docling subprocess processing.
499
+
500
+ Args:
501
+ local_file_path: Path to local file to process
502
+ original_file_path: Original file path to include in metadata
503
+ Returns:
504
+ Data object with processed content
505
+ """
294
506
  args: dict[str, Any] = {
295
- "file_path": file_path,
507
+ "file_path": local_file_path,
296
508
  "markdown": bool(self.markdown),
297
509
  "image_mode": str(self.IMAGE_MODE),
298
510
  "md_image_placeholder": str(self.md_image_placeholder),
@@ -303,7 +515,7 @@ class FileComponent(BaseFileComponent):
303
515
  ),
304
516
  }
305
517
 
306
- self.log(f"Starting Docling subprocess for file: {file_path}")
518
+ self.log(f"Starting Docling subprocess for file: {local_file_path}")
307
519
  self.log(args)
308
520
 
309
521
  # Child script for isolating the docling processing
@@ -496,14 +708,17 @@ class FileComponent(BaseFileComponent):
496
708
 
497
709
  if not proc.stdout:
498
710
  err_msg = proc.stderr.decode("utf-8", errors="replace") or "no output from child process"
499
- return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": file_path})
711
+ return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
500
712
 
501
713
  try:
502
714
  result = json.loads(proc.stdout.decode("utf-8"))
503
715
  except Exception as e: # noqa: BLE001
504
716
  err_msg = proc.stderr.decode("utf-8", errors="replace")
505
717
  return Data(
506
- data={"error": f"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}", "file_path": file_path},
718
+ data={
719
+ "error": f"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}",
720
+ "file_path": original_file_path,
721
+ },
507
722
  )
508
723
 
509
724
  if not result.get("ok"):
@@ -533,6 +748,39 @@ class FileComponent(BaseFileComponent):
533
748
  msg = "No files to process."
534
749
  raise ValueError(msg)
535
750
 
751
+ # Validate image files to detect content/extension mismatches
752
+ # This prevents API errors like "Image does not match the provided media type"
753
+ image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
754
+ for file in file_list:
755
+ extension = file.path.suffix[1:].lower()
756
+ if extension in image_extensions:
757
+ # file.path is already resolved, read bytes directly
758
+ try:
759
+ content = file.path.read_bytes()
760
+ is_valid, error_msg = validate_image_content_type(
761
+ str(file.path),
762
+ content=content,
763
+ )
764
+ if not is_valid:
765
+ self.log(error_msg)
766
+ if not self.silent_errors:
767
+ raise ValueError(error_msg)
768
+ except OSError as e:
769
+ self.log(f"Could not read file for validation: {e}")
770
+ # Continue - let it fail later with better error
771
+
772
+ # Validate that files requiring Docling are only processed when advanced mode is enabled
773
+ if not self.advanced_mode:
774
+ for file in file_list:
775
+ extension = file.path.suffix[1:].lower()
776
+ if extension in self.DOCLING_ONLY_EXTENSIONS:
777
+ msg = (
778
+ f"File '{file.path.name}' has extension '.{extension}' which requires "
779
+ f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
780
+ )
781
+ self.log(msg)
782
+ raise ValueError(msg)
783
+
536
784
  def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:
537
785
  try:
538
786
  return parse_text_file_to_data(file_path, silent_errors=silent_errors)
@@ -559,7 +807,8 @@ class FileComponent(BaseFileComponent):
559
807
  # --- UNNEST: expand each element in `doc` to its own Data row
560
808
  payload = getattr(advanced_data, "data", {}) or {}
561
809
  doc_rows = payload.get("doc")
562
- if isinstance(doc_rows, list):
810
+ if isinstance(doc_rows, list) and doc_rows:
811
+ # Non-empty list of structured rows
563
812
  rows: list[Data | None] = [
564
813
  Data(
565
814
  data={
@@ -570,6 +819,19 @@ class FileComponent(BaseFileComponent):
570
819
  for item in doc_rows
571
820
  ]
572
821
  final_return.extend(self.rollup_data(file_list, rows))
822
+ elif isinstance(doc_rows, list) and not doc_rows:
823
+ # Empty list - file was processed but no text content found
824
+ # Create a Data object indicating no content was extracted
825
+ self.log(f"No text extracted from '{file_path}', creating placeholder data")
826
+ empty_data = Data(
827
+ data={
828
+ "file_path": file_path,
829
+ "text": "(No text content extracted from image)",
830
+ "info": "Image processed successfully but contained no extractable text",
831
+ **{k: v for k, v in payload.items() if k != "doc"},
832
+ },
833
+ )
834
+ final_return.extend(self.rollup_data([file], [empty_data]))
573
835
  else:
574
836
  # If not structured, keep as-is (e.g., markdown export or error dict)
575
837
  final_return.extend(self.rollup_data(file_list, [advanced_data]))
@@ -577,6 +839,7 @@ class FileComponent(BaseFileComponent):
577
839
 
578
840
  # Standard multi-file (or single non-advanced) path
579
841
  concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)
842
+
580
843
  file_paths = [str(f.path) for f in file_list]
581
844
  self.log(f"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.")
582
845
  my_data = parallel_load_data(
@@ -592,13 +855,17 @@ class FileComponent(BaseFileComponent):
592
855
  def load_files_helper(self) -> DataFrame:
593
856
  result = self.load_files()
594
857
 
595
- # Error condition - raise error if no text and an error is present
596
- if not hasattr(result, "text"):
597
- if hasattr(result, "error"):
598
- raise ValueError(result.error[0])
858
+ # Result is a DataFrame - check if it has any rows
859
+ if result.empty:
599
860
  msg = "Could not extract content from the provided file(s)."
600
861
  raise ValueError(msg)
601
862
 
863
+ # Check for error column with error messages
864
+ if "error" in result.columns:
865
+ errors = result["error"].dropna().tolist()
866
+ if errors and not any(col in result.columns for col in ["text", "doc", "exported_content"]):
867
+ raise ValueError(errors[0])
868
+
602
869
  return result
603
870
 
604
871
  def load_files_dataframe(self) -> DataFrame:
@@ -610,4 +877,17 @@ class FileComponent(BaseFileComponent):
610
877
  """Load files using advanced Docling processing and export to Markdown format."""
611
878
  self.markdown = True
612
879
  result = self.load_files_helper()
613
- return Message(text=str(result.text[0]))
880
+
881
+ # Result is a DataFrame - check for text or exported_content columns
882
+ if "text" in result.columns and not result["text"].isna().all():
883
+ text_values = result["text"].dropna().tolist()
884
+ if text_values:
885
+ return Message(text=str(text_values[0]))
886
+
887
+ if "exported_content" in result.columns and not result["exported_content"].isna().all():
888
+ content_values = result["exported_content"].dropna().tolist()
889
+ if content_values:
890
+ return Message(text=str(content_values[0]))
891
+
892
+ # Return empty message with info that no text was found
893
+ return Message(text="(No text content extracted from file)")
@@ -235,8 +235,8 @@ class KnowledgeRetrievalComponent(Component):
235
235
 
236
236
  # Only proceed if we have valid document IDs
237
237
  if doc_ids:
238
- # Access underlying client to get embeddings
239
- collection = chroma._client.get_collection(name=self.knowledge_base)
238
+ # Access underlying collection to get embeddings
239
+ collection = chroma._collection # noqa: SLF001
240
240
  embeddings_result = collection.get(where={"_id": {"$in": doc_ids}}, include=["metadatas", "embeddings"])
241
241
 
242
242
  # Create a mapping from document ID to embedding