lfx-nightly 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +21 -4
  3. lfx/base/agents/altk_base_agent.py +393 -0
  4. lfx/base/agents/altk_tool_wrappers.py +565 -0
  5. lfx/base/agents/events.py +2 -1
  6. lfx/base/composio/composio_base.py +159 -224
  7. lfx/base/data/base_file.py +97 -20
  8. lfx/base/data/docling_utils.py +61 -10
  9. lfx/base/data/storage_utils.py +301 -0
  10. lfx/base/data/utils.py +178 -14
  11. lfx/base/mcp/util.py +2 -2
  12. lfx/base/models/anthropic_constants.py +21 -12
  13. lfx/base/models/groq_constants.py +74 -58
  14. lfx/base/models/groq_model_discovery.py +265 -0
  15. lfx/base/models/model.py +1 -1
  16. lfx/base/models/model_utils.py +100 -0
  17. lfx/base/models/openai_constants.py +7 -0
  18. lfx/base/models/watsonx_constants.py +32 -8
  19. lfx/base/tools/run_flow.py +601 -129
  20. lfx/cli/commands.py +9 -4
  21. lfx/cli/common.py +2 -2
  22. lfx/cli/run.py +1 -1
  23. lfx/cli/script_loader.py +53 -11
  24. lfx/components/Notion/create_page.py +1 -1
  25. lfx/components/Notion/list_database_properties.py +1 -1
  26. lfx/components/Notion/list_pages.py +1 -1
  27. lfx/components/Notion/list_users.py +1 -1
  28. lfx/components/Notion/page_content_viewer.py +1 -1
  29. lfx/components/Notion/search.py +1 -1
  30. lfx/components/Notion/update_page_property.py +1 -1
  31. lfx/components/__init__.py +19 -5
  32. lfx/components/{agents → altk}/__init__.py +5 -9
  33. lfx/components/altk/altk_agent.py +193 -0
  34. lfx/components/apify/apify_actor.py +1 -1
  35. lfx/components/composio/__init__.py +70 -18
  36. lfx/components/composio/apollo_composio.py +11 -0
  37. lfx/components/composio/bitbucket_composio.py +11 -0
  38. lfx/components/composio/canva_composio.py +11 -0
  39. lfx/components/composio/coda_composio.py +11 -0
  40. lfx/components/composio/composio_api.py +10 -0
  41. lfx/components/composio/discord_composio.py +1 -1
  42. lfx/components/composio/elevenlabs_composio.py +11 -0
  43. lfx/components/composio/exa_composio.py +11 -0
  44. lfx/components/composio/firecrawl_composio.py +11 -0
  45. lfx/components/composio/fireflies_composio.py +11 -0
  46. lfx/components/composio/gmail_composio.py +1 -1
  47. lfx/components/composio/googlebigquery_composio.py +11 -0
  48. lfx/components/composio/googlecalendar_composio.py +1 -1
  49. lfx/components/composio/googledocs_composio.py +1 -1
  50. lfx/components/composio/googlemeet_composio.py +1 -1
  51. lfx/components/composio/googlesheets_composio.py +1 -1
  52. lfx/components/composio/googletasks_composio.py +1 -1
  53. lfx/components/composio/heygen_composio.py +11 -0
  54. lfx/components/composio/mem0_composio.py +11 -0
  55. lfx/components/composio/peopledatalabs_composio.py +11 -0
  56. lfx/components/composio/perplexityai_composio.py +11 -0
  57. lfx/components/composio/serpapi_composio.py +11 -0
  58. lfx/components/composio/slack_composio.py +3 -574
  59. lfx/components/composio/slackbot_composio.py +1 -1
  60. lfx/components/composio/snowflake_composio.py +11 -0
  61. lfx/components/composio/tavily_composio.py +11 -0
  62. lfx/components/composio/youtube_composio.py +2 -2
  63. lfx/components/cuga/__init__.py +34 -0
  64. lfx/components/cuga/cuga_agent.py +730 -0
  65. lfx/components/data/__init__.py +78 -28
  66. lfx/components/data_source/__init__.py +58 -0
  67. lfx/components/{data → data_source}/api_request.py +26 -3
  68. lfx/components/{data → data_source}/csv_to_data.py +15 -10
  69. lfx/components/{data → data_source}/json_to_data.py +15 -8
  70. lfx/components/{data → data_source}/news_search.py +1 -1
  71. lfx/components/{data → data_source}/rss.py +1 -1
  72. lfx/components/{data → data_source}/sql_executor.py +1 -1
  73. lfx/components/{data → data_source}/url.py +1 -1
  74. lfx/components/{data → data_source}/web_search.py +1 -1
  75. lfx/components/datastax/astradb_cql.py +1 -1
  76. lfx/components/datastax/astradb_graph.py +1 -1
  77. lfx/components/datastax/astradb_tool.py +1 -1
  78. lfx/components/datastax/astradb_vectorstore.py +1 -1
  79. lfx/components/datastax/hcd.py +1 -1
  80. lfx/components/deactivated/json_document_builder.py +1 -1
  81. lfx/components/docling/__init__.py +0 -3
  82. lfx/components/docling/chunk_docling_document.py +3 -1
  83. lfx/components/docling/export_docling_document.py +3 -1
  84. lfx/components/elastic/elasticsearch.py +1 -1
  85. lfx/components/files_and_knowledge/__init__.py +47 -0
  86. lfx/components/{data → files_and_knowledge}/directory.py +1 -1
  87. lfx/components/{data → files_and_knowledge}/file.py +304 -24
  88. lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +2 -2
  89. lfx/components/{data → files_and_knowledge}/save_file.py +218 -31
  90. lfx/components/flow_controls/__init__.py +58 -0
  91. lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
  92. lfx/components/{logic → flow_controls}/loop.py +43 -9
  93. lfx/components/flow_controls/run_flow.py +108 -0
  94. lfx/components/glean/glean_search_api.py +1 -1
  95. lfx/components/groq/groq.py +35 -28
  96. lfx/components/helpers/__init__.py +102 -0
  97. lfx/components/ibm/watsonx.py +7 -1
  98. lfx/components/input_output/__init__.py +3 -1
  99. lfx/components/input_output/chat.py +4 -3
  100. lfx/components/input_output/chat_output.py +10 -4
  101. lfx/components/input_output/text.py +1 -1
  102. lfx/components/input_output/text_output.py +1 -1
  103. lfx/components/{data → input_output}/webhook.py +1 -1
  104. lfx/components/knowledge_bases/__init__.py +59 -4
  105. lfx/components/langchain_utilities/character.py +1 -1
  106. lfx/components/langchain_utilities/csv_agent.py +84 -16
  107. lfx/components/langchain_utilities/json_agent.py +67 -12
  108. lfx/components/langchain_utilities/language_recursive.py +1 -1
  109. lfx/components/llm_operations/__init__.py +46 -0
  110. lfx/components/{processing → llm_operations}/batch_run.py +17 -8
  111. lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
  112. lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
  113. lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
  114. lfx/components/{processing → llm_operations}/structured_output.py +1 -1
  115. lfx/components/logic/__init__.py +126 -0
  116. lfx/components/mem0/mem0_chat_memory.py +11 -0
  117. lfx/components/models/__init__.py +64 -9
  118. lfx/components/models_and_agents/__init__.py +49 -0
  119. lfx/components/{agents → models_and_agents}/agent.py +6 -4
  120. lfx/components/models_and_agents/embedding_model.py +353 -0
  121. lfx/components/models_and_agents/language_model.py +398 -0
  122. lfx/components/{agents → models_and_agents}/mcp_component.py +53 -44
  123. lfx/components/{helpers → models_and_agents}/memory.py +1 -1
  124. lfx/components/nvidia/system_assist.py +1 -1
  125. lfx/components/olivya/olivya.py +1 -1
  126. lfx/components/ollama/ollama.py +24 -5
  127. lfx/components/processing/__init__.py +9 -60
  128. lfx/components/processing/converter.py +1 -1
  129. lfx/components/processing/dataframe_operations.py +1 -1
  130. lfx/components/processing/parse_json_data.py +2 -2
  131. lfx/components/processing/parser.py +1 -1
  132. lfx/components/processing/split_text.py +1 -1
  133. lfx/components/qdrant/qdrant.py +1 -1
  134. lfx/components/redis/redis.py +1 -1
  135. lfx/components/twelvelabs/split_video.py +10 -0
  136. lfx/components/twelvelabs/video_file.py +12 -0
  137. lfx/components/utilities/__init__.py +43 -0
  138. lfx/components/{helpers → utilities}/calculator_core.py +1 -1
  139. lfx/components/{helpers → utilities}/current_date.py +1 -1
  140. lfx/components/{processing → utilities}/python_repl_core.py +1 -1
  141. lfx/components/vectorstores/local_db.py +9 -0
  142. lfx/components/youtube/youtube_transcripts.py +118 -30
  143. lfx/custom/custom_component/component.py +57 -1
  144. lfx/custom/custom_component/custom_component.py +68 -6
  145. lfx/custom/directory_reader/directory_reader.py +5 -2
  146. lfx/graph/edge/base.py +43 -20
  147. lfx/graph/state/model.py +15 -2
  148. lfx/graph/utils.py +6 -0
  149. lfx/graph/vertex/param_handler.py +10 -7
  150. lfx/helpers/__init__.py +12 -0
  151. lfx/helpers/flow.py +117 -0
  152. lfx/inputs/input_mixin.py +24 -1
  153. lfx/inputs/inputs.py +13 -1
  154. lfx/interface/components.py +161 -83
  155. lfx/log/logger.py +5 -3
  156. lfx/schema/image.py +2 -12
  157. lfx/services/database/__init__.py +5 -0
  158. lfx/services/database/service.py +25 -0
  159. lfx/services/deps.py +87 -22
  160. lfx/services/interfaces.py +5 -0
  161. lfx/services/manager.py +24 -10
  162. lfx/services/mcp_composer/service.py +1029 -162
  163. lfx/services/session.py +5 -0
  164. lfx/services/settings/auth.py +18 -11
  165. lfx/services/settings/base.py +56 -30
  166. lfx/services/settings/constants.py +8 -0
  167. lfx/services/storage/local.py +108 -46
  168. lfx/services/storage/service.py +171 -29
  169. lfx/template/field/base.py +3 -0
  170. lfx/utils/image.py +29 -11
  171. lfx/utils/ssrf_protection.py +384 -0
  172. lfx/utils/validate_cloud.py +26 -0
  173. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/METADATA +38 -22
  174. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/RECORD +189 -160
  175. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/WHEEL +1 -1
  176. lfx/components/agents/altk_agent.py +0 -366
  177. lfx/components/agents/cuga_agent.py +0 -1013
  178. lfx/components/docling/docling_remote_vlm.py +0 -284
  179. lfx/components/logic/run_flow.py +0 -71
  180. lfx/components/models/embedding_model.py +0 -195
  181. lfx/components/models/language_model.py +0 -144
  182. lfx/components/processing/dataframe_to_toolset.py +0 -259
  183. /lfx/components/{data → data_source}/mock_data.py +0 -0
  184. /lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +0 -0
  185. /lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
  186. /lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
  187. /lfx/components/{logic → flow_controls}/listen.py +0 -0
  188. /lfx/components/{logic → flow_controls}/notify.py +0 -0
  189. /lfx/components/{logic → flow_controls}/pass_message.py +0 -0
  190. /lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
  191. /lfx/components/{processing → models_and_agents}/prompt.py +0 -0
  192. /lfx/components/{helpers → processing}/create_list.py +0 -0
  193. /lfx/components/{helpers → processing}/output_parser.py +0 -0
  194. /lfx/components/{helpers → processing}/store_message.py +0 -0
  195. /lfx/components/{helpers → utilities}/id_generator.py +0 -0
  196. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,7 @@ import ast
2
2
  import shutil
3
3
  import tarfile
4
4
  from abc import ABC, abstractmethod
5
+ from io import BytesIO
5
6
  from pathlib import Path
6
7
  from tempfile import TemporaryDirectory
7
8
  from typing import TYPE_CHECKING, Any
@@ -10,11 +11,14 @@ from zipfile import ZipFile, is_zipfile
10
11
  import orjson
11
12
  import pandas as pd
12
13
 
14
+ from lfx.base.data.storage_utils import get_file_size, read_file_bytes
13
15
  from lfx.custom.custom_component.component import Component
14
16
  from lfx.io import BoolInput, FileInput, HandleInput, Output, StrInput
15
17
  from lfx.schema.data import Data
16
18
  from lfx.schema.dataframe import DataFrame
17
19
  from lfx.schema.message import Message
20
+ from lfx.services.deps import get_settings_service
21
+ from lfx.utils.async_helpers import run_until_complete
18
22
  from lfx.utils.helpers import build_content_type_from_extension
19
23
 
20
24
  if TYPE_CHECKING:
@@ -27,6 +31,8 @@ class BaseFileComponent(Component, ABC):
27
31
  This class provides common functionality for resolving, validating, and
28
32
  processing file paths. Child classes must define valid file extensions
29
33
  and implement the `process_files` method.
34
+
35
+ # TODO: May want to subclass for local and remote files
30
36
  """
31
37
 
32
38
  class BaseFile:
@@ -251,12 +257,25 @@ class BaseFileComponent(Component, ABC):
251
257
 
252
258
  file_path = data_item.file_path
253
259
  file_path_obj = Path(file_path)
254
- file_size_stat = file_path_obj.stat()
255
260
  filename = file_path_obj.name
256
261
 
262
+ settings = get_settings_service().settings
263
+ if settings.storage_type == "s3":
264
+ try:
265
+ file_size = get_file_size(file_path)
266
+ except (FileNotFoundError, ValueError):
267
+ # If we can't get file size, set to 0 or omit
268
+ file_size = 0
269
+ else:
270
+ try:
271
+ file_size_stat = file_path_obj.stat()
272
+ file_size = file_size_stat.st_size
273
+ except OSError:
274
+ file_size = 0
275
+
257
276
  # Basic file metadata
258
277
  metadata["filename"] = filename
259
- metadata["file_size"] = file_size_stat.st_size
278
+ metadata["file_size"] = file_size
260
279
 
261
280
  # Add MIME type from extension
262
281
  extension = filename.split(".")[-1]
@@ -321,7 +340,16 @@ class BaseFileComponent(Component, ABC):
321
340
  Message: Message containing file paths
322
341
  """
323
342
  files = self._validate_and_resolve_paths()
324
- paths = [file.path.as_posix() for file in files if file.path.exists()]
343
+ settings = get_settings_service().settings
344
+
345
+ # For S3 storage, paths are virtual storage keys that don't exist on the local filesystem.
346
+ # Skip the exists() check for S3 files to preserve them in the output.
347
+ # Validation of S3 file existence is deferred until file processing (see _validate_and_resolve_paths).
348
+ # If a file was removed from S3, it will fail when attempting to read/process it later.
349
+ if settings.storage_type == "s3":
350
+ paths = [file.path.as_posix() for file in files]
351
+ else:
352
+ paths = [file.path.as_posix() for file in files if file.path.exists()]
325
353
 
326
354
  return Message(text="\n".join(paths) if paths else "")
327
355
 
@@ -329,7 +357,29 @@ class BaseFileComponent(Component, ABC):
329
357
  if not file_path:
330
358
  return None
331
359
 
332
- # Map file extensions to pandas read functions with type annotation
360
+ # Get file extension in lowercase
361
+ ext = Path(file_path).suffix.lower()
362
+
363
+ settings = get_settings_service().settings
364
+
365
+ # For S3 storage, download file bytes first
366
+ if settings.storage_type == "s3":
367
+ # Download file content from S3
368
+ content = run_until_complete(read_file_bytes(file_path))
369
+
370
+ # Map file extensions to pandas read functions that support BytesIO
371
+ if ext == ".csv":
372
+ result = pd.read_csv(BytesIO(content))
373
+ elif ext == ".xlsx":
374
+ result = pd.read_excel(BytesIO(content))
375
+ elif ext == ".parquet":
376
+ result = pd.read_parquet(BytesIO(content))
377
+ else:
378
+ return None
379
+
380
+ return result.to_dict("records")
381
+
382
+ # Local storage - read directly from filesystem
333
383
  file_readers: dict[str, Callable[[str], pd.DataFrame]] = {
334
384
  ".csv": pd.read_csv,
335
385
  ".xlsx": pd.read_excel,
@@ -337,9 +387,6 @@ class BaseFileComponent(Component, ABC):
337
387
  # TODO: sqlite and json support?
338
388
  }
339
389
 
340
- # Get file extension in lowercase
341
- ext = Path(file_path).suffix.lower()
342
-
343
390
  # Get the appropriate reader function or None
344
391
  reader = file_readers.get(ext)
345
392
 
@@ -558,16 +605,38 @@ class BaseFileComponent(Component, ABC):
558
605
  resolved_files = []
559
606
 
560
607
  def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
561
- resolved_path = Path(self.resolve_path(str(path)))
608
+ path_str = str(path)
609
+ settings = get_settings_service().settings
610
+
611
+ # When using object storage (S3), file paths are storage keys (e.g., "<flow_id>/<filename>")
612
+ # that don't exist on the local filesystem. We defer validation until file processing.
613
+ # For local storage, validate the file exists immediately to fail fast.
614
+ if settings.storage_type == "s3":
615
+ resolved_files.append(
616
+ BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
617
+ )
618
+ else:
619
+ # Check if path looks like a storage path (flow_id/filename format)
620
+ # If so, use get_full_path to resolve it to the actual storage location
621
+ if "/" in path_str and not Path(path_str).is_absolute():
622
+ try:
623
+ resolved_path = Path(self.get_full_path(path_str))
624
+ self.log(f"Resolved storage path '{path_str}' to '{resolved_path}'")
625
+ except (ValueError, AttributeError) as e:
626
+ # Fallback to resolve_path if get_full_path fails
627
+ self.log(f"get_full_path failed for '{path_str}': {e}, falling back to resolve_path")
628
+ resolved_path = Path(self.resolve_path(path_str))
629
+ else:
630
+ resolved_path = Path(self.resolve_path(path_str))
562
631
 
563
- if not resolved_path.exists():
564
- msg = f"File or directory not found: {path}"
565
- self.log(msg)
566
- if not self.silent_errors:
567
- raise ValueError(msg)
568
- resolved_files.append(
569
- BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
570
- )
632
+ if not resolved_path.exists():
633
+ msg = f"File not found: '{path}' (resolved to: '{resolved_path}'). Please upload the file again."
634
+ self.log(msg)
635
+ if not self.silent_errors:
636
+ raise ValueError(msg)
637
+ resolved_files.append(
638
+ BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
639
+ )
571
640
 
572
641
  file_path = self._file_path_as_list()
573
642
 
@@ -707,7 +776,7 @@ class BaseFileComponent(Component, ABC):
707
776
  raise ValueError(msg)
708
777
 
709
778
  def _filter_and_mark_files(self, files: list[BaseFile]) -> list[BaseFile]:
710
- """Validate file types and mark files for removal.
779
+ """Validate file types and filter out invalid files.
711
780
 
712
781
  Args:
713
782
  files (list[BaseFile]): List of BaseFile instances.
@@ -718,18 +787,26 @@ class BaseFileComponent(Component, ABC):
718
787
  Raises:
719
788
  ValueError: If unsupported files are encountered and `ignore_unsupported_extensions` is False.
720
789
  """
790
+ settings = get_settings_service().settings
791
+ is_s3_storage = settings.storage_type == "s3"
721
792
  final_files = []
722
793
  ignored_files = []
723
794
 
724
795
  for file in files:
725
- if not file.path.is_file():
796
+ # For local storage, verify the path is actually a file
797
+ # For S3 storage, paths are virtual keys that don't exist locally
798
+ if not is_s3_storage and not file.path.is_file():
726
799
  self.log(f"Not a file: {file.path.name}")
727
800
  continue
728
801
 
729
- if file.path.suffix[1:].lower() not in self.valid_extensions:
730
- if self.ignore_unsupported_extensions:
802
+ # Validate file extension
803
+ extension = file.path.suffix[1:].lower() if file.path.suffix else ""
804
+ if extension not in self.valid_extensions:
805
+ # For local storage, optionally ignore unsupported extensions
806
+ if not is_s3_storage and self.ignore_unsupported_extensions:
731
807
  ignored_files.append(file.path.name)
732
808
  continue
809
+
733
810
  msg = f"Unsupported file extension: {file.path.suffix}"
734
811
  self.log(msg)
735
812
  if not self.silent_errors:
@@ -25,21 +25,72 @@ class DoclingDependencyError(Exception):
25
25
  super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
26
26
 
27
27
 
28
- def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
28
+ def extract_docling_documents(
29
+ data_inputs: Data | list[Data] | DataFrame, doc_key: str
30
+ ) -> tuple[list[DoclingDocument], str | None]:
31
+ """Extract DoclingDocument objects from data inputs.
32
+
33
+ Args:
34
+ data_inputs: The data inputs containing DoclingDocument objects
35
+ doc_key: The key/column name to look for DoclingDocument objects
36
+
37
+ Returns:
38
+ A tuple of (documents, warning_message) where warning_message is None if no warning
39
+
40
+ Raises:
41
+ TypeError: If the data cannot be extracted or is invalid
42
+ """
29
43
  documents: list[DoclingDocument] = []
44
+ warning_message: str | None = None
45
+
30
46
  if isinstance(data_inputs, DataFrame):
31
47
  if not len(data_inputs):
32
48
  msg = "DataFrame is empty"
33
49
  raise TypeError(msg)
34
50
 
35
- if doc_key not in data_inputs.columns:
36
- msg = f"Column '{doc_key}' not found in DataFrame"
37
- raise TypeError(msg)
38
- try:
39
- documents = data_inputs[doc_key].tolist()
40
- except Exception as e:
41
- msg = f"Error extracting DoclingDocument from DataFrame: {e}"
42
- raise TypeError(msg) from e
51
+ # Primary: Check for exact column name match
52
+ if doc_key in data_inputs.columns:
53
+ try:
54
+ documents = data_inputs[doc_key].tolist()
55
+ except Exception as e:
56
+ msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
57
+ raise TypeError(msg) from e
58
+ else:
59
+ # Fallback: Search all columns for DoclingDocument objects
60
+ found_column = None
61
+ for col in data_inputs.columns:
62
+ try:
63
+ # Check if this column contains DoclingDocument objects
64
+ sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
65
+ if sample is not None and isinstance(sample, DoclingDocument):
66
+ found_column = col
67
+ break
68
+ except (IndexError, AttributeError):
69
+ continue
70
+
71
+ if found_column:
72
+ warning_message = (
73
+ f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
74
+ f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
75
+ )
76
+ logger.warning(warning_message)
77
+ try:
78
+ documents = data_inputs[found_column].tolist()
79
+ except Exception as e:
80
+ msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
81
+ raise TypeError(msg) from e
82
+ else:
83
+ # Provide helpful error message
84
+ available_columns = list(data_inputs.columns)
85
+ msg = (
86
+ f"Column '{doc_key}' not found in DataFrame. "
87
+ f"Available columns: {available_columns}. "
88
+ f"\n\nPossible solutions:\n"
89
+ f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
90
+ f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
91
+ f"3. If using VLM pipeline, try using the standard pipeline"
92
+ )
93
+ raise TypeError(msg)
43
94
  else:
44
95
  if not data_inputs:
45
96
  msg = "No data inputs provided"
@@ -69,7 +120,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
69
120
  except AttributeError as e:
70
121
  msg = f"Invalid input type in collection: {e}"
71
122
  raise TypeError(msg) from e
72
- return documents
123
+ return documents, warning_message
73
124
 
74
125
 
75
126
  def _unwrap_secrets(obj):
@@ -0,0 +1,301 @@
1
+ """Storage-aware file utilities for components.
2
+
3
+ This module provides utilities that work with both local files and remote files
4
+ stored in the storage service.
5
+
6
+ TODO: Can abstract these into the storage service interface and update
7
+ implementations.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING
14
+
15
+ from lfx.services.deps import get_settings_service, get_storage_service
16
+ from lfx.utils.async_helpers import run_until_complete
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Callable
20
+
21
+ from lfx.services.storage.service import StorageService
22
+
23
+ # Constants for path parsing
24
+ EXPECTED_PATH_PARTS = 2 # Path format: "flow_id/filename"
25
+
26
+
27
+ def parse_storage_path(path: str) -> tuple[str, str] | None:
28
+ """Parse a storage service path into flow_id and filename.
29
+
30
+ Storage service paths follow the format: flow_id/filename
31
+ This should only be called when storage_type == "s3".
32
+
33
+ Args:
34
+ path: The storage service path in format "flow_id/filename"
35
+
36
+ Returns:
37
+ tuple[str, str] | None: (flow_id, filename) or None if invalid format
38
+ """
39
+ if not path or "/" not in path:
40
+ return None
41
+
42
+ parts = path.split("/", 1)
43
+ if len(parts) != EXPECTED_PATH_PARTS or not parts[0] or not parts[1]:
44
+ return None
45
+
46
+ return parts[0], parts[1]
47
+
48
+
49
+ async def read_file_bytes(
50
+ file_path: str,
51
+ storage_service: StorageService | None = None,
52
+ resolve_path: Callable[[str], str] | None = None,
53
+ ) -> bytes:
54
+ """Read file bytes from either storage service or local filesystem.
55
+
56
+ Args:
57
+ file_path: Path to the file (S3 key format "flow_id/filename" or local path)
58
+ storage_service: Optional storage service instance (will get from deps if not provided)
59
+ resolve_path: Optional function to resolve relative paths to absolute paths
60
+ (typically Component.resolve_path). Only used for local storage.
61
+
62
+ Returns:
63
+ bytes: The file content
64
+
65
+ Raises:
66
+ FileNotFoundError: If the file doesn't exist
67
+ """
68
+ settings = get_settings_service().settings
69
+
70
+ if settings.storage_type == "s3":
71
+ parsed = parse_storage_path(file_path)
72
+ if not parsed:
73
+ msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
74
+ raise ValueError(msg)
75
+
76
+ if storage_service is None:
77
+ storage_service = get_storage_service()
78
+
79
+ flow_id, filename = parsed
80
+ return await storage_service.get_file(flow_id, filename)
81
+
82
+ # For local storage, resolve path if resolver provided
83
+ if resolve_path:
84
+ file_path = resolve_path(file_path)
85
+
86
+ path_obj = Path(file_path)
87
+ if not path_obj.exists():
88
+ msg = f"File not found: {file_path}"
89
+ raise FileNotFoundError(msg)
90
+
91
+ return path_obj.read_bytes()
92
+
93
+
94
+ async def read_file_text(
95
+ file_path: str,
96
+ encoding: str = "utf-8",
97
+ storage_service: StorageService | None = None,
98
+ resolve_path: Callable[[str], str] | None = None,
99
+ newline: str | None = None,
100
+ ) -> str:
101
+ r"""Read file text from either storage service or local filesystem.
102
+
103
+ Args:
104
+ file_path: Path to the file (storage service path or local path)
105
+ encoding: Text encoding to use
106
+ storage_service: Optional storage service instance
107
+ resolve_path: Optional function to resolve relative paths to absolute paths
108
+ (typically Component.resolve_path). Only used for local storage.
109
+ newline: Newline mode (None for default, "" for universal newlines like CSV).
110
+ When set to "", normalizes all line endings to \\n for consistency.
111
+
112
+ Returns:
113
+ str: The file content as text
114
+
115
+ Raises:
116
+ FileNotFoundError: If the file doesn't exist
117
+ """
118
+ settings = get_settings_service().settings
119
+
120
+ if settings.storage_type == "s3":
121
+ content = await read_file_bytes(file_path, storage_service, resolve_path)
122
+ text = content.decode(encoding)
123
+ # Normalize newlines for S3 when newline="" is specified (universal newline mode)
124
+ if newline == "":
125
+ # Convert all line endings to \n (matches Python's universal newline mode)
126
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
127
+ return text
128
+ # For local storage, resolve path if resolver provided
129
+ if resolve_path:
130
+ file_path = resolve_path(file_path)
131
+
132
+ path_obj = Path(file_path)
133
+ if newline is not None:
134
+ with path_obj.open(newline=newline, encoding=encoding) as f: # noqa: ASYNC230
135
+ return f.read()
136
+ return path_obj.read_text(encoding=encoding)
137
+
138
+
139
+ def get_file_size(file_path: str, storage_service: StorageService | None = None) -> int:
140
+ """Get file size from either storage service or local filesystem.
141
+
142
+ Note: This is a sync wrapper - for async code, use the storage service directly.
143
+
144
+ Args:
145
+ file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
146
+ storage_service: Optional storage service instance
147
+
148
+ Returns:
149
+ int: File size in bytes
150
+
151
+ Raises:
152
+ FileNotFoundError: If the file doesn't exist
153
+ """
154
+ settings = get_settings_service().settings
155
+
156
+ if settings.storage_type == "s3":
157
+ parsed = parse_storage_path(file_path)
158
+ if not parsed:
159
+ msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
160
+ raise ValueError(msg)
161
+
162
+ if storage_service is None:
163
+ storage_service = get_storage_service()
164
+
165
+ flow_id, filename = parsed
166
+ return run_until_complete(storage_service.get_file_size(flow_id, filename))
167
+
168
+ # Local file system
169
+ path_obj = Path(file_path)
170
+ if not path_obj.exists():
171
+ msg = f"File not found: {file_path}"
172
+ raise FileNotFoundError(msg)
173
+
174
+ return path_obj.stat().st_size
175
+
176
+
177
+ def file_exists(file_path: str, storage_service: StorageService | None = None) -> bool:
178
+ """Check if a file exists in either storage service or local filesystem.
179
+
180
+ Args:
181
+ file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
182
+ storage_service: Optional storage service instance
183
+
184
+ Returns:
185
+ bool: True if the file exists
186
+ """
187
+ try:
188
+ get_file_size(file_path, storage_service)
189
+ except (FileNotFoundError, ValueError):
190
+ return False
191
+ else:
192
+ return True
193
+
194
+
195
+ # Magic bytes signatures for common image formats
196
+ MIN_IMAGE_HEADER_SIZE = 12 # Minimum bytes needed to detect image type
197
+
198
+ IMAGE_SIGNATURES: dict[str, list[tuple[bytes, int]]] = {
199
+ "jpeg": [(b"\xff\xd8\xff", 0)],
200
+ "jpg": [(b"\xff\xd8\xff", 0)],
201
+ "png": [(b"\x89PNG\r\n\x1a\n", 0)],
202
+ "gif": [(b"GIF87a", 0), (b"GIF89a", 0)],
203
+ "webp": [(b"RIFF", 0)], # WebP starts with RIFF, then has WEBP at offset 8
204
+ "bmp": [(b"BM", 0)],
205
+ "tiff": [(b"II*\x00", 0), (b"MM\x00*", 0)], # Little-endian and big-endian TIFF
206
+ }
207
+
208
+
209
+ def detect_image_type_from_bytes(content: bytes) -> str | None:
210
+ """Detect the actual image type from file content using magic bytes.
211
+
212
+ Args:
213
+ content: The file content bytes (at least first 12 bytes needed)
214
+
215
+ Returns:
216
+ str | None: The detected image type (e.g., "jpeg", "png") or None if not recognized
217
+ """
218
+ if len(content) < MIN_IMAGE_HEADER_SIZE:
219
+ return None
220
+
221
+ # Check WebP specifically (needs to check both RIFF and WEBP)
222
+ if content[:4] == b"RIFF" and content[8:12] == b"WEBP":
223
+ return "webp"
224
+
225
+ # Check other image signatures
226
+ for image_type, signatures in IMAGE_SIGNATURES.items():
227
+ if image_type == "webp":
228
+ continue # Already handled above
229
+ for signature, offset in signatures:
230
+ if content[offset : offset + len(signature)] == signature:
231
+ return image_type
232
+
233
+ return None
234
+
235
+
236
+ def validate_image_content_type(
237
+ file_path: str,
238
+ content: bytes | None = None,
239
+ storage_service: StorageService | None = None,
240
+ resolve_path: Callable[[str], str] | None = None,
241
+ ) -> tuple[bool, str | None]:
242
+ """Validate that an image file's content matches its declared extension.
243
+
244
+ This prevents errors like "Image does not match the provided media type image/png"
245
+ when a JPEG file is saved with a .png extension.
246
+
247
+ Only rejects files when we can definitively detect a mismatch. Files with
248
+ unrecognized content are allowed through (they may fail later, but that's
249
+ better than false positives blocking valid files).
250
+
251
+ Args:
252
+ file_path: Path to the image file
253
+ content: Optional pre-read file content bytes. If not provided, will read from file.
254
+ storage_service: Optional storage service instance for S3 files
255
+ resolve_path: Optional function to resolve relative paths
256
+
257
+ Returns:
258
+ tuple[bool, str | None]: (is_valid, error_message)
259
+ - (True, None) if the content matches the extension, is unrecognized, or file is not an image
260
+ - (False, error_message) if there's a definite mismatch
261
+ """
262
+ # Get the file extension
263
+ path_obj = Path(file_path)
264
+ extension = path_obj.suffix[1:].lower() if path_obj.suffix else ""
265
+
266
+ # Only validate image files
267
+ image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
268
+ if extension not in image_extensions:
269
+ return True, None
270
+
271
+ # Read content if not provided
272
+ if content is None:
273
+ try:
274
+ content = run_until_complete(read_file_bytes(file_path, storage_service, resolve_path))
275
+ except (FileNotFoundError, ValueError):
276
+ # Can't read file - let it pass, will fail later with better error
277
+ return True, None
278
+
279
+ # Detect actual image type
280
+ detected_type = detect_image_type_from_bytes(content)
281
+
282
+ # If we can't detect the type, the file is not a valid image
283
+ if detected_type is None:
284
+ return False, (
285
+ f"File '{path_obj.name}' has extension '.{extension}' but its content "
286
+ f"is not a valid image format. The file may be corrupted, empty, or not a real image."
287
+ )
288
+
289
+ # Normalize extensions for comparison (jpg == jpeg, tif == tiff)
290
+ extension_normalized = "jpeg" if extension == "jpg" else extension
291
+ detected_normalized = "jpeg" if detected_type == "jpg" else detected_type
292
+
293
+ if extension_normalized != detected_normalized:
294
+ return False, (
295
+ f"File '{path_obj.name}' has extension '.{extension}' but contains "
296
+ f"'{detected_type.upper()}' image data. This mismatch will cause API errors. "
297
+ f"Please rename the file with the correct extension '.{detected_type}' or "
298
+ f"re-save it in the correct format."
299
+ )
300
+
301
+ return True, None