lfx-nightly 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +13 -1
  3. lfx/base/agents/altk_base_agent.py +380 -0
  4. lfx/base/agents/altk_tool_wrappers.py +565 -0
  5. lfx/base/agents/events.py +2 -1
  6. lfx/base/composio/composio_base.py +159 -224
  7. lfx/base/data/base_file.py +88 -21
  8. lfx/base/data/storage_utils.py +192 -0
  9. lfx/base/data/utils.py +178 -14
  10. lfx/base/embeddings/embeddings_class.py +113 -0
  11. lfx/base/models/groq_constants.py +74 -58
  12. lfx/base/models/groq_model_discovery.py +265 -0
  13. lfx/base/models/model.py +1 -1
  14. lfx/base/models/model_utils.py +100 -0
  15. lfx/base/models/openai_constants.py +7 -0
  16. lfx/base/models/watsonx_constants.py +32 -8
  17. lfx/base/tools/run_flow.py +601 -129
  18. lfx/cli/commands.py +6 -3
  19. lfx/cli/common.py +2 -2
  20. lfx/cli/run.py +1 -1
  21. lfx/cli/script_loader.py +53 -11
  22. lfx/components/Notion/create_page.py +1 -1
  23. lfx/components/Notion/list_database_properties.py +1 -1
  24. lfx/components/Notion/list_pages.py +1 -1
  25. lfx/components/Notion/list_users.py +1 -1
  26. lfx/components/Notion/page_content_viewer.py +1 -1
  27. lfx/components/Notion/search.py +1 -1
  28. lfx/components/Notion/update_page_property.py +1 -1
  29. lfx/components/__init__.py +19 -5
  30. lfx/components/{agents → altk}/__init__.py +5 -9
  31. lfx/components/altk/altk_agent.py +193 -0
  32. lfx/components/apify/apify_actor.py +1 -1
  33. lfx/components/composio/__init__.py +70 -18
  34. lfx/components/composio/apollo_composio.py +11 -0
  35. lfx/components/composio/bitbucket_composio.py +11 -0
  36. lfx/components/composio/canva_composio.py +11 -0
  37. lfx/components/composio/coda_composio.py +11 -0
  38. lfx/components/composio/composio_api.py +10 -0
  39. lfx/components/composio/discord_composio.py +1 -1
  40. lfx/components/composio/elevenlabs_composio.py +11 -0
  41. lfx/components/composio/exa_composio.py +11 -0
  42. lfx/components/composio/firecrawl_composio.py +11 -0
  43. lfx/components/composio/fireflies_composio.py +11 -0
  44. lfx/components/composio/gmail_composio.py +1 -1
  45. lfx/components/composio/googlebigquery_composio.py +11 -0
  46. lfx/components/composio/googlecalendar_composio.py +1 -1
  47. lfx/components/composio/googledocs_composio.py +1 -1
  48. lfx/components/composio/googlemeet_composio.py +1 -1
  49. lfx/components/composio/googlesheets_composio.py +1 -1
  50. lfx/components/composio/googletasks_composio.py +1 -1
  51. lfx/components/composio/heygen_composio.py +11 -0
  52. lfx/components/composio/mem0_composio.py +11 -0
  53. lfx/components/composio/peopledatalabs_composio.py +11 -0
  54. lfx/components/composio/perplexityai_composio.py +11 -0
  55. lfx/components/composio/serpapi_composio.py +11 -0
  56. lfx/components/composio/slack_composio.py +3 -574
  57. lfx/components/composio/slackbot_composio.py +1 -1
  58. lfx/components/composio/snowflake_composio.py +11 -0
  59. lfx/components/composio/tavily_composio.py +11 -0
  60. lfx/components/composio/youtube_composio.py +2 -2
  61. lfx/components/cuga/__init__.py +34 -0
  62. lfx/components/cuga/cuga_agent.py +730 -0
  63. lfx/components/data/__init__.py +78 -28
  64. lfx/components/data_source/__init__.py +58 -0
  65. lfx/components/{data → data_source}/api_request.py +26 -3
  66. lfx/components/{data → data_source}/csv_to_data.py +15 -10
  67. lfx/components/{data → data_source}/json_to_data.py +15 -8
  68. lfx/components/{data → data_source}/news_search.py +1 -1
  69. lfx/components/{data → data_source}/rss.py +1 -1
  70. lfx/components/{data → data_source}/sql_executor.py +1 -1
  71. lfx/components/{data → data_source}/url.py +1 -1
  72. lfx/components/{data → data_source}/web_search.py +1 -1
  73. lfx/components/datastax/astradb_cql.py +1 -1
  74. lfx/components/datastax/astradb_graph.py +1 -1
  75. lfx/components/datastax/astradb_tool.py +1 -1
  76. lfx/components/datastax/astradb_vectorstore.py +1 -1
  77. lfx/components/datastax/hcd.py +1 -1
  78. lfx/components/deactivated/json_document_builder.py +1 -1
  79. lfx/components/docling/__init__.py +0 -3
  80. lfx/components/elastic/elasticsearch.py +1 -1
  81. lfx/components/elastic/opensearch_multimodal.py +1575 -0
  82. lfx/components/files_and_knowledge/__init__.py +47 -0
  83. lfx/components/{data → files_and_knowledge}/directory.py +1 -1
  84. lfx/components/{data → files_and_knowledge}/file.py +246 -18
  85. lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +2 -2
  86. lfx/components/{data → files_and_knowledge}/save_file.py +142 -22
  87. lfx/components/flow_controls/__init__.py +58 -0
  88. lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
  89. lfx/components/{logic → flow_controls}/loop.py +43 -9
  90. lfx/components/flow_controls/run_flow.py +108 -0
  91. lfx/components/glean/glean_search_api.py +1 -1
  92. lfx/components/groq/groq.py +35 -28
  93. lfx/components/helpers/__init__.py +102 -0
  94. lfx/components/input_output/__init__.py +3 -1
  95. lfx/components/input_output/chat.py +4 -3
  96. lfx/components/input_output/chat_output.py +4 -4
  97. lfx/components/input_output/text.py +1 -1
  98. lfx/components/input_output/text_output.py +1 -1
  99. lfx/components/{data → input_output}/webhook.py +1 -1
  100. lfx/components/knowledge_bases/__init__.py +59 -4
  101. lfx/components/langchain_utilities/character.py +1 -1
  102. lfx/components/langchain_utilities/csv_agent.py +84 -16
  103. lfx/components/langchain_utilities/json_agent.py +67 -12
  104. lfx/components/langchain_utilities/language_recursive.py +1 -1
  105. lfx/components/llm_operations/__init__.py +46 -0
  106. lfx/components/{processing → llm_operations}/batch_run.py +1 -1
  107. lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
  108. lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
  109. lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
  110. lfx/components/{processing → llm_operations}/structured_output.py +1 -1
  111. lfx/components/logic/__init__.py +126 -0
  112. lfx/components/mem0/mem0_chat_memory.py +11 -0
  113. lfx/components/models/__init__.py +64 -9
  114. lfx/components/models_and_agents/__init__.py +49 -0
  115. lfx/components/{agents → models_and_agents}/agent.py +2 -2
  116. lfx/components/models_and_agents/embedding_model.py +423 -0
  117. lfx/components/models_and_agents/language_model.py +398 -0
  118. lfx/components/{agents → models_and_agents}/mcp_component.py +53 -44
  119. lfx/components/{helpers → models_and_agents}/memory.py +1 -1
  120. lfx/components/nvidia/system_assist.py +1 -1
  121. lfx/components/olivya/olivya.py +1 -1
  122. lfx/components/ollama/ollama.py +17 -3
  123. lfx/components/processing/__init__.py +9 -57
  124. lfx/components/processing/converter.py +1 -1
  125. lfx/components/processing/dataframe_operations.py +1 -1
  126. lfx/components/processing/parse_json_data.py +2 -2
  127. lfx/components/processing/parser.py +1 -1
  128. lfx/components/processing/split_text.py +1 -1
  129. lfx/components/qdrant/qdrant.py +1 -1
  130. lfx/components/redis/redis.py +1 -1
  131. lfx/components/twelvelabs/split_video.py +10 -0
  132. lfx/components/twelvelabs/video_file.py +12 -0
  133. lfx/components/utilities/__init__.py +43 -0
  134. lfx/components/{helpers → utilities}/calculator_core.py +1 -1
  135. lfx/components/{helpers → utilities}/current_date.py +1 -1
  136. lfx/components/{processing → utilities}/python_repl_core.py +1 -1
  137. lfx/components/vectorstores/local_db.py +9 -0
  138. lfx/components/youtube/youtube_transcripts.py +118 -30
  139. lfx/custom/custom_component/component.py +57 -1
  140. lfx/custom/custom_component/custom_component.py +68 -6
  141. lfx/graph/edge/base.py +43 -20
  142. lfx/graph/graph/base.py +4 -1
  143. lfx/graph/state/model.py +15 -2
  144. lfx/graph/utils.py +6 -0
  145. lfx/graph/vertex/base.py +4 -1
  146. lfx/graph/vertex/param_handler.py +10 -7
  147. lfx/helpers/__init__.py +12 -0
  148. lfx/helpers/flow.py +117 -0
  149. lfx/inputs/input_mixin.py +24 -1
  150. lfx/inputs/inputs.py +13 -1
  151. lfx/interface/components.py +161 -83
  152. lfx/log/logger.py +5 -3
  153. lfx/services/database/__init__.py +5 -0
  154. lfx/services/database/service.py +25 -0
  155. lfx/services/deps.py +87 -22
  156. lfx/services/manager.py +19 -6
  157. lfx/services/mcp_composer/service.py +998 -157
  158. lfx/services/session.py +5 -0
  159. lfx/services/settings/base.py +51 -7
  160. lfx/services/settings/constants.py +8 -0
  161. lfx/services/storage/local.py +76 -46
  162. lfx/services/storage/service.py +152 -29
  163. lfx/template/field/base.py +3 -0
  164. lfx/utils/ssrf_protection.py +384 -0
  165. lfx/utils/validate_cloud.py +26 -0
  166. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/METADATA +38 -22
  167. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/RECORD +182 -150
  168. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/WHEEL +1 -1
  169. lfx/components/agents/altk_agent.py +0 -366
  170. lfx/components/agents/cuga_agent.py +0 -1013
  171. lfx/components/docling/docling_remote_vlm.py +0 -284
  172. lfx/components/logic/run_flow.py +0 -71
  173. lfx/components/models/embedding_model.py +0 -195
  174. lfx/components/models/language_model.py +0 -144
  175. /lfx/components/{data → data_source}/mock_data.py +0 -0
  176. /lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +0 -0
  177. /lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
  178. /lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
  179. /lfx/components/{logic → flow_controls}/listen.py +0 -0
  180. /lfx/components/{logic → flow_controls}/notify.py +0 -0
  181. /lfx/components/{logic → flow_controls}/pass_message.py +0 -0
  182. /lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
  183. /lfx/components/{processing → models_and_agents}/prompt.py +0 -0
  184. /lfx/components/{helpers → processing}/create_list.py +0 -0
  185. /lfx/components/{helpers → processing}/output_parser.py +0 -0
  186. /lfx/components/{helpers → processing}/store_message.py +0 -0
  187. /lfx/components/{helpers → utilities}/id_generator.py +0 -0
  188. {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,7 @@ import ast
2
2
  import shutil
3
3
  import tarfile
4
4
  from abc import ABC, abstractmethod
5
+ from io import BytesIO
5
6
  from pathlib import Path
6
7
  from tempfile import TemporaryDirectory
7
8
  from typing import TYPE_CHECKING, Any
@@ -10,11 +11,14 @@ from zipfile import ZipFile, is_zipfile
10
11
  import orjson
11
12
  import pandas as pd
12
13
 
14
+ from lfx.base.data.storage_utils import get_file_size, read_file_bytes
13
15
  from lfx.custom.custom_component.component import Component
14
16
  from lfx.io import BoolInput, FileInput, HandleInput, Output, StrInput
15
17
  from lfx.schema.data import Data
16
18
  from lfx.schema.dataframe import DataFrame
17
19
  from lfx.schema.message import Message
20
+ from lfx.services.deps import get_settings_service
21
+ from lfx.utils.async_helpers import run_until_complete
18
22
  from lfx.utils.helpers import build_content_type_from_extension
19
23
 
20
24
  if TYPE_CHECKING:
@@ -27,6 +31,8 @@ class BaseFileComponent(Component, ABC):
27
31
  This class provides common functionality for resolving, validating, and
28
32
  processing file paths. Child classes must define valid file extensions
29
33
  and implement the `process_files` method.
34
+
35
+ # TODO: May want to subclass for local and remote files
30
36
  """
31
37
 
32
38
  class BaseFile:
@@ -251,12 +257,27 @@ class BaseFileComponent(Component, ABC):
251
257
 
252
258
  file_path = data_item.file_path
253
259
  file_path_obj = Path(file_path)
254
- file_size_stat = file_path_obj.stat()
255
260
  filename = file_path_obj.name
256
261
 
262
+ settings = get_settings_service().settings
263
+
264
+ # Get file size - use storage service for S3, filesystem for local
265
+ if settings.storage_type == "s3":
266
+ try:
267
+ file_size = get_file_size(file_path)
268
+ except (FileNotFoundError, ValueError):
269
+ # If we can't get file size, set to 0 or omit
270
+ file_size = 0
271
+ else:
272
+ try:
273
+ file_size_stat = file_path_obj.stat()
274
+ file_size = file_size_stat.st_size
275
+ except OSError:
276
+ file_size = 0
277
+
257
278
  # Basic file metadata
258
279
  metadata["filename"] = filename
259
- metadata["file_size"] = file_size_stat.st_size
280
+ metadata["file_size"] = file_size
260
281
 
261
282
  # Add MIME type from extension
262
283
  extension = filename.split(".")[-1]
@@ -321,7 +342,16 @@ class BaseFileComponent(Component, ABC):
321
342
  Message: Message containing file paths
322
343
  """
323
344
  files = self._validate_and_resolve_paths()
324
- paths = [file.path.as_posix() for file in files if file.path.exists()]
345
+ settings = get_settings_service().settings
346
+
347
+ # For S3 storage, paths are virtual storage keys that don't exist on the local filesystem.
348
+ # Skip the exists() check for S3 files to preserve them in the output.
349
+ # Validation of S3 file existence is deferred until file processing (see _validate_and_resolve_paths).
350
+ # If a file was removed from S3, it will fail when attempting to read/process it later.
351
+ if settings.storage_type == "s3":
352
+ paths = [file.path.as_posix() for file in files]
353
+ else:
354
+ paths = [file.path.as_posix() for file in files if file.path.exists()]
325
355
 
326
356
  return Message(text="\n".join(paths) if paths else "")
327
357
 
@@ -329,7 +359,29 @@ class BaseFileComponent(Component, ABC):
329
359
  if not file_path:
330
360
  return None
331
361
 
332
- # Map file extensions to pandas read functions with type annotation
362
+ # Get file extension in lowercase
363
+ ext = Path(file_path).suffix.lower()
364
+
365
+ settings = get_settings_service().settings
366
+
367
+ # For S3 storage, download file bytes first
368
+ if settings.storage_type == "s3":
369
+ # Download file content from S3
370
+ content = run_until_complete(read_file_bytes(file_path))
371
+
372
+ # Map file extensions to pandas read functions that support BytesIO
373
+ if ext == ".csv":
374
+ result = pd.read_csv(BytesIO(content))
375
+ elif ext == ".xlsx":
376
+ result = pd.read_excel(BytesIO(content))
377
+ elif ext == ".parquet":
378
+ result = pd.read_parquet(BytesIO(content))
379
+ else:
380
+ return None
381
+
382
+ return result.to_dict("records")
383
+
384
+ # Local storage - read directly from filesystem
333
385
  file_readers: dict[str, Callable[[str], pd.DataFrame]] = {
334
386
  ".csv": pd.read_csv,
335
387
  ".xlsx": pd.read_excel,
@@ -337,9 +389,6 @@ class BaseFileComponent(Component, ABC):
337
389
  # TODO: sqlite and json support?
338
390
  }
339
391
 
340
- # Get file extension in lowercase
341
- ext = Path(file_path).suffix.lower()
342
-
343
392
  # Get the appropriate reader function or None
344
393
  reader = file_readers.get(ext)
345
394
 
@@ -558,16 +607,26 @@ class BaseFileComponent(Component, ABC):
558
607
  resolved_files = []
559
608
 
560
609
  def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
561
- resolved_path = Path(self.resolve_path(str(path)))
562
-
563
- if not resolved_path.exists():
564
- msg = f"File or directory not found: {path}"
565
- self.log(msg)
566
- if not self.silent_errors:
567
- raise ValueError(msg)
568
- resolved_files.append(
569
- BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
570
- )
610
+ path_str = str(path)
611
+ settings = get_settings_service().settings
612
+
613
+ # When using object storage (S3), file paths are storage keys (e.g., "<flow_id>/<filename>")
614
+ # that don't exist on the local filesystem. We defer validation until file processing.
615
+ # For local storage, validate the file exists immediately to fail fast.
616
+ if settings.storage_type == "s3":
617
+ resolved_files.append(
618
+ BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
619
+ )
620
+ else:
621
+ resolved_path = Path(self.resolve_path(path_str))
622
+ if not resolved_path.exists():
623
+ msg = f"File or directory not found: {path}"
624
+ self.log(msg)
625
+ if not self.silent_errors:
626
+ raise ValueError(msg)
627
+ resolved_files.append(
628
+ BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
629
+ )
571
630
 
572
631
  file_path = self._file_path_as_list()
573
632
 
@@ -707,7 +766,7 @@ class BaseFileComponent(Component, ABC):
707
766
  raise ValueError(msg)
708
767
 
709
768
  def _filter_and_mark_files(self, files: list[BaseFile]) -> list[BaseFile]:
710
- """Validate file types and mark files for removal.
769
+ """Validate file types and filter out invalid files.
711
770
 
712
771
  Args:
713
772
  files (list[BaseFile]): List of BaseFile instances.
@@ -718,18 +777,26 @@ class BaseFileComponent(Component, ABC):
718
777
  Raises:
719
778
  ValueError: If unsupported files are encountered and `ignore_unsupported_extensions` is False.
720
779
  """
780
+ settings = get_settings_service().settings
781
+ is_s3_storage = settings.storage_type == "s3"
721
782
  final_files = []
722
783
  ignored_files = []
723
784
 
724
785
  for file in files:
725
- if not file.path.is_file():
786
+ # For local storage, verify the path is actually a file
787
+ # For S3 storage, paths are virtual keys that don't exist locally
788
+ if not is_s3_storage and not file.path.is_file():
726
789
  self.log(f"Not a file: {file.path.name}")
727
790
  continue
728
791
 
729
- if file.path.suffix[1:].lower() not in self.valid_extensions:
730
- if self.ignore_unsupported_extensions:
792
+ # Validate file extension
793
+ extension = file.path.suffix[1:].lower() if file.path.suffix else ""
794
+ if extension not in self.valid_extensions:
795
+ # For local storage, optionally ignore unsupported extensions
796
+ if not is_s3_storage and self.ignore_unsupported_extensions:
731
797
  ignored_files.append(file.path.name)
732
798
  continue
799
+
733
800
  msg = f"Unsupported file extension: {file.path.suffix}"
734
801
  self.log(msg)
735
802
  if not self.silent_errors:
@@ -0,0 +1,192 @@
1
+ """Storage-aware file utilities for components.
2
+
3
+ This module provides utilities that work with both local files and remote files
4
+ stored in the storage service.
5
+
6
+ TODO: Can abstract these into the storage service interface and update
7
+ implementations.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING
14
+
15
+ from lfx.services.deps import get_settings_service, get_storage_service
16
+ from lfx.utils.async_helpers import run_until_complete
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Callable
20
+
21
+ from lfx.services.storage.service import StorageService
22
+
23
+ # Constants for path parsing
24
+ EXPECTED_PATH_PARTS = 2 # Path format: "flow_id/filename"
25
+
26
+
27
+ def parse_storage_path(path: str) -> tuple[str, str] | None:
28
+ """Parse a storage service path into flow_id and filename.
29
+
30
+ Storage service paths follow the format: flow_id/filename
31
+ This should only be called when storage_type == "s3".
32
+
33
+ Args:
34
+ path: The storage service path in format "flow_id/filename"
35
+
36
+ Returns:
37
+ tuple[str, str] | None: (flow_id, filename) or None if invalid format
38
+ """
39
+ if not path or "/" not in path:
40
+ return None
41
+
42
+ parts = path.split("/", 1)
43
+ if len(parts) != EXPECTED_PATH_PARTS or not parts[0] or not parts[1]:
44
+ return None
45
+
46
+ return parts[0], parts[1]
47
+
48
+
49
+ async def read_file_bytes(
50
+ file_path: str,
51
+ storage_service: StorageService | None = None,
52
+ resolve_path: Callable[[str], str] | None = None,
53
+ ) -> bytes:
54
+ """Read file bytes from either storage service or local filesystem.
55
+
56
+ Args:
57
+ file_path: Path to the file (S3 key format "flow_id/filename" or local path)
58
+ storage_service: Optional storage service instance (will get from deps if not provided)
59
+ resolve_path: Optional function to resolve relative paths to absolute paths
60
+ (typically Component.resolve_path). Only used for local storage.
61
+
62
+ Returns:
63
+ bytes: The file content
64
+
65
+ Raises:
66
+ FileNotFoundError: If the file doesn't exist
67
+ """
68
+ settings = get_settings_service().settings
69
+
70
+ if settings.storage_type == "s3":
71
+ parsed = parse_storage_path(file_path)
72
+ if not parsed:
73
+ msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
74
+ raise ValueError(msg)
75
+
76
+ if storage_service is None:
77
+ storage_service = get_storage_service()
78
+
79
+ flow_id, filename = parsed
80
+ return await storage_service.get_file(flow_id, filename)
81
+
82
+ # For local storage, resolve path if resolver provided
83
+ if resolve_path:
84
+ file_path = resolve_path(file_path)
85
+
86
+ path_obj = Path(file_path)
87
+ if not path_obj.exists():
88
+ msg = f"File not found: {file_path}"
89
+ raise FileNotFoundError(msg)
90
+
91
+ return path_obj.read_bytes()
92
+
93
+
94
+ async def read_file_text(
95
+ file_path: str,
96
+ encoding: str = "utf-8",
97
+ storage_service: StorageService | None = None,
98
+ resolve_path: Callable[[str], str] | None = None,
99
+ newline: str | None = None,
100
+ ) -> str:
101
+ r"""Read file text from either storage service or local filesystem.
102
+
103
+ Args:
104
+ file_path: Path to the file (storage service path or local path)
105
+ encoding: Text encoding to use
106
+ storage_service: Optional storage service instance
107
+ resolve_path: Optional function to resolve relative paths to absolute paths
108
+ (typically Component.resolve_path). Only used for local storage.
109
+ newline: Newline mode (None for default, "" for universal newlines like CSV).
110
+ When set to "", normalizes all line endings to \\n for consistency.
111
+
112
+ Returns:
113
+ str: The file content as text
114
+
115
+ Raises:
116
+ FileNotFoundError: If the file doesn't exist
117
+ """
118
+ settings = get_settings_service().settings
119
+
120
+ if settings.storage_type == "s3":
121
+ content = await read_file_bytes(file_path, storage_service, resolve_path)
122
+ text = content.decode(encoding)
123
+ # Normalize newlines for S3 when newline="" is specified (universal newline mode)
124
+ if newline == "":
125
+ # Convert all line endings to \n (matches Python's universal newline mode)
126
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
127
+ return text
128
+ # For local storage, resolve path if resolver provided
129
+ if resolve_path:
130
+ file_path = resolve_path(file_path)
131
+
132
+ path_obj = Path(file_path)
133
+ if newline is not None:
134
+ with path_obj.open(newline=newline, encoding=encoding) as f: # noqa: ASYNC230
135
+ return f.read()
136
+ return path_obj.read_text(encoding=encoding)
137
+
138
+
139
+ def get_file_size(file_path: str, storage_service: StorageService | None = None) -> int:
140
+ """Get file size from either storage service or local filesystem.
141
+
142
+ Note: This is a sync wrapper - for async code, use the storage service directly.
143
+
144
+ Args:
145
+ file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
146
+ storage_service: Optional storage service instance
147
+
148
+ Returns:
149
+ int: File size in bytes
150
+
151
+ Raises:
152
+ FileNotFoundError: If the file doesn't exist
153
+ """
154
+ settings = get_settings_service().settings
155
+
156
+ if settings.storage_type == "s3":
157
+ parsed = parse_storage_path(file_path)
158
+ if not parsed:
159
+ msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
160
+ raise ValueError(msg)
161
+
162
+ if storage_service is None:
163
+ storage_service = get_storage_service()
164
+
165
+ flow_id, filename = parsed
166
+ return run_until_complete(storage_service.get_file_size(flow_id, filename))
167
+
168
+ # Local file system
169
+ path_obj = Path(file_path)
170
+ if not path_obj.exists():
171
+ msg = f"File not found: {file_path}"
172
+ raise FileNotFoundError(msg)
173
+
174
+ return path_obj.stat().st_size
175
+
176
+
177
+ def file_exists(file_path: str, storage_service: StorageService | None = None) -> bool:
178
+ """Check if a file exists in either storage service or local filesystem.
179
+
180
+ Args:
181
+ file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
182
+ storage_service: Optional storage service instance
183
+
184
+ Returns:
185
+ bool: True if the file exists
186
+ """
187
+ try:
188
+ get_file_size(file_path, storage_service)
189
+ except (FileNotFoundError, ValueError):
190
+ return False
191
+ else:
192
+ return True
lfx/base/data/utils.py CHANGED
@@ -1,14 +1,21 @@
1
+ import contextlib
2
+ import tempfile
1
3
  import unicodedata
2
4
  from collections.abc import Callable
3
5
  from concurrent import futures
6
+ from io import BytesIO
4
7
  from pathlib import Path
5
8
 
6
9
  import chardet
7
10
  import orjson
8
11
  import yaml
9
12
  from defusedxml import ElementTree
13
+ from pypdf import PdfReader
10
14
 
15
+ from lfx.base.data.storage_utils import read_file_bytes
11
16
  from lfx.schema.data import Data
17
+ from lfx.services.deps import get_settings_service
18
+ from lfx.utils.async_helpers import run_until_complete
12
19
 
13
20
  # Types of files that can be read simply by file.read()
14
21
  # and have 100% to be completely readable
@@ -36,6 +43,34 @@ TEXT_FILE_TYPES = [
36
43
  IMG_FILE_TYPES = ["jpg", "jpeg", "png", "bmp", "image"]
37
44
 
38
45
 
46
+ def parse_structured_text(text: str, file_path: str) -> str | dict | list:
47
+ """Parse structured text formats (JSON, YAML, XML) and normalize text.
48
+
49
+ Args:
50
+ text: The text content to parse
51
+ file_path: The file path (used to determine format)
52
+
53
+ Returns:
54
+ Parsed content (dict/list for JSON, dict for YAML, str for XML)
55
+ """
56
+ if file_path.endswith(".json"):
57
+ loaded_json = orjson.loads(text)
58
+ if isinstance(loaded_json, dict):
59
+ loaded_json = {k: normalize_text(v) if isinstance(v, str) else v for k, v in loaded_json.items()}
60
+ elif isinstance(loaded_json, list):
61
+ loaded_json = [normalize_text(item) if isinstance(item, str) else item for item in loaded_json]
62
+ return orjson.dumps(loaded_json).decode("utf-8")
63
+
64
+ if file_path.endswith((".yaml", ".yml")):
65
+ return yaml.safe_load(text)
66
+
67
+ if file_path.endswith(".xml"):
68
+ xml_element = ElementTree.fromstring(text)
69
+ return ElementTree.tostring(xml_element, encoding="unicode")
70
+
71
+ return text
72
+
73
+
39
74
  def normalize_text(text):
40
75
  return unicodedata.normalize("NFKD", text)
41
76
 
@@ -109,6 +144,14 @@ def partition_file_to_data(file_path: str, *, silent_errors: bool) -> Data | Non
109
144
 
110
145
 
111
146
  def read_text_file(file_path: str) -> str:
147
+ """Read a text file with automatic encoding detection.
148
+
149
+ Args:
150
+ file_path: Path to the file (local path only, not storage service path)
151
+
152
+ Returns:
153
+ str: The file content as text
154
+ """
112
155
  file_path_ = Path(file_path)
113
156
  raw_data = file_path_.read_bytes()
114
157
  result = chardet.detect(raw_data)
@@ -120,13 +163,90 @@ def read_text_file(file_path: str) -> str:
120
163
  return file_path_.read_text(encoding=encoding)
121
164
 
122
165
 
166
+ async def read_text_file_async(file_path: str) -> str:
167
+ """Read a text file with automatic encoding detection (async, storage-aware).
168
+
169
+ Args:
170
+ file_path: Path to the file (S3 key format "flow_id/filename" or local path)
171
+
172
+ Returns:
173
+ str: The file content as text
174
+ """
175
+ from .storage_utils import read_file_bytes
176
+
177
+ # Use storage-aware read to get bytes
178
+ raw_data = await read_file_bytes(file_path)
179
+
180
+ # Auto-detect encoding
181
+ result = chardet.detect(raw_data)
182
+ encoding = result.get("encoding")
183
+
184
+ # If encoding detection fails (e.g., binary file), default to utf-8
185
+ if not encoding or encoding in {"Windows-1252", "Windows-1254", "MacRoman"}:
186
+ encoding = "utf-8"
187
+
188
+ return raw_data.decode(encoding, errors="replace")
189
+
190
+
123
191
  def read_docx_file(file_path: str) -> str:
192
+ """Read a DOCX file and extract text.
193
+
194
+ ote: python-docx requires a file path, so this only works with local files.
195
+ For storage service files, use read_docx_file_async which downloads to temp.
196
+
197
+ Args:
198
+ file_path: Path to the DOCX file (local path only)
199
+
200
+ Returns:
201
+ str: Extracted text from the document
202
+ """
124
203
  from docx import Document
125
204
 
126
205
  doc = Document(file_path)
127
206
  return "\n\n".join([p.text for p in doc.paragraphs])
128
207
 
129
208
 
209
+ async def read_docx_file_async(file_path: str) -> str:
210
+ """Read a DOCX file and extract text (async, storage-aware).
211
+
212
+ For S3 storage, downloads to temp file (python-docx requires file path).
213
+ For local storage, reads directly.
214
+
215
+ Args:
216
+ file_path: Path to the DOCX file (S3 key format "flow_id/filename" or local path)
217
+
218
+ Returns:
219
+ str: Extracted text from the document
220
+ """
221
+ from docx import Document
222
+
223
+ from .storage_utils import read_file_bytes
224
+
225
+ settings = get_settings_service().settings
226
+
227
+ if settings.storage_type == "local":
228
+ # Local storage - read directly
229
+ doc = Document(file_path)
230
+ return "\n\n".join([p.text for p in doc.paragraphs])
231
+
232
+ # S3 storage - need temp file for python-docx (doesn't support BytesIO)
233
+ content = await read_file_bytes(file_path)
234
+
235
+ # Create temp file with .docx extension
236
+ # Extract filename from path for suffix
237
+ suffix = Path(file_path.split("/")[-1]).suffix
238
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp_file:
239
+ tmp_file.write(content)
240
+ temp_path = tmp_file.name
241
+
242
+ try:
243
+ doc = Document(temp_path)
244
+ return "\n\n".join([p.text for p in doc.paragraphs])
245
+ finally:
246
+ with contextlib.suppress(Exception):
247
+ Path(temp_path).unlink()
248
+
249
+
130
250
  def parse_pdf_to_text(file_path: str) -> str:
131
251
  from pypdf import PdfReader
132
252
 
@@ -134,7 +254,35 @@ def parse_pdf_to_text(file_path: str) -> str:
134
254
  return "\n\n".join([page.extract_text() for page in reader.pages])
135
255
 
136
256
 
257
+ async def parse_pdf_to_text_async(file_path: str) -> str:
258
+ """Parse a PDF file to extract text (async, storage-aware).
259
+
260
+ Uses storage-aware file reading to support both local and S3 storage.
261
+
262
+ Args:
263
+ file_path: Path to the PDF file (S3 key format "flow_id/filename" or local path)
264
+
265
+ Returns:
266
+ str: Extracted text from all pages
267
+ """
268
+ content = await read_file_bytes(file_path)
269
+ with BytesIO(content) as f, PdfReader(f) as reader:
270
+ return "\n\n".join([page.extract_text() for page in reader.pages])
271
+
272
+
137
273
  def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | None:
274
+ """Parse a text file to Data (sync version).
275
+
276
+ For S3 storage, this will use async operations to fetch the file.
277
+ For local storage, reads directly from filesystem.
278
+ """
279
+ settings = get_settings_service().settings
280
+
281
+ # If using S3 storage, we need to use async operations
282
+ if settings.storage_type == "s3":
283
+ # Run the async version safely (handles existing event loops)
284
+ return run_until_complete(parse_text_file_to_data_async(file_path, silent_errors=silent_errors))
285
+
138
286
  try:
139
287
  if file_path.endswith(".pdf"):
140
288
  text = parse_pdf_to_text(file_path)
@@ -143,20 +291,7 @@ def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | No
143
291
  else:
144
292
  text = read_text_file(file_path)
145
293
 
146
- # if file is json, yaml, or xml, we can parse it
147
- if file_path.endswith(".json"):
148
- loaded_json = orjson.loads(text)
149
- if isinstance(loaded_json, dict):
150
- loaded_json = {k: normalize_text(v) if isinstance(v, str) else v for k, v in loaded_json.items()}
151
- elif isinstance(loaded_json, list):
152
- loaded_json = [normalize_text(item) if isinstance(item, str) else item for item in loaded_json]
153
- text = orjson.dumps(loaded_json).decode("utf-8")
154
-
155
- elif file_path.endswith((".yaml", ".yml")):
156
- text = yaml.safe_load(text)
157
- elif file_path.endswith(".xml"):
158
- xml_element = ElementTree.fromstring(text)
159
- text = ElementTree.tostring(xml_element, encoding="unicode")
294
+ text = parse_structured_text(text, file_path)
160
295
  except Exception as e:
161
296
  if not silent_errors:
162
297
  msg = f"Error loading file {file_path}: {e}"
@@ -166,6 +301,35 @@ def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | No
166
301
  return Data(data={"file_path": file_path, "text": text})
167
302
 
168
303
 
304
+ async def parse_text_file_to_data_async(file_path: str, *, silent_errors: bool) -> Data | None:
305
+ """Parse a text file to Data (async version, supports storage service).
306
+
307
+ This version properly handles storage service files:
308
+ - For text/JSON/YAML/XML: reads bytes directly (no temp file)
309
+ - For PDF: reads bytes directly via BytesIO (no temp file)
310
+ - For DOCX: downloads to temp file (python-docx requires file path)
311
+ """
312
+ try:
313
+ if file_path.endswith(".pdf"):
314
+ text = await parse_pdf_to_text_async(file_path)
315
+ elif file_path.endswith(".docx"):
316
+ text = await read_docx_file_async(file_path)
317
+ else:
318
+ # Text files - read directly, no temp file needed
319
+ text = await read_text_file_async(file_path)
320
+
321
+ # Parse structured formats (JSON, YAML, XML)
322
+ text = parse_structured_text(text, file_path)
323
+
324
+ return Data(data={"file_path": file_path, "text": text})
325
+
326
+ except Exception as e:
327
+ if not silent_errors:
328
+ msg = f"Error loading file {file_path}: {e}"
329
+ raise ValueError(msg) from e
330
+ return None
331
+
332
+
169
333
  # ! Removing unstructured dependency until
170
334
  # ! 3.12 is supported
171
335
  # def get_elements(