PyPI - lfx-nightly - Versions diffs - 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev26__py3-none-any.whl - Mend

lfx-nightly 0.2.0.dev0py3-none-any.whl → 0.2.0.dev26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

lfx/_assets/component_index.json +1 -1
lfx/base/agents/agent.py +13 -1
lfx/base/agents/altk_base_agent.py +380 -0
lfx/base/agents/altk_tool_wrappers.py +565 -0
lfx/base/agents/events.py +2 -1
lfx/base/composio/composio_base.py +159 -224
lfx/base/data/base_file.py +88 -21
lfx/base/data/storage_utils.py +192 -0
lfx/base/data/utils.py +178 -14
lfx/base/embeddings/embeddings_class.py +113 -0
lfx/base/models/groq_constants.py +74 -58
lfx/base/models/groq_model_discovery.py +265 -0
lfx/base/models/model.py +1 -1
lfx/base/models/model_utils.py +100 -0
lfx/base/models/openai_constants.py +7 -0
lfx/base/models/watsonx_constants.py +32 -8
lfx/base/tools/run_flow.py +601 -129
lfx/cli/commands.py +6 -3
lfx/cli/common.py +2 -2
lfx/cli/run.py +1 -1
lfx/cli/script_loader.py +53 -11
lfx/components/Notion/create_page.py +1 -1
lfx/components/Notion/list_database_properties.py +1 -1
lfx/components/Notion/list_pages.py +1 -1
lfx/components/Notion/list_users.py +1 -1
lfx/components/Notion/page_content_viewer.py +1 -1
lfx/components/Notion/search.py +1 -1
lfx/components/Notion/update_page_property.py +1 -1
lfx/components/__init__.py +19 -5
lfx/components/{agents → altk}/__init__.py +5 -9
lfx/components/altk/altk_agent.py +193 -0
lfx/components/apify/apify_actor.py +1 -1
lfx/components/composio/__init__.py +70 -18
lfx/components/composio/apollo_composio.py +11 -0
lfx/components/composio/bitbucket_composio.py +11 -0
lfx/components/composio/canva_composio.py +11 -0
lfx/components/composio/coda_composio.py +11 -0
lfx/components/composio/composio_api.py +10 -0
lfx/components/composio/discord_composio.py +1 -1
lfx/components/composio/elevenlabs_composio.py +11 -0
lfx/components/composio/exa_composio.py +11 -0
lfx/components/composio/firecrawl_composio.py +11 -0
lfx/components/composio/fireflies_composio.py +11 -0
lfx/components/composio/gmail_composio.py +1 -1
lfx/components/composio/googlebigquery_composio.py +11 -0
lfx/components/composio/googlecalendar_composio.py +1 -1
lfx/components/composio/googledocs_composio.py +1 -1
lfx/components/composio/googlemeet_composio.py +1 -1
lfx/components/composio/googlesheets_composio.py +1 -1
lfx/components/composio/googletasks_composio.py +1 -1
lfx/components/composio/heygen_composio.py +11 -0
lfx/components/composio/mem0_composio.py +11 -0
lfx/components/composio/peopledatalabs_composio.py +11 -0
lfx/components/composio/perplexityai_composio.py +11 -0
lfx/components/composio/serpapi_composio.py +11 -0
lfx/components/composio/slack_composio.py +3 -574
lfx/components/composio/slackbot_composio.py +1 -1
lfx/components/composio/snowflake_composio.py +11 -0
lfx/components/composio/tavily_composio.py +11 -0
lfx/components/composio/youtube_composio.py +2 -2
lfx/components/cuga/__init__.py +34 -0
lfx/components/cuga/cuga_agent.py +730 -0
lfx/components/data/__init__.py +78 -28
lfx/components/data_source/__init__.py +58 -0
lfx/components/{data → data_source}/api_request.py +26 -3
lfx/components/{data → data_source}/csv_to_data.py +15 -10
lfx/components/{data → data_source}/json_to_data.py +15 -8
lfx/components/{data → data_source}/news_search.py +1 -1
lfx/components/{data → data_source}/rss.py +1 -1
lfx/components/{data → data_source}/sql_executor.py +1 -1
lfx/components/{data → data_source}/url.py +1 -1
lfx/components/{data → data_source}/web_search.py +1 -1
lfx/components/datastax/astradb_cql.py +1 -1
lfx/components/datastax/astradb_graph.py +1 -1
lfx/components/datastax/astradb_tool.py +1 -1
lfx/components/datastax/astradb_vectorstore.py +1 -1
lfx/components/datastax/hcd.py +1 -1
lfx/components/deactivated/json_document_builder.py +1 -1
lfx/components/docling/__init__.py +0 -3
lfx/components/elastic/elasticsearch.py +1 -1
lfx/components/elastic/opensearch_multimodal.py +1575 -0
lfx/components/files_and_knowledge/__init__.py +47 -0
lfx/components/{data → files_and_knowledge}/directory.py +1 -1
lfx/components/{data → files_and_knowledge}/file.py +246 -18
lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +2 -2
lfx/components/{data → files_and_knowledge}/save_file.py +142 -22
lfx/components/flow_controls/__init__.py +58 -0
lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
lfx/components/{logic → flow_controls}/loop.py +43 -9
lfx/components/flow_controls/run_flow.py +108 -0
lfx/components/glean/glean_search_api.py +1 -1
lfx/components/groq/groq.py +35 -28
lfx/components/helpers/__init__.py +102 -0
lfx/components/input_output/__init__.py +3 -1
lfx/components/input_output/chat.py +4 -3
lfx/components/input_output/chat_output.py +4 -4
lfx/components/input_output/text.py +1 -1
lfx/components/input_output/text_output.py +1 -1
lfx/components/{data → input_output}/webhook.py +1 -1
lfx/components/knowledge_bases/__init__.py +59 -4
lfx/components/langchain_utilities/character.py +1 -1
lfx/components/langchain_utilities/csv_agent.py +84 -16
lfx/components/langchain_utilities/json_agent.py +67 -12
lfx/components/langchain_utilities/language_recursive.py +1 -1
lfx/components/llm_operations/__init__.py +46 -0
lfx/components/{processing → llm_operations}/batch_run.py +1 -1
lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
lfx/components/{processing → llm_operations}/structured_output.py +1 -1
lfx/components/logic/__init__.py +126 -0
lfx/components/mem0/mem0_chat_memory.py +11 -0
lfx/components/models/__init__.py +64 -9
lfx/components/models_and_agents/__init__.py +49 -0
lfx/components/{agents → models_and_agents}/agent.py +2 -2
lfx/components/models_and_agents/embedding_model.py +423 -0
lfx/components/models_and_agents/language_model.py +398 -0
lfx/components/{agents → models_and_agents}/mcp_component.py +53 -44
lfx/components/{helpers → models_and_agents}/memory.py +1 -1
lfx/components/nvidia/system_assist.py +1 -1
lfx/components/olivya/olivya.py +1 -1
lfx/components/ollama/ollama.py +17 -3
lfx/components/processing/__init__.py +9 -57
lfx/components/processing/converter.py +1 -1
lfx/components/processing/dataframe_operations.py +1 -1
lfx/components/processing/parse_json_data.py +2 -2
lfx/components/processing/parser.py +1 -1
lfx/components/processing/split_text.py +1 -1
lfx/components/qdrant/qdrant.py +1 -1
lfx/components/redis/redis.py +1 -1
lfx/components/twelvelabs/split_video.py +10 -0
lfx/components/twelvelabs/video_file.py +12 -0
lfx/components/utilities/__init__.py +43 -0
lfx/components/{helpers → utilities}/calculator_core.py +1 -1
lfx/components/{helpers → utilities}/current_date.py +1 -1
lfx/components/{processing → utilities}/python_repl_core.py +1 -1
lfx/components/vectorstores/local_db.py +9 -0
lfx/components/youtube/youtube_transcripts.py +118 -30
lfx/custom/custom_component/component.py +57 -1
lfx/custom/custom_component/custom_component.py +68 -6
lfx/graph/edge/base.py +43 -20
lfx/graph/graph/base.py +4 -1
lfx/graph/state/model.py +15 -2
lfx/graph/utils.py +6 -0
lfx/graph/vertex/base.py +4 -1
lfx/graph/vertex/param_handler.py +10 -7
lfx/helpers/__init__.py +12 -0
lfx/helpers/flow.py +117 -0
lfx/inputs/input_mixin.py +24 -1
lfx/inputs/inputs.py +13 -1
lfx/interface/components.py +161 -83
lfx/log/logger.py +5 -3
lfx/services/database/__init__.py +5 -0
lfx/services/database/service.py +25 -0
lfx/services/deps.py +87 -22
lfx/services/manager.py +19 -6
lfx/services/mcp_composer/service.py +998 -157
lfx/services/session.py +5 -0
lfx/services/settings/base.py +51 -7
lfx/services/settings/constants.py +8 -0
lfx/services/storage/local.py +76 -46
lfx/services/storage/service.py +152 -29
lfx/template/field/base.py +3 -0
lfx/utils/ssrf_protection.py +384 -0
lfx/utils/validate_cloud.py +26 -0
{lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/METADATA +38 -22
{lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/RECORD +182 -150
{lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/WHEEL +1 -1
lfx/components/agents/altk_agent.py +0 -366
lfx/components/agents/cuga_agent.py +0 -1013
lfx/components/docling/docling_remote_vlm.py +0 -284
lfx/components/logic/run_flow.py +0 -71
lfx/components/models/embedding_model.py +0 -195
lfx/components/models/language_model.py +0 -144
/lfx/components/{data → data_source}/mock_data.py +0 -0
/lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +0 -0
/lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
/lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
/lfx/components/{logic → flow_controls}/listen.py +0 -0
/lfx/components/{logic → flow_controls}/notify.py +0 -0
/lfx/components/{logic → flow_controls}/pass_message.py +0 -0
/lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
/lfx/components/{processing → models_and_agents}/prompt.py +0 -0
/lfx/components/{helpers → processing}/create_list.py +0 -0
/lfx/components/{helpers → processing}/output_parser.py +0 -0
/lfx/components/{helpers → processing}/store_message.py +0 -0
/lfx/components/{helpers → utilities}/id_generator.py +0 -0
{lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/entry_points.txt +0 -0

lfx/base/data/base_file.py CHANGED Viewed

@@ -2,6 +2,7 @@ import ast
 import shutil
 import tarfile
 from abc import ABC, abstractmethod
+from io import BytesIO
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any
@@ -10,11 +11,14 @@ from zipfile import ZipFile, is_zipfile
 import orjson
 import pandas as pd
+from lfx.base.data.storage_utils import get_file_size, read_file_bytes
 from lfx.custom.custom_component.component import Component
 from lfx.io import BoolInput, FileInput, HandleInput, Output, StrInput
 from lfx.schema.data import Data
 from lfx.schema.dataframe import DataFrame
 from lfx.schema.message import Message
+from lfx.services.deps import get_settings_service
+from lfx.utils.async_helpers import run_until_complete
 from lfx.utils.helpers import build_content_type_from_extension
 if TYPE_CHECKING:
@@ -27,6 +31,8 @@ class BaseFileComponent(Component, ABC):
     This class provides common functionality for resolving, validating, and
     processing file paths. Child classes must define valid file extensions
     and implement the `process_files` method.
+    # TODO: May want to subclass for local and remote files
     """
     class BaseFile:
@@ -251,12 +257,27 @@ class BaseFileComponent(Component, ABC):
         file_path = data_item.file_path
         file_path_obj = Path(file_path)
-        file_size_stat = file_path_obj.stat()
         filename = file_path_obj.name
+        settings = get_settings_service().settings
+        # Get file size - use storage service for S3, filesystem for local
+        if settings.storage_type == "s3":
+            try:
+                file_size = get_file_size(file_path)
+            except (FileNotFoundError, ValueError):
+                # If we can't get file size, set to 0 or omit
+                file_size = 0
+        else:
+            try:
+                file_size_stat = file_path_obj.stat()
+                file_size = file_size_stat.st_size
+            except OSError:
+                file_size = 0
         # Basic file metadata
         metadata["filename"] = filename
-        metadata["file_size"] = file_size_stat.st_size
+        metadata["file_size"] = file_size
         # Add MIME type from extension
         extension = filename.split(".")[-1]
@@ -321,7 +342,16 @@ class BaseFileComponent(Component, ABC):
             Message: Message containing file paths
         """
         files = self._validate_and_resolve_paths()
-        paths = [file.path.as_posix() for file in files if file.path.exists()]
+        settings = get_settings_service().settings
+        # For S3 storage, paths are virtual storage keys that don't exist on the local filesystem.
+        # Skip the exists() check for S3 files to preserve them in the output.
+        # Validation of S3 file existence is deferred until file processing (see _validate_and_resolve_paths).
+        # If a file was removed from S3, it will fail when attempting to read/process it later.
+        if settings.storage_type == "s3":
+            paths = [file.path.as_posix() for file in files]
+        else:
+            paths = [file.path.as_posix() for file in files if file.path.exists()]
         return Message(text="\n".join(paths) if paths else "")
@@ -329,7 +359,29 @@ class BaseFileComponent(Component, ABC):
         if not file_path:
             return None
-        # Map file extensions to pandas read functions with type annotation
+        # Get file extension in lowercase
+        ext = Path(file_path).suffix.lower()
+        settings = get_settings_service().settings
+        # For S3 storage, download file bytes first
+        if settings.storage_type == "s3":
+            # Download file content from S3
+            content = run_until_complete(read_file_bytes(file_path))
+            # Map file extensions to pandas read functions that support BytesIO
+            if ext == ".csv":
+                result = pd.read_csv(BytesIO(content))
+            elif ext == ".xlsx":
+                result = pd.read_excel(BytesIO(content))
+            elif ext == ".parquet":
+                result = pd.read_parquet(BytesIO(content))
+            else:
+                return None
+            return result.to_dict("records")
+        # Local storage - read directly from filesystem
         file_readers: dict[str, Callable[[str], pd.DataFrame]] = {
             ".csv": pd.read_csv,
             ".xlsx": pd.read_excel,
@@ -337,9 +389,6 @@ class BaseFileComponent(Component, ABC):
             # TODO: sqlite and json support?
         }
-        # Get file extension in lowercase
-        ext = Path(file_path).suffix.lower()
         # Get the appropriate reader function or None
         reader = file_readers.get(ext)
@@ -558,16 +607,26 @@ class BaseFileComponent(Component, ABC):
         resolved_files = []
         def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
-            resolved_path = Path(self.resolve_path(str(path)))
-            if not resolved_path.exists():
-                msg = f"File or directory not found: {path}"
-                self.log(msg)
-                if not self.silent_errors:
-                    raise ValueError(msg)
-            resolved_files.append(
-                BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
-            )
+            path_str = str(path)
+            settings = get_settings_service().settings
+            # When using object storage (S3), file paths are storage keys (e.g., "<flow_id>/<filename>")
+            # that don't exist on the local filesystem. We defer validation until file processing.
+            # For local storage, validate the file exists immediately to fail fast.
+            if settings.storage_type == "s3":
+                resolved_files.append(
+                    BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
+                )
+            else:
+                resolved_path = Path(self.resolve_path(path_str))
+                if not resolved_path.exists():
+                    msg = f"File or directory not found: {path}"
+                    self.log(msg)
+                    if not self.silent_errors:
+                        raise ValueError(msg)
+                resolved_files.append(
+                    BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
+                )
         file_path = self._file_path_as_list()
@@ -707,7 +766,7 @@ class BaseFileComponent(Component, ABC):
             raise ValueError(msg)
     def _filter_and_mark_files(self, files: list[BaseFile]) -> list[BaseFile]:
-        """Validate file types and mark files for removal.
+        """Validate file types and filter out invalid files.
         Args:
             files (list[BaseFile]): List of BaseFile instances.
@@ -718,18 +777,26 @@ class BaseFileComponent(Component, ABC):
         Raises:
             ValueError: If unsupported files are encountered and `ignore_unsupported_extensions` is False.
         """
+        settings = get_settings_service().settings
+        is_s3_storage = settings.storage_type == "s3"
         final_files = []
         ignored_files = []
         for file in files:
-            if not file.path.is_file():
+            # For local storage, verify the path is actually a file
+            # For S3 storage, paths are virtual keys that don't exist locally
+            if not is_s3_storage and not file.path.is_file():
                 self.log(f"Not a file: {file.path.name}")
                 continue
-            if file.path.suffix[1:].lower() not in self.valid_extensions:
-                if self.ignore_unsupported_extensions:
+            # Validate file extension
+            extension = file.path.suffix[1:].lower() if file.path.suffix else ""
+            if extension not in self.valid_extensions:
+                # For local storage, optionally ignore unsupported extensions
+                if not is_s3_storage and self.ignore_unsupported_extensions:
                     ignored_files.append(file.path.name)
                     continue
                 msg = f"Unsupported file extension: {file.path.suffix}"
                 self.log(msg)
                 if not self.silent_errors:

lfx/base/data/storage_utils.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""Storage-aware file utilities for components.
+This module provides utilities that work with both local files and remote files
+stored in the storage service.
+TODO: Can abstract these into the storage service interface and update
+implementations.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+from lfx.services.deps import get_settings_service, get_storage_service
+from lfx.utils.async_helpers import run_until_complete
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from lfx.services.storage.service import StorageService
+# Constants for path parsing
+EXPECTED_PATH_PARTS = 2  # Path format: "flow_id/filename"
+def parse_storage_path(path: str) -> tuple[str, str] | None:
+    """Parse a storage service path into flow_id and filename.
+    Storage service paths follow the format: flow_id/filename
+    This should only be called when storage_type == "s3".
+    Args:
+        path: The storage service path in format "flow_id/filename"
+    Returns:
+        tuple[str, str] | None: (flow_id, filename) or None if invalid format
+    """
+    if not path or "/" not in path:
+        return None
+    parts = path.split("/", 1)
+    if len(parts) != EXPECTED_PATH_PARTS or not parts[0] or not parts[1]:
+        return None
+    return parts[0], parts[1]
+async def read_file_bytes(
+    file_path: str,
+    storage_service: StorageService | None = None,
+    resolve_path: Callable[[str], str] | None = None,
+) -> bytes:
+    """Read file bytes from either storage service or local filesystem.
+    Args:
+        file_path: Path to the file (S3 key format "flow_id/filename" or local path)
+        storage_service: Optional storage service instance (will get from deps if not provided)
+        resolve_path: Optional function to resolve relative paths to absolute paths
+                     (typically Component.resolve_path). Only used for local storage.
+    Returns:
+        bytes: The file content
+    Raises:
+        FileNotFoundError: If the file doesn't exist
+    """
+    settings = get_settings_service().settings
+    if settings.storage_type == "s3":
+        parsed = parse_storage_path(file_path)
+        if not parsed:
+            msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
+            raise ValueError(msg)
+        if storage_service is None:
+            storage_service = get_storage_service()
+        flow_id, filename = parsed
+        return await storage_service.get_file(flow_id, filename)
+    # For local storage, resolve path if resolver provided
+    if resolve_path:
+        file_path = resolve_path(file_path)
+    path_obj = Path(file_path)
+    if not path_obj.exists():
+        msg = f"File not found: {file_path}"
+        raise FileNotFoundError(msg)
+    return path_obj.read_bytes()
+async def read_file_text(
+    file_path: str,
+    encoding: str = "utf-8",
+    storage_service: StorageService | None = None,
+    resolve_path: Callable[[str], str] | None = None,
+    newline: str | None = None,
+) -> str:
+    r"""Read file text from either storage service or local filesystem.
+    Args:
+        file_path: Path to the file (storage service path or local path)
+        encoding: Text encoding to use
+        storage_service: Optional storage service instance
+        resolve_path: Optional function to resolve relative paths to absolute paths
+                     (typically Component.resolve_path). Only used for local storage.
+        newline: Newline mode (None for default, "" for universal newlines like CSV).
+                 When set to "", normalizes all line endings to \\n for consistency.
+    Returns:
+        str: The file content as text
+    Raises:
+        FileNotFoundError: If the file doesn't exist
+    """
+    settings = get_settings_service().settings
+    if settings.storage_type == "s3":
+        content = await read_file_bytes(file_path, storage_service, resolve_path)
+        text = content.decode(encoding)
+        # Normalize newlines for S3 when newline="" is specified (universal newline mode)
+        if newline == "":
+            # Convert all line endings to \n (matches Python's universal newline mode)
+            text = text.replace("\r\n", "\n").replace("\r", "\n")
+        return text
+    # For local storage, resolve path if resolver provided
+    if resolve_path:
+        file_path = resolve_path(file_path)
+    path_obj = Path(file_path)
+    if newline is not None:
+        with path_obj.open(newline=newline, encoding=encoding) as f:  # noqa: ASYNC230
+            return f.read()
+    return path_obj.read_text(encoding=encoding)
+def get_file_size(file_path: str, storage_service: StorageService | None = None) -> int:
+    """Get file size from either storage service or local filesystem.
+    Note: This is a sync wrapper - for async code, use the storage service directly.
+    Args:
+        file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
+        storage_service: Optional storage service instance
+    Returns:
+        int: File size in bytes
+    Raises:
+        FileNotFoundError: If the file doesn't exist
+    """
+    settings = get_settings_service().settings
+    if settings.storage_type == "s3":
+        parsed = parse_storage_path(file_path)
+        if not parsed:
+            msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
+            raise ValueError(msg)
+        if storage_service is None:
+            storage_service = get_storage_service()
+        flow_id, filename = parsed
+        return run_until_complete(storage_service.get_file_size(flow_id, filename))
+    # Local file system
+    path_obj = Path(file_path)
+    if not path_obj.exists():
+        msg = f"File not found: {file_path}"
+        raise FileNotFoundError(msg)
+    return path_obj.stat().st_size
+def file_exists(file_path: str, storage_service: StorageService | None = None) -> bool:
+    """Check if a file exists in either storage service or local filesystem.
+    Args:
+        file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
+        storage_service: Optional storage service instance
+    Returns:
+        bool: True if the file exists
+    """
+    try:
+        get_file_size(file_path, storage_service)
+    except (FileNotFoundError, ValueError):
+        return False
+    else:
+        return True

lfx/base/data/utils.py CHANGED Viewed

@@ -1,14 +1,21 @@
+import contextlib
+import tempfile
 import unicodedata
 from collections.abc import Callable
 from concurrent import futures
+from io import BytesIO
 from pathlib import Path
 import chardet
 import orjson
 import yaml
 from defusedxml import ElementTree
+from pypdf import PdfReader
+from lfx.base.data.storage_utils import read_file_bytes
 from lfx.schema.data import Data
+from lfx.services.deps import get_settings_service
+from lfx.utils.async_helpers import run_until_complete
 # Types of files that can be read simply by file.read()
 # and have 100% to be completely readable
@@ -36,6 +43,34 @@ TEXT_FILE_TYPES = [
 IMG_FILE_TYPES = ["jpg", "jpeg", "png", "bmp", "image"]
+def parse_structured_text(text: str, file_path: str) -> str | dict | list:
+    """Parse structured text formats (JSON, YAML, XML) and normalize text.
+    Args:
+        text: The text content to parse
+        file_path: The file path (used to determine format)
+    Returns:
+        Parsed content (dict/list for JSON, dict for YAML, str for XML)
+    """
+    if file_path.endswith(".json"):
+        loaded_json = orjson.loads(text)
+        if isinstance(loaded_json, dict):
+            loaded_json = {k: normalize_text(v) if isinstance(v, str) else v for k, v in loaded_json.items()}
+        elif isinstance(loaded_json, list):
+            loaded_json = [normalize_text(item) if isinstance(item, str) else item for item in loaded_json]
+        return orjson.dumps(loaded_json).decode("utf-8")
+    if file_path.endswith((".yaml", ".yml")):
+        return yaml.safe_load(text)
+    if file_path.endswith(".xml"):
+        xml_element = ElementTree.fromstring(text)
+        return ElementTree.tostring(xml_element, encoding="unicode")
+    return text
 def normalize_text(text):
     return unicodedata.normalize("NFKD", text)
@@ -109,6 +144,14 @@ def partition_file_to_data(file_path: str, *, silent_errors: bool) -> Data | Non
 def read_text_file(file_path: str) -> str:
+    """Read a text file with automatic encoding detection.
+    Args:
+        file_path: Path to the file (local path only, not storage service path)
+    Returns:
+        str: The file content as text
+    """
     file_path_ = Path(file_path)
     raw_data = file_path_.read_bytes()
     result = chardet.detect(raw_data)
@@ -120,13 +163,90 @@ def read_text_file(file_path: str) -> str:
     return file_path_.read_text(encoding=encoding)
+async def read_text_file_async(file_path: str) -> str:
+    """Read a text file with automatic encoding detection (async, storage-aware).
+    Args:
+        file_path: Path to the file (S3 key format "flow_id/filename" or local path)
+    Returns:
+        str: The file content as text
+    """
+    from .storage_utils import read_file_bytes
+    # Use storage-aware read to get bytes
+    raw_data = await read_file_bytes(file_path)
+    # Auto-detect encoding
+    result = chardet.detect(raw_data)
+    encoding = result.get("encoding")
+    # If encoding detection fails (e.g., binary file), default to utf-8
+    if not encoding or encoding in {"Windows-1252", "Windows-1254", "MacRoman"}:
+        encoding = "utf-8"
+    return raw_data.decode(encoding, errors="replace")
 def read_docx_file(file_path: str) -> str:
+    """Read a DOCX file and extract text.
+    ote: python-docx requires a file path, so this only works with local files.
+    For storage service files, use read_docx_file_async which downloads to temp.
+    Args:
+        file_path: Path to the DOCX file (local path only)
+    Returns:
+        str: Extracted text from the document
+    """
     from docx import Document
     doc = Document(file_path)
     return "\n\n".join([p.text for p in doc.paragraphs])
+async def read_docx_file_async(file_path: str) -> str:
+    """Read a DOCX file and extract text (async, storage-aware).
+    For S3 storage, downloads to temp file (python-docx requires file path).
+    For local storage, reads directly.
+    Args:
+        file_path: Path to the DOCX file (S3 key format "flow_id/filename" or local path)
+    Returns:
+        str: Extracted text from the document
+    """
+    from docx import Document
+    from .storage_utils import read_file_bytes
+    settings = get_settings_service().settings
+    if settings.storage_type == "local":
+        # Local storage - read directly
+        doc = Document(file_path)
+        return "\n\n".join([p.text for p in doc.paragraphs])
+    # S3 storage - need temp file for python-docx (doesn't support BytesIO)
+    content = await read_file_bytes(file_path)
+    # Create temp file with .docx extension
+    # Extract filename from path for suffix
+    suffix = Path(file_path.split("/")[-1]).suffix
+    with tempfile.NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp_file:
+        tmp_file.write(content)
+        temp_path = tmp_file.name
+    try:
+        doc = Document(temp_path)
+        return "\n\n".join([p.text for p in doc.paragraphs])
+    finally:
+        with contextlib.suppress(Exception):
+            Path(temp_path).unlink()
 def parse_pdf_to_text(file_path: str) -> str:
     from pypdf import PdfReader
@@ -134,7 +254,35 @@ def parse_pdf_to_text(file_path: str) -> str:
         return "\n\n".join([page.extract_text() for page in reader.pages])
+async def parse_pdf_to_text_async(file_path: str) -> str:
+    """Parse a PDF file to extract text (async, storage-aware).
+    Uses storage-aware file reading to support both local and S3 storage.
+    Args:
+        file_path: Path to the PDF file (S3 key format "flow_id/filename" or local path)
+    Returns:
+        str: Extracted text from all pages
+    """
+    content = await read_file_bytes(file_path)
+    with BytesIO(content) as f, PdfReader(f) as reader:
+        return "\n\n".join([page.extract_text() for page in reader.pages])
 def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | None:
+    """Parse a text file to Data (sync version).
+    For S3 storage, this will use async operations to fetch the file.
+    For local storage, reads directly from filesystem.
+    """
+    settings = get_settings_service().settings
+    # If using S3 storage, we need to use async operations
+    if settings.storage_type == "s3":
+        # Run the async version safely (handles existing event loops)
+        return run_until_complete(parse_text_file_to_data_async(file_path, silent_errors=silent_errors))
     try:
         if file_path.endswith(".pdf"):
             text = parse_pdf_to_text(file_path)
@@ -143,20 +291,7 @@ def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | No
         else:
             text = read_text_file(file_path)
-        # if file is json, yaml, or xml, we can parse it
-        if file_path.endswith(".json"):
-            loaded_json = orjson.loads(text)
-            if isinstance(loaded_json, dict):
-                loaded_json = {k: normalize_text(v) if isinstance(v, str) else v for k, v in loaded_json.items()}
-            elif isinstance(loaded_json, list):
-                loaded_json = [normalize_text(item) if isinstance(item, str) else item for item in loaded_json]
-            text = orjson.dumps(loaded_json).decode("utf-8")
-        elif file_path.endswith((".yaml", ".yml")):
-            text = yaml.safe_load(text)
-        elif file_path.endswith(".xml"):
-            xml_element = ElementTree.fromstring(text)
-            text = ElementTree.tostring(xml_element, encoding="unicode")
+        text = parse_structured_text(text, file_path)
     except Exception as e:
         if not silent_errors:
             msg = f"Error loading file {file_path}: {e}"
@@ -166,6 +301,35 @@ def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | No
     return Data(data={"file_path": file_path, "text": text})
+async def parse_text_file_to_data_async(file_path: str, *, silent_errors: bool) -> Data | None:
+    """Parse a text file to Data (async version, supports storage service).
+    This version properly handles storage service files:
+    - For text/JSON/YAML/XML: reads bytes directly (no temp file)
+    - For PDF: reads bytes directly via BytesIO (no temp file)
+    - For DOCX: downloads to temp file (python-docx requires file path)
+    """
+    try:
+        if file_path.endswith(".pdf"):
+            text = await parse_pdf_to_text_async(file_path)
+        elif file_path.endswith(".docx"):
+            text = await read_docx_file_async(file_path)
+        else:
+            # Text files - read directly, no temp file needed
+            text = await read_text_file_async(file_path)
+        # Parse structured formats (JSON, YAML, XML)
+        text = parse_structured_text(text, file_path)
+        return Data(data={"file_path": file_path, "text": text})
+    except Exception as e:
+        if not silent_errors:
+            msg = f"Error loading file {file_path}: {e}"
+            raise ValueError(msg) from e
+        return None
 # ! Removing unstructured dependency until
 # ! 3.12 is supported
 # def get_elements(

lfx-nightly 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev26__py3-none-any.whl

lfx-nightly 0.2.0.dev0py3-none-any.whl → 0.2.0.dev26py3-none-any.whl