PyPI - lfx-nightly - Versions diffs - 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl - Mend

lfx-nightly 0.2.0.dev26py3-none-any.whl → 0.2.1.dev7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

lfx/_assets/component_index.json +1 -1
lfx/base/agents/agent.py +9 -4
lfx/base/agents/altk_base_agent.py +16 -3
lfx/base/agents/altk_tool_wrappers.py +1 -1
lfx/base/agents/utils.py +4 -0
lfx/base/composio/composio_base.py +78 -41
lfx/base/data/base_file.py +14 -4
lfx/base/data/cloud_storage_utils.py +156 -0
lfx/base/data/docling_utils.py +191 -65
lfx/base/data/storage_utils.py +109 -0
lfx/base/datastax/astradb_base.py +75 -64
lfx/base/mcp/util.py +2 -2
lfx/base/models/__init__.py +11 -1
lfx/base/models/anthropic_constants.py +21 -12
lfx/base/models/google_generative_ai_constants.py +33 -9
lfx/base/models/model_metadata.py +6 -0
lfx/base/models/ollama_constants.py +196 -30
lfx/base/models/openai_constants.py +37 -10
lfx/base/models/unified_models.py +1123 -0
lfx/base/models/watsonx_constants.py +36 -0
lfx/base/tools/component_tool.py +2 -9
lfx/cli/commands.py +6 -1
lfx/cli/run.py +65 -409
lfx/cli/script_loader.py +13 -3
lfx/components/__init__.py +0 -3
lfx/components/composio/github_composio.py +1 -1
lfx/components/cuga/cuga_agent.py +39 -27
lfx/components/data_source/api_request.py +4 -2
lfx/components/docling/__init__.py +45 -11
lfx/components/docling/chunk_docling_document.py +3 -1
lfx/components/docling/docling_inline.py +39 -49
lfx/components/docling/export_docling_document.py +3 -1
lfx/components/elastic/opensearch_multimodal.py +215 -57
lfx/components/files_and_knowledge/file.py +439 -39
lfx/components/files_and_knowledge/ingestion.py +8 -0
lfx/components/files_and_knowledge/retrieval.py +10 -0
lfx/components/files_and_knowledge/save_file.py +123 -53
lfx/components/ibm/watsonx.py +7 -1
lfx/components/input_output/chat_output.py +7 -1
lfx/components/langchain_utilities/tool_calling.py +14 -6
lfx/components/llm_operations/batch_run.py +80 -25
lfx/components/llm_operations/lambda_filter.py +33 -6
lfx/components/llm_operations/llm_conditional_router.py +39 -7
lfx/components/llm_operations/structured_output.py +38 -12
lfx/components/models/__init__.py +16 -74
lfx/components/models_and_agents/agent.py +51 -201
lfx/components/models_and_agents/embedding_model.py +185 -339
lfx/components/models_and_agents/language_model.py +54 -318
lfx/components/models_and_agents/mcp_component.py +58 -9
lfx/components/ollama/ollama.py +9 -4
lfx/components/ollama/ollama_embeddings.py +2 -1
lfx/components/openai/openai_chat_model.py +1 -1
lfx/components/processing/__init__.py +0 -3
lfx/components/vllm/__init__.py +37 -0
lfx/components/vllm/vllm.py +141 -0
lfx/components/vllm/vllm_embeddings.py +110 -0
lfx/custom/custom_component/custom_component.py +8 -6
lfx/custom/directory_reader/directory_reader.py +5 -2
lfx/graph/utils.py +64 -18
lfx/inputs/__init__.py +2 -0
lfx/inputs/input_mixin.py +54 -0
lfx/inputs/inputs.py +115 -0
lfx/interface/initialize/loading.py +42 -12
lfx/io/__init__.py +2 -0
lfx/run/__init__.py +5 -0
lfx/run/base.py +494 -0
lfx/schema/data.py +1 -1
lfx/schema/image.py +28 -19
lfx/schema/message.py +19 -3
lfx/services/interfaces.py +5 -0
lfx/services/manager.py +5 -4
lfx/services/mcp_composer/service.py +45 -13
lfx/services/settings/auth.py +18 -11
lfx/services/settings/base.py +12 -24
lfx/services/settings/constants.py +2 -0
lfx/services/storage/local.py +37 -0
lfx/services/storage/service.py +19 -0
lfx/utils/constants.py +1 -0
lfx/utils/image.py +29 -11
lfx/utils/validate_cloud.py +14 -3
{lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
{lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +84 -78
lfx/components/processing/dataframe_to_toolset.py +0 -259
{lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
{lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0

lfx/components/files_and_knowledge/file.py CHANGED Viewed

@@ -21,15 +21,25 @@ from tempfile import NamedTemporaryFile
 from typing import Any
 from lfx.base.data.base_file import BaseFileComponent
-from lfx.base.data.storage_utils import parse_storage_path
+from lfx.base.data.storage_utils import parse_storage_path, read_file_bytes, validate_image_content_type
 from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
+from lfx.inputs import SortableListInput
 from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
-from lfx.io import BoolInput, FileInput, IntInput, Output
+from lfx.io import BoolInput, FileInput, IntInput, Output, SecretStrInput
 from lfx.schema.data import Data
 from lfx.schema.dataframe import DataFrame  # noqa: TC001
 from lfx.schema.message import Message
 from lfx.services.deps import get_settings_service, get_storage_service
 from lfx.utils.async_helpers import run_until_complete
+from lfx.utils.validate_cloud import is_astra_cloud_environment
+def _get_storage_location_options():
+    """Get storage location options, filtering out Local if in Astra cloud environment."""
+    all_options = [{"name": "AWS", "icon": "Amazon"}, {"name": "Google Drive", "icon": "google"}]
+    if is_astra_cloud_environment():
+        return all_options
+    return [{"name": "Local", "icon": "hard-drive"}, *all_options]
 class FileComponent(BaseFileComponent):
@@ -91,6 +101,15 @@ class FileComponent(BaseFileComponent):
             break
     inputs = [
+        SortableListInput(
+            name="storage_location",
+            display_name="Storage Location",
+            placeholder="Select Location",
+            info="Choose where to read the file from.",
+            options=_get_storage_location_options(),
+            real_time_refresh=True,
+            limit=1,
+        ),
         *_base_inputs,
         StrInput(
             name="file_path_str",
@@ -104,6 +123,63 @@ class FileComponent(BaseFileComponent):
             tool_mode=True,  # Required for Toolset toggle, but _get_tools() ignores this parameter
             required=False,
         ),
+        # AWS S3 specific inputs
+        SecretStrInput(
+            name="aws_access_key_id",
+            display_name="AWS Access Key ID",
+            info="AWS Access key ID.",
+            show=False,
+            advanced=False,
+            required=True,
+        ),
+        SecretStrInput(
+            name="aws_secret_access_key",
+            display_name="AWS Secret Key",
+            info="AWS Secret Key.",
+            show=False,
+            advanced=False,
+            required=True,
+        ),
+        StrInput(
+            name="bucket_name",
+            display_name="S3 Bucket Name",
+            info="Enter the name of the S3 bucket.",
+            show=False,
+            advanced=False,
+            required=True,
+        ),
+        StrInput(
+            name="aws_region",
+            display_name="AWS Region",
+            info="AWS region (e.g., us-east-1, eu-west-1).",
+            show=False,
+            advanced=False,
+        ),
+        StrInput(
+            name="s3_file_key",
+            display_name="S3 File Key",
+            info="The key (path) of the file in S3 bucket.",
+            show=False,
+            advanced=False,
+            required=True,
+        ),
+        # Google Drive specific inputs
+        SecretStrInput(
+            name="service_account_key",
+            display_name="GCP Credentials Secret Key",
+            info="Your Google Cloud Platform service account JSON key as a secret string (complete JSON content).",
+            show=False,
+            advanced=False,
+            required=True,
+        ),
+        StrInput(
+            name="file_id",
+            display_name="Google Drive File ID",
+            info=("The Google Drive file ID to read. The file must be shared with the service account email."),
+            show=False,
+            advanced=False,
+            required=True,
+        ),
         BoolInput(
             name="advanced_mode",
             display_name="Advanced Parser",
@@ -113,7 +189,8 @@ class FileComponent(BaseFileComponent):
                 "Enable advanced document processing and export with Docling for PDFs, images, and office documents. "
                 "Note that advanced document processing can consume significant resources."
             ),
-            show=True,
+            # Disabled in cloud
+            show=not is_astra_cloud_environment(),
         ),
         DropdownInput(
             name="pipeline",
@@ -269,6 +346,20 @@ class FileComponent(BaseFileComponent):
         """Return the list of currently selected file paths from the template."""
         return template.get("path", {}).get("file_path", [])
+    def _disable_docling_fields_in_cloud(self, build_config: dict[str, Any]) -> None:
+        """Disable all Docling-related fields in cloud environments."""
+        if "advanced_mode" in build_config:
+            build_config["advanced_mode"]["show"] = False
+            build_config["advanced_mode"]["value"] = False
+        # Hide all Docling-related fields
+        docling_fields = ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder")
+        for field in docling_fields:
+            if field in build_config:
+                build_config[field]["show"] = False
+        # Also disable OCR engine specifically
+        if "ocr_engine" in build_config:
+            build_config["ocr_engine"]["value"] = "None"
     def update_build_config(
         self,
         build_config: dict[str, Any],
@@ -276,28 +367,120 @@ class FileComponent(BaseFileComponent):
         field_name: str | None = None,
     ) -> dict[str, Any]:
         """Show/hide Advanced Parser and related fields based on selection context."""
+        # Update storage location options dynamically based on cloud environment
+        if "storage_location" in build_config:
+            updated_options = _get_storage_location_options()
+            build_config["storage_location"]["options"] = updated_options
+        # Handle storage location selection
+        if field_name == "storage_location":
+            # Extract selected storage location
+            selected = [location["name"] for location in field_value] if isinstance(field_value, list) else []
+            # Hide all storage-specific fields first
+            storage_fields = [
+                "aws_access_key_id",
+                "aws_secret_access_key",
+                "bucket_name",
+                "aws_region",
+                "s3_file_key",
+                "service_account_key",
+                "file_id",
+            ]
+            for f_name in storage_fields:
+                if f_name in build_config:
+                    build_config[f_name]["show"] = False
+            # Show fields based on selected storage location
+            if len(selected) == 1:
+                location = selected[0]
+                if location == "Local":
+                    # Show file upload input for local storage
+                    if "path" in build_config:
+                        build_config["path"]["show"] = True
+                elif location == "AWS":
+                    # Hide file upload input, show AWS fields
+                    if "path" in build_config:
+                        build_config["path"]["show"] = False
+                    aws_fields = [
+                        "aws_access_key_id",
+                        "aws_secret_access_key",
+                        "bucket_name",
+                        "aws_region",
+                        "s3_file_key",
+                    ]
+                    for f_name in aws_fields:
+                        if f_name in build_config:
+                            build_config[f_name]["show"] = True
+                            build_config[f_name]["advanced"] = False
+                elif location == "Google Drive":
+                    # Hide file upload input, show Google Drive fields
+                    if "path" in build_config:
+                        build_config["path"]["show"] = False
+                    gdrive_fields = ["service_account_key", "file_id"]
+                    for f_name in gdrive_fields:
+                        if f_name in build_config:
+                            build_config[f_name]["show"] = True
+                            build_config[f_name]["advanced"] = False
+            # No storage location selected - show file upload by default
+            elif "path" in build_config:
+                build_config["path"]["show"] = True
+            return build_config
         if field_name == "path":
             paths = self._path_value(build_config)
-            # If all files can be processed by docling, do so
-            allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
-            build_config["advanced_mode"]["show"] = allow_advanced
-            if not allow_advanced:
-                build_config["advanced_mode"]["value"] = False
-                for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
-                    if f in build_config:
-                        build_config[f]["show"] = False
+            # Disable in cloud environments
+            if is_astra_cloud_environment():
+                self._disable_docling_fields_in_cloud(build_config)
+            else:
+                # If all files can be processed by docling, do so
+                allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
+                build_config["advanced_mode"]["show"] = allow_advanced
+                if not allow_advanced:
+                    build_config["advanced_mode"]["value"] = False
+                    docling_fields = (
+                        "pipeline",
+                        "ocr_engine",
+                        "doc_key",
+                        "md_image_placeholder",
+                        "md_page_break_placeholder",
+                    )
+                    for field in docling_fields:
+                        if field in build_config:
+                            build_config[field]["show"] = False
         # Docling Processing
         elif field_name == "advanced_mode":
-            for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
-                if f in build_config:
-                    build_config[f]["show"] = bool(field_value)
-                    if f == "pipeline":
-                        build_config[f]["advanced"] = not bool(field_value)
+            # Disable in cloud environments - don't show Docling fields even if advanced_mode is toggled
+            if is_astra_cloud_environment():
+                self._disable_docling_fields_in_cloud(build_config)
+            else:
+                docling_fields = (
+                    "pipeline",
+                    "ocr_engine",
+                    "doc_key",
+                    "md_image_placeholder",
+                    "md_page_break_placeholder",
+                )
+                for field in docling_fields:
+                    if field in build_config:
+                        build_config[field]["show"] = bool(field_value)
+                        if field == "pipeline":
+                            build_config[field]["advanced"] = not bool(field_value)
         elif field_name == "pipeline":
-            if field_value == "standard":
+            # Disable in cloud environments - don't show OCR engine even if pipeline is changed
+            if is_astra_cloud_environment():
+                self._disable_docling_fields_in_cloud(build_config)
+            elif field_value == "standard":
                 build_config["ocr_engine"]["show"] = True
                 build_config["ocr_engine"]["value"] = "easyocr"
             else:
@@ -368,15 +551,34 @@ class FileComponent(BaseFileComponent):
     # ------------------------------ Core processing ----------------------------------
+    def _get_selected_storage_location(self) -> str:
+        """Get the selected storage location from the SortableListInput."""
+        if hasattr(self, "storage_location") and self.storage_location:
+            if isinstance(self.storage_location, list) and len(self.storage_location) > 0:
+                return self.storage_location[0].get("name", "")
+            if isinstance(self.storage_location, dict):
+                return self.storage_location.get("name", "")
+        return "Local"  # Default to Local if not specified
     def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
-        """Override to handle file_path_str input from tool mode.
+        """Override to handle file_path_str input from tool mode and cloud storage.
-        When called as a tool, the file_path_str parameter can be set.
-        If not provided, it will fall back to using the path FileInput (uploaded file).
         Priority:
-        1. file_path_str (if provided by the tool call)
-        2. path (uploaded file from UI)
+        1. Cloud storage (AWS/Google Drive) if selected
+        2. file_path_str (if provided by the tool call)
+        3. path (uploaded file from UI)
         """
+        storage_location = self._get_selected_storage_location()
+        # Handle AWS S3
+        if storage_location == "AWS":
+            return self._read_from_aws_s3()
+        # Handle Google Drive
+        if storage_location == "Google Drive":
+            return self._read_from_google_drive()
+        # Handle Local storage
         # Check if file_path_str is provided (from tool mode)
         file_path_str = getattr(self, "file_path_str", None)
         if file_path_str:
@@ -399,6 +601,101 @@ class FileComponent(BaseFileComponent):
         # Otherwise use the default implementation (uses path FileInput)
         return super()._validate_and_resolve_paths()
+    def _read_from_aws_s3(self) -> list[BaseFileComponent.BaseFile]:
+        """Read file from AWS S3."""
+        from lfx.base.data.cloud_storage_utils import create_s3_client, validate_aws_credentials
+        # Validate AWS credentials
+        validate_aws_credentials(self)
+        if not getattr(self, "s3_file_key", None):
+            msg = "S3 File Key is required"
+            raise ValueError(msg)
+        # Create S3 client
+        s3_client = create_s3_client(self)
+        # Download file to temp location
+        import tempfile
+        # Get file extension from S3 key
+        file_extension = Path(self.s3_file_key).suffix or ""
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
+            temp_file_path = temp_file.name
+            try:
+                s3_client.download_fileobj(self.bucket_name, self.s3_file_key, temp_file)
+            except Exception as e:
+                # Clean up temp file on failure
+                with contextlib.suppress(OSError):
+                    Path(temp_file_path).unlink()
+                msg = f"Failed to download file from S3: {e}"
+                raise RuntimeError(msg) from e
+        # Create BaseFile object
+        from lfx.schema.data import Data
+        temp_path = Path(temp_file_path)
+        data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
+        return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
+    def _read_from_google_drive(self) -> list[BaseFileComponent.BaseFile]:
+        """Read file from Google Drive."""
+        import tempfile
+        from googleapiclient.http import MediaIoBaseDownload
+        from lfx.base.data.cloud_storage_utils import create_google_drive_service
+        # Validate Google Drive credentials
+        if not getattr(self, "service_account_key", None):
+            msg = "GCP Credentials Secret Key is required for Google Drive storage"
+            raise ValueError(msg)
+        if not getattr(self, "file_id", None):
+            msg = "Google Drive File ID is required"
+            raise ValueError(msg)
+        # Create Google Drive service with read-only scope
+        drive_service = create_google_drive_service(
+            self.service_account_key, scopes=["https://www.googleapis.com/auth/drive.readonly"]
+        )
+        # Get file metadata to determine file name and extension
+        try:
+            file_metadata = drive_service.files().get(fileId=self.file_id, fields="name,mimeType").execute()
+            file_name = file_metadata.get("name", "download")
+        except Exception as e:
+            msg = (
+                f"Unable to access file with ID '{self.file_id}'. "
+                f"Error: {e!s}. "
+                "Please ensure: 1) The file ID is correct, 2) The file exists, "
+                "3) The service account has been granted access to this file."
+            )
+            raise ValueError(msg) from e
+        # Download file to temp location
+        file_extension = Path(file_name).suffix or ""
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
+            temp_file_path = temp_file.name
+            try:
+                request = drive_service.files().get_media(fileId=self.file_id)
+                downloader = MediaIoBaseDownload(temp_file, request)
+                done = False
+                while not done:
+                    _status, done = downloader.next_chunk()
+            except Exception as e:
+                # Clean up temp file on failure
+                with contextlib.suppress(OSError):
+                    Path(temp_file_path).unlink()
+                msg = f"Failed to download file from Google Drive: {e}"
+                raise RuntimeError(msg) from e
+        # Create BaseFile object
+        from lfx.schema.data import Data
+        temp_path = Path(temp_file_path)
+        data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
+        return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
     def _is_docling_compatible(self, file_path: str) -> bool:
         """Lightweight extension gate for Docling-compatible types."""
         docling_exts = (
@@ -515,9 +812,6 @@ class FileComponent(BaseFileComponent):
             ),
         }
-        self.log(f"Starting Docling subprocess for file: {local_file_path}")
-        self.log(args)
         # Child script for isolating the docling processing
         child_script = textwrap.dedent(
             r"""
@@ -707,7 +1001,7 @@ class FileComponent(BaseFileComponent):
         )
         if not proc.stdout:
-            err_msg = proc.stderr.decode("utf-8", errors="replace") or "no output from child process"
+            err_msg = proc.stderr.decode("utf-8", errors="replace") if proc.stderr else "no output from child process"
             return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
         try:
@@ -722,9 +1016,16 @@ class FileComponent(BaseFileComponent):
             )
         if not result.get("ok"):
-            return Data(data={"error": result.get("error", "Unknown Docling error"), **result.get("meta", {})})
+            error_msg = result.get("error", "Unknown Docling error")
+            # Override meta file_path with original_file_path to ensure correct path matching
+            meta = result.get("meta", {})
+            meta["file_path"] = original_file_path
+            return Data(data={"error": error_msg, **meta})
         meta = result.get("meta", {})
+        # Override meta file_path with original_file_path to ensure correct path matching
+        # The subprocess returns the temp file path, but we need the original S3/local path for rollup_data
+        meta["file_path"] = original_file_path
         if result.get("mode") == "markdown":
             exported_content = str(result.get("text", ""))
             return Data(
@@ -748,15 +1049,50 @@ class FileComponent(BaseFileComponent):
             msg = "No files to process."
             raise ValueError(msg)
+        # Validate image files to detect content/extension mismatches
+        # This prevents API errors like "Image does not match the provided media type"
+        image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
+        settings = get_settings_service().settings
+        for file in file_list:
+            extension = file.path.suffix[1:].lower()
+            if extension in image_extensions:
+                # Read bytes based on storage type
+                try:
+                    if settings.storage_type == "s3":
+                        # For S3 storage, use storage service to read file bytes
+                        file_path_str = str(file.path)
+                        content = run_until_complete(read_file_bytes(file_path_str))
+                    else:
+                        # For local storage, read bytes directly from filesystem
+                        content = file.path.read_bytes()
+                    is_valid, error_msg = validate_image_content_type(
+                        str(file.path),
+                        content=content,
+                    )
+                    if not is_valid:
+                        self.log(error_msg)
+                        if not self.silent_errors:
+                            raise ValueError(error_msg)
+                except (OSError, FileNotFoundError) as e:
+                    self.log(f"Could not read file for validation: {e}")
+                    # Continue - let it fail later with better error
         # Validate that files requiring Docling are only processed when advanced mode is enabled
         if not self.advanced_mode:
             for file in file_list:
                 extension = file.path.suffix[1:].lower()
                 if extension in self.DOCLING_ONLY_EXTENSIONS:
-                    msg = (
-                        f"File '{file.path.name}' has extension '.{extension}' which requires "
-                        f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
-                    )
+                    if is_astra_cloud_environment():
+                        msg = (
+                            f"File '{file.path.name}' has extension '.{extension}' which requires "
+                            f"Advanced Parser mode. Advanced Parser is not available in cloud environments."
+                        )
+                    else:
+                        msg = (
+                            f"File '{file.path.name}' has extension '.{extension}' which requires "
+                            f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
+                        )
                     self.log(msg)
                     raise ValueError(msg)
@@ -783,10 +1119,36 @@ class FileComponent(BaseFileComponent):
                 file_path = str(file.path)
                 advanced_data: Data | None = self._process_docling_in_subprocess(file_path)
+                # Handle None case - Docling processing failed or returned None
+                if advanced_data is None:
+                    error_data = Data(
+                        data={
+                            "file_path": file_path,
+                            "error": "Docling processing returned no result. Check logs for details.",
+                        },
+                    )
+                    final_return.extend(self.rollup_data([file], [error_data]))
+                    continue
                 # --- UNNEST: expand each element in `doc` to its own Data row
                 payload = getattr(advanced_data, "data", {}) or {}
+                # Check for errors first
+                if "error" in payload:
+                    error_msg = payload.get("error", "Unknown error")
+                    error_data = Data(
+                        data={
+                            "file_path": file_path,
+                            "error": error_msg,
+                            **{k: v for k, v in payload.items() if k not in ("error", "file_path")},
+                        },
+                    )
+                    final_return.extend(self.rollup_data([file], [error_data]))
+                    continue
                 doc_rows = payload.get("doc")
-                if isinstance(doc_rows, list):
+                if isinstance(doc_rows, list) and doc_rows:
+                    # Non-empty list of structured rows
                     rows: list[Data | None] = [
                         Data(
                             data={
@@ -796,10 +1158,31 @@ class FileComponent(BaseFileComponent):
                         )
                         for item in doc_rows
                     ]
-                    final_return.extend(self.rollup_data(file_list, rows))
+                    final_return.extend(self.rollup_data([file], rows))
+                elif isinstance(doc_rows, list) and not doc_rows:
+                    # Empty list - file was processed but no text content found
+                    # Create a Data object indicating no content was extracted
+                    self.log(f"No text extracted from '{file_path}', creating placeholder data")
+                    empty_data = Data(
+                        data={
+                            "file_path": file_path,
+                            "text": "(No text content extracted from image)",
+                            "info": "Image processed successfully but contained no extractable text",
+                            **{k: v for k, v in payload.items() if k != "doc"},
+                        },
+                    )
+                    final_return.extend(self.rollup_data([file], [empty_data]))
                 else:
                     # If not structured, keep as-is (e.g., markdown export or error dict)
-                    final_return.extend(self.rollup_data(file_list, [advanced_data]))
+                    # Ensure file_path is set for proper rollup matching
+                    if not payload.get("file_path"):
+                        payload["file_path"] = file_path
+                        # Create new Data with file_path
+                        advanced_data = Data(
+                            data=payload,
+                            text=getattr(advanced_data, "text", None),
+                        )
+                    final_return.extend(self.rollup_data([file], [advanced_data]))
             return final_return
         # Standard multi-file (or single non-advanced) path
@@ -820,13 +1203,17 @@ class FileComponent(BaseFileComponent):
     def load_files_helper(self) -> DataFrame:
         result = self.load_files()
-        # Error condition - raise error if no text and an error is present
-        if not hasattr(result, "text"):
-            if hasattr(result, "error"):
-                raise ValueError(result.error[0])
+        # Result is a DataFrame - check if it has any rows
+        if result.empty:
             msg = "Could not extract content from the provided file(s)."
             raise ValueError(msg)
+        # Check for error column with error messages
+        if "error" in result.columns:
+            errors = result["error"].dropna().tolist()
+            if errors and not any(col in result.columns for col in ["text", "doc", "exported_content"]):
+                raise ValueError(errors[0])
         return result
     def load_files_dataframe(self) -> DataFrame:
@@ -838,4 +1225,17 @@ class FileComponent(BaseFileComponent):
         """Load files using advanced Docling processing and export to Markdown format."""
         self.markdown = True
         result = self.load_files_helper()
-        return Message(text=str(result.text[0]))
+        # Result is a DataFrame - check for text or exported_content columns
+        if "text" in result.columns and not result["text"].isna().all():
+            text_values = result["text"].dropna().tolist()
+            if text_values:
+                return Message(text=str(text_values[0]))
+        if "exported_content" in result.columns and not result["exported_content"].isna().all():
+            content_values = result["exported_content"].dropna().tolist()
+            if content_values:
+                return Message(text=str(content_values[0]))
+        # Return empty message with info that no text was found
+        return Message(text="(No text content extracted from file)")

lfx/components/files_and_knowledge/ingestion.py CHANGED Viewed

@@ -38,6 +38,7 @@ from lfx.services.deps import (
     get_variable_service,
     session_scope,
 )
+from lfx.utils.validate_cloud import raise_error_if_astra_cloud_disable_component
 if TYPE_CHECKING:
     from lfx.schema.dataframe import DataFrame
@@ -50,6 +51,9 @@ COHERE_MODEL_NAMES = ["embed-english-v3.0", "embed-multilingual-v3.0"]
 _KNOWLEDGE_BASES_ROOT_PATH: Path | None = None
+# Error message to raise if we're in Astra cloud environment and the component is not supported.
+astra_error_msg = "Knowledge ingestion is not supported in Astra cloud environment."
 def _get_knowledge_bases_root_path() -> Path:
     """Lazy load the knowledge bases root path from settings."""
@@ -540,6 +544,8 @@ class KnowledgeIngestionComponent(Component):
     # ---------------------------------------------------------------------
     async def build_kb_info(self) -> Data:
         """Main ingestion routine → returns a dict with KB metadata."""
+        # Check if we're in Astra cloud environment and raise an error if we are.
+        raise_error_if_astra_cloud_disable_component(astra_error_msg)
         try:
             input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df
             df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)
@@ -626,6 +632,8 @@ class KnowledgeIngestionComponent(Component):
         field_name: str | None = None,
     ):
         """Update build configuration based on provider selection."""
+        # Check if we're in Astra cloud environment and raise an error if we are.
+        raise_error_if_astra_cloud_disable_component(astra_error_msg)
         # Create a new knowledge base
         if field_name == "knowledge_base":
             async with session_scope() as db:

lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl

lfx-nightly 0.2.0.dev26py3-none-any.whl → 0.2.1.dev7py3-none-any.whl