PyPI - lfx-nightly - Versions diffs - 0.2.0.dev41__py3-none-any.whl → 0.3.0.dev3__py3-none-any.whl - Mend

lfx-nightly 0.2.0.dev41py3-none-any.whl → 0.3.0.dev3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

lfx/__main__.py +137 -6
lfx/_assets/component_index.json +1 -1
lfx/base/agents/agent.py +10 -6
lfx/base/agents/altk_base_agent.py +5 -3
lfx/base/agents/altk_tool_wrappers.py +1 -1
lfx/base/agents/events.py +1 -1
lfx/base/agents/utils.py +4 -0
lfx/base/composio/composio_base.py +78 -41
lfx/base/data/cloud_storage_utils.py +156 -0
lfx/base/data/docling_utils.py +130 -55
lfx/base/datastax/astradb_base.py +75 -64
lfx/base/embeddings/embeddings_class.py +113 -0
lfx/base/models/__init__.py +11 -1
lfx/base/models/google_generative_ai_constants.py +33 -9
lfx/base/models/model_metadata.py +6 -0
lfx/base/models/ollama_constants.py +196 -30
lfx/base/models/openai_constants.py +37 -10
lfx/base/models/unified_models.py +1123 -0
lfx/base/models/watsonx_constants.py +43 -4
lfx/base/prompts/api_utils.py +40 -5
lfx/base/tools/component_tool.py +2 -9
lfx/cli/__init__.py +10 -2
lfx/cli/commands.py +3 -0
lfx/cli/run.py +65 -409
lfx/cli/script_loader.py +18 -7
lfx/cli/validation.py +6 -3
lfx/components/__init__.py +0 -3
lfx/components/composio/github_composio.py +1 -1
lfx/components/cuga/cuga_agent.py +39 -27
lfx/components/data_source/api_request.py +4 -2
lfx/components/datastax/astradb_assistant_manager.py +4 -2
lfx/components/docling/__init__.py +45 -11
lfx/components/docling/docling_inline.py +39 -49
lfx/components/docling/docling_remote.py +1 -0
lfx/components/elastic/opensearch_multimodal.py +1733 -0
lfx/components/files_and_knowledge/file.py +384 -36
lfx/components/files_and_knowledge/ingestion.py +8 -0
lfx/components/files_and_knowledge/retrieval.py +10 -0
lfx/components/files_and_knowledge/save_file.py +91 -88
lfx/components/langchain_utilities/ibm_granite_handler.py +211 -0
lfx/components/langchain_utilities/tool_calling.py +37 -6
lfx/components/llm_operations/batch_run.py +64 -18
lfx/components/llm_operations/lambda_filter.py +213 -101
lfx/components/llm_operations/llm_conditional_router.py +39 -7
lfx/components/llm_operations/structured_output.py +38 -12
lfx/components/models/__init__.py +16 -74
lfx/components/models_and_agents/agent.py +51 -203
lfx/components/models_and_agents/embedding_model.py +171 -255
lfx/components/models_and_agents/language_model.py +54 -318
lfx/components/models_and_agents/mcp_component.py +96 -10
lfx/components/models_and_agents/prompt.py +105 -18
lfx/components/ollama/ollama_embeddings.py +111 -29
lfx/components/openai/openai_chat_model.py +1 -1
lfx/components/processing/text_operations.py +580 -0
lfx/components/vllm/__init__.py +37 -0
lfx/components/vllm/vllm.py +141 -0
lfx/components/vllm/vllm_embeddings.py +110 -0
lfx/custom/custom_component/component.py +65 -10
lfx/custom/custom_component/custom_component.py +8 -6
lfx/events/observability/__init__.py +0 -0
lfx/events/observability/lifecycle_events.py +111 -0
lfx/field_typing/__init__.py +57 -58
lfx/graph/graph/base.py +40 -1
lfx/graph/utils.py +109 -30
lfx/graph/vertex/base.py +75 -23
lfx/graph/vertex/vertex_types.py +0 -5
lfx/inputs/__init__.py +2 -0
lfx/inputs/input_mixin.py +55 -0
lfx/inputs/inputs.py +120 -0
lfx/interface/components.py +24 -7
lfx/interface/initialize/loading.py +42 -12
lfx/io/__init__.py +2 -0
lfx/run/__init__.py +5 -0
lfx/run/base.py +464 -0
lfx/schema/__init__.py +50 -0
lfx/schema/data.py +1 -1
lfx/schema/image.py +26 -7
lfx/schema/message.py +104 -11
lfx/schema/workflow.py +171 -0
lfx/services/deps.py +12 -0
lfx/services/interfaces.py +43 -1
lfx/services/mcp_composer/service.py +7 -1
lfx/services/schema.py +1 -0
lfx/services/settings/auth.py +95 -4
lfx/services/settings/base.py +11 -1
lfx/services/settings/constants.py +2 -0
lfx/services/settings/utils.py +82 -0
lfx/services/storage/local.py +13 -8
lfx/services/transaction/__init__.py +5 -0
lfx/services/transaction/service.py +35 -0
lfx/tests/unit/components/__init__.py +0 -0
lfx/utils/constants.py +2 -0
lfx/utils/mustache_security.py +79 -0
lfx/utils/validate_cloud.py +81 -3
{lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/METADATA +7 -2
{lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/RECORD +98 -80
{lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/WHEEL +0 -0
{lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/entry_points.txt +0 -0

lfx/base/agents/agent.py CHANGED Viewed

@@ -71,8 +71,8 @@ class LCAgentComponent(Component):
     ]
     outputs = [
-        Output(display_name="Agent", name="agent", method="build_agent", hidden=True, tool_mode=False),
         Output(display_name="Response", name="response", method="message_response"),
+        Output(display_name="Agent", name="agent", method="build_agent", tool_mode=False),
     ]
     # Get shared callbacks for tracing and save them to self.shared_callbacks
@@ -185,8 +185,10 @@ class LCAgentComponent(Component):
         if "input" not in input_dict:
             input_dict = {"input": self.input_value}
-        if hasattr(self, "system_prompt") and self.system_prompt and self.system_prompt.strip():
-            input_dict["system_prompt"] = self.system_prompt
+        # Use enhanced prompt if available (set by IBM Granite handler), otherwise use original
+        system_prompt_to_use = getattr(self, "_effective_system_prompt", None) or self.system_prompt
+        if system_prompt_to_use and system_prompt_to_use.strip():
+            input_dict["system_prompt"] = system_prompt_to_use
         if hasattr(self, "chat_history") and self.chat_history:
             if isinstance(self.chat_history, Data):
@@ -272,9 +274,11 @@ class LCAgentComponent(Component):
                 on_token_callback,
             )
         except ExceptionWithMessageError as e:
-            if hasattr(e, "agent_message") and hasattr(e.agent_message, "id"):
-                msg_id = e.agent_message.id
-                await delete_message(id_=msg_id)
+            # Only delete message from database if it has an ID (was stored)
+            if hasattr(e, "agent_message"):
+                msg_id = e.agent_message.get_id()
+                if msg_id:
+                    await delete_message(id_=msg_id)
             await self._send_message_event(e.agent_message, category="remove_message")
             logger.error(f"ExceptionWithMessageError: {e}")
             raise

lfx/base/agents/altk_base_agent.py CHANGED Viewed

@@ -378,9 +378,11 @@ class ALTKBaseAgentComponent(AgentComponent):
                 cast("SendMessageFunctionType", self.send_message),
             )
         except ExceptionWithMessageError as e:
-            if hasattr(e, "agent_message") and hasattr(e.agent_message, "id"):
-                msg_id = e.agent_message.id
-                await delete_message(id_=msg_id)
+            # Only delete message from database if it has an ID (was stored)
+            if hasattr(e, "agent_message"):
+                msg_id = e.agent_message.get_id()
+                if msg_id:
+                    await delete_message(id_=msg_id)
             await self._send_message_event(e.agent_message, category="remove_message")
             logger.error(f"ExceptionWithMessageError: {e}")
             raise

lfx/base/agents/altk_tool_wrappers.py CHANGED Viewed

@@ -513,7 +513,7 @@ class PostToolProcessor(ALTKBaseTool):
                 output = None
                 try:
                     output = middleware.process(input_data, AgentPhase.RUNTIME)
-                except (AttributeError, TypeError, ValueError, RuntimeError) as e:
+                except Exception as e:  # noqa: BLE001
                     logger.error(f"Exception in executing CodeGenerationComponent: {e}")
                 if output is not None and hasattr(output, "result"):
                     logger.info(f"Output of CodeGenerationComponent: {output.result}")

lfx/base/agents/events.py CHANGED Viewed

@@ -388,7 +388,7 @@ async def process_agent_events(
     agent_message = await send_message_callback(message=agent_message)
     # Capture the original message id - this must stay consistent throughout if streaming
     # Message may not contain id if the Agent is not connected to a Chat Output (_should_skip_message is True)
-    initial_message_id = agent_message.id if hasattr(agent_message, "id") else None
+    initial_message_id = agent_message.get_id()
     try:
         # Create a mapping of run_ids to tool contents
         tool_blocks_map: dict[str, ToolContent] = {}

lfx/base/agents/utils.py CHANGED Viewed

@@ -224,6 +224,10 @@ def get_chat_output_sender_name(self) -> str | None:
     if not hasattr(self, "graph") or not self.graph:
         return None
+    # Check if graph has vertices attribute (PlaceholderGraph doesn't)
+    if not hasattr(self.graph, "vertices"):
+        return None
     for vertex in self.graph.vertices:
         # Safely check if vertex has data attribute, correct type, and raw_params
         if (

lfx/base/composio/composio_base.py CHANGED Viewed

@@ -41,6 +41,58 @@ class ComposioBaseComponent(Component):
     default_tools_limit: int = 5
+    # Reserved attribute names that conflict with Component base class
+    RESERVED_ATTRIBUTES: set[str] = {
+        # Core component attributes
+        "name",
+        "description",
+        "status",
+        "display_name",
+        "icon",
+        "priority",
+        "code",
+        "inputs",
+        "outputs",
+        "selected_output",
+        # Properties and methods
+        "trace_type",
+        "trace_name",
+        "function",
+        "repr_value",
+        "field_config",
+        "field_order",
+        "frozen",
+        "build_parameters",
+        "cache",
+        "tools_metadata",
+        "vertex",
+        # User and session attributes
+        "user_id",  # Already handled separately but included for completeness
+        "session_id",
+        "flow_id",
+        "flow_name",
+        "context",
+        # Common method names
+        "build",
+        "run",
+        "stop",
+        "start",
+        "validate",
+        "get_function",
+        "set_attributes",
+        # Additional common conflicts
+        "id",
+        "type",
+        "value",
+        "metadata",
+        "logs",
+        "results",
+        "artifacts",
+        "parameters",
+        "template",
+        "config",
+    }
     _base_inputs = [
         MessageTextInput(
             name="entity_id",
@@ -623,13 +675,9 @@ class ComposioBaseComponent(Component):
                                 attachment_related_found = True
                                 continue  # Skip individual attachment fields
-                            # Handle conflicting field names - rename user_id to avoid conflicts with entity_id
-                            if clean_field == "user_id":
-                                clean_field = f"{self.app_name}_user_id"
-                            # Handle reserved attribute name conflicts (e.g., 'status', 'name')
+                            # Handle reserved attribute name conflicts
                             # Prefix with app name to prevent clashes with component attributes
-                            if clean_field in {"status", "name"}:
+                            if clean_field in self.RESERVED_ATTRIBUTES:
                                 clean_field = f"{self.app_name}_{clean_field}"
                             action_fields.append(clean_field)
@@ -795,28 +843,16 @@ class ComposioBaseComponent(Component):
                     # Don't add individual attachment sub-fields to the schema
                     continue
-                # Handle conflicting field names - rename user_id to avoid conflicts with entity_id
-                if clean_field_name == "user_id":
-                    clean_field_name = f"{self.app_name}_user_id"
+                # Handle reserved attribute name conflicts
+                if clean_field_name in self.RESERVED_ATTRIBUTES:
+                    original_name = clean_field_name
+                    clean_field_name = f"{self.app_name}_{clean_field_name}"
                     # Update the field schema description to reflect the name change
                     field_schema_copy = field_schema.copy()
+                    original_description = field_schema.get("description", "")
                     field_schema_copy["description"] = (
-                        f"User ID for {self.app_name.title()}: " + field_schema["description"]
-                    )
-                elif clean_field_name == "status":
-                    clean_field_name = f"{self.app_name}_status"
-                    # Update the field schema description to reflect the name change
-                    field_schema_copy = field_schema.copy()
-                    field_schema_copy["description"] = f"Status for {self.app_name.title()}: " + field_schema.get(
-                        "description", ""
-                    )
-                elif clean_field_name == "name":
-                    clean_field_name = f"{self.app_name}_name"
-                    # Update the field schema description to reflect the name change
-                    field_schema_copy = field_schema.copy()
-                    field_schema_copy["description"] = f"Name for {self.app_name.title()}: " + field_schema.get(
-                        "description", ""
-                    )
+                        f"{original_name.replace('_', ' ').title()} for {self.app_name.title()}: {original_description}"
+                    ).strip()
                 else:
                     # Use the original field schema for all other fields
                     field_schema_copy = field_schema
@@ -842,12 +878,8 @@ class ComposioBaseComponent(Component):
                 cleaned_required = []
                 for field in flat_schema["required"]:
                     base = field.replace("[0]", "")
-                    if base == "user_id":
-                        cleaned_required.append(f"{self.app_name}_user_id")
-                    elif base == "status":
-                        cleaned_required.append(f"{self.app_name}_status")
-                    elif base == "name":
-                        cleaned_required.append(f"{self.app_name}_name")
+                    if base in self.RESERVED_ATTRIBUTES:
+                        cleaned_required.append(f"{self.app_name}_{base}")
                     else:
                         cleaned_required.append(base)
                 flat_schema["required"] = cleaned_required
@@ -943,9 +975,10 @@ class ComposioBaseComponent(Component):
                                 inp.advanced = True
                             # Skip entity_id being mapped to user_id parameter
-                            if inp.name == "user_id" and getattr(self, "entity_id", None) == getattr(
-                                inp, "value", None
-                            ):
+                            # Check both original name and renamed version
+                            if inp.name in {"user_id", f"{self.app_name}_user_id"} and getattr(
+                                self, "entity_id", None
+                            ) == getattr(inp, "value", None):
                                 continue
                             processed_inputs.append(inp)
@@ -2422,12 +2455,11 @@ class ComposioBaseComponent(Component):
                 # Handle renamed fields - map back to original names for API execution
                 final_field_name = field
-                if field.endswith("_user_id") and field.startswith(self.app_name):
-                    final_field_name = "user_id"
-                elif field == f"{self.app_name}_status":
-                    final_field_name = "status"
-                elif field == f"{self.app_name}_name":
-                    final_field_name = "name"
+                # Check if this is a renamed reserved attribute
+                if field.startswith(f"{self.app_name}_"):
+                    potential_original = field[len(self.app_name) + 1 :]  # Remove app_name prefix
+                    if potential_original in self.RESERVED_ATTRIBUTES:
+                        final_field_name = potential_original
                 arguments[final_field_name] = value
@@ -2538,7 +2570,7 @@ class ComposioBaseComponent(Component):
                 build_config[fname]["value"] = "" if fname not in self._bool_variables else False
         # Hide any other visible, non-protected fields that look like parameters
         protected = {
-            "code",
+            # Component control fields
             "entity_id",
             "api_key",
             "auth_link",
@@ -2570,6 +2602,11 @@ class ComposioBaseComponent(Component):
             "instance_url",
             "tenant_id",
         }
+        # Add all reserved Component attributes to protected set
+        protected.update(self.RESERVED_ATTRIBUTES)
+        # Also add the renamed versions (with app_name prefix) to protected set
+        for attr in self.RESERVED_ATTRIBUTES:
+            protected.add(f"{self.app_name}_{attr}")
         # Add all dynamic auth fields to protected set
         protected.update(self._auth_dynamic_fields)
         # Also protect any auth fields discovered across all instances

lfx/base/data/cloud_storage_utils.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Shared utilities for cloud storage operations (AWS S3 and Google Drive).
+This module provides common functionality used by both read and write file components
+to avoid code duplication.
+"""
+from __future__ import annotations
+import json
+from typing import Any
+def validate_aws_credentials(component: Any) -> None:
+    """Validate that required AWS S3 credentials are present.
+    Args:
+        component: Component instance with AWS credential attributes
+    Raises:
+        ValueError: If any required credential is missing
+    """
+    if not getattr(component, "aws_access_key_id", None):
+        msg = "AWS Access Key ID is required for S3 storage"
+        raise ValueError(msg)
+    if not getattr(component, "aws_secret_access_key", None):
+        msg = "AWS Secret Key is required for S3 storage"
+        raise ValueError(msg)
+    if not getattr(component, "bucket_name", None):
+        msg = "S3 Bucket Name is required for S3 storage"
+        raise ValueError(msg)
+def create_s3_client(component: Any):
+    """Create and return a configured boto3 S3 client.
+    Args:
+        component: Component instance with AWS credential attributes
+    Returns:
+        boto3 S3 client instance
+    Raises:
+        ImportError: If boto3 is not installed
+    """
+    try:
+        import boto3
+    except ImportError as e:
+        msg = "boto3 is not installed. Please install it using `uv pip install boto3`."
+        raise ImportError(msg) from e
+    client_config = {
+        "aws_access_key_id": component.aws_access_key_id,
+        "aws_secret_access_key": component.aws_secret_access_key,
+    }
+    if hasattr(component, "aws_region") and component.aws_region:
+        client_config["region_name"] = component.aws_region
+    return boto3.client("s3", **client_config)
+def parse_google_service_account_key(service_account_key: str) -> dict:
+    """Parse Google service account JSON key with multiple fallback strategies.
+    This function handles various common formatting issues when users paste
+    service account keys, including:
+    - Control characters
+    - Extra whitespace
+    - Double-encoded JSON strings
+    - Escaped newlines in private_key field
+    Args:
+        service_account_key: Service account JSON key as string
+    Returns:
+        dict: Parsed service account credentials
+    Raises:
+        ValueError: If all parsing strategies fail
+    """
+    credentials_dict = None
+    parse_errors = []
+    # Strategy 1: Parse as-is with strict=False to allow control characters
+    try:
+        credentials_dict = json.loads(service_account_key, strict=False)
+    except json.JSONDecodeError as e:
+        parse_errors.append(f"Standard parse: {e!s}")
+    # Strategy 2: Strip whitespace and try again
+    if credentials_dict is None:
+        try:
+            cleaned_key = service_account_key.strip()
+            credentials_dict = json.loads(cleaned_key, strict=False)
+        except json.JSONDecodeError as e:
+            parse_errors.append(f"Stripped parse: {e!s}")
+    # Strategy 3: Check if it's double-encoded (JSON string of a JSON string)
+    if credentials_dict is None:
+        try:
+            decoded_once = json.loads(service_account_key, strict=False)
+            credentials_dict = json.loads(decoded_once, strict=False) if isinstance(decoded_once, str) else decoded_once
+        except json.JSONDecodeError as e:
+            parse_errors.append(f"Double-encoded parse: {e!s}")
+    # Strategy 4: Try to fix common issues with newlines in the private_key field
+    if credentials_dict is None:
+        try:
+            # Replace literal \n with actual newlines which is common in pasted JSON
+            fixed_key = service_account_key.replace("\\n", "\n")
+            credentials_dict = json.loads(fixed_key, strict=False)
+        except json.JSONDecodeError as e:
+            parse_errors.append(f"Newline-fixed parse: {e!s}")
+    if credentials_dict is None:
+        error_details = "; ".join(parse_errors)
+        msg = (
+            f"Unable to parse service account key JSON. Tried multiple strategies: {error_details}. "
+            "Please ensure you've copied the entire JSON content from your service account key file. "
+            "The JSON should start with '{' and contain fields like 'type', 'project_id', 'private_key', etc."
+        )
+        raise ValueError(msg)
+    return credentials_dict
+def create_google_drive_service(service_account_key: str, scopes: list[str], *, return_credentials: bool = False):
+    """Create and return a configured Google Drive API service.
+    Args:
+        service_account_key: Service account JSON key as string
+        scopes: List of Google API scopes to request
+        return_credentials: If True, return both service and credentials as tuple
+    Returns:
+        Google Drive API service instance, or tuple of (service, credentials) if return_credentials=True
+    Raises:
+        ImportError: If Google API client libraries are not installed
+        ValueError: If credentials cannot be parsed
+    """
+    try:
+        from google.oauth2 import service_account
+        from googleapiclient.discovery import build
+    except ImportError as e:
+        msg = "Google API client libraries are not installed. Please install them."
+        raise ImportError(msg) from e
+    credentials_dict = parse_google_service_account_key(service_account_key)
+    credentials = service_account.Credentials.from_service_account_info(credentials_dict, scopes=scopes)
+    service = build("drive", "v3", credentials=credentials)
+    if return_credentials:
+        return service, credentials
+    return service

lfx/base/data/docling_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import signal
 import sys
 import traceback
 from contextlib import suppress
-from typing import TYPE_CHECKING
+from functools import lru_cache
 from docling_core.types.doc import DoclingDocument
 from pydantic import BaseModel, SecretStr, TypeAdapter
@@ -12,9 +12,6 @@ from lfx.log.logger import logger
 from lfx.schema.data import Data
 from lfx.schema.dataframe import DataFrame
-if TYPE_CHECKING:
-    from langchain_core.language_models.chat_models import BaseChatModel
 class DoclingDependencyError(Exception):
     """Custom exception for missing Docling dependencies."""
@@ -152,6 +149,81 @@ def _deserialize_pydantic_model(data: dict):
     return adapter.validate_python(data["config"])
+# Global cache for DocumentConverter instances
+# This cache persists across multiple runs and thread invocations
+@lru_cache(maxsize=4)
+def _get_cached_converter(
+    pipeline: str,
+    ocr_engine: str,
+    *,
+    do_picture_classification: bool,
+    pic_desc_config_hash: str | None,
+):
+    """Create and cache a DocumentConverter instance based on configuration.
+    This function uses LRU caching to maintain DocumentConverter instances in memory,
+    eliminating the 15-20 minute model loading time on subsequent runs.
+    Args:
+        pipeline: The pipeline type ("standard" or "vlm")
+        ocr_engine: The OCR engine to use
+        do_picture_classification: Whether to enable picture classification
+        pic_desc_config_hash: Hash of the picture description config (for cache key)
+    Returns:
+        A cached or newly created DocumentConverter instance
+    """
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
+    from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+    from docling.models.factories import get_ocr_factory
+    from docling.pipeline.vlm_pipeline import VlmPipeline
+    logger.info(f"Creating DocumentConverter for pipeline={pipeline}, ocr_engine={ocr_engine}")
+    # Configure the standard PDF pipeline
+    def _get_standard_opts() -> PdfPipelineOptions:
+        pipeline_options = PdfPipelineOptions()
+        pipeline_options.do_ocr = ocr_engine not in {"", "None"}
+        if pipeline_options.do_ocr:
+            ocr_factory = get_ocr_factory(
+                allow_external_plugins=False,
+            )
+            ocr_options: OcrOptions = ocr_factory.create_options(
+                kind=ocr_engine,
+            )
+            pipeline_options.ocr_options = ocr_options
+        pipeline_options.do_picture_classification = do_picture_classification
+        # Note: pic_desc_config_hash is for cache key only
+        # Actual picture description is handled separately (non-cached path)
+        _ = pic_desc_config_hash  # Mark as intentionally unused
+        return pipeline_options
+    # Configure the VLM pipeline
+    def _get_vlm_opts() -> VlmPipelineOptions:
+        return VlmPipelineOptions()
+    if pipeline == "standard":
+        pdf_format_option = PdfFormatOption(
+            pipeline_options=_get_standard_opts(),
+        )
+    elif pipeline == "vlm":
+        pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
+    else:
+        msg = f"Unknown pipeline: {pipeline!r}"
+        raise ValueError(msg)
+    format_options: dict[InputFormat, FormatOption] = {
+        InputFormat.PDF: pdf_format_option,
+        InputFormat.IMAGE: pdf_format_option,
+    }
+    return DocumentConverter(format_options=format_options)
 def docling_worker(
     *,
     file_paths: list[str],
@@ -162,7 +234,12 @@ def docling_worker(
     pic_desc_config: dict | None,
     pic_desc_prompt: str,
 ):
-    """Worker function for processing files with Docling in a separate process."""
+    """Worker function for processing files with Docling using threading.
+    This function now uses a globally cached DocumentConverter instance,
+    significantly reducing processing time on subsequent runs from 15-20 minutes
+    to just seconds.
+    """
     # Signal handling for graceful shutdown
     shutdown_requested = False
@@ -205,12 +282,12 @@ def docling_worker(
     check_shutdown()
     try:
-        from docling.datamodel.base_models import ConversionStatus, InputFormat
-        from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
-        from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-        from docling.models.factories import get_ocr_factory
-        from docling.pipeline.vlm_pipeline import VlmPipeline
-        from langchain_docling.picture_description import PictureDescriptionLangChainOptions
+        from docling.datamodel.base_models import ConversionStatus, InputFormat  # noqa: F401
+        from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions  # noqa: F401
+        from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption  # noqa: F401
+        from docling.models.factories import get_ocr_factory  # noqa: F401
+        from docling.pipeline.vlm_pipeline import VlmPipeline  # noqa: F401
+        from langchain_docling.picture_description import PictureDescriptionLangChainOptions  # noqa: F401
         # Check for shutdown after imports
         check_shutdown()
@@ -233,27 +310,34 @@ def docling_worker(
         queue.put({"error": "Worker interrupted during imports", "shutdown": True})
         return
-    # Configure the standard PDF pipeline
-    def _get_standard_opts() -> PdfPipelineOptions:
+    # Use cached converter instead of creating new one each time
+    # This is the key optimization that eliminates 15-20 minute model load times
+    def _get_converter() -> DocumentConverter:
         check_shutdown()  # Check before heavy operations
-        pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_ocr = ocr_engine not in {"", "None"}
-        if pipeline_options.do_ocr:
-            ocr_factory = get_ocr_factory(
-                allow_external_plugins=False,
-            )
-            ocr_options: OcrOptions = ocr_factory.create_options(
-                kind=ocr_engine,
-            )
-            pipeline_options.ocr_options = ocr_options
-        pipeline_options.do_picture_classification = do_picture_classification
+        # For now, we don't support pic_desc_config caching due to serialization complexity
+        # This is a known limitation that can be addressed in a future enhancement
         if pic_desc_config:
-            pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config)
+            logger.warning(
+                "Picture description with LLM is not yet supported with cached converters. "
+                "Using non-cached converter for this request."
+            )
+            # Fall back to creating a new converter (old behavior)
+            from docling.datamodel.base_models import InputFormat
+            from docling.datamodel.pipeline_options import PdfPipelineOptions
+            from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+            from docling.models.factories import get_ocr_factory
+            from langchain_docling.picture_description import PictureDescriptionLangChainOptions
+            pipeline_options = PdfPipelineOptions()
+            pipeline_options.do_ocr = ocr_engine not in {"", "None"}
+            if pipeline_options.do_ocr:
+                ocr_factory = get_ocr_factory(allow_external_plugins=False)
+                ocr_options = ocr_factory.create_options(kind=ocr_engine)
+                pipeline_options.ocr_options = ocr_options
+            pipeline_options.do_picture_classification = do_picture_classification
+            pic_desc_llm = _deserialize_pydantic_model(pic_desc_config)
             logger.info("Docling enabling the picture description stage.")
             pipeline_options.do_picture_description = True
             pipeline_options.allow_external_plugins = True
@@ -261,33 +345,24 @@ def docling_worker(
                 llm=pic_desc_llm,
                 prompt=pic_desc_prompt,
             )
-        return pipeline_options
-    # Configure the VLM pipeline
-    def _get_vlm_opts() -> VlmPipelineOptions:
-        check_shutdown()  # Check before heavy operations
-        return VlmPipelineOptions()
-    # Configure the main format options and create the DocumentConverter()
-    def _get_converter() -> DocumentConverter:
-        check_shutdown()  # Check before heavy operations
-        if pipeline == "standard":
-            pdf_format_option = PdfFormatOption(
-                pipeline_options=_get_standard_opts(),
-            )
-        elif pipeline == "vlm":
-            pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
-        else:
-            msg = f"Unknown pipeline: {pipeline!r}"
-            raise ValueError(msg)
-        format_options: dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
-        return DocumentConverter(format_options=format_options)
+            pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
+            format_options: dict[InputFormat, FormatOption] = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+            return DocumentConverter(format_options=format_options)
+        # Use cached converter - this is where the magic happens!
+        # First run: creates and caches converter (15-20 min)
+        # Subsequent runs: reuses cached converter (seconds)
+        pic_desc_config_hash = None  # Will be None since we checked above
+        return _get_cached_converter(
+            pipeline=pipeline,
+            ocr_engine=ocr_engine,
+            do_picture_classification=do_picture_classification,
+            pic_desc_config_hash=pic_desc_config_hash,
+        )
     try:
         # Check for shutdown before creating converter (can be slow)

lfx-nightly 0.2.0.dev41__py3-none-any.whl → 0.3.0.dev3__py3-none-any.whl

lfx-nightly 0.2.0.dev41py3-none-any.whl → 0.3.0.dev3py3-none-any.whl