PyPI - nv-ingest-api - Versions diffs - 26.1.0rc4__py3-none-any.whl - Mend

nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show

nv_ingest_api/__init__.py +3 -0
nv_ingest_api/interface/__init__.py +218 -0
nv_ingest_api/interface/extract.py +977 -0
nv_ingest_api/interface/mutate.py +154 -0
nv_ingest_api/interface/store.py +200 -0
nv_ingest_api/interface/transform.py +382 -0
nv_ingest_api/interface/utility.py +186 -0
nv_ingest_api/internal/__init__.py +0 -0
nv_ingest_api/internal/enums/__init__.py +3 -0
nv_ingest_api/internal/enums/common.py +550 -0
nv_ingest_api/internal/extract/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
nv_ingest_api/internal/extract/docx/__init__.py +5 -0
nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
nv_ingest_api/internal/extract/html/__init__.py +3 -0
nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
nv_ingest_api/internal/extract/image/__init__.py +3 -0
nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
nv_ingest_api/internal/meta/__init__.py +3 -0
nv_ingest_api/internal/meta/udf.py +232 -0
nv_ingest_api/internal/mutate/__init__.py +3 -0
nv_ingest_api/internal/mutate/deduplicate.py +110 -0
nv_ingest_api/internal/mutate/filter.py +133 -0
nv_ingest_api/internal/primitives/__init__.py +0 -0
nv_ingest_api/internal/primitives/control_message_task.py +16 -0
nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
nv_ingest_api/internal/schemas/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
nv_ingest_api/internal/schemas/meta/udf.py +23 -0
nv_ingest_api/internal/schemas/mixins.py +39 -0
nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
nv_ingest_api/internal/schemas/store/__init__.py +3 -0
nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
nv_ingest_api/internal/store/__init__.py +3 -0
nv_ingest_api/internal/store/embed_text_upload.py +236 -0
nv_ingest_api/internal/store/image_upload.py +251 -0
nv_ingest_api/internal/transform/__init__.py +3 -0
nv_ingest_api/internal/transform/caption_image.py +219 -0
nv_ingest_api/internal/transform/embed_text.py +702 -0
nv_ingest_api/internal/transform/split_text.py +182 -0
nv_ingest_api/util/__init__.py +3 -0
nv_ingest_api/util/control_message/__init__.py +0 -0
nv_ingest_api/util/control_message/validators.py +47 -0
nv_ingest_api/util/converters/__init__.py +0 -0
nv_ingest_api/util/converters/bytetools.py +78 -0
nv_ingest_api/util/converters/containers.py +65 -0
nv_ingest_api/util/converters/datetools.py +90 -0
nv_ingest_api/util/converters/dftools.py +127 -0
nv_ingest_api/util/converters/formats.py +64 -0
nv_ingest_api/util/converters/type_mappings.py +27 -0
nv_ingest_api/util/dataloader/__init__.py +9 -0
nv_ingest_api/util/dataloader/dataloader.py +409 -0
nv_ingest_api/util/detectors/__init__.py +5 -0
nv_ingest_api/util/detectors/language.py +38 -0
nv_ingest_api/util/exception_handlers/__init__.py +0 -0
nv_ingest_api/util/exception_handlers/converters.py +72 -0
nv_ingest_api/util/exception_handlers/decorators.py +429 -0
nv_ingest_api/util/exception_handlers/detectors.py +74 -0
nv_ingest_api/util/exception_handlers/pdf.py +116 -0
nv_ingest_api/util/exception_handlers/schemas.py +68 -0
nv_ingest_api/util/image_processing/__init__.py +5 -0
nv_ingest_api/util/image_processing/clustering.py +260 -0
nv_ingest_api/util/image_processing/processing.py +177 -0
nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
nv_ingest_api/util/image_processing/transforms.py +850 -0
nv_ingest_api/util/imports/__init__.py +3 -0
nv_ingest_api/util/imports/callable_signatures.py +108 -0
nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
nv_ingest_api/util/introspection/__init__.py +3 -0
nv_ingest_api/util/introspection/class_inspect.py +145 -0
nv_ingest_api/util/introspection/function_inspect.py +65 -0
nv_ingest_api/util/logging/__init__.py +0 -0
nv_ingest_api/util/logging/configuration.py +102 -0
nv_ingest_api/util/logging/sanitize.py +84 -0
nv_ingest_api/util/message_brokers/__init__.py +3 -0
nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
nv_ingest_api/util/metadata/__init__.py +5 -0
nv_ingest_api/util/metadata/aggregators.py +516 -0
nv_ingest_api/util/multi_processing/__init__.py +8 -0
nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
nv_ingest_api/util/nim/__init__.py +161 -0
nv_ingest_api/util/pdf/__init__.py +3 -0
nv_ingest_api/util/pdf/pdfium.py +428 -0
nv_ingest_api/util/schema/__init__.py +3 -0
nv_ingest_api/util/schema/schema_validator.py +10 -0
nv_ingest_api/util/service_clients/__init__.py +3 -0
nv_ingest_api/util/service_clients/client_base.py +86 -0
nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
nv_ingest_api/util/string_processing/__init__.py +51 -0
nv_ingest_api/util/string_processing/configuration.py +682 -0
nv_ingest_api/util/string_processing/yaml.py +109 -0
nv_ingest_api/util/system/__init__.py +0 -0
nv_ingest_api/util/system/hardware_info.py +594 -0
nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
udfs/__init__.py +5 -0
udfs/llm_summarizer_udf.py +259 -0

nv_ingest_api/internal/extract/pptx/pptx_extractor.py ADDED Viewed

@@ -0,0 +1,210 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import base64
+import functools
+import io
+import logging
+from typing import Any, Optional, Dict, Union, Tuple
+import pandas as pd
+from pydantic import BaseModel
+from nv_ingest_api.internal.extract.pdf.engines.pdfium import pdfium_extractor
+from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import convert_stream_with_libreoffice
+from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import python_pptx
+from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
+logger = logging.getLogger(__name__)
+def _prepare_task_properties(
+    base64_row: pd.Series, task_props: Union[Dict[str, Any], BaseModel]
+) -> Tuple[Dict[str, Any], Optional[str]]:
+    """
+    Prepare and return the task properties dictionary and source identifier from a DataFrame row.
+    This function converts task properties to a dictionary (if provided as a Pydantic model),
+    extracts row data (excluding the "content" field), and stores it under the "row_data" key within
+    the task properties. It also retrieves the "source_id" from the row if present.
+    Parameters
+    ----------
+    base64_row : pd.Series
+        A pandas Series representing a row containing base64-encoded content under the key "content"
+        and optionally a "source_id".
+    task_props : Union[Dict[str, Any], BaseModel]
+        A dictionary or Pydantic model containing extraction instructions and parameters.
+    Returns
+    -------
+    Tuple[Dict[str, Any], Optional[str]]
+        A tuple where the first element is the prepared task properties dictionary (with "row_data" added)
+        and the second element is the source_id if present; otherwise, None.
+    """
+    # If task_props is a Pydantic model, convert it to a dictionary.
+    if isinstance(task_props, BaseModel):
+        task_props = task_props.model_dump()
+    else:
+        task_props = dict(task_props)
+    # Exclude the "content" field from the row data.
+    row_data = base64_row.drop(labels=["content"], errors="ignore")
+    if "params" not in task_props:
+        task_props["params"] = {}
+    # Store the row data in the parameters.
+    task_props["params"]["row_data"] = row_data
+    # Retrieve the source identifier if available.
+    source_id = base64_row.get("source_id", None)
+    return task_props, source_id
+@unified_exception_handler
+def _decode_and_extract_from_pptx(
+    base64_row: pd.Series,
+    task_props: Union[Dict[str, Any], BaseModel],
+    extraction_config: Any,
+    trace_info: Dict[str, Any],
+) -> Any:
+    """
+    Decode base64-encoded PPTX content from a DataFrame row and extract data using the specified method.
+    The function prepares task properties (using `_prepare_task_properties`), decodes the base64 content
+    into a byte stream, determines extraction parameters, and calls the extraction function (e.g. `python_pptx`)
+    with the proper flags. If extraction fails, an exception tag is returned.
+    Parameters
+    ----------
+    base64_row : pd.Series
+        A Series containing base64-encoded PPTX content under the key "content" and optionally a "source_id".
+    task_props : Union[Dict[str, Any], BaseModel]
+        A dictionary or Pydantic model containing extraction instructions (may include a "method" key and "params").
+    extraction_config : Any
+        A configuration object containing PPTX extraction settings (e.g. `pptx_extraction_config`).
+    trace_info : Dict[str, Any]
+        A dictionary with trace information for logging or debugging.
+    Returns
+    -------
+    Any
+        The extracted data from the PPTX file, or an exception tag indicating failure.
+    """
+    # Prepare task properties and extract source_id.
+    prepared_task_props, source_id = _prepare_task_properties(base64_row, task_props)
+    # Decode base64 content into bytes and create a BytesIO stream.
+    base64_content: str = base64_row["content"]
+    pptx_bytes: bytes = base64.b64decode(base64_content)
+    pptx_stream: io.BytesIO = io.BytesIO(pptx_bytes)
+    # Retrieve extraction parameters (and remove boolean flags as they are consumed).
+    extract_method = prepared_task_props.get("method", "python_pptx")
+    extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
+    extract_text: bool = extract_params.pop("extract_text", False)
+    extract_images: bool = extract_params.pop("extract_images", False)
+    extract_tables: bool = extract_params.pop("extract_tables", False)
+    extract_charts: bool = extract_params.pop("extract_charts", False)
+    extract_infographics: bool = extract_params.pop("extract_infographics", False)
+    # Inject additional configuration and trace information.
+    if getattr(extraction_config, "pptx_extraction_config", None) is not None:
+        extract_params["pptx_extraction_config"] = extraction_config.pptx_extraction_config
+    if trace_info is not None:
+        extract_params["trace_info"] = trace_info
+    if extract_method == "render_as_pdf":
+        pdf_stream = convert_stream_with_libreoffice(pptx_stream, "pptx", "pdf")
+        pdf_extract_method = extract_params.get("pdf_extract_method", "pdfium")
+        pdf_extractor_config = extract_params.copy()
+        pdf_extractor_config["extract_method"] = pdf_extract_method
+        if getattr(extraction_config, "pdfium_config", None) is not None:
+            pdf_extractor_config["pdfium_config"] = extraction_config.pdfium_config
+        extracted_data: Any = pdfium_extractor(
+            pdf_stream=pdf_stream,
+            extract_text=extract_text,
+            extract_images=extract_images,
+            extract_infographics=extract_infographics,
+            extract_tables=extract_tables,
+            extract_charts=extract_charts,
+            extract_page_as_image=False,
+            extractor_config=pdf_extractor_config,
+            execution_trace_log=None,
+        )
+    elif extract_method == "python_pptx":
+        # Call the PPTX extraction function.
+        extracted_data = python_pptx(
+            pptx_stream=pptx_stream,
+            extract_text=extract_text,
+            extract_images=extract_images,
+            extract_infographics=extract_infographics,
+            extract_tables=extract_tables,
+            extract_charts=extract_charts,
+            extraction_config=extract_params,
+            execution_trace_log=None,
+        )
+    else:
+        raise ValueError(f"Unsupported PPTx extraction method: {extract_method}")
+    return extracted_data
+@unified_exception_handler
+def extract_primitives_from_pptx_internal(
+    df_extraction_ledger: pd.DataFrame,
+    task_config: Union[Dict[str, Any], BaseModel],
+    extraction_config: Any,  # Assuming PPTXExtractorSchema or similar type
+    execution_trace_log: Optional[Dict[str, Any]] = None,
+) -> pd.DataFrame:
+    """
+    Process a DataFrame containing base64-encoded PPTX files and extract primitive data.
+    This function applies a decoding and extraction routine to each row of the DataFrame
+    (via `_decode_and_extract_from_pptx`), then explodes any list results into separate rows, drops missing values,
+    and compiles the extracted data into a new DataFrame. The resulting DataFrame includes columns for document type,
+    extracted metadata, and a unique identifier (UUID).
+    Parameters
+    ----------
+    df_extraction_ledger : pd.DataFrame
+        Input DataFrame with PPTX files in base64 encoding. Expected to include columns 'source_id' and 'content'.
+    task_config : Union[Dict[str, Any], BaseModel]
+        Configuration for the PPTX extraction task, as a dict or Pydantic model.
+    extraction_config : Any
+        Configuration object for PPTX extraction (e.g. PPTXExtractorSchema).
+    execution_trace_log : Optional[Dict[str, Any]], optional
+        Optional dictionary containing trace information for debugging.
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with extracted PPTX content containing columns:
+        "document_type", "metadata", and "uuid".
+    Raises
+    ------
+    Exception
+        Reraises any exception encountered during extraction with additional context.
+    """
+    # Create a partial function to decode and extract content from each DataFrame row.
+    decode_and_extract_partial = functools.partial(
+        _decode_and_extract_from_pptx,
+        task_props=task_config,
+        extraction_config=extraction_config,
+        trace_info=execution_trace_log,
+    )
+    # Apply the decoding and extraction to each row.
+    extraction_series = df_extraction_ledger.apply(decode_and_extract_partial, axis=1)
+    # Explode list results into separate rows and remove missing values.
+    extraction_series = extraction_series.explode().dropna()
+    # Convert the series into a DataFrame with defined columns.
+    if not extraction_series.empty:
+        extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
+    else:
+        extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
+    return extracted_df, {}

nv_ingest_api/internal/meta/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/meta/udf.py ADDED Viewed

@@ -0,0 +1,232 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import hashlib
+import inspect
+import logging
+import time
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
+from nv_ingest_api.internal.schemas.meta.udf import UDFStageSchema
+from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
+logger = logging.getLogger(__name__)
+@dataclass
+class CachedUDF:
+    """Cached UDF function with metadata"""
+    function: callable
+    function_name: str
+    signature_validated: bool
+    created_at: float
+    last_used: float
+    use_count: int
+class UDFCache:
+    """LRU cache for compiled and validated UDF functions"""
+    def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
+        self.max_size = max_size
+        self.ttl_seconds = ttl_seconds
+        self.cache: Dict[str, CachedUDF] = {}
+        self.access_order: List[str] = []  # For LRU tracking
+    def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
+        """Generate cache key from UDF string and function name"""
+        content = f"{udf_function_str.strip()}:{udf_function_name}"
+        return hashlib.sha256(content.encode()).hexdigest()
+    def _evict_lru(self):
+        """Remove least recently used item"""
+        if self.access_order:
+            lru_key = self.access_order.pop(0)
+            self.cache.pop(lru_key, None)
+    def _cleanup_expired(self):
+        """Remove expired entries if TTL is configured"""
+        if not self.ttl_seconds:
+            return
+        current_time = time.time()
+        expired_keys = [
+            key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
+        ]
+        for key in expired_keys:
+            self.cache.pop(key, None)
+            if key in self.access_order:
+                self.access_order.remove(key)
+    def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
+        """Get cached UDF function if available"""
+        self._cleanup_expired()
+        cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
+        if cache_key in self.cache:
+            # Update access tracking
+            if cache_key in self.access_order:
+                self.access_order.remove(cache_key)
+            self.access_order.append(cache_key)
+            # Update usage stats
+            cached_udf = self.cache[cache_key]
+            cached_udf.last_used = time.time()
+            cached_udf.use_count += 1
+            return cached_udf
+        return None
+    def put(
+        self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
+    ) -> str:
+        """Cache a compiled and validated UDF function"""
+        cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
+        # Evict LRU if at capacity
+        while len(self.cache) >= self.max_size:
+            self._evict_lru()
+        current_time = time.time()
+        cached_udf = CachedUDF(
+            function=function,
+            function_name=udf_function_name,
+            signature_validated=signature_validated,
+            created_at=current_time,
+            last_used=current_time,
+            use_count=1,
+        )
+        self.cache[cache_key] = cached_udf
+        self.access_order.append(cache_key)
+        return cache_key
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics"""
+        total_uses = sum(udf.use_count for udf in self.cache.values())
+        most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
+        return {
+            "size": len(self.cache),
+            "max_size": self.max_size,
+            "total_uses": total_uses,
+            "most_used_function": most_used.function_name if most_used else None,
+            "most_used_count": most_used.use_count if most_used else 0,
+        }
+# Global cache instance
+_udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
+def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
+    """Compile and validate UDF function (extracted for caching)"""
+    # Execute the UDF function string in a controlled namespace
+    namespace: Dict[str, Any] = {}
+    try:
+        exec(udf_function_str, namespace)
+    except Exception as e:
+        raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
+    # Extract the specified function from the namespace
+    if udf_function_name in namespace and callable(namespace[udf_function_name]):
+        udf_function = namespace[udf_function_name]
+    else:
+        raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
+    # Validate the UDF function signature
+    try:
+        ingest_callable_signature(inspect.signature(udf_function))
+    except Exception as e:
+        raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
+    return udf_function
+def get_udf_cache_stats() -> Dict[str, Any]:
+    """Get UDF cache performance statistics"""
+    return _udf_cache.get_stats()
+def udf_stage_callable_fn(control_message: IngestControlMessage, stage_config: UDFStageSchema) -> IngestControlMessage:
+    """
+    UDF stage callable function that processes UDF tasks in a control message.
+    This function extracts all UDF tasks from the control message and executes them sequentially.
+    Parameters
+    ----------
+    control_message : IngestControlMessage
+        The control message containing UDF tasks to process
+    stage_config : UDFStageSchema
+        Configuration for the UDF stage
+    Returns
+    -------
+    IngestControlMessage
+        The control message after processing all UDF tasks
+    """
+    logger.debug("Starting UDF stage processing")
+    # Extract all UDF tasks from control message using free function
+    try:
+        all_task_configs = remove_all_tasks_by_type(control_message, "udf")
+    except ValueError:
+        # No UDF tasks found
+        if stage_config.ignore_empty_udf:
+            logger.debug("No UDF tasks found, ignoring as configured")
+            return control_message
+        else:
+            raise ValueError("No UDF tasks found in control message")
+    # Process each UDF task sequentially
+    for task_num, task_config in enumerate(all_task_configs, 1):
+        logger.debug(f"Processing UDF task {task_num} of {len(all_task_configs)}")
+        # Get UDF function string and function name from task properties
+        udf_function_str = task_config.get("udf_function", "").strip()
+        udf_function_name = task_config.get("udf_function_name", "").strip()
+        # Skip empty UDF functions if configured to ignore them
+        if not udf_function_str:
+            if stage_config.ignore_empty_udf:
+                logger.debug(f"UDF task {task_num} has empty function, skipping as configured")
+                continue
+            else:
+                raise ValueError(f"UDF task {task_num} has empty function string")
+        # Validate that function name is provided
+        if not udf_function_name:
+            raise ValueError(f"UDF task {task_num} missing required 'udf_function_name' property")
+        # Check if UDF function is cached
+        cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
+        if cached_udf:
+            udf_function = cached_udf.function
+        else:
+            # Compile and validate UDF function
+            udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_num)
+            # Cache the compiled UDF function
+            _udf_cache.put(udf_function_str, udf_function_name, udf_function)
+        # Execute the UDF function with the control message
+        try:
+            control_message = udf_function(control_message)
+        except Exception as e:
+            raise ValueError(f"UDF task {task_num} execution failed: {str(e)}")
+        # Validate that the UDF function returned an IngestControlMessage
+        if not isinstance(control_message, IngestControlMessage):
+            raise ValueError(f"UDF task {task_num} must return an IngestControlMessage, got {type(control_message)}")
+        logger.debug(f"UDF task {task_num} completed successfully")
+    logger.debug(f"UDF stage processing completed. Processed {len(all_task_configs)} UDF tasks")
+    return control_message

nv_ingest_api/internal/mutate/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/mutate/deduplicate.py ADDED Viewed

@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import hashlib
+from typing import Any, Dict, Optional, List
+import pandas as pd
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
+from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
+logger = logging.getLogger(__name__)
+def _hash_content(x: Any, algorithm: str = "md5") -> bytes:
+    """
+    Compute a hash of the content using the specified algorithm.
+    Parameters
+    ----------
+    x : dict
+        A dictionary containing the content under the key "content".
+    algorithm : str, optional
+        Hashing algorithm to use (default "md5").
+    Returns
+    -------
+    bytes
+        The computed hash.
+    """
+    try:
+        return hashlib.new(algorithm, x["content"].encode()).digest()
+    except Exception as e:
+        msg = f"hash_content: Error computing hash: {e}"
+        logger.error(msg, exc_info=True)
+        raise type(e)(msg) from e
+def deduplicate_images_internal(
+    df_ledger: pd.DataFrame,
+    task_config: Dict[str, Any],
+    mutate_config: ImageDedupSchema = ImageDedupSchema(),
+    execution_trace_log: Optional[List[Any]] = None,
+) -> pd.DataFrame:
+    """
+    Deduplicate images in a DataFrame based on content hashes.
+    The function processes rows where the 'document_type' is IMAGE, computes a content hash for each,
+    and then either removes duplicates or marks them based on the 'filter' flag in task_config.
+    A 'hash_algorithm' flag in task_config determines the algorithm used for hashing.
+    Parameters
+    ----------
+    df_ledger : pd.DataFrame
+        DataFrame containing at least 'document_type' and 'metadata' columns.
+    task_config : dict
+        Configuration parameters, including:
+            - "filter": bool, if True duplicate rows are removed; if False, duplicates are marked.
+            - "hash_algorithm": str, the algorithm to use for hashing (default "md5").
+    mutate_config : ImageDedupSchema, optional
+    execution_trace_log : Optional[List[Any]], optional
+    Returns
+    -------
+    pd.DataFrame
+        The DataFrame with duplicate images either removed or marked.
+    Raises
+    ------
+    ValueError
+        If the required columns are missing.
+    Exception
+        For any other errors encountered during deduplication.
+    """
+    _ = mutate_config  # Unused variable
+    _ = execution_trace_log  # TODO(Devin): Implement trace logging
+    try:
+        # Verify required columns exist.
+        for col in ("document_type", "metadata"):
+            if col not in df_ledger.columns:
+                raise ValueError(f"Missing required column '{col}'.")
+        # Select image rows.
+        image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
+        if not image_mask.any():
+            return df_ledger[~image_mask]
+        df_images = df_ledger.loc[image_mask].copy()
+        hash_algorithm = task_config.get("hash_algorithm", "md5")
+        # Compute content hash for each image.
+        df_images["_image_content_hash"] = df_images["metadata"].apply(_hash_content, args=(hash_algorithm,))
+        df_images_deduped = df_images.drop_duplicates(subset="_image_content_hash")
+        deduped_indices = df_images_deduped.index
+        non_image_rows = df_ledger.loc[~image_mask]
+        deduped_images = df_images.loc[deduped_indices][df_ledger.columns.difference(["_image_content_hash"])]
+        result, execution_trace_log = pd.concat([deduped_images, non_image_rows], axis=0), {}
+        _ = execution_trace_log
+        return result
+    except Exception as e:
+        msg = f"deduplicate_images_internal: Error applying deduplication filter: {e}"
+        logger.error(msg, exc_info=True)
+        raise type(e)(msg) from e

nv_ingest_api/internal/mutate/filter.py ADDED Viewed

@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Dict, Optional, List, Any
+import pandas as pd
+from nv_ingest_api.internal.enums.common import TaskTypeEnum
+from nv_ingest_api.internal.schemas.meta.metadata_schema import (
+    ContentTypeEnum,
+    InfoMessageMetadataSchema,
+    StatusEnum,
+)
+from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
+from nv_ingest_api.util.schema.schema_validator import validate_schema
+logger = logging.getLogger(__name__)
+def _add_info_message(x, info_msg):
+    x["info_message_metadata"] = info_msg
+    return x
+def _calculate_average_image_size(x):
+    return (x["image_metadata"]["width"] + x["image_metadata"]["height"]) / 2
+def _calculate_aspect_ratio(x):
+    return x["image_metadata"]["width"] / max(x["image_metadata"]["height"], 1e-9)
+def filter_images_internal(
+    df_ledger: pd.DataFrame,
+    task_config: Dict[str, Any],
+    mutate_config: ImageFilterSchema = ImageFilterSchema(),
+    execution_trace_log: Optional[List[Any]] = None,
+) -> pd.DataFrame:
+    """
+    Apply an image filtering operation to a DataFrame based on average image size and aspect ratio.
+    Parameters
+    ----------
+    df_ledger : pd.DataFrame
+        DataFrame to be filtered. Must contain 'document_type' and 'metadata' columns.
+    task_config : dict
+        Dictionary with the following keys:
+            - "min_size": Minimum average image size threshold.
+            - "max_aspect_ratio": Maximum allowed aspect ratio.
+            - "min_aspect_ratio": Minimum allowed aspect ratio.
+            - "filter": If True, rows failing the criteria are dropped; if False, they are flagged.
+    mutate_config : ImageFilterSchema
+    execution_trace_log : Optional[List[Any]], optional
+    Returns
+    -------
+    pd.DataFrame
+        The updated DataFrame after applying the image filter.
+    Raises
+    ------
+    ValueError
+        If required columns are missing or if parameters are invalid.
+    Exception
+        For other errors encountered during filtering.
+    """
+    _ = mutate_config  # Unused variable
+    _ = execution_trace_log  # TODO(Devin)
+    try:
+        required_columns = {"document_type", "metadata"}
+        if not required_columns.issubset(df_ledger.columns):
+            raise ValueError(f"DataFrame must contain columns: {required_columns}")
+        min_size = task_config.get("min_size")
+        max_aspect_ratio = task_config.get("max_aspect_ratio")
+        min_aspect_ratio = task_config.get("min_aspect_ratio")
+        filter_flag = task_config.get("filter", True)
+        if not isinstance(min_size, (int, float)) or min_size < 0:
+            raise ValueError("min_size must be a non-negative number")
+        if not isinstance(max_aspect_ratio, (int, float)) or max_aspect_ratio <= 0:
+            raise ValueError("max_aspect_ratio must be a positive number")
+        if not isinstance(min_aspect_ratio, (int, float)) or min_aspect_ratio <= 0:
+            raise ValueError("min_aspect_ratio must be a positive number")
+        if min_aspect_ratio > max_aspect_ratio:
+            raise ValueError("min_aspect_ratio cannot be greater than max_aspect_ratio")
+        image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
+        if not image_mask.any():
+            return df_ledger.copy()
+        df_image = df_ledger.loc[image_mask].copy()
+        avg_size = df_image["metadata"].apply(_calculate_average_image_size)
+        avg_size_mask = avg_size > min_size
+        aspect_ratio = df_image["metadata"].apply(_calculate_aspect_ratio)
+        min_aspect_ratio_mask = aspect_ratio > min_aspect_ratio
+        max_aspect_ratio_mask = aspect_ratio < max_aspect_ratio
+        valid_mask = avg_size_mask & min_aspect_ratio_mask & max_aspect_ratio_mask
+        image_filter_mask = ~valid_mask
+        if image_filter_mask.any():
+            filtered_df = df_image.loc[image_filter_mask].copy()
+            if filter_flag:
+                df_ledger.drop(labels=filtered_df.index, inplace=True)
+                return df_ledger
+            info_msg = {
+                "task": TaskTypeEnum.FILTER.value,
+                "status": StatusEnum.SUCCESS.value,
+                "message": "Filtered due to image size or aspect ratio.",
+                "filter": True,
+            }
+            validated_info_msg = validate_schema(info_msg, InfoMessageMetadataSchema).model_dump()
+            filtered_df["info_message_metadata"] = [validated_info_msg] * filtered_df.shape[0]
+            filtered_df["metadata"] = filtered_df["metadata"].apply(_add_info_message, args=(info_msg,))
+            df_ledger.loc[filtered_df.index, "metadata"] = filtered_df["metadata"]
+            df_ledger.loc[filtered_df.index, "document_type"] = ContentTypeEnum.INFO_MSG
+        result, execution_trace_log = df_ledger, {}
+        return result
+    except Exception as e:
+        err_msg = f"filter_images_internal: Error applying image filter. Original error: {e}"
+        logger.error(err_msg, exc_info=True)
+        raise type(e)(err_msg) from e