PyPI - nv-ingest-api - Versions diffs - 26.1.0rc4__py3-none-any.whl - Mend

nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show

nv_ingest_api/__init__.py +3 -0
nv_ingest_api/interface/__init__.py +218 -0
nv_ingest_api/interface/extract.py +977 -0
nv_ingest_api/interface/mutate.py +154 -0
nv_ingest_api/interface/store.py +200 -0
nv_ingest_api/interface/transform.py +382 -0
nv_ingest_api/interface/utility.py +186 -0
nv_ingest_api/internal/__init__.py +0 -0
nv_ingest_api/internal/enums/__init__.py +3 -0
nv_ingest_api/internal/enums/common.py +550 -0
nv_ingest_api/internal/extract/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
nv_ingest_api/internal/extract/docx/__init__.py +5 -0
nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
nv_ingest_api/internal/extract/html/__init__.py +3 -0
nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
nv_ingest_api/internal/extract/image/__init__.py +3 -0
nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
nv_ingest_api/internal/meta/__init__.py +3 -0
nv_ingest_api/internal/meta/udf.py +232 -0
nv_ingest_api/internal/mutate/__init__.py +3 -0
nv_ingest_api/internal/mutate/deduplicate.py +110 -0
nv_ingest_api/internal/mutate/filter.py +133 -0
nv_ingest_api/internal/primitives/__init__.py +0 -0
nv_ingest_api/internal/primitives/control_message_task.py +16 -0
nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
nv_ingest_api/internal/schemas/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
nv_ingest_api/internal/schemas/meta/udf.py +23 -0
nv_ingest_api/internal/schemas/mixins.py +39 -0
nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
nv_ingest_api/internal/schemas/store/__init__.py +3 -0
nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
nv_ingest_api/internal/store/__init__.py +3 -0
nv_ingest_api/internal/store/embed_text_upload.py +236 -0
nv_ingest_api/internal/store/image_upload.py +251 -0
nv_ingest_api/internal/transform/__init__.py +3 -0
nv_ingest_api/internal/transform/caption_image.py +219 -0
nv_ingest_api/internal/transform/embed_text.py +702 -0
nv_ingest_api/internal/transform/split_text.py +182 -0
nv_ingest_api/util/__init__.py +3 -0
nv_ingest_api/util/control_message/__init__.py +0 -0
nv_ingest_api/util/control_message/validators.py +47 -0
nv_ingest_api/util/converters/__init__.py +0 -0
nv_ingest_api/util/converters/bytetools.py +78 -0
nv_ingest_api/util/converters/containers.py +65 -0
nv_ingest_api/util/converters/datetools.py +90 -0
nv_ingest_api/util/converters/dftools.py +127 -0
nv_ingest_api/util/converters/formats.py +64 -0
nv_ingest_api/util/converters/type_mappings.py +27 -0
nv_ingest_api/util/dataloader/__init__.py +9 -0
nv_ingest_api/util/dataloader/dataloader.py +409 -0
nv_ingest_api/util/detectors/__init__.py +5 -0
nv_ingest_api/util/detectors/language.py +38 -0
nv_ingest_api/util/exception_handlers/__init__.py +0 -0
nv_ingest_api/util/exception_handlers/converters.py +72 -0
nv_ingest_api/util/exception_handlers/decorators.py +429 -0
nv_ingest_api/util/exception_handlers/detectors.py +74 -0
nv_ingest_api/util/exception_handlers/pdf.py +116 -0
nv_ingest_api/util/exception_handlers/schemas.py +68 -0
nv_ingest_api/util/image_processing/__init__.py +5 -0
nv_ingest_api/util/image_processing/clustering.py +260 -0
nv_ingest_api/util/image_processing/processing.py +177 -0
nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
nv_ingest_api/util/image_processing/transforms.py +850 -0
nv_ingest_api/util/imports/__init__.py +3 -0
nv_ingest_api/util/imports/callable_signatures.py +108 -0
nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
nv_ingest_api/util/introspection/__init__.py +3 -0
nv_ingest_api/util/introspection/class_inspect.py +145 -0
nv_ingest_api/util/introspection/function_inspect.py +65 -0
nv_ingest_api/util/logging/__init__.py +0 -0
nv_ingest_api/util/logging/configuration.py +102 -0
nv_ingest_api/util/logging/sanitize.py +84 -0
nv_ingest_api/util/message_brokers/__init__.py +3 -0
nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
nv_ingest_api/util/metadata/__init__.py +5 -0
nv_ingest_api/util/metadata/aggregators.py +516 -0
nv_ingest_api/util/multi_processing/__init__.py +8 -0
nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
nv_ingest_api/util/nim/__init__.py +161 -0
nv_ingest_api/util/pdf/__init__.py +3 -0
nv_ingest_api/util/pdf/pdfium.py +428 -0
nv_ingest_api/util/schema/__init__.py +3 -0
nv_ingest_api/util/schema/schema_validator.py +10 -0
nv_ingest_api/util/service_clients/__init__.py +3 -0
nv_ingest_api/util/service_clients/client_base.py +86 -0
nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
nv_ingest_api/util/string_processing/__init__.py +51 -0
nv_ingest_api/util/string_processing/configuration.py +682 -0
nv_ingest_api/util/string_processing/yaml.py +109 -0
nv_ingest_api/util/system/__init__.py +0 -0
nv_ingest_api/util/system/hardware_info.py +594 -0
nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
udfs/__init__.py +5 -0
udfs/llm_summarizer_udf.py +259 -0

nv_ingest_api/internal/primitives/nim/nim_model_interface.py ADDED Viewed

@@ -0,0 +1,126 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+logger = logging.getLogger(__name__)
+class ModelInterface:
+    """
+    Base class for defining a model interface that supports preparing input data, formatting it for
+    inference, parsing output, and processing inference results.
+    """
+    def format_input(self, data: dict, protocol: str, max_batch_size: int):
+        """
+        Format the input data for the specified protocol.
+        Parameters
+        ----------
+        data : dict
+            The input data to format.
+        protocol : str
+            The protocol to format the data for.
+        """
+        raise NotImplementedError("Subclasses should implement this method")
+    def parse_output(self, response, protocol: str, data: Optional[dict] = None, **kwargs):
+        """
+        Parse the output data from the model's inference response.
+        Parameters
+        ----------
+        response : Any
+            The response from the model inference.
+        protocol : str
+            The protocol used ("grpc" or "http").
+        data : dict, optional
+            Additional input data passed to the function.
+        """
+        raise NotImplementedError("Subclasses should implement this method")
+    def prepare_data_for_inference(self, data: dict):
+        """
+        Prepare input data for inference by processing or transforming it as required.
+        Parameters
+        ----------
+        data : dict
+            The input data to prepare.
+        """
+        raise NotImplementedError("Subclasses should implement this method")
+    def process_inference_results(self, output_array, protocol: str, **kwargs):
+        """
+        Process the inference results from the model.
+        Parameters
+        ----------
+        output_array : Any
+            The raw output from the model.
+        kwargs : dict
+            Additional parameters for processing.
+        """
+        raise NotImplementedError("Subclasses should implement this method")
+    def name(self) -> str:
+        """
+        Get the name of the model interface.
+        Returns
+        -------
+        str
+            The name of the model interface.
+        """
+        raise NotImplementedError("Subclasses should implement this method")
+    def coalesce_requests_to_batch(self, requests, protocol: str, **kwargs) -> Tuple[Any, Dict[str, Any]]:
+        """
+        Takes a list of InferenceRequest objects and combines them into a single
+        formatted batch ready for inference.
+        THIS METHOD IS REQUIRED FOR DYNAMIC BATCHING SUPPORT.
+        Parameters
+        ----------
+        requests : List[InferenceRequest]
+            A list of InferenceRequest namedtuples collected for the batch.
+            Each tuple contains the data, dimensions, and other context for a single item.
+        protocol : str
+            The inference protocol, either "grpc" or "http".
+        **kwargs : Any
+            Additional keyword arguments passed from the original request.
+        Returns
+        -------
+        Tuple[Any, Dict[str, Any]]
+            A tuple containing the single formatted batch and its scratch-pad data.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support dynamic batching "
+            "because `coalesce_requests_to_batch` is not implemented."
+        )
+    def does_item_fit_in_batch(self, current_batch, next_request, memory_budget_bytes: int) -> bool:
+        """
+        Checks if adding another request to the current batch would exceed the memory budget.
+        This is a model-specific calculation. The default implementation always
+        returns True, effectively ignoring the memory budget. Interfaces for models
+        that require memory management (like padded image models) must override this.
+        Returns
+        -------
+        bool
+            True if the item fits within the budget, False otherwise.
+        """
+        return True

nv_ingest_api/internal/primitives/tracing/__init__.py ADDED Viewed

File without changes

nv_ingest_api/internal/primitives/tracing/latency.py ADDED Viewed

@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from datetime import datetime
+from functools import wraps
+logger = logging.getLogger(__name__)
+# Define ANSI color codes
+class ColorCodes:
+    RED = "\033[91m"
+    GREEN = "\033[92m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"  # Added Blue
+    RESET = "\033[0m"
+# Function to apply color to a message
+def colorize(message, color_code):
+    return f"{color_code}{message}{ColorCodes.RESET}"
+def latency_logger(name=None):
+    """
+    A decorator to log the elapsed time of function execution. If available, it also logs
+    the latency based on 'latency::ts_send' metadata in a IngestControlMessage object.
+    Parameters
+    ----------
+    name : str, optional
+        Custom name to use in the log message. Defaults to the function's name.
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Ensure there's at least one argument and it has timestamp handling capabilities
+            if args and hasattr(args[0], "get_timestamp"):
+                message = args[0]
+                start_time = datetime.now()
+                result = func(*args, **kwargs)
+                end_time = datetime.now()
+                elapsed_time = end_time - start_time
+                func_name = name if name else func.__name__
+                # Log latency from ts_send if available
+                if message.filter_timestamp("latency::ts_send"):
+                    ts_send = message.get_timestamp("latency::ts_send")
+                    latency_ms = (start_time - ts_send).total_seconds() * 1e3
+                    logger.debug(f"{func_name} since ts_send: {latency_ms} msec.")
+                message.set_timestamp("latency::ts_send", datetime.now())
+                message.set_timestamp(f"latency::{func_name}::elapsed_time", elapsed_time)
+                return result
+            else:
+                raise ValueError(
+                    "The first argument must be a IngestControlMessage object with metadata " "capabilities."
+                )
+        return wrapper
+    return decorator

nv_ingest_api/internal/primitives/tracing/logging.py ADDED Viewed

@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import inspect
+import uuid
+from datetime import datetime
+from enum import Enum
+from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
+class TaskResultStatus(Enum):
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+def annotate_cm(control_message: IngestControlMessage, source_id=None, **kwargs):
+    """
+    Annotate a IngestControlMessage object with arbitrary metadata, a source ID, and a timestamp.
+    Each annotation will be uniquely identified by a UUID.
+    Parameters:
+    - control_message: The IngestControlMessage object to be annotated.
+    - source_id: A unique identifier for the source of the annotation. If None, uses the caller's __name__.
+    - **kwargs: Arbitrary key-value pairs to be included in the annotation.
+    """
+    if source_id is None:
+        # Determine the __name__ of the parent caller's module
+        frame = inspect.currentframe()
+        caller_frame = inspect.getouterframes(frame)[2]
+        module = inspect.getmodule(caller_frame[0])
+        source_id = module.__name__ if module is not None else "UnknownModule"
+    # Ensure 'annotation_timestamp' is not overridden by kwargs
+    if "annotation_timestamp" in kwargs:
+        raise ValueError("'annotation_timestamp' is a reserved key and cannot be specified.")
+    message = kwargs.get("message")
+    annotation_key = f"annotation::{message}" if message else f"annotation::{uuid.uuid4()}"
+    annotation_timestamp = datetime.now()
+    try:
+        control_message.set_timestamp(annotation_key, annotation_timestamp)
+    except Exception as e:
+        print(f"Failed to set annotation timestamp: {e}")
+    # Construct the metadata key uniquely identified by a UUID.
+    metadata_key = f"annotation::{uuid.uuid4()}"
+    # Construct the metadata value with reserved 'annotation_timestamp', source_id, and any provided kwargs.
+    metadata_value = {
+        "source_id": source_id,
+    }
+    metadata_value.update(kwargs)
+    try:
+        # Attempt to set the annotated metadata on the IngestControlMessage object.
+        control_message.set_metadata(metadata_key, metadata_value)
+    except Exception as e:
+        # Handle any exceptions that occur when setting metadata.
+        print(f"Failed to annotate IngestControlMessage: {e}")
+def annotate_task_result(control_message, result, task_id, source_id=None, **kwargs):
+    """
+    Annotate a IngestControlMessage object with the result of a task, identified by a task_id,
+    and an arbitrary number of additional key-value pairs. The result can be a TaskResultStatus
+    enum or a string that will be converted to the corresponding enum.
+    Parameters:
+    - control_message: The IngestControlMessage object to be annotated.
+    - result: The result of the task, either SUCCESS or FAILURE, as an enum or string.
+    - task_id: A unique identifier for the task.
+    - **kwargs: Arbitrary additional key-value pairs to be included in the annotation.
+    """
+    # Convert result to TaskResultStatus enum if it's a string
+    if isinstance(result, str):
+        try:
+            result = TaskResultStatus[result.upper()]
+        except KeyError:
+            raise ValueError(
+                f"Invalid result string: {result}. Must be one of {[status.name for status in TaskResultStatus]}."
+            )
+    elif not isinstance(result, TaskResultStatus):
+        raise ValueError("result must be an instance of TaskResultStatus Enum or a valid result string.")
+    # Annotate the control message with task-related information, including the result and task_id.
+    annotate_cm(
+        control_message,
+        source_id=source_id,
+        task_result=result.value,
+        task_id=task_id,
+        **kwargs,
+    )

nv_ingest_api/internal/primitives/tracing/tagging.py ADDED Viewed

@@ -0,0 +1,288 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import functools
+import inspect
+import logging
+import string
+from datetime import datetime
+from typing import Optional
+logger = logging.getLogger(__name__)
+def traceable(trace_name: Optional[str] = None):
+    """
+    A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata
+    based on the presence of a 'config::add_trace_tagging' flag.
+    This decorator checks if the 'config::add_trace_tagging' flag is set to True in the
+    message's metadata. If so, it records the entry and exit timestamps of the function
+    execution, using either a provided custom trace name, auto-detected stage name from
+    self.stage_name, or the function's name as fallback.
+    Parameters
+    ----------
+    trace_name : str, optional
+        A custom name for the trace entries in the message metadata. If not provided,
+        attempts to use self.stage_name from the decorated method's instance,
+        falling back to the function's name if neither is available.
+    Returns
+    -------
+    decorator_trace_tagging : Callable
+        A wrapper function that decorates the target function to implement trace tagging.
+    Notes
+    -----
+    The decorated function must accept a IngestControlMessage object as one of its arguments.
+    For a regular function, this is expected to be the first argument; for a class method,
+    this is expected to be the second argument (after 'self'). The IngestControlMessage object
+    must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator
+    to check for the trace tagging flag and to add trace metadata.
+    The trace metadata added by the decorator includes two entries:
+    - 'trace::entry::<trace_name>': The timestamp marking the function's entry.
+    - 'trace::exit::<trace_name>': The timestamp marking the function's exit.
+    Examples
+    --------
+    Automatic stage name detection (recommended):
+    >>> @traceable()  # Uses self.stage_name automatically
+    ... def process_message(self, message):
+    ...     pass
+    Explicit trace name (override):
+    >>> @traceable("custom_trace")
+    ... def process_message(self, message):
+    ...     pass
+    Function without instance (uses function name):
+    >>> @traceable()
+    ... def process_message(message):
+    ...     pass
+    """
+    def decorator_trace_tagging(func):
+        @functools.wraps(func)
+        def wrapper_trace_tagging(*args, **kwargs):
+            ts_fetched = datetime.now()
+            # Determine the trace name to use
+            resolved_trace_name = trace_name
+            # If no explicit trace_name provided, try to get it from self.stage_name
+            if resolved_trace_name is None and len(args) >= 1:
+                stage_instance = args[0]  # 'self' in method calls
+                if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
+                    resolved_trace_name = stage_instance.stage_name
+                    logger.debug(f"Using auto-detected trace name: '{resolved_trace_name}'")
+                else:
+                    resolved_trace_name = func.__name__
+                    logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
+            elif resolved_trace_name is None:
+                resolved_trace_name = func.__name__
+                logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
+            # Determine which argument is the message.
+            if hasattr(args[0], "has_metadata"):
+                message = args[0]
+            elif len(args) > 1 and hasattr(args[1], "has_metadata"):
+                message = args[1]
+            else:
+                raise ValueError("traceable decorator could not find a message argument with 'has_metadata()'")
+            do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
+                message.get_metadata("config::add_trace_tagging") is True
+            )
+            trace_prefix = resolved_trace_name
+            if do_trace_tagging:
+                ts_send = message.get_timestamp("latency::ts_send")
+                ts_entry = datetime.now()
+                message.set_timestamp(f"trace::entry::{trace_prefix}", ts_entry)
+                if ts_send:
+                    message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
+                    message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
+            # Call the decorated function.
+            result = func(*args, **kwargs)
+            if do_trace_tagging:
+                ts_exit = datetime.now()
+                message.set_timestamp(f"trace::exit::{trace_prefix}", ts_exit)
+                message.set_timestamp("latency::ts_send", ts_exit)
+            return result
+        return wrapper_trace_tagging
+    return decorator_trace_tagging
+def traceable_func(trace_name=None, dedupe=True):
+    """
+    A decorator that injects trace information for tracking the execution of a function.
+    It logs the entry and exit timestamps of the function in a `trace_info` dictionary,
+    which can be used for performance monitoring or debugging purposes.
+    Parameters
+    ----------
+    trace_name : str, optional
+        An optional string used as the prefix for the trace log entries. If not provided,
+        the decorated function's name is used. The string can include placeholders (e.g.,
+        "pdf_extractor::{model_name}") that will be dynamically replaced with matching
+        function argument values.
+    dedupe : bool, optional
+        If True, ensures that the trace entry and exit keys are unique by appending an index
+        (e.g., `_0`, `_1`) to the keys if duplicate entries are detected. Default is True.
+    Returns
+    -------
+    function
+        A wrapped function that injects trace information before and after the function's
+        execution.
+    Notes
+    -----
+    - If `trace_info` is not provided in the keyword arguments, a new dictionary is created
+      and used for storing trace entries.
+    - If `trace_name` contains format placeholders, the decorator attempts to populate them
+      with matching argument values from the decorated function.
+    - The trace information is logged in the format:
+        - `trace::entry::{trace_name}` for the entry timestamp.
+        - `trace::exit::{trace_name}` for the exit timestamp.
+    - If `dedupe` is True, the trace keys will be appended with an index to avoid
+      overwriting existing entries.
+    Example
+    -------
+    >>> @traceable_func(trace_name="pdf_extractor::{model_name}")
+    >>> def extract_pdf(model_name):
+    ...     pass
+    >>> trace_info = {}
+    >>> extract_pdf("my_model", trace_info=trace_info)
+    In this example, `model_name` is dynamically replaced in the trace_name, and the
+    trace information is logged with unique keys if deduplication is enabled.
+    """
+    def decorator_inject_trace_info(func):
+        @functools.wraps(func)
+        def wrapper_inject_trace_info(*args, **kwargs):
+            trace_info = kwargs.pop("trace_info", None)
+            if trace_info is None:
+                trace_info = {}
+            trace_prefix = trace_name if trace_name else func.__name__
+            arg_names = list(inspect.signature(func).parameters)
+            args_name_to_val = dict(zip(arg_names, args))
+            # If `trace_name` is a formattable string, e.g., "pdf_extractor::{model_name}",
+            # search `args` and `kwargs` to replace the placeholder.
+            placeholders = [x[1] for x in string.Formatter().parse(trace_name) if x[1] is not None]
+            if placeholders:
+                format_kwargs = {}
+                for name in placeholders:
+                    if name in args_name_to_val:
+                        arg_val = args_name_to_val[name]
+                    elif name in kwargs:
+                        arg_val = kwargs.get(name)
+                    else:
+                        arg_val = name
+                    format_kwargs[name] = arg_val
+                trace_prefix = trace_prefix.format(**format_kwargs)
+            trace_entry_key = f"trace::entry::{trace_prefix}"
+            trace_exit_key = f"trace::exit::{trace_prefix}"
+            ts_entry = datetime.now()
+            if dedupe:
+                trace_entry_key += "_{}"
+                trace_exit_key += "_{}"
+                i = 0
+                while (trace_entry_key.format(i) in trace_info) or (trace_exit_key.format(i) in trace_info):
+                    i += 1
+                trace_entry_key = trace_entry_key.format(i)
+                trace_exit_key = trace_exit_key.format(i)
+            trace_info[trace_entry_key] = ts_entry
+            # Call the decorated function
+            result = func(*args, **kwargs)
+            ts_exit = datetime.now()
+            trace_info[trace_exit_key] = ts_exit
+            return result
+        return wrapper_inject_trace_info
+    return decorator_inject_trace_info
+def set_trace_timestamps_with_parent_context(control_message, execution_trace_log: dict, parent_name: str, logger=None):
+    """
+    Set trace timestamps on a control message with proper parent-child context.
+    This utility function processes trace timestamps from an execution_trace_log and
+    ensures that child traces are properly namespaced under their parent context.
+    This resolves OpenTelemetry span hierarchy issues where child spans cannot
+    find their expected parent contexts.
+    Parameters
+    ----------
+    control_message : IngestControlMessage
+        The control message to set timestamps on
+    execution_trace_log : dict
+        Dictionary of trace keys to timestamp values from internal operations
+    parent_name : str
+        The parent stage name to use as context for child traces
+    logger : logging.Logger, optional
+        Logger for debug output of key transformations
+    Examples
+    --------
+    Basic usage in a stage:
+    >>> execution_trace_log = {"trace::entry::yolox_inference": ts1, "trace::exit::yolox_inference": ts2}
+    >>> set_trace_timestamps_with_parent_context(
+    ...     control_message, execution_trace_log, "pdf_extractor", logger
+    ... )
+    This transforms:
+    - trace::entry::yolox_inference -> trace::entry::pdf_extractor::yolox_inference
+    - trace::exit::yolox_inference  -> trace::exit::pdf_extractor::yolox_inference
+    """
+    if not execution_trace_log:
+        return
+    for key, ts in execution_trace_log.items():
+        enhanced_key = key
+        # Check if this is a child trace that needs parent context
+        if key.startswith("trace::") and "::" in key:
+            # Parse the trace key to extract the base trace name
+            parts = key.split("::")
+            if len(parts) >= 3:  # e.g., ["trace", "entry", "yolox_inference"]
+                trace_type = parts[1]  # "entry" or "exit"
+                child_name = "::".join(parts[2:])  # everything after trace::entry:: or trace::exit::
+                # Only rewrite if it doesn't already include the parent context
+                if not child_name.startswith(f"{parent_name}::"):
+                    # Rewrite to include parent context: trace::entry::pdf_extractor::yolox_inference
+                    enhanced_key = f"trace::{trace_type}::{parent_name}::{child_name}"
+                    if logger:
+                        logger.debug(f"Enhanced trace key: {key} -> {enhanced_key}")
+        # Set the timestamp with the (possibly enhanced) key
+        control_message.set_timestamp(enhanced_key, ts)

nv_ingest_api/internal/schemas/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/schemas/extract/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0