PyPI - nv-ingest-api - Versions diffs - 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl - Mend

nv-ingest-api 2025.8.14.dev20250814py3-none-any.whl → 2025.8.16.dev20250816py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (24) hide show

nv_ingest_api/internal/schemas/meta/ingest_job_schema.py CHANGED Viewed

@@ -35,7 +35,7 @@ class IngestTaskSplitSchema(BaseModelNoExt):
     tokenizer: Optional[str] = None
     chunk_size: Annotated[int, Field(gt=0)] = 1024
     chunk_overlap: Annotated[int, Field(ge=0)] = 150
-    params: dict
+    params: dict = Field(default_factory=dict)
     @field_validator("chunk_overlap")
     def check_chunk_overlap(cls, v, values, **kwargs):
@@ -47,7 +47,7 @@ class IngestTaskSplitSchema(BaseModelNoExt):
 class IngestTaskExtractSchema(BaseModelNoExt):
     document_type: DocumentTypeEnum
     method: str
-    params: dict
+    params: dict = Field(default_factory=dict)
     @field_validator("document_type", mode="before")
     @classmethod
@@ -61,14 +61,14 @@ class IngestTaskExtractSchema(BaseModelNoExt):
 class IngestTaskStoreEmbedSchema(BaseModelNoExt):
-    params: dict
+    params: dict = Field(default_factory=dict)
 class IngestTaskStoreSchema(BaseModelNoExt):
     structured: bool = True
     images: bool = False
     method: str
-    params: dict
+    params: dict = Field(default_factory=dict)
 # Captioning: All fields are optional and override default parameters.
@@ -143,6 +143,40 @@ class IngestTaskInfographicExtraction(BaseModelNoExt):
     params: dict = Field(default_factory=dict)
+class IngestTaskUDFSchema(BaseModelNoExt):
+    udf_function: str
+    udf_function_name: str
+    phase: Optional[int] = Field(default=None, ge=1, le=5)
+    run_before: bool = Field(default=False, description="Execute UDF before the target stage")
+    run_after: bool = Field(default=False, description="Execute UDF after the target stage")
+    target_stage: Optional[str] = Field(
+        default=None, description="Name of the stage to target (e.g., 'image_dedup', 'text_extract')"
+    )
+    @model_validator(mode="after")
+    def validate_stage_targeting(self):
+        """Validate that stage targeting configuration is consistent"""
+        # Must specify either phase or target_stage, but not both
+        has_phase = self.phase is not None
+        has_target_stage = self.target_stage is not None
+        if has_phase and has_target_stage:
+            raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
+        elif not has_phase and not has_target_stage:
+            raise ValueError("Must specify either 'phase' or 'target_stage'.")
+        # If using run_before or run_after, must specify target_stage
+        if self.run_before or self.run_after:
+            if not self.target_stage:
+                raise ValueError("target_stage must be specified when using run_before or run_after")
+        # If target_stage is specified, must have at least one timing
+        if self.target_stage and not (self.run_before or self.run_after):
+            raise ValueError("At least one of run_before or run_after must be True when target_stage is specified")
+        return self
 class IngestTaskSchema(BaseModelNoExt):
     type: TaskTypeEnum
     task_properties: Union[
@@ -159,6 +193,7 @@ class IngestTaskSchema(BaseModelNoExt):
         IngestTaskTableExtraction,
         IngestTaskChartExtraction,
         IngestTaskInfographicExtraction,
+        IngestTaskUDFSchema,
     ]
     raise_on_failure: bool = False
@@ -190,6 +225,7 @@ class IngestTaskSchema(BaseModelNoExt):
             TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
             TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
             TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
+            TaskTypeEnum.UDF: IngestTaskUDFSchema,
         }
         expected_schema_cls = task_type_to_schema.get(task_type)

nv_ingest_api/internal/schemas/meta/udf.py ADDED Viewed

@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from pydantic import BaseModel, Field, ConfigDict
+class UDFStageSchema(BaseModel):
+    """
+    Schema for UDF stage configuration.
+    The UDF function string should be provided in the task config. If no UDF function
+    is provided and ignore_empty_udf is True, the message is returned unchanged.
+    If ignore_empty_udf is False, an error is raised when no UDF function is provided.
+    """
+    ignore_empty_udf: bool = Field(
+        False,
+        description="If True, ignore UDF tasks without udf_function and return message unchanged. "
+        "If False, raise error.",
+    )
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/transform/embed_text.py CHANGED Viewed

@@ -15,6 +15,11 @@ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema im
 logger = logging.getLogger(__name__)
+# Reduce SDK HTTP logging verbosity so request/response logs are not emitted
+logging.getLogger("openai").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("httpcore").setLevel(logging.ERROR)
 MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]

nv_ingest_api/util/exception_handlers/decorators.py CHANGED Viewed

@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 def nv_ingest_node_failure_try_except(  # New name to distinguish
-    annotation_id: str,
+    annotation_id: Optional[str] = None,
     payload_can_be_empty: bool = False,
     raise_on_failure: bool = False,
     skip_processing_if_failed: bool = True,
@@ -29,7 +29,19 @@ def nv_ingest_node_failure_try_except(  # New name to distinguish
     failures by annotating an IngestControlMessage. Replaces the context
     manager approach for potentially simpler interaction with frameworks like Ray.
-    Parameters are the same as nv_ingest_node_failure_context_manager.
+    Parameters
+    ----------
+    annotation_id : Optional[str]
+        A unique identifier for annotation. If None, attempts to auto-detect
+        from the stage instance's stage_name property.
+    payload_can_be_empty : bool, optional
+        If False, the message payload must not be null.
+    raise_on_failure : bool, optional
+        If True, exceptions are raised; otherwise, they are annotated.
+    skip_processing_if_failed : bool, optional
+        If True, skip processing if the message is already marked as failed.
+    forward_func : Optional[Callable[[Any], Any]]
+        If provided, a function to forward the message when processing is skipped.
     """
     def extract_message_and_prefix(args: Tuple) -> Tuple[Any, Tuple]:
@@ -47,170 +59,106 @@ def nv_ingest_node_failure_try_except(  # New name to distinguish
     def decorator(func: Callable) -> Callable:
         func_name = func.__name__  # Get function name for logging/errors
-        # --- ASYNC WRAPPER ---
-        if asyncio.iscoroutinefunction(func):
-            @functools.wraps(func)
-            async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
-                logger.debug(f"async_wrapper for {func_name}: Entering.")
-                try:
-                    control_message, prefix = extract_message_and_prefix(args)
-                except ValueError as e:
-                    logger.error(f"async_wrapper for {func_name}: Failed to extract control message. Error: {e}")
-                    raise  # Cannot proceed without the message
-                # --- Skip logic ---
-                is_failed = control_message.get_metadata("cm_failed", False)
-                if is_failed and skip_processing_if_failed:
-                    logger.debug(f"async_wrapper for {func_name}: Skipping processing, message already marked failed.")
-                    if forward_func:
-                        logger.debug("async_wrapper: Forwarding skipped message.")
-                        # Await forward_func if it's async
-                        if asyncio.iscoroutinefunction(forward_func):
-                            return await forward_func(control_message)
-                        else:
-                            return forward_func(control_message)
-                    else:
-                        logger.debug("async_wrapper: Returning skipped message as is.")
-                        return control_message
-                # --- Main execution block ---
-                result = None
-                try:
-                    # Payload check
-                    if not payload_can_be_empty:
-                        cm_ensure_payload_not_null(control_message)
-                    # Rebuild args and call original async function
-                    new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
-                    logger.debug(f"async_wrapper for {func_name}: Calling await func...")
-                    result = await func(*new_args, **kwargs)
-                    logger.debug(f"async_wrapper for {func_name}: func call completed.")
-                    # Success annotation
-                    logger.debug(f"async_wrapper for {func_name}: Annotating success.")
-                    annotate_task_result(
-                        control_message=result if result is not None else control_message,
-                        # Annotate result if func returns it, else original message
-                        result=TaskResultStatus.SUCCESS,
-                        task_id=annotation_id,
+        @functools.wraps(func)
+        def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
+            logger.debug(f"sync_wrapper for {func_name}: Entering.")
+            # Determine the annotation_id to use
+            resolved_annotation_id = annotation_id
+            # If no explicit annotation_id provided, try to get it from self.stage_name
+            if resolved_annotation_id is None and len(args) >= 1:
+                stage_instance = args[0]  # 'self' in method calls
+                if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
+                    resolved_annotation_id = stage_instance.stage_name
+                    logger.debug("Using auto-detected annotation_id from stage_name: " f"'{resolved_annotation_id}'")
+                else:
+                    # Fallback to function name if no stage_name available
+                    resolved_annotation_id = func_name
+                    logger.debug(
+                        "No stage_name available, using function name as annotation_id: " f"'{resolved_annotation_id}'"
                     )
-                    logger.debug(f"async_wrapper for {func_name}: Success annotation done. Returning result.")
-                    return result
-                except Exception as e:
-                    # --- Failure Handling ---
-                    error_message = f"Error in {func_name}: {e}"
-                    logger.error(f"async_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
-                    # Annotate failure on the original message object
-                    try:
-                        cm_set_failure(control_message, error_message)
-                        annotate_task_result(
-                            control_message=control_message,
-                            result=TaskResultStatus.FAILURE,
-                            task_id=annotation_id,
-                            message=error_message,
-                        )
-                        logger.debug(f"async_wrapper for {func_name}: Failure annotation complete.")
-                    except Exception as anno_err:
-                        # Log error during annotation but proceed based on raise_on_failure
-                        logger.exception(
-                            f"async_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
-                        )
-                    # Decide whether to raise or return annotated message
-                    if raise_on_failure:
-                        logger.debug(f"async_wrapper for {func_name}: Re-raising exception as configured.")
-                        raise e  # Re-raise the original exception
-                    else:
-                        logger.debug(
-                            f"async_wrapper for {func_name}: Suppressing exception and returning annotated message."
-                        )
-                        # Return the original control_message, now annotated with failure
-                        return control_message
-            return async_wrapper
+            elif resolved_annotation_id is None:
+                # Fallback to function name if no annotation_id and no instance
+                resolved_annotation_id = func_name
+                logger.debug(
+                    "No annotation_id provided and no instance available, using function name: "
+                    f"'{resolved_annotation_id}'"
+                )
-        # --- SYNC WRAPPER ---
-        else:
+            try:
+                control_message, prefix = extract_message_and_prefix(args)
+            except ValueError as e:
+                logger.error(f"sync_wrapper for {func_name}: Failed to extract control message. Error: {e}")
+                raise
+            # --- Skip logic ---
+            is_failed = control_message.get_metadata("cm_failed", False)
+            if is_failed and skip_processing_if_failed:
+                logger.warning(f"sync_wrapper for {func_name}: Skipping processing, message already marked failed.")
+                if forward_func:
+                    logger.debug("sync_wrapper: Forwarding skipped message.")
+                    return forward_func(control_message)  # Assume forward_func is sync here
+                else:
+                    logger.debug("sync_wrapper: Returning skipped message as is.")
+                    return control_message
-            @functools.wraps(func)
-            def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
-                logger.debug(f"sync_wrapper for {func_name}: Entering.")
-                try:
-                    control_message, prefix = extract_message_and_prefix(args)
-                except ValueError as e:
-                    logger.error(f"sync_wrapper for {func_name}: Failed to extract control message. Error: {e}")
-                    raise
+            # --- Main execution block ---
+            result = None
+            try:
+                # Payload check
+                if not payload_can_be_empty:
+                    cm_ensure_payload_not_null(control_message)
+                # Rebuild args and call original sync function
+                new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
+                logger.debug(f"sync_wrapper for {func_name}: Calling func...")
+                result = func(*new_args, **kwargs)
+                logger.debug(f"sync_wrapper for {func_name}: func call completed.")
+                # Success annotation
+                logger.debug(f"sync_wrapper for {func_name}: Annotating success.")
+                annotate_task_result(
+                    control_message=result if result is not None else control_message,
+                    # Annotate result or original message
+                    result=TaskResultStatus.SUCCESS,
+                    task_id=resolved_annotation_id,
+                )
+                logger.debug(f"sync_wrapper for {func_name}: Success annotation done. Returning result.")
+                return result
-                # --- Skip logic ---
-                is_failed = control_message.get_metadata("cm_failed", False)
-                if is_failed and skip_processing_if_failed:
-                    logger.warning(f"sync_wrapper for {func_name}: Skipping processing, message already marked failed.")
-                    if forward_func:
-                        logger.debug("sync_wrapper: Forwarding skipped message.")
-                        return forward_func(control_message)  # Assume forward_func is sync here
-                    else:
-                        logger.debug("sync_wrapper: Returning skipped message as is.")
-                        return control_message
+            except Exception as e:
+                # --- Failure Handling ---
+                error_message = f"Error in {func_name}: {e}"
+                logger.error(f"sync_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
-                # --- Main execution block ---
-                result = None
+                # Annotate failure on the original message object
                 try:
-                    # Payload check
-                    if not payload_can_be_empty:
-                        cm_ensure_payload_not_null(control_message)
-                    # Rebuild args and call original sync function
-                    new_args = prefix + (control_message,) + args[len(prefix) + 1 :]
-                    logger.debug(f"sync_wrapper for {func_name}: Calling func...")
-                    result = func(*new_args, **kwargs)
-                    logger.debug(f"sync_wrapper for {func_name}: func call completed.")
-                    # Success annotation
-                    logger.debug(f"sync_wrapper for {func_name}: Annotating success.")
+                    cm_set_failure(control_message, error_message)
                     annotate_task_result(
-                        control_message=result if result is not None else control_message,
-                        # Annotate result or original message
-                        result=TaskResultStatus.SUCCESS,
-                        task_id=annotation_id,
+                        control_message=control_message,
+                        result=TaskResultStatus.FAILURE,
+                        task_id=resolved_annotation_id,
+                        message=error_message,
+                    )
+                    logger.debug(f"sync_wrapper for {func_name}: Failure annotation complete.")
+                except Exception as anno_err:
+                    logger.exception(
+                        f"sync_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
                     )
-                    logger.debug(f"sync_wrapper for {func_name}: Success annotation done. Returning result.")
-                    return result
-                except Exception as e:
-                    # --- Failure Handling ---
-                    error_message = f"Error in {func_name}: {e}"
-                    logger.error(f"sync_wrapper for {func_name}: Caught exception: {error_message}", exc_info=True)
-                    # Annotate failure on the original message object
-                    try:
-                        cm_set_failure(control_message, error_message)
-                        annotate_task_result(
-                            control_message=control_message,
-                            result=TaskResultStatus.FAILURE,
-                            task_id=annotation_id,
-                            message=error_message,
-                        )
-                        logger.debug(f"sync_wrapper for {func_name}: Failure annotation complete.")
-                    except Exception as anno_err:
-                        logger.exception(
-                            f"sync_wrapper for {func_name}: CRITICAL - Error during failure annotation: {anno_err}"
-                        )
-                    # Decide whether to raise or return annotated message
-                    if raise_on_failure:
-                        logger.debug(f"sync_wrapper for {func_name}: Re-raising exception as configured.")
-                        raise e  # Re-raise the original exception
-                    else:
-                        logger.debug(
-                            f"sync_wrapper for {func_name}: Suppressing exception and returning annotated message."
-                        )
-                        # Return the original control_message, now annotated with failure
-                        return control_message
+                # Decide whether to raise or return annotated message
+                if raise_on_failure:
+                    logger.debug(f"sync_wrapper for {func_name}: Re-raising exception as configured.")
+                    raise e  # Re-raise the original exception
+                else:
+                    logger.debug(
+                        f"sync_wrapper for {func_name}: Suppressing exception and returning annotated message."
+                    )
+                    # Return the original control_message, now annotated with failure
+                    return control_message
-            return sync_wrapper
+        return sync_wrapper
     return decorator

nv_ingest_api/util/imports/callable_signatures.py CHANGED Viewed

@@ -14,6 +14,8 @@ def ingest_stage_callable_signature(sig: inspect.Signature):
     Validates that a callable has the signature:
         (IngestControlMessage, BaseModel) -> IngestControlMessage
+    Also allows for generic (*args, **kwargs) signatures for flexibility with class constructors.
     Raises
     ------
     TypeError
@@ -21,11 +23,15 @@ def ingest_stage_callable_signature(sig: inspect.Signature):
     """
     params = list(sig.parameters.values())
+    # If the signature accepts arbitrary keyword arguments, it's flexible enough.
+    if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
+        return
     if len(params) != 2:
         raise TypeError(f"Expected exactly 2 parameters, got {len(params)}")
     if params[0].name != "control_message" or params[1].name != "stage_config":
-        raise TypeError("Expected parameter names: 'control_message', 'config'")
+        raise TypeError("Expected parameter names: 'control_message', 'stage_config'")
     first_param = params[0].annotation
     second_param = params[1].annotation
@@ -48,3 +54,55 @@ def ingest_stage_callable_signature(sig: inspect.Signature):
     if not issubclass(return_type, IngestControlMessage):
         raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
+def ingest_callable_signature(sig: inspect.Signature):
+    """
+    Validates that a callable has the signature:
+        (IngestControlMessage) -> IngestControlMessage
+    Also allows for generic (*args, **kwargs) signatures for flexibility with class constructors.
+    Raises
+    ------
+    TypeError
+        If the signature does not match the expected pattern.
+    """
+    params = list(sig.parameters.values())
+    # If the signature accepts arbitrary keyword arguments, it's flexible enough.
+    if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
+        return
+    if len(params) != 1:
+        raise TypeError(f"Expected exactly 1 parameter, got {len(params)}")
+    if params[0].name != "control_message":
+        raise TypeError("Expected parameter name: 'control_message'")
+    first_param = params[0].annotation
+    return_type = sig.return_annotation
+    if first_param is inspect.Parameter.empty:
+        raise TypeError("Parameter must be annotated with IngestControlMessage")
+    if return_type is inspect.Signature.empty:
+        raise TypeError("Return type must be annotated with IngestControlMessage")
+    # Handle string annotations (forward references)
+    if isinstance(first_param, str):
+        if first_param != "IngestControlMessage":
+            raise TypeError(f"Parameter must be IngestControlMessage, got {first_param}")
+    else:
+        # Handle actual class annotations
+        if not issubclass(first_param, IngestControlMessage):
+            raise TypeError(f"Parameter must be IngestControlMessage, got {first_param}")
+    # Handle string annotations for return type
+    if isinstance(return_type, str):
+        if return_type != "IngestControlMessage":
+            raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")
+    else:
+        # Handle actual class annotations
+        if not issubclass(return_type, IngestControlMessage):
+            raise TypeError(f"Return type must be IngestControlMessage, got {return_type}")

nv_ingest_api/util/imports/dynamic_resolvers.py CHANGED Viewed

@@ -6,6 +6,8 @@ import importlib
 import inspect
 from typing import Callable, Union, List, Optional
+from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
 def resolve_obj_from_path(path: str, allowed_base_paths: Optional[List[str]] = None) -> object:
     """
@@ -99,12 +101,58 @@ def resolve_callable_from_path(
         try:
             schema_checker(sig)
         except Exception as e:
-            raise TypeError(
-                f"Callable at '{callable_path}' failed custom signature validation:\n"
-                f"  Signature: {sig}\n"
-                f"  Error: {e}"
-            ) from e
+            raise TypeError(f"Signature validation for '{callable_path}' failed: {e}") from e
     else:
         raise TypeError(f"Invalid signature_schema: expected list, callable, or str, got {type(signature_schema)}")
     return obj
+def resolve_actor_class_from_path(
+    path: str, expected_base_class: type, allowed_base_paths: Optional[List[str]] = None
+) -> type:
+    """
+    Resolves an actor class from a path and validates that it is a class
+    that inherits from the expected base class. This function correctly handles
+    decorated Ray actors by inspecting their original class.
+    Parameters
+    ----------
+    path : str
+        The full import path to the actor class.
+    expected_base_class : type
+        The base class that the resolved class must inherit from.
+    allowed_base_paths : Optional[List[str]]
+        An optional list of base module paths from which imports are allowed.
+    Returns
+    -------
+    type
+        The resolved actor class (or Ray actor factory).
+    """
+    obj = resolve_obj_from_path(path, allowed_base_paths=allowed_base_paths)
+    # Determine the class to validate. If it's a Ray actor factory, we need to
+    # inspect its MRO to find the original user-defined class.
+    cls_to_validate = None
+    if inspect.isclass(obj):
+        cls_to_validate = obj
+    else:
+        # For actor factories, find the base class in the MRO that inherits from RayActorStage
+        for base in obj.__class__.__mro__:
+            if inspect.isclass(base) and issubclass(base, RayActorStage) and base is not RayActorStage:
+                cls_to_validate = base
+                break
+    if cls_to_validate is None:
+        raise TypeError(
+            f"Could not resolve a valid actor class from path '{path}'. "
+            f"The object is not a class and not a recognized actor factory."
+        )
+    if not issubclass(cls_to_validate, expected_base_class):
+        raise TypeError(
+            f"Actor class '{cls_to_validate.__name__}' at '{path}' must inherit from '{expected_base_class.__name__}'."
+        )
+    return obj

nv_ingest_api/util/introspection/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv-ingest-api 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.8.14.dev20250814py3-none-any.whl → 2025.8.16.dev20250816py3-none-any.whl