PyPI - nv-ingest-client - Versions diffs - 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl - Mend

nv-ingest-client 2025.8.14.dev20250814py3-none-any.whl → 2025.8.16.dev20250816py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (28) hide show

nv_ingest_client/cli/util/click.py CHANGED Viewed

@@ -12,23 +12,30 @@ from pprint import pprint
 from typing import Union, List, Any, Dict
 import click
+from nv_ingest_api.internal.enums.common import PipelinePhase
+from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
 from nv_ingest_client.util.processing import check_schema
 from nv_ingest_client.primitives.tasks import CaptionTask
 from nv_ingest_client.primitives.tasks import DedupTask
 from nv_ingest_client.primitives.tasks import EmbedTask
 from nv_ingest_client.primitives.tasks import ExtractTask
 from nv_ingest_client.primitives.tasks import FilterTask
+from nv_ingest_client.primitives.tasks import InfographicExtractionTask
 from nv_ingest_client.primitives.tasks import SplitTask
 from nv_ingest_client.primitives.tasks import StoreEmbedTask
 from nv_ingest_client.primitives.tasks import StoreTask
-from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
-from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
-from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
-from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
-from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
-from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
-from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
-from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
+from nv_ingest_client.primitives.tasks import UDFTask
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
 from nv_ingest_client.util.util import generate_matching_files
 logger = logging.getLogger(__name__)
@@ -78,12 +85,6 @@ class ClientType(str, Enum):
     KAFKA = "KAFKA"
-# Example TaskId validation set
-VALID_TASK_IDS = {"task1", "task2", "task3"}
-_MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
 def debug_print_click_options(ctx: click.Context) -> None:
     """
     Retrieves all options from the Click context and pretty prints them.
@@ -149,9 +150,11 @@ TaskType = Union[
     EmbedTask,
     ExtractTask,
     FilterTask,
+    InfographicExtractionTask,
     SplitTask,
     StoreEmbedTask,
     StoreTask,
+    UDFTask,
 ]
@@ -178,7 +181,32 @@ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
         the error details (e.g., expected property format), and show the input that was provided.
     """
     try:
-        return json.loads(options_str)
+        options = json.loads(options_str)
+        # Convert string boolean values to actual booleans for extract tasks
+        if task_id == "extract":
+            boolean_fields = [
+                "extract_text",
+                "extract_images",
+                "extract_tables",
+                "extract_charts",
+                "extract_infographics",
+                "extract_page_as_image",
+            ]
+            for field in boolean_fields:
+                if field in options:
+                    value = options[field]
+                    if isinstance(value, str):
+                        if value.lower() in ("true", "1", "yes", "on"):
+                            options[field] = True
+                        elif value.lower() in ("false", "0", "no", "off"):
+                            options[field] = False
+                        else:
+                            raise ValueError(
+                                f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
+                            )
+        return options
     except json.JSONDecodeError as e:
         error_message = (
             f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
@@ -229,46 +257,170 @@ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[
             options: Dict[str, Any] = parse_task_options(task_id, json_options)
             if task_id == "split":
-                task_options = check_schema(SplitTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
             elif task_id == "extract":
-                task_options = check_schema(ExtractTaskSchema, options, task_id, json_options)
-                new_task_id = f"{task_id}_{task_options.document_type}"
-                new_task = [(new_task_id, ExtractTask(**task_options.model_dump()))]
+                # Map CLI parameters to API schema structure
+                method = options.pop("extract_method", None)
+                if method is None:
+                    method = "pdfium"  # Default fallback
+                # Build params dict for API schema
+                params = {k: v for k, v in options.items() if k != "document_type"}
+                # Validate with API schema
+                api_options = {
+                    "document_type": options.get("document_type"),
+                    "method": method,
+                    "params": params,
+                }
+                task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
+                new_task_id = f"{task_id}_{task_options.document_type.value}"
+                # Create ExtractTask with original CLI parameters
+                extract_task_params = {
+                    "document_type": task_options.document_type,
+                    "extract_method": task_options.method,
+                    **task_options.params,
+                }
+                # Start with the main extract task
+                new_task = [(new_task_id, ExtractTask(**extract_task_params))]
+                # Add ChartExtractionTask if extract_charts is True
+                if task_options.params.get("extract_charts", False):
+                    from nv_ingest_client.primitives.tasks import ChartExtractionTask
+                    chart_task_id = "chart_data_extract"
+                    chart_params = {"params": {}}  # ChartExtractionTask takes params dict
+                    new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
+                # Add TableExtractionTask if extract_tables is True
+                if task_options.params.get("extract_tables", False):
+                    from nv_ingest_client.primitives.tasks import TableExtractionTask
+                    table_task_id = "table_data_extract"
+                    new_task.append((table_task_id, TableExtractionTask()))
             elif task_id == "store":
-                task_options = check_schema(StoreTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
             elif task_id == "store_embedding":
-                task_options = check_schema(StoreEmbedTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
             elif task_id == "caption":
-                task_options = check_schema(CaptionTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
-                new_task = [(new_task_id, CaptionTask(**task_options.model_dump()))]
+                # Extract individual parameters from API schema for CaptionTask constructor
+                caption_params = {
+                    "api_key": task_options.api_key,
+                    "endpoint_url": task_options.endpoint_url,
+                    "prompt": task_options.prompt,
+                    "model_name": task_options.model_name,
+                }
+                new_task = [(new_task_id, CaptionTask(**caption_params))]
             elif task_id == "dedup":
-                task_options = check_schema(DedupTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
-                new_task = [(new_task_id, DedupTask(**task_options.model_dump()))]
+                # Extract individual parameters from API schema for DedupTask constructor
+                dedup_params = {
+                    "content_type": task_options.content_type,
+                    "filter": task_options.params.filter,
+                }
+                new_task = [(new_task_id, DedupTask(**dedup_params))]
             elif task_id == "filter":
-                task_options = check_schema(FilterTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
-                new_task = [(new_task_id, FilterTask(**task_options.model_dump()))]
+                # Extract individual parameters from API schema for FilterTask constructor
+                filter_params = {
+                    "content_type": task_options.content_type,
+                    "min_size": task_options.params.min_size,
+                    "max_aspect_ratio": task_options.params.max_aspect_ratio,
+                    "min_aspect_ratio": task_options.params.min_aspect_ratio,
+                    "filter": task_options.params.filter,
+                }
+                new_task = [(new_task_id, FilterTask(**filter_params))]
             elif task_id == "embed":
-                task_options = check_schema(EmbedTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
+            elif task_id == "infographic":
+                task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
+                new_task_id = f"{task_id}"
+                new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
+            elif task_id == "udf":
+                # Validate mutual exclusivity of target_stage and phase
+                has_target_stage = "target_stage" in options and options["target_stage"] is not None
+                has_phase = "phase" in options and options["phase"] is not None
+                if has_target_stage and has_phase:
+                    raise ValueError(
+                        "UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
+                    )
+                elif not has_target_stage and not has_phase:
+                    raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
+                # Pre-process UDF task options to convert phase names to integers
+                if "phase" in options and isinstance(options["phase"], str):
+                    # Convert phase string to integer using the same logic as UDFTask
+                    phase_str = options["phase"].upper()
+                    phase_aliases = {
+                        "PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
+                        "PREPROCESSING": PipelinePhase.PRE_PROCESSING,
+                        "PRE": PipelinePhase.PRE_PROCESSING,
+                        "EXTRACTION": PipelinePhase.EXTRACTION,
+                        "EXTRACT": PipelinePhase.EXTRACTION,
+                        "POST_PROCESSING": PipelinePhase.POST_PROCESSING,
+                        "POSTPROCESSING": PipelinePhase.POST_PROCESSING,
+                        "POST": PipelinePhase.POST_PROCESSING,
+                        "MUTATION": PipelinePhase.MUTATION,
+                        "MUTATE": PipelinePhase.MUTATION,
+                        "TRANSFORM": PipelinePhase.TRANSFORM,
+                        "RESPONSE": PipelinePhase.RESPONSE,
+                        "RESP": PipelinePhase.RESPONSE,
+                    }
+                    if phase_str in phase_aliases:
+                        options["phase"] = phase_aliases[phase_str].value
+                    else:
+                        raise ValueError(f"Invalid phase name: {options['phase']}")
+                # Try to infer udf_function_name if not provided
+                if "udf_function_name" not in options or not options["udf_function_name"]:
+                    udf_function = options.get("udf_function", "")
+                    if udf_function:
+                        inferred_name = infer_udf_function_name(udf_function)
+                        if inferred_name:
+                            options["udf_function_name"] = inferred_name
+                            logger.info(f"Inferred UDF function name: {inferred_name}")
+                        else:
+                            raise ValueError(
+                                f"Could not infer UDF function name from '{udf_function}'. "
+                                "Please specify 'udf_function_name' explicitly."
+                            )
+                task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
+                new_task_id = f"{task_id}"
+                new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
             else:
                 raise ValueError(f"Unsupported task type: {task_id}")
+            # Check for duplicate tasks - now allowing multiple tasks of the same type
             if new_task_id in validated_tasks:
-                raise ValueError(f"Duplicate task detected: {new_task_id}")
+                logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
             logger.debug("Adding task: %s", new_task_id)
             for task_tuple in new_task:
-                validated_tasks[task_tuple[0]] = task_tuple[1]
+                if task_tuple[0] in validated_tasks:
+                    # Convert single task to list if needed, then append
+                    existing_task = validated_tasks[task_tuple[0]]
+                    if not isinstance(existing_task, list):
+                        validated_tasks[task_tuple[0]] = [existing_task]
+                    validated_tasks[task_tuple[0]].append(task_tuple[1])
+                else:
+                    validated_tasks[task_tuple[0]] = task_tuple[1]
         except ValueError as e:
             validation_errors.append(str(e))

nv_ingest_client/client/interface.py CHANGED Viewed

@@ -27,6 +27,16 @@ from typing import Union
 from urllib.parse import urlparse
 import fsspec
+from nv_ingest_api.internal.enums.common import PipelinePhase
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
+from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
 from nv_ingest_client.client.client import NvIngestClient
 from nv_ingest_client.client.util.processing import get_valid_filename
 from nv_ingest_client.client.util.processing import save_document_results_to_jsonl
@@ -38,16 +48,9 @@ from nv_ingest_client.primitives.tasks import EmbedTask
 from nv_ingest_client.primitives.tasks import ExtractTask
 from nv_ingest_client.primitives.tasks import FilterTask
 from nv_ingest_client.primitives.tasks import SplitTask
-from nv_ingest_client.primitives.tasks import StoreEmbedTask
 from nv_ingest_client.primitives.tasks import StoreTask
-from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
-from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
-from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
-from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
-from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
-from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
-from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
-from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
+from nv_ingest_client.primitives.tasks import StoreEmbedTask
+from nv_ingest_client.primitives.tasks import UDFTask
 from nv_ingest_client.util.processing import check_schema
 from nv_ingest_client.util.system import ensure_directory_with_permissions
 from nv_ingest_client.util.util import filter_function_kwargs
@@ -436,7 +439,7 @@ class Ingestor:
         final_results_payload_list: Union[List[List[Dict[str, Any]]], List[LazyLoadedList]] = []
-        # Lock for thread-safe appends to final_results_payload_list by I/O tasks
+        # Lock for thread-safe appending to final_results_payload_list by I/O tasks
         results_lock = threading.Lock() if self._output_config else None
         io_executor: Optional[ThreadPoolExecutor] = None
@@ -698,8 +701,23 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(DedupTaskSchema, kwargs, "dedup", json.dumps(kwargs))
-        dedup_task = DedupTask(**task_options.model_dump())
+        # Extract content_type and build params dict for API schema
+        content_type = kwargs.pop("content_type", "text")  # Default to "text" if not specified
+        params = kwargs  # Remaining parameters go into params dict
+        # Validate with API schema
+        api_options = {
+            "content_type": content_type,
+            "params": params,
+        }
+        task_options = check_schema(IngestTaskDedupSchema, api_options, "dedup", json.dumps(api_options))
+        # Extract individual parameters from API schema for DedupTask constructor
+        dedup_params = {
+            "content_type": task_options.content_type,
+            "filter": task_options.params.filter,
+        }
+        dedup_task = DedupTask(**dedup_params)
         self._job_specs.add_task(dedup_task)
         return self
@@ -719,8 +737,14 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(EmbedTaskSchema, kwargs, "embed", json.dumps(kwargs))
-        embed_task = EmbedTask(**task_options.model_dump())
+        # Filter out deprecated parameters before API schema validation
+        # The EmbedTask constructor handles these deprecated parameters with warnings
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k not in ["text", "tables"]}
+        _ = check_schema(IngestTaskEmbedSchema, filtered_kwargs, "embed", json.dumps(filtered_kwargs))
+        # Pass original kwargs to EmbedTask constructor so it can handle deprecated parameters
+        embed_task = EmbedTask(**kwargs)
         self._job_specs.add_task(embed_task)
         return self
@@ -767,9 +791,52 @@ class Ingestor:
                 extract_page_as_image=extract_page_as_image,
                 **kwargs,
             )
-            task_options = check_schema(ExtractTaskSchema, task_options, "extract", json.dumps(task_options))
-            extract_task = ExtractTask(**task_options.model_dump())
+            # Extract method from task_options for API schema
+            method = task_options.pop("extract_method", None)
+            if method is None:
+                # Let ExtractTask constructor handle default method selection
+                method = "pdfium"  # Default fallback
+            # Build params dict for API schema
+            params = {k: v for k, v in task_options.items() if k != "document_type"}
+            # Map document type to API schema expected values
+            # Handle common file extension to DocumentTypeEnum mapping
+            document_type_mapping = {
+                "txt": "text",
+                "md": "text",
+                "sh": "text",
+                "json": "text",
+                "jpg": "jpeg",
+                "jpeg": "jpeg",
+                "png": "png",
+                "pdf": "pdf",
+                "docx": "docx",
+                "pptx": "pptx",
+                "html": "html",
+                "bmp": "bmp",
+                "tiff": "tiff",
+                "svg": "svg",
+                "mp3": "mp3",
+                "wav": "wav",
+            }
+            # Use mapped document type for API schema validation
+            api_document_type = document_type_mapping.get(document_type.lower(), document_type)
+            # Validate with API schema
+            api_task_options = {
+                "document_type": api_document_type,
+                "method": method,
+                "params": params,
+            }
+            check_schema(IngestTaskExtractSchema, api_task_options, "extract", json.dumps(api_task_options))
+            # Create ExtractTask with mapped document type for API schema compatibility
+            extract_task_params = {"document_type": api_document_type, "extract_method": method, **params}
+            extract_task = ExtractTask(**extract_task_params)
             self._job_specs.add_task(extract_task, document_type=document_type)
         return self
@@ -789,8 +856,27 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(FilterTaskSchema, kwargs, "filter", json.dumps(kwargs))
-        filter_task = FilterTask(**task_options.model_dump())
+        # Restructure parameters to match API schema structure
+        params_fields = {"min_size", "max_aspect_ratio", "min_aspect_ratio", "filter"}
+        params = {k: v for k, v in kwargs.items() if k in params_fields}
+        top_level = {k: v for k, v in kwargs.items() if k not in params_fields}
+        # Build API schema structure
+        api_kwargs = top_level.copy()
+        if params:
+            api_kwargs["params"] = params
+        task_options = check_schema(IngestTaskFilterSchema, api_kwargs, "filter", json.dumps(api_kwargs))
+        # Extract individual parameters from API schema for FilterTask constructor
+        filter_params = {
+            "content_type": task_options.content_type,
+            "min_size": task_options.params.min_size,
+            "max_aspect_ratio": task_options.params.max_aspect_ratio,
+            "min_aspect_ratio": task_options.params.min_aspect_ratio,
+            "filter": task_options.params.filter,
+        }
+        filter_task = FilterTask(**filter_params)
         self._job_specs.add_task(filter_task)
         return self
@@ -810,7 +896,7 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(SplitTaskSchema, kwargs, "split", json.dumps(kwargs))
+        task_options = check_schema(IngestTaskSplitSchema, kwargs, "split", json.dumps(kwargs))
         extract_task = SplitTask(**task_options.model_dump())
         self._job_specs.add_task(extract_task)
@@ -831,8 +917,24 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(StoreTaskSchema, kwargs, "store", json.dumps(kwargs))
-        store_task = StoreTask(**task_options.model_dump())
+        # Handle parameter name mapping: store_method -> method for API schema
+        if "store_method" in kwargs:
+            kwargs["method"] = kwargs.pop("store_method")
+        # Provide default method if not specified (matching client StoreTask behavior)
+        if "method" not in kwargs:
+            kwargs["method"] = "minio"
+        task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
+        # Map API schema fields back to StoreTask constructor parameters
+        store_params = {
+            "structured": task_options.structured,
+            "images": task_options.images,
+            "store_method": task_options.method,  # Map method back to store_method
+            "params": task_options.params,
+        }
+        store_task = StoreTask(**store_params)
         self._job_specs.add_task(store_task)
         return self
@@ -840,24 +942,97 @@ class Ingestor:
     @ensure_job_specs
     def store_embed(self, **kwargs: Any) -> "Ingestor":
         """
-        Adds a StoreTask to the batch job specification.
+        Adds a StoreEmbedTask to the batch job specification.
         Parameters
         ----------
         kwargs : dict
-            Parameters specific to the StoreTask.
+            Parameters specific to the StoreEmbedTask.
         Returns
         -------
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(StoreEmbedTaskSchema, kwargs, "store_embedding", json.dumps(kwargs))
+        task_options = check_schema(IngestTaskStoreEmbedSchema, kwargs, "store_embedding", json.dumps(kwargs))
         store_task = StoreEmbedTask(**task_options.model_dump())
         self._job_specs.add_task(store_task)
         return self
+    def udf(
+        self,
+        udf_function: str,
+        udf_function_name: Optional[str] = None,
+        phase: Optional[Union[PipelinePhase, int, str]] = None,
+        target_stage: Optional[str] = None,
+        run_before: bool = False,
+        run_after: bool = False,
+    ) -> "Ingestor":
+        """
+        Adds a UDFTask to the batch job specification.
+        Parameters
+        ----------
+        udf_function : str
+            UDF specification. Supports three formats:
+            1. Inline function: 'def my_func(control_message): ...'
+            2. Import path: 'my_module.my_function'
+            3. File path: '/path/to/file.py:function_name'
+        udf_function_name : str, optional
+            Name of the function to execute from the UDF specification.
+            If not provided, attempts to infer from udf_function.
+        phase : Union[PipelinePhase, int, str], optional
+            Pipeline phase to execute UDF. Accepts phase names ('extract', 'split', 'embed', 'response')
+            or numbers (1-4). Cannot be used with target_stage.
+        target_stage : str, optional
+            Specific stage name to target for UDF execution. Cannot be used with phase.
+        run_before : bool, optional
+            If True and target_stage is specified, run UDF before the target stage. Default: False.
+        run_after : bool, optional
+            If True and target_stage is specified, run UDF after the target stage. Default: False.
+        Returns
+        -------
+        Ingestor
+            Returns self for chaining.
+        Raises
+        ------
+        ValueError
+            If udf_function_name cannot be inferred and is not provided explicitly,
+            or if both phase and target_stage are specified, or if neither is specified.
+        """
+        # Validate mutual exclusivity of phase and target_stage
+        if phase is not None and target_stage is not None:
+            raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
+        elif phase is None and target_stage is None:
+            # Default to response phase for backward compatibility
+            phase = PipelinePhase.RESPONSE
+        # Try to infer udf_function_name if not provided
+        if udf_function_name is None:
+            udf_function_name = infer_udf_function_name(udf_function)
+            if udf_function_name is None:
+                raise ValueError(
+                    f"Could not infer UDF function name from '{udf_function}'. "
+                    "Please specify 'udf_function_name' explicitly."
+                )
+            logger.info(f"Inferred UDF function name: {udf_function_name}")
+        # Use UDFTask constructor with explicit parameters
+        udf_task = UDFTask(
+            udf_function=udf_function,
+            udf_function_name=udf_function_name,
+            phase=phase,
+            target_stage=target_stage,
+            run_before=run_before,
+            run_after=run_after,
+        )
+        self._job_specs.add_task(udf_task)
+        return self
     def vdb_upload(self, purge_results_after_upload: bool = True, **kwargs: Any) -> "Ingestor":
         """
         Adds a VdbUploadTask to the batch job specification.
@@ -986,8 +1161,16 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        task_options = check_schema(CaptionTaskSchema, kwargs, "caption", json.dumps(kwargs))
-        caption_task = CaptionTask(**task_options.model_dump())
+        task_options = check_schema(IngestTaskCaptionSchema, kwargs, "caption", json.dumps(kwargs))
+        # Extract individual parameters from API schema for CaptionTask constructor
+        caption_params = {
+            "api_key": task_options.api_key,
+            "endpoint_url": task_options.endpoint_url,
+            "prompt": task_options.prompt,
+            "model_name": task_options.model_name,
+        }
+        caption_task = CaptionTask(**caption_params)
         self._job_specs.add_task(caption_task)
         return self

nv_ingest_client/nv_ingest_cli.py CHANGED Viewed

@@ -169,6 +169,22 @@ Tasks and Options:
     - split_length (int): Segment length. No default.
     - split_overlap (int): Segment overlap. No default.
 \b
+- udf: Executes user-defined functions (UDFs) for custom processing logic.
+    Options:
+    - udf_function (str): UDF specification. Supports three formats:
+        1. Inline function: 'def my_func(control_message): ...'
+        2. Import path: 'my_module.my_function'
+        3. File path: '/path/to/file.py:function_name' or '/path/to/file.py' (assumes 'process' function)
+    - udf_function_name (str): Name of the function to execute from the UDF specification. Required.
+    - target_stage (str): Specific pipeline stage name to target for UDF execution (e.g.,
+        'text_extractor', 'text_embedder', 'image_extractor'). Cannot be used with phase.
+    - run_before (bool): If True and target_stage is specified, run UDF before the target stage. Default: False.
+    - run_after (bool): If True and target_stage is specified, run UDF after the target stage. Default: False.
+    Examples:
+        --task 'udf:{"udf_function": "my_file.py:my_func", "target_stage": "text_embedder", "run_before": true}'
+        --task 'udf:{"udf_function": "def process(cm): return cm",
+            "target_stage": "image_extractor", "run_after": true}'
+\b
 Note: The 'extract_method' automatically selects the optimal method based on 'document_type' if not explicitly stated.
 """,
 )

nv-ingest-client 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.8.14.dev20250814py3-none-any.whl → 2025.8.16.dev20250816py3-none-any.whl