PyPI - nv-ingest-client - Versions diffs - 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl - Mend

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show

nv_ingest_client/cli/util/click.py +182 -30
nv_ingest_client/cli/util/processing.py +0 -393
nv_ingest_client/client/client.py +561 -207
nv_ingest_client/client/ingest_job_handler.py +412 -0
nv_ingest_client/client/interface.py +466 -59
nv_ingest_client/client/util/processing.py +11 -1
nv_ingest_client/nv_ingest_cli.py +58 -6
nv_ingest_client/primitives/jobs/job_spec.py +32 -10
nv_ingest_client/primitives/tasks/__init__.py +6 -4
nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
nv_ingest_client/primitives/tasks/caption.py +10 -16
nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
nv_ingest_client/primitives/tasks/dedup.py +12 -21
nv_ingest_client/primitives/tasks/embed.py +37 -76
nv_ingest_client/primitives/tasks/extract.py +68 -169
nv_ingest_client/primitives/tasks/filter.py +22 -28
nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
nv_ingest_client/primitives/tasks/split.py +17 -18
nv_ingest_client/primitives/tasks/store.py +29 -29
nv_ingest_client/primitives/tasks/task_base.py +1 -72
nv_ingest_client/primitives/tasks/task_factory.py +10 -11
nv_ingest_client/primitives/tasks/udf.py +349 -0
nv_ingest_client/util/dataset.py +8 -2
nv_ingest_client/util/document_analysis.py +314 -0
nv_ingest_client/util/image_disk_utils.py +300 -0
nv_ingest_client/util/transport.py +12 -6
nv_ingest_client/util/util.py +66 -0
nv_ingest_client/util/vdb/milvus.py +220 -75
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
nv_ingest_client/cli/util/tasks.py +0 -3
nv_ingest_client/primitives/exceptions.py +0 -0
nv_ingest_client/primitives/tasks/transform.py +0 -0
nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0

nv_ingest_client/client/util/processing.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import gzip
 import io
 import json
 import logging
@@ -6,6 +7,7 @@ import re
 from typing import Any
 from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 try:
@@ -33,6 +35,7 @@ def save_document_results_to_jsonl(
     jsonl_output_filepath: str,
     original_source_name_for_log: str,
     ensure_parent_dir_exists: bool = True,
+    compression: Optional[str] = None,
 ) -> Tuple[int, Dict[str, str]]:
     """
     Saves a list of extraction items (for a single source document) to a JSON Lines file.
@@ -50,6 +53,13 @@ def save_document_results_to_jsonl(
             if parent_dir:
                 os.makedirs(parent_dir, exist_ok=True)
+        if compression == "gzip":
+            open_func = gzip.open
+        elif compression is None:
+            open_func = open
+        else:
+            raise ValueError(f"Unsupported compression type: {compression}")
         with io.BytesIO() as buffer:
             for extraction_item in doc_response_data:
                 if USING_ORJSON:
@@ -60,7 +70,7 @@ def save_document_results_to_jsonl(
         count_items_written = len(doc_response_data)
-        with open(jsonl_output_filepath, "wb") as f_jsonl:
+        with open_func(jsonl_output_filepath, "wb") as f_jsonl:
             f_jsonl.write(full_byte_content)
         logger.info(

nv_ingest_client/nv_ingest_cli.py CHANGED Viewed

@@ -25,13 +25,14 @@ from nv_ingest_client.cli.util.click import click_match_and_validate_files
 from nv_ingest_client.cli.util.click import click_validate_batch_size
 from nv_ingest_client.cli.util.click import click_validate_file_exists
 from nv_ingest_client.cli.util.click import click_validate_task
-from nv_ingest_client.cli.util.processing import create_and_process_jobs
 from nv_ingest_client.cli.util.processing import report_statistics
 from nv_ingest_client.cli.util.system import configure_logging
 from nv_ingest_client.client import NvIngestClient
+from nv_ingest_client.client.ingest_job_handler import IngestJobHandler
 from nv_ingest_client.util.dataset import get_dataset_files
 from nv_ingest_client.util.dataset import get_dataset_statistics
 from nv_ingest_client.util.system import ensure_directory_with_permissions
+from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
 try:
     NV_INGEST_VERSION = version("nv_ingest")
@@ -73,6 +74,12 @@ logger = logging.getLogger(__name__)
 @click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
 @click.option("--client_port", default=7670, type=int, help="Port for the client endpoint.")
 @click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
+@click.option(
+    "--api_version",
+    default="v1",
+    type=click.Choice(["v1", "v2"], case_sensitive=False),
+    help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
+)
 @click.option(
     "--client_type",
     default="rest",
@@ -118,6 +125,8 @@ Example:
   --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
   --task 'embed'
   --task 'caption:{}'
+  --pdf_split_page_count 64  # Configure PDF splitting (requires --api_version v2)
+  --api_version v2           # Use V2 API for PDF splitting support
 \b
 Tasks and Options:
@@ -169,6 +178,22 @@ Tasks and Options:
     - split_length (int): Segment length. No default.
     - split_overlap (int): Segment overlap. No default.
 \b
+- udf: Executes user-defined functions (UDFs) for custom processing logic.
+    Options:
+    - udf_function (str): UDF specification. Supports three formats:
+        1. Inline function: 'def my_func(control_message): ...'
+        2. Import path: 'my_module.my_function'
+        3. File path: '/path/to/file.py:function_name' or '/path/to/file.py' (assumes 'process' function)
+    - udf_function_name (str): Name of the function to execute from the UDF specification. Required.
+    - target_stage (str): Specific pipeline stage name to target for UDF execution (e.g.,
+        'text_extractor', 'text_embedder', 'image_extractor'). Cannot be used with phase.
+    - run_before (bool): If True and target_stage is specified, run UDF before the target stage. Default: False.
+    - run_after (bool): If True and target_stage is specified, run UDF after the target stage. Default: False.
+    Examples:
+        --task 'udf:{"udf_function": "my_file.py:my_func", "target_stage": "text_embedder", "run_before": true}'
+        --task 'udf:{"udf_function": "def process(cm): return cm",
+            "target_stage": "image_extractor", "run_after": true}'
+\b
 Note: The 'extract_method' automatically selects the optimal method based on 'document_type' if not explicitly stated.
 """,
 )
@@ -190,6 +215,12 @@ for locating portions of the system that might be bottlenecks for the overall ru
 )
 @click.option("--zipkin_host", default="localhost", help="DNS name or Zipkin API.")
 @click.option("--zipkin_port", default=9411, type=int, help="Port for the Zipkin trace API")
+@click.option(
+    "--pdf_split_page_count",
+    default=None,
+    type=int,
+    help="Number of pages per PDF chunk for splitting. Allows per-request tuning of PDF split size in v2 api.",
+)
 @click.option("--version", is_flag=True, help="Show version.")
 @click.pass_context
 def main(
@@ -198,6 +229,7 @@ def main(
     client_host: str,
     client_kwargs: str,
     client_port: int,
+    api_version: str,
     client_type: str,
     concurrency_n: int,
     dataset: str,
@@ -211,6 +243,7 @@ def main(
     collect_profiling_traces: bool,
     zipkin_host: str,
     zipkin_port: int,
+    pdf_split_page_count: int,
     task: [str],
     version: [bool],
 ):
@@ -221,7 +254,9 @@ def main(
     try:
         configure_logging(logger, log_level)
-        logging.debug(f"nv-ingest-cli:params:\n{json.dumps(ctx.params, indent=2, default=repr)}")
+        # Sanitize CLI params before logging to avoid leaking secrets
+        _sanitized_params = sanitize_for_logging(dict(ctx.params))
+        logging.debug(f"nv-ingest-cli:params:\n{json.dumps(_sanitized_params, indent=2, default=repr)}")
         docs = list(doc)
         if dataset:
@@ -244,7 +279,20 @@ def main(
             logger.info(_msg)
         if not dry_run:
-            logging.debug(f"Creating message client: {client_host} and port: {client_port} -> {client_kwargs}")
+            # Sanitize client kwargs (JSON string) before logging
+            try:
+                _client_kwargs_obj = json.loads(client_kwargs)
+            except Exception:
+                _client_kwargs_obj = {"raw": client_kwargs}
+            # Merge api_version into client_kwargs
+            _client_kwargs_obj["api_version"] = api_version
+            _sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
+            logging.debug(
+                f"Creating message client: {client_host} and port: {client_port} -> "
+                f"{json.dumps(_sanitized_client_kwargs, indent=2, default=repr)}"
+            )
             if client_type == "rest":
                 client_allocator = RestClient
@@ -257,20 +305,24 @@ def main(
                 message_client_allocator=client_allocator,
                 message_client_hostname=client_host,
                 message_client_port=client_port,
-                message_client_kwargs=json.loads(client_kwargs),
+                message_client_kwargs=_client_kwargs_obj,
                 worker_pool_size=concurrency_n,
             )
             start_time_ns = time.time_ns()
-            (total_files, trace_times, pages_processed, trace_ids) = create_and_process_jobs(
-                files=docs,
+            handler = IngestJobHandler(
                 client=ingest_client,
+                files=docs,
                 tasks=task,
                 output_directory=output_directory,
                 batch_size=batch_size,
                 fail_on_error=fail_on_error,
                 save_images_separately=save_images_separately,
+                show_progress=True,
+                show_telemetry=True,
+                pdf_split_page_count=pdf_split_page_count,
             )
+            (total_files, trace_times, pages_processed, trace_ids) = handler.run()
             report_statistics(start_time_ns, trace_times, pages_processed, total_files)

nv_ingest_client/primitives/jobs/job_spec.py CHANGED Viewed

@@ -110,6 +110,7 @@ class JobSpec:
             "job_id": str(self._job_id),
             "tasks": [task.to_dict() for task in self._tasks],
             "tracing_options": self._extended_options.get("tracing_options", {}),
+            "pdf_config": self._extended_options.get("pdf_config", {}),
         }
     @property
@@ -150,23 +151,48 @@ class JobSpec:
     def add_task(self, task) -> None:
         """
-        Adds a task to the job specification.
+        Adds a task or list of tasks to the job specification.
+        Parameters
+        ----------
+        task : Task or list of Task
+            The task(s) to add to the job specification. Can be a single task or a list of tasks.
+            Each task must derive from the Task class and have a to_dict method.
+        Raises
+        ------
+        ValueError
+            If any task does not derive from the Task class.
+        """
+        # Handle both single tasks and lists of tasks
+        if isinstance(task, list):
+            # Process each task in the list
+            for single_task in task:
+                self._add_single_task(single_task)
+        else:
+            # Process single task
+            self._add_single_task(task)
+    def _add_single_task(self, task) -> None:
+        """
+        Adds a single task to the job specification with automatic task expansion.
         Parameters
         ----------
-        task
-            The task to add to the job specification. Assumes the task has a to_dict method.
+        task : Task
+            The task to add to the job specification.
         Raises
         ------
         ValueError
-            If the task does not have a to_dict method.
+            If the task does not derive from the Task class.
         """
         if not isinstance(task, Task):
             raise ValueError("Task must derive from nv_ingest_client.primitives.Task class")
         self._tasks.append(task)
+        # Automatic task expansion for ExtractTask
         if isinstance(task, ExtractTask) and (task._extract_tables is True):
             self._tasks.append(TableExtractionTask())
         if isinstance(task, ExtractTask) and (task._extract_charts is True):
@@ -239,15 +265,16 @@ class BatchJobSpec:
         """
         from nv_ingest_client.util.util import create_job_specs_for_batch
         from nv_ingest_client.util.util import generate_matching_files
+        from nv_ingest_client.util.util import balanced_groups_flat_order
         if isinstance(files, str):
             files = [files]
         matching_files = list(generate_matching_files(files))
+        matching_files = balanced_groups_flat_order(matching_files)
         if not matching_files:
             logger.warning(f"No files found matching {files}.")
             return
         job_specs = create_job_specs_for_batch(matching_files)
         for job_spec in job_specs:
             self.add_job_spec(job_spec)
@@ -321,11 +348,6 @@ class BatchJobSpec:
         document_type : str, optional
             The document type used to filter job specifications. If not provided, the
             `document_type` is inferred from the task, or the task is applied to all job specifications.
-        Raises
-        ------
-        ValueError
-            If the task does not derive from the `Task` class.
         """
         if not isinstance(task, Task):
             raise ValueError("Task must derive from nv_ingest_client.primitives.Task class")

nv_ingest_client/primitives/tasks/__init__.py CHANGED Viewed

@@ -18,14 +18,18 @@ from .task_base import Task
 from .task_base import TaskType
 from .task_base import is_valid_task_type
 from .task_factory import task_factory
+from .udf import UDFTask
 __all__ = [
     "AudioExtractionTask",
     "CaptionTask",
     "ChartExtractionTask",
+    "DedupTask",
+    "EmbedTask",
     "ExtractTask",
-    "is_valid_task_type",
+    "FilterTask",
     "InfographicExtractionTask",
+    "is_valid_task_type",
     "SplitTask",
     "StoreEmbedTask",
     "StoreTask",
@@ -33,7 +37,5 @@ __all__ = [
     "Task",
     "task_factory",
     "TaskType",
-    "DedupTask",
-    "FilterTask",
-    "EmbedTask",
+    "UDFTask",
 ]

nv_ingest_client/primitives/tasks/audio_extraction.py CHANGED Viewed

@@ -10,33 +10,19 @@ import logging
 from typing import Dict
 from typing import Optional
-from pydantic import BaseModel
-from pydantic import ConfigDict
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskAudioExtraction
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class AudioExtractionSchema(BaseModel):
-    auth_token: Optional[str] = None
-    grpc_endpoint: Optional[str] = None
-    http_endpoint: Optional[str] = None
-    infer_protocol: Optional[str] = None
-    function_id: Optional[str] = None
-    use_ssl: Optional[bool] = None
-    ssl_cert: Optional[str] = None
-    segment_audio: Optional[bool] = None
-    model_config = ConfigDict(extra="forbid")
-    model_config["protected_namespaces"] = ()
 class AudioExtractionTask(Task):
     def __init__(
         self,
         auth_token: str = None,
         grpc_endpoint: str = None,
+        http_endpoint: str = None,
         infer_protocol: str = None,
         function_id: Optional[str] = None,
         use_ssl: bool = None,
@@ -45,13 +31,26 @@ class AudioExtractionTask(Task):
     ) -> None:
         super().__init__()
-        self._auth_token = auth_token
-        self._grpc_endpoint = grpc_endpoint
-        self._infer_protocol = infer_protocol
-        self._function_id = function_id
-        self._use_ssl = use_ssl
-        self._ssl_cert = ssl_cert
-        self._segment_audio = segment_audio
+        # Use the API schema for validation
+        validated_data = IngestTaskAudioExtraction(
+            auth_token=auth_token,
+            grpc_endpoint=grpc_endpoint,
+            http_endpoint=http_endpoint,
+            infer_protocol=infer_protocol,
+            function_id=function_id,
+            use_ssl=use_ssl,
+            ssl_cert=ssl_cert,
+            segment_audio=segment_audio,
+        )
+        self._auth_token = validated_data.auth_token
+        self._grpc_endpoint = validated_data.grpc_endpoint
+        self._http_endpoint = validated_data.http_endpoint
+        self._infer_protocol = validated_data.infer_protocol
+        self._function_id = validated_data.function_id
+        self._use_ssl = validated_data.use_ssl
+        self._ssl_cert = validated_data.ssl_cert
+        self._segment_audio = validated_data.segment_audio
     def __str__(self) -> str:
         """
@@ -64,6 +63,8 @@ class AudioExtractionTask(Task):
             info += "  auth_token: [redacted]\n"
         if self._grpc_endpoint:
             info += f"  grpc_endpoint: {self._grpc_endpoint}\n"
+        if self._http_endpoint:
+            info += f"  http_endpoint: {self._http_endpoint}\n"
         if self._infer_protocol:
             info += f"  infer_protocol: {self._infer_protocol}\n"
         if self._function_id:
@@ -89,6 +90,9 @@ class AudioExtractionTask(Task):
         if self._grpc_endpoint:
             task_properties["grpc_endpoint"] = self._grpc_endpoint
+        if self._http_endpoint:
+            task_properties["http_endpoint"] = self._http_endpoint
         if self._infer_protocol:
             task_properties["infer_protocol"] = self._infer_protocol

nv_ingest_client/primitives/tasks/caption.py CHANGED Viewed

@@ -8,25 +8,14 @@
 import logging
 from typing import Dict
-from typing import Optional
-from pydantic import ConfigDict, BaseModel
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class CaptionTaskSchema(BaseModel):
-    api_key: Optional[str] = None
-    endpoint_url: Optional[str] = None
-    prompt: Optional[str] = None
-    model_name: Optional[str] = None
-    model_config = ConfigDict(extra="forbid")
-    model_config["protected_namespaces"] = ()
 class CaptionTask(Task):
     def __init__(
         self,
@@ -37,10 +26,15 @@ class CaptionTask(Task):
     ) -> None:
         super().__init__()
-        self._api_key = api_key
-        self._endpoint_url = endpoint_url
-        self._prompt = prompt
-        self._model_name = model_name
+        # Use the API schema for validation
+        validated_data = IngestTaskCaptionSchema(
+            api_key=api_key, endpoint_url=endpoint_url, prompt=prompt, model_name=model_name
+        )
+        self._api_key = validated_data.api_key
+        self._endpoint_url = validated_data.endpoint_url
+        self._prompt = validated_data.prompt
+        self._model_name = validated_data.model_name
     def __str__(self) -> str:
         """

nv_ingest_client/primitives/tasks/chart_extraction.py CHANGED Viewed

@@ -9,35 +9,41 @@
 import logging
 from typing import Dict
-from pydantic import BaseModel
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class ChartExtractionSchema(BaseModel):
-    class Config:
-        extra = "forbid"
 class ChartExtractionTask(Task):
     """
     Object for chart extraction task
     """
-    def __init__(self) -> None:
+    def __init__(self, params: dict = None) -> None:
         """
-        Setup Dedup Task Config
+        Setup Chart Extraction Task Config
         """
         super().__init__()
+        # Handle None params by converting to empty dict for backward compatibility
+        if params is None:
+            params = {}
+        # Use the API schema for validation
+        validated_data = IngestTaskChartExtraction(params=params)
+        self._params = validated_data.params
     def __str__(self) -> str:
         """
         Returns a string with the object's config and run time state
         """
         info = ""
-        info += "chart extraction task\n"
+        info += "Chart Extraction Task:\n"
+        if self._params:
+            info += f"  params: {self._params}\n"
         return info
     def to_dict(self) -> Dict:
@@ -46,7 +52,7 @@ class ChartExtractionTask(Task):
         """
         task_properties = {
-            "params": {},
+            "params": self._params,
         }
         return {"type": "chart_data_extract", "task_properties": task_properties}

nv_ingest_client/primitives/tasks/dedup.py CHANGED Viewed

@@ -10,29 +10,13 @@ import logging
 from typing import Dict
 from typing import Literal
-from pydantic import BaseModel, field_validator
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class DedupTaskSchema(BaseModel):
-    content_type: str = "image"
-    filter: bool = False
-    @field_validator("content_type")
-    def content_type_must_be_valid(cls, v):
-        valid_criteria = ["image"]
-        if v not in valid_criteria:
-            raise ValueError(f"content_type must be one of {valid_criteria}")
-        return v
-    class Config:
-        extra = "forbid"
 class DedupTask(Task):
     """
     Object for document dedup task
@@ -49,8 +33,15 @@ class DedupTask(Task):
         Setup Dedup Task Config
         """
         super().__init__()
-        self._content_type = content_type
-        self._filter = filter
+        # Use the API schema for validation
+        validated_data = IngestTaskDedupSchema(
+            content_type=content_type,
+            params={"filter": filter},
+        )
+        self._content_type = validated_data.content_type
+        self._filter = validated_data.params.filter
     def __str__(self) -> str:
         """
@@ -58,7 +49,7 @@ class DedupTask(Task):
         """
         info = ""
         info += "Dedup Task:\n"
-        info += f"  content_type: {self._content_type}\n"
+        info += f"  content_type: {self._content_type.value}\n"
         info += f"  filter: {self._filter}\n"
         return info
@@ -69,7 +60,7 @@ class DedupTask(Task):
         dedup_params = {"filter": self._filter}
         task_properties = {
-            "content_type": self._content_type,
+            "content_type": self._content_type.value,
             "params": dedup_params,
         }

nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl