PyPI - nv-ingest-client - Versions diffs - 2025.10.18.dev20251018__tar.gz → 2025.11.14.dev20251114__tar.gz - Mend

nv-ingest-client 2025.10.18.dev20251018tar.gz → 2025.11.14.dev20251114tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client.egg-info → nv_ingest_client-2025.11.14.dev20251114}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.10.18.dev20251018
+Version: 2025.11.14.dev20251114
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/client.py RENAMED Viewed

@@ -44,6 +44,50 @@ from nv_ingest_client.util.util import (
 logger = logging.getLogger(__name__)
+def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Compute resident_time entries from entry/exit pairs if not already present.
+    This ensures consistency between split jobs (where server computes resident_time)
+    and non-split jobs (where we compute it client-side).
+    Parameters
+    ----------
+    trace_dict : Dict[str, Any]
+        Trace dictionary with entry/exit pairs
+    Returns
+    -------
+    Dict[str, Any]
+        Trace dictionary with resident_time entries added
+    """
+    if not trace_dict or not isinstance(trace_dict, dict):
+        return trace_dict
+    # Check if resident_time already exists (server-computed for split jobs)
+    has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
+    if has_resident:
+        return trace_dict  # Already computed by server
+    # Compute resident_time from entry/exit pairs
+    result = dict(trace_dict)
+    stages = set()
+    # Find all unique stages
+    for key in trace_dict:
+        if key.startswith("trace::entry::"):
+            stages.add(key.replace("trace::entry::", ""))
+    # Compute resident_time for each stage
+    for stage in stages:
+        entry_key = f"trace::entry::{stage}"
+        exit_key = f"trace::exit::{stage}"
+        if entry_key in trace_dict and exit_key in trace_dict:
+            result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
+    return result
 class DataDecodeException(Exception):
     """
     Exception raised for errors in decoding data.
@@ -87,6 +131,7 @@ class _ConcurrentProcessor:
         stream_to_callback_only: bool,
         return_full_response: bool,
         verbose: bool = False,
+        return_traces: bool = False,
     ):
         """
         Initializes the concurrent processor.
@@ -120,6 +165,8 @@ class _ConcurrentProcessor:
             initiating job submission or fetching fails for a batch.
         verbose : bool, optional
             If True, enables detailed debug logging. Default is False.
+        return_traces : bool, optional
+            If True, parent-level trace data for each completed job is stored.
         Raises
         ------
@@ -142,12 +189,14 @@ class _ConcurrentProcessor:
         self.stream_to_callback_only = stream_to_callback_only
         self.return_full_response = return_full_response
         self.verbose = verbose
+        self.return_traces = return_traces
         # State variables managed across batch cycles
         self.retry_job_ids: List[str] = []
         self.retry_counts: Dict[str, int] = defaultdict(int)
         self.results: List[Dict[str, Any]] = []  # Stores successful results (full dicts)
         self.failures: List[Tuple[str, str]] = []  # (job_index, error_message)
+        self.traces: List[Optional[Dict[str, Any]]] = []
         # --- Initial Checks ---
         if not self.job_queue_id:
@@ -247,6 +296,14 @@ class _ConcurrentProcessor:
             # When requested, return the full response envelope (includes 'trace' and 'annotations')
             self.results.append(result_data if self.return_full_response else result_data.get("data"))
+        # Extract trace data for all successful (non-failed) jobs
+        if self.return_traces and not is_failed:
+            trace_payload = result_data.get("trace") if result_data else None
+            # Compute resident_time if not already present (for consistency)
+            if trace_payload:
+                trace_payload = _compute_resident_times(trace_payload)
+            self.traces.append(trace_payload if trace_payload else None)
         # Cleanup retry count if it exists
         if job_index in self.retry_counts:
             del self.retry_counts[job_index]
@@ -438,7 +495,7 @@ class _ConcurrentProcessor:
         return batch_futures_dict, normalized_job_indices
-    def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
+    def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
         """
         Executes the main processing loop in batches.
@@ -581,7 +638,7 @@ class _ConcurrentProcessor:
         # --- Final Logging ---
         self._log_final_status(total_jobs)
-        return self.results, self.failures
+        return self.results, self.failures, self.traces if self.return_traces else []
 class NvIngestClient:
@@ -1212,7 +1269,12 @@ class NvIngestClient:
         stream_to_callback_only: bool = False,
         return_full_response: bool = False,
         verbose: bool = False,
-    ) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
+        return_traces: bool = False,
+    ) -> Union[
+        List[Any],
+        Tuple[List[Any], List[Tuple[str, str]]],
+        Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
+    ]:
         """
         Submit and fetch multiple jobs concurrently.
@@ -1247,6 +1309,8 @@ class NvIngestClient:
             Ignored when stream_to_callback_only=True. Default is False.
         verbose : bool, optional
             If True, enable debug logging. Default is False.
+        return_traces : bool, optional
+            If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
         Returns
         -------
@@ -1254,6 +1318,9 @@ class NvIngestClient:
             List of successful job results when `return_failures` is False.
         results, failures : tuple
             Tuple of (successful results, failure tuples) when `return_failures` is True.
+        results, failures, traces : tuple
+            Tuple of (successful results, failure tuples, trace dicts) when both
+            `return_failures` and `return_traces` are True.
         Raises
         ------
@@ -1266,7 +1333,12 @@ class NvIngestClient:
         # Handle empty input
         if not job_indices:
-            return ([], []) if return_failures else []
+            if return_failures and return_traces:
+                return [], [], []
+            elif return_failures:
+                return [], []
+            else:
+                return []
         # Validate and set batch_size
         validated_batch_size = self._validate_batch_size(batch_size)
@@ -1289,12 +1361,17 @@ class NvIngestClient:
             stream_to_callback_only=stream_to_callback_only,
             return_full_response=return_full_response,
             verbose=verbose,
+            return_traces=return_traces,
         )
-        results, failures = processor.run()
+        results, failures, traces = processor.run()
-        if return_failures:
+        if return_failures and return_traces:
+            return results, failures, traces
+        elif return_failures:
             return results, failures
+        elif return_traces:
+            return results, traces
         if failures:
             logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
@@ -1628,9 +1705,6 @@ class NvIngestClient:
                     )
                     logger.error(error_msg)
                     failures.append((self._job_index_to_job_spec[job_id].source_id, str(e)))
-                finally:
-                    # Clean up the job spec mapping
-                    del self._job_index_to_job_spec[job_id]
         if return_failures:
             return results, failures

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/ingest_job_handler.py RENAMED Viewed

@@ -323,18 +323,40 @@ class IngestJobHandler:
                 futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
                 for future in as_completed(futures_dict.keys()):
+                    pages_per_sec = None
                     try:
                         # Block as each future completes; this mirrors CLI behavior
                         future_response, trace_id = self._handle_future_result(future)
                         job_id: str = futures_dict[future]
                         trace_ids[job_id_map[job_id]] = trace_id
-                        first_page_metadata = future_response["data"][0]["metadata"]
-                        file_page_counts: Dict[str, int] = {
-                            first_page_metadata["source_metadata"]["source_name"]: first_page_metadata[
-                                "content_metadata"
-                            ]["hierarchy"]["page_count"]
-                        }
+                        # Extract page count: prefer V2 metadata location, fall back to V1
+                        page_count = None
+                        source_name = None
+                        # Try V2 metadata location first (top-level metadata.total_pages)
+                        if "metadata" in future_response and future_response["metadata"]:
+                            response_metadata = future_response["metadata"]
+                            page_count = response_metadata.get("total_pages")
+                            source_name = response_metadata.get("original_source_name")
+                        # Fall back to V1 location (first data element's hierarchy.page_count)
+                        if page_count is None and future_response.get("data"):
+                            try:
+                                first_page_metadata = future_response["data"][0]["metadata"]
+                                page_count = first_page_metadata["content_metadata"]["hierarchy"]["page_count"]
+                                source_name = first_page_metadata["source_metadata"]["source_name"]
+                            except (KeyError, IndexError, TypeError):
+                                # If we can't extract from V1 location, use defaults
+                                pass
+                        # Use extracted values or defaults
+                        if page_count is None:
+                            page_count = 0  # Default if not found
+                        if source_name is None:
+                            source_name = "unknown_source"
+                        file_page_counts: Dict[str, int] = {source_name: page_count}
                         if self.output_directory:
                             self._save_response_data(

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/interface.py RENAMED Viewed

@@ -402,16 +402,9 @@ class Ingestor:
         show_progress: bool = False,
         return_failures: bool = False,
         save_to_disk: bool = False,
+        return_traces: bool = False,
         **kwargs: Any,
-    ) -> Union[
-        List[List[Dict[str, Any]]],  # In-memory: List of response['data'] for each doc
-        List[Dict[str, Any]],  # In-memory: Full response envelopes when return_full_response=True
-        List[LazyLoadedList],  # Disk: List of proxies, one per original doc
-        Tuple[
-            Union[List[List[Dict[str, Any]]], List[Dict[str, Any]], List[LazyLoadedList]],
-            List[Tuple[str, str]],
-        ],
-    ]:  # noqa: E501
+    ) -> Union[List[Any], Tuple[Any, ...]]:
         """
         Ingest documents by submitting jobs and fetching results concurrently.
@@ -421,24 +414,30 @@ class Ingestor:
             Whether to display a progress bar. Default is False.
         return_failures : bool, optional
             If True, return a tuple (results, failures); otherwise, return only results. Default is False.
+        save_to_disk : bool, optional
+            If True, save results to disk and return LazyLoadedList proxies. Default is False.
+        return_traces : bool, optional
+            If True, return trace metrics alongside results. Default is False.
+            Traces contain timing metrics (entry, exit, resident_time) for each stage.
         **kwargs : Any
-            Additional keyword arguments for the underlying client methods. Supported keys:
-            'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
-            'data_only', 'return_full_response', 'verbose'. Unrecognized keys are passed
-            through to process_jobs_concurrently.
+            Additional keyword arguments for the underlying client methods.
             Optional flags include `include_parent_trace_ids=True` to also return
-            parent job trace identifiers gathered during ingestion.
+            parent job trace identifiers (V2 API only).
         Returns
         -------
-        results : list of dict
-            List of successful job results when `return_failures` is False.
+        list or tuple
+            Returns vary based on flags:
+            - Default: list of results
+            - return_failures=True: (results, failures)
+            - return_traces=True: (results, traces)
+            - return_failures=True, return_traces=True: (results, failures, traces)
+            - Additional combinations with include_parent_trace_ids kwarg
-        results, failures : tuple (list of dict, list of tuple of str)
-            Tuple containing successful results and failure information when `return_failures` is True.
-        If `include_parent_trace_ids=True` is provided via kwargs, an additional
-        list of parent trace IDs is appended to the return value.
+        Notes
+        -----
+        Trace metrics include timing data for each processing stage. For detailed
+        usage and examples, see src/nv_ingest/api/v2/README.md
         """
         if save_to_disk and (not self._output_config):
             self.save_to_disk()
@@ -574,7 +573,8 @@ class Ingestor:
         if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
             self._client.enable_telemetry(bool(enable_telemetry))
-        results, failures = self._client.process_jobs_concurrently(
+        # Call process_jobs_concurrently
+        proc_result = self._client.process_jobs_concurrently(
             job_indices=self._job_ids,
             job_queue_id=self._job_queue_id,
             timeout=timeout,
@@ -583,9 +583,17 @@ class Ingestor:
             return_failures=True,
             stream_to_callback_only=stream_to_callback_only,
             verbose=verbose,
+            return_traces=return_traces,
             **proc_kwargs,
         )
+        # Unpack result based on return_traces flag
+        if return_traces:
+            results, failures, traces_list = proc_result
+        else:
+            results, failures = proc_result
+            traces_list = []  # Empty list when traces not requested
         if show_progress and pbar:
             pbar.close()
@@ -648,13 +656,18 @@ class Ingestor:
         parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
-        if return_failures and include_parent_trace_ids:
-            return results, failures, parent_trace_ids
+        # Build return tuple based on requested outputs
+        # Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
+        returns = [results]
         if return_failures:
-            return results, failures
+            returns.append(failures)
+        if return_traces:
+            returns.append(traces_list)
         if include_parent_trace_ids:
-            return results, parent_trace_ids
-        return results
+            returns.append(parent_trace_ids)
+        return tuple(returns) if len(returns) > 1 else results
     def ingest_async(self, **kwargs: Any) -> Future:
         """
@@ -681,6 +694,7 @@ class Ingestor:
         submitted_futures = set(future_to_job_id.keys())
         completed_futures = set()
         future_results = []
+        vdb_future = None
         def _done_callback(future):
             job_id = future_to_job_id[future]
@@ -702,9 +716,10 @@ class Ingestor:
             future.add_done_callback(_done_callback)
         if self._vdb_bulk_upload:
-            self._vdb_bulk_upload.run(combined_future.result())
+            executor = ThreadPoolExecutor(max_workers=1)
+            vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
-        return combined_future
+        return combined_future if not vdb_future else vdb_future
     @ensure_job_specs
     def _prepare_ingest_run(self):
@@ -821,6 +836,7 @@ class Ingestor:
         extract_tables = kwargs.pop("extract_tables", True)
         extract_charts = kwargs.pop("extract_charts", True)
         extract_page_as_image = kwargs.pop("extract_page_as_image", False)
+        table_output_format = kwargs.pop("table_output_format", "markdown")
         # Defaulting to False since enabling infographic extraction reduces throughput.
         # Users have to set to True if infographic extraction is required.
@@ -843,6 +859,7 @@ class Ingestor:
                 extract_charts=extract_charts,
                 extract_infographics=extract_infographics,
                 extract_page_as_image=extract_page_as_image,
+                table_output_format=table_output_format,
                 **kwargs,
             )
@@ -1346,3 +1363,85 @@ class Ingestor:
         terminal_jobs = self.completed_jobs() + self.failed_jobs() + self.cancelled_jobs()
         return len(self._job_states) - terminal_jobs
+    def get_status(self) -> Dict[str, str]:
+        """
+        Returns a dictionary mapping document identifiers to their current status in the pipeline.
+        This method is designed for use with async ingestion to poll the status of submitted jobs.
+        For each document submitted to the ingestor, the method returns its current processing state.
+        Returns
+        -------
+        Dict[str, str]
+            A dictionary where:
+            - Keys are document identifiers (source names or source IDs)
+            - Values are status strings representing the current state:
+              * "pending": Job created but not yet submitted
+              * "submitted": Job submitted and waiting for processing
+              * "processing": Job is currently being processed
+              * "completed": Job finished successfully
+              * "failed": Job encountered an error
+              * "cancelled": Job was cancelled
+              * "unknown": Job state could not be determined (initial state)
+        Examples
+        --------
+        >>> ingestor = Ingestor(documents=["doc1.pdf", "doc2.pdf"], client=client)
+        >>> ingestor.extract().embed()
+        >>> future = ingestor.ingest_async()
+        >>>
+        >>> # Poll status while processing
+        >>> status = ingestor.get_status()
+        >>> print(status)
+        {'doc1.pdf': 'processing', 'doc2.pdf': 'submitted'}
+        >>>
+        >>> # Check again after some time
+        >>> status = ingestor.get_status()
+        >>> print(status)
+        {'doc1.pdf': 'completed', 'doc2.pdf': 'processing'}
+        Notes
+        -----
+        - This method is most useful when called after `ingest_async()` to track progress
+        - If called before any jobs are submitted, returns an empty dictionary or
+          documents with "unknown" status
+        - The method accesses internal job state from the client, so it reflects
+          the most current known state
+        """
+        status_dict = {}
+        if not self._job_states:
+            # If job states haven't been initialized yet (before ingest_async is called)
+            # Return unknown status for all documents
+            for doc in self._documents:
+                doc_name = os.path.basename(doc) if isinstance(doc, str) else str(doc)
+                status_dict[doc_name] = "unknown"
+            return status_dict
+        # Map job IDs to their states and source identifiers
+        for job_id, job_state in self._job_states.items():
+            # Get the job spec to find the source identifier
+            job_spec = self._client._job_index_to_job_spec.get(job_id)
+            if job_spec:
+                # Use source_name as the key (the document name)
+                source_identifier = job_spec.source_name
+            else:
+                # Fallback to job_id if we can't find the spec
+                source_identifier = f"job_{job_id}"
+            # Map the JobStateEnum to a user-friendly string
+            state_mapping = {
+                JobStateEnum.PENDING: "pending",
+                JobStateEnum.SUBMITTED_ASYNC: "submitted",
+                JobStateEnum.SUBMITTED: "submitted",
+                JobStateEnum.PROCESSING: "processing",
+                JobStateEnum.COMPLETED: "completed",
+                JobStateEnum.FAILED: "failed",
+                JobStateEnum.CANCELLED: "cancelled",
+            }
+            status_dict[source_identifier] = state_mapping.get(job_state.state, "unknown")
+        return status_dict

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_spec.py RENAMED Viewed

@@ -18,6 +18,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
 from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
 from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
 from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
+from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
 from nv_ingest_client.util.dataset import get_dataset_files
 from nv_ingest_client.util.dataset import get_dataset_statistics
@@ -199,6 +200,8 @@ class JobSpec:
             self._tasks.append(ChartExtractionTask())
         if isinstance(task, ExtractTask) and (task._extract_infographics is True):
             self._tasks.append(InfographicExtractionTask())
+        if isinstance(task, ExtractTask) and (task._extract_method in {"ocr"}):
+            self._tasks.append(OCRExtractionTask())
         if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
             extract_audio_params = task._extract_audio_params or {}
             self._tasks.append(AudioExtractionTask(**extract_audio_params))

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/embed.py RENAMED Viewed

@@ -36,6 +36,9 @@ class EmbedTask(Task):
         image_elements_modality: Optional[str] = None,
         structured_elements_modality: Optional[str] = None,
         audio_elements_modality: Optional[str] = None,
+        custom_content_field: Optional[str] = None,
+        result_target_field: Optional[str] = None,
+        dimensions: Optional[int] = None,
     ) -> None:
         """
         Initialize the EmbedTask configuration.
@@ -76,6 +79,9 @@ class EmbedTask(Task):
             image_elements_modality=image_elements_modality,
             structured_elements_modality=structured_elements_modality,
             audio_elements_modality=audio_elements_modality,
+            custom_content_field=custom_content_field,
+            result_target_field=result_target_field,
+            dimensions=dimensions,
         )
         self._endpoint_url = validated_data.endpoint_url
@@ -86,6 +92,9 @@ class EmbedTask(Task):
         self._image_elements_modality = validated_data.image_elements_modality
         self._structured_elements_modality = validated_data.structured_elements_modality
         self._audio_elements_modality = validated_data.audio_elements_modality
+        self._custom_content_field = validated_data.custom_content_field
+        self._result_target_field = validated_data.result_target_field
+        self._dimensions = validated_data.dimensions
     def __str__(self) -> str:
         """
@@ -114,6 +123,12 @@ class EmbedTask(Task):
             info += f"  structured_elements_modality: {self._structured_elements_modality}\n"
         if self._audio_elements_modality:
             info += f"  audio_elements_modality: {self._audio_elements_modality}\n"
+        if self._custom_content_field:
+            info += f"  custom_content_field: {self._custom_content_field}\n"
+        if self._result_target_field:
+            info += f"  result_target_field: {self.result_target_field}\n"
+        if self._dimensions:
+            info += f"  dimensions: {self._dimensions}\n"
         return info
     def to_dict(self) -> Dict[str, Any]:
@@ -149,4 +164,13 @@ class EmbedTask(Task):
         if self._audio_elements_modality:
             task_properties["audio_elements_modality"] = self._audio_elements_modality
+        if self._custom_content_field:
+            task_properties["custom_content_field"] = self._custom_content_field
+        if self._result_target_field:
+            task_properties["result_target_field"] = self._result_target_field
+        if self._dimensions:
+            task_properties["dimensions"] = self._dimensions
         return {"type": "embed", "task_properties": task_properties}

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/filter.py RENAMED Viewed

@@ -31,7 +31,7 @@ class FilterTask(Task):
         min_size: int = 128,
         max_aspect_ratio: Union[int, float] = 5.0,
         min_aspect_ratio: Union[int, float] = 0.2,
-        filter: bool = False,
+        filter: bool = True,
     ) -> None:
         """
         Setup Filter Task Config

nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/primitives/tasks/ocr_extraction.py ADDED Viewed

@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# pylint: disable=too-few-public-methods
+# pylint: disable=too-many-arguments
+import logging
+from typing import Dict
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
+from nv_ingest_client.primitives.tasks.task_base import Task
+logger = logging.getLogger(__name__)
+class OCRExtractionTask(Task):
+    """
+    Object for ocr extraction task
+    """
+    def __init__(self, params: dict = None) -> None:
+        """
+        Setup OCR Extraction Task Config
+        """
+        super().__init__()
+        # Handle None params by converting to empty dict for backward compatibility
+        if params is None:
+            params = {}
+        # Use the API schema for validation
+        validated_data = IngestTaskOCRExtraction(params=params)
+        self._params = validated_data.params
+    def __str__(self) -> str:
+        """
+        Returns a string with the object's config and run time state
+        """
+        info = ""
+        info += "OCR Extraction Task:\n"
+        info += f"  params: {self._params}\n"
+        return info
+    def to_dict(self) -> Dict:
+        """
+        Convert to a dict for submission to redis
+        """
+        task_properties = {
+            "params": self._params,
+        }
+        return {"type": "ocr_data_extract", "task_properties": task_properties}

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/udf.py RENAMED Viewed

@@ -11,6 +11,7 @@ import logging
 import importlib
 import inspect
 import ast
+import re
 from typing import Dict, Optional, Union
 from nv_ingest_api.internal.enums.common import PipelinePhase
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
     3. File path: '/path/to/file.py:my_function'
     4. Legacy import path: 'my_module.my_function' (function name only, no imports)
     """
-    if udf_function_spec.strip().startswith("def "):
-        # Already an inline function string
-        return udf_function_spec
+    # Default to treating as inline unless it clearly matches a
+    # module/file specification. This avoids misclassifying inline code that
+    # contains colons, imports, or annotations before the def line.
-    elif ".py:" in udf_function_spec:
-        # File path format: /path/to/file.py:function_name
-        file_path, function_name = udf_function_spec.split(":", 1)
+    spec = udf_function_spec.strip()
+    # 1) File path with function: /path/to/file.py:function_name
+    if ".py:" in spec:
+        file_path, function_name = spec.split(":", 1)
         return _extract_function_with_context(file_path, function_name)
-    elif udf_function_spec.endswith(".py"):
-        # File path format without function name - this is an error
+    # 2) File path without function name is an explicit error
+    if spec.endswith(".py"):
         raise ValueError(
-            f"File path '{udf_function_spec}' is missing function name. "
-            f"Use format 'file.py:function_name' to specify which function to use."
+            f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
         )
-    elif ":" in udf_function_spec and ".py:" not in udf_function_spec:
-        # Module path format with colon: my_module.submodule:function_name
-        # This preserves imports and module context
-        module_path, function_name = udf_function_spec.split(":", 1)
+    # 3) Module path with colon: my.module:function
+    # Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
+    # no whitespace/newlines.
+    module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
+    if module_colon_pattern.match(spec):
+        module_path, function_name = spec.split(":", 1)
         try:
-            # Import the module to get its file path
             module = importlib.import_module(module_path)
             module_file = inspect.getfile(module)
-            # Extract the function with full module context
             return _extract_function_with_context(module_file, function_name)
         except ImportError as e:
             raise ValueError(f"Failed to import module '{module_path}': {e}")
         except Exception as e:
             raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
-    elif "." in udf_function_spec:
-        # Legacy import path format: module.submodule.function
-        # This only extracts the function source without imports (legacy behavior)
-        func = _load_function_from_import_path(udf_function_spec)
-        # Get the source code of the function only
+    # 4) Legacy import path: my.module.function (no colon)
+    legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
+    if legacy_import_pattern.match(spec):
+        func = _load_function_from_import_path(spec)
         try:
             source = inspect.getsource(func)
             return source
         except (OSError, TypeError) as e:
             raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
-    else:
-        raise ValueError(f"Invalid UDF function specification: {udf_function_spec}")
+    # 5) Default: treat as inline UDF source (entire string)
+    return udf_function_spec
 class UDFTask(Task):

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/extract.py RENAMED Viewed

@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
     "txt": DocumentTypeEnum.TXT,
     "mp3": DocumentTypeEnum.MP3,
     "wav": DocumentTypeEnum.WAV,
+    "mp4": DocumentTypeEnum.MP4,
+    "mov": DocumentTypeEnum.MOV,
+    "avi": DocumentTypeEnum.AVI,
+    "mkv": DocumentTypeEnum.MKV,
     # Add more as needed
 }

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/milvus.py RENAMED Viewed

@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
 logger = logging.getLogger(__name__)
 CONSISTENCY = CONSISTENCY_BOUNDED
+DENSE_INDEX_NAME = "dense_index"
 pandas_reader_map = {
     ".json": pd.read_json,
@@ -93,7 +94,7 @@ def create_meta_collection(
     index_params = MilvusClient.prepare_index_params()
     index_params.add_index(
         field_name="vector",
-        index_name="dense_index",
+        index_name=DENSE_INDEX_NAME,
         index_type="FLAT",
         metric_type="L2",
     )
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
     if local_index:
         index_params.add_index(
             field_name="vector",
-            index_name="dense_index",
+            index_name=DENSE_INDEX_NAME,
             index_type="FLAT",
             metric_type="L2",
         )
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
         if gpu_index:
             index_params.add_index(
                 field_name="vector",
-                index_name="dense_index",
+                index_name=DENSE_INDEX_NAME,
                 index_type="GPU_CAGRA",
                 metric_type="L2",
                 params={
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
         else:
             index_params.add_index(
                 field_name="vector",
-                index_name="dense_index",
+                index_name=DENSE_INDEX_NAME,
                 index_type="HNSW",
                 metric_type="L2",
                 params={"M": 64, "efConstruction": 512},
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
     if isinstance(indexes, dict):
         # Old Milvus behavior (< 2.5.6)
         for k, v in indexes.items():
-            if k[1] == "dense_index" and hasattr(v, "_index_type"):
+            if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
                 d_idx = v._index_type
             if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
                 s_idx = v._index_type
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
             index_name = getattr(idx, "index_name", None)
             index_type = getattr(idx, "index_type", None)
-            if index_name == "dense_index":
+            if index_name == DENSE_INDEX_NAME:
                 d_idx = index_type
             if sparse and index_name == "sparse_index":
                 s_idx = index_type
@@ -776,13 +777,13 @@ def bulk_insert_milvus(
     t_bulk_start = time.time()
     task_ids = []
-    task_ids.append(
-        utility.do_bulk_insert(
+    for files in writer.batch_files:
+        task_id = utility.do_bulk_insert(
             collection_name=collection_name,
-            files=[file for files in writer.batch_files for file in files],
+            files=files,
             consistency_level=CONSISTENCY,
         )
-    )
+        task_ids.append(task_id)
     while len(task_ids) > 0:
         time.sleep(1)
@@ -900,30 +901,32 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
     (refer to MilvusClient.refresh_load for bulk inserts).
     """
     client.flush(collection_name)
-    index_names = utility.list_indexes(collection_name)
+    # index_names = utility.list_indexes(collection_name)
     indexed_rows = 0
-    for index_name in index_names:
+    # observe dense_index, all indexes get populated simultaneously
+    for index_name in [DENSE_INDEX_NAME]:
         indexed_rows = 0
-        while indexed_rows < num_elements:
+        expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
+        while indexed_rows < expected_rows:
             pos_movement = 10  # number of iteration allowed without noticing an increase in indexed_rows
             for i in range(20):
-                new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
+                current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
                 time.sleep(1)
                 logger.info(
-                    f"polling for indexed rows, {collection_name}, {index_name} -  {new_indexed_rows} / {num_elements}"
+                    f"Indexed rows, {collection_name}, {index_name} -  {current_indexed_rows} / {expected_rows}"
                 )
-                if new_indexed_rows == num_elements:
-                    indexed_rows = new_indexed_rows
+                if current_indexed_rows == expected_rows:
+                    indexed_rows = current_indexed_rows
                     break
                 # check if indexed_rows is staying the same, too many times means something is wrong
-                if new_indexed_rows == indexed_rows:
+                if current_indexed_rows == indexed_rows:
                     pos_movement -= 1
                 else:
                     pos_movement = 10
                 # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
                 if pos_movement == 0:
-                    raise ValueError("Rows are not getting indexed as expected")
-                indexed_rows = new_indexed_rows
+                    raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
+                indexed_rows = current_indexed_rows
     return indexed_rows
@@ -2057,3 +2060,24 @@ class Milvus(VDB):
                 self.write_to_index(records, collection_name=coll_name, **sub_write_params)
         else:
             raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
+        return records
+    def run_async(self, records):
+        collection_name, create_params = self.get_connection_params()
+        _, write_params = self.get_write_params()
+        if isinstance(collection_name, str):
+            logger.info(f"creating index - {collection_name}")
+            self.create_index(collection_name=collection_name, **create_params)
+            records = records.result()
+            logger.info(f"writing to index, for collection - {collection_name}")
+            self.write_to_index(records, **write_params)
+        elif isinstance(collection_name, dict):
+            split_params_list = _dict_to_params(collection_name, write_params)
+            for sub_params in split_params_list:
+                coll_name, sub_write_params = sub_params
+                sub_write_params.pop("collection_name", None)
+                self.create_index(collection_name=coll_name, **create_params)
+                self.write_to_index(records, collection_name=coll_name, **sub_write_params)
+        else:
+            raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
+        return records

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.10.18.dev20251018
+Version: 2025.11.14.dev20251114
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/SOURCES.txt RENAMED Viewed

@@ -34,6 +34,7 @@ src/nv_ingest_client/primitives/tasks/embed.py
 src/nv_ingest_client/primitives/tasks/extract.py
 src/nv_ingest_client/primitives/tasks/filter.py
 src/nv_ingest_client/primitives/tasks/infographic_extraction.py
+src/nv_ingest_client/primitives/tasks/ocr_extraction.py
 src/nv_ingest_client/primitives/tasks/split.py
 src/nv_ingest_client/primitives/tasks/store.py
 src/nv_ingest_client/primitives/tasks/table_extraction.py