PyPI - nv-ingest-client - Versions diffs - 2025.11.5.dev20251105__py3-none-any.whl → 2025.11.17.dev20251117__py3-none-any.whl - Mend

nv-ingest-client 2025.11.5.dev20251105py3-none-any.whl → 2025.11.17.dev20251117py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

nv_ingest_client/client/interface.py CHANGED Viewed

@@ -694,6 +694,7 @@ class Ingestor:
         submitted_futures = set(future_to_job_id.keys())
         completed_futures = set()
         future_results = []
+        vdb_future = None
         def _done_callback(future):
             job_id = future_to_job_id[future]
@@ -715,9 +716,10 @@ class Ingestor:
             future.add_done_callback(_done_callback)
         if self._vdb_bulk_upload:
-            self._vdb_bulk_upload.run(combined_future.result())
+            executor = ThreadPoolExecutor(max_workers=1)
+            vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
-        return combined_future
+        return combined_future if not vdb_future else vdb_future
     @ensure_job_specs
     def _prepare_ingest_run(self):
@@ -834,6 +836,7 @@ class Ingestor:
         extract_tables = kwargs.pop("extract_tables", True)
         extract_charts = kwargs.pop("extract_charts", True)
         extract_page_as_image = kwargs.pop("extract_page_as_image", False)
+        table_output_format = kwargs.pop("table_output_format", "markdown")
         # Defaulting to False since enabling infographic extraction reduces throughput.
         # Users have to set to True if infographic extraction is required.
@@ -856,6 +859,7 @@ class Ingestor:
                 extract_charts=extract_charts,
                 extract_infographics=extract_infographics,
                 extract_page_as_image=extract_page_as_image,
+                table_output_format=table_output_format,
                 **kwargs,
             )

nv_ingest_client/primitives/jobs/job_spec.py CHANGED Viewed

@@ -18,6 +18,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
 from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
 from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
 from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
+from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
 from nv_ingest_client.util.dataset import get_dataset_files
 from nv_ingest_client.util.dataset import get_dataset_statistics
@@ -199,6 +200,8 @@ class JobSpec:
             self._tasks.append(ChartExtractionTask())
         if isinstance(task, ExtractTask) and (task._extract_infographics is True):
             self._tasks.append(InfographicExtractionTask())
+        if isinstance(task, ExtractTask) and (task._extract_method in {"pdfium_hybrid", "ocr"}):
+            self._tasks.append(OCRExtractionTask())
         if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
             extract_audio_params = task._extract_audio_params or {}
             self._tasks.append(AudioExtractionTask(**extract_audio_params))

nv_ingest_client/primitives/tasks/embed.py CHANGED Viewed

@@ -38,6 +38,7 @@ class EmbedTask(Task):
         audio_elements_modality: Optional[str] = None,
         custom_content_field: Optional[str] = None,
         result_target_field: Optional[str] = None,
+        dimensions: Optional[int] = None,
     ) -> None:
         """
         Initialize the EmbedTask configuration.
@@ -80,6 +81,7 @@ class EmbedTask(Task):
             audio_elements_modality=audio_elements_modality,
             custom_content_field=custom_content_field,
             result_target_field=result_target_field,
+            dimensions=dimensions,
         )
         self._endpoint_url = validated_data.endpoint_url
@@ -92,6 +94,7 @@ class EmbedTask(Task):
         self._audio_elements_modality = validated_data.audio_elements_modality
         self._custom_content_field = validated_data.custom_content_field
         self._result_target_field = validated_data.result_target_field
+        self._dimensions = validated_data.dimensions
     def __str__(self) -> str:
         """
@@ -124,6 +127,8 @@ class EmbedTask(Task):
             info += f"  custom_content_field: {self._custom_content_field}\n"
         if self._result_target_field:
             info += f"  result_target_field: {self.result_target_field}\n"
+        if self._dimensions:
+            info += f"  dimensions: {self._dimensions}\n"
         return info
     def to_dict(self) -> Dict[str, Any]:
@@ -163,6 +168,9 @@ class EmbedTask(Task):
             task_properties["custom_content_field"] = self._custom_content_field
         if self._result_target_field:
-            task_properties["result_target_field"] = self.result_target_field
+            task_properties["result_target_field"] = self._result_target_field
+        if self._dimensions:
+            task_properties["dimensions"] = self._dimensions
         return {"type": "embed", "task_properties": task_properties}

nv_ingest_client/primitives/tasks/extract.py CHANGED Viewed

@@ -58,6 +58,7 @@ _Type_Extract_Method_PDF = Literal[
     "pdfium",
     "tika",
     "unstructured_io",
+    "ocr",
 ]
 _Type_Extract_Images_Method = Literal["group", "yolox"]

nv_ingest_client/primitives/tasks/ocr_extraction.py ADDED Viewed

@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# pylint: disable=too-few-public-methods
+# pylint: disable=too-many-arguments
+import logging
+from typing import Dict
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
+from nv_ingest_client.primitives.tasks.task_base import Task
+logger = logging.getLogger(__name__)
+class OCRExtractionTask(Task):
+    """
+    Object for ocr extraction task
+    """
+    def __init__(self, params: dict = None) -> None:
+        """
+        Setup OCR Extraction Task Config
+        """
+        super().__init__()
+        # Handle None params by converting to empty dict for backward compatibility
+        if params is None:
+            params = {}
+        # Use the API schema for validation
+        validated_data = IngestTaskOCRExtraction(params=params)
+        self._params = validated_data.params
+    def __str__(self) -> str:
+        """
+        Returns a string with the object's config and run time state
+        """
+        info = ""
+        info += "OCR Extraction Task:\n"
+        info += f"  params: {self._params}\n"
+        return info
+    def to_dict(self) -> Dict:
+        """
+        Convert to a dict for submission to redis
+        """
+        task_properties = {
+            "params": self._params,
+        }
+        return {"type": "ocr_data_extract", "task_properties": task_properties}

nv_ingest_client/util/file_processing/extract.py CHANGED Viewed

@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
     "txt": DocumentTypeEnum.TXT,
     "mp3": DocumentTypeEnum.MP3,
     "wav": DocumentTypeEnum.WAV,
+    "mp4": DocumentTypeEnum.MP4,
+    "mov": DocumentTypeEnum.MOV,
+    "avi": DocumentTypeEnum.AVI,
+    "mkv": DocumentTypeEnum.MKV,
     # Add more as needed
 }

nv_ingest_client/util/vdb/milvus.py CHANGED Viewed

@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
 logger = logging.getLogger(__name__)
 CONSISTENCY = CONSISTENCY_BOUNDED
+DENSE_INDEX_NAME = "dense_index"
 pandas_reader_map = {
     ".json": pd.read_json,
@@ -93,7 +94,7 @@ def create_meta_collection(
     index_params = MilvusClient.prepare_index_params()
     index_params.add_index(
         field_name="vector",
-        index_name="dense_index",
+        index_name=DENSE_INDEX_NAME,
         index_type="FLAT",
         metric_type="L2",
     )
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
     if local_index:
         index_params.add_index(
             field_name="vector",
-            index_name="dense_index",
+            index_name=DENSE_INDEX_NAME,
             index_type="FLAT",
             metric_type="L2",
         )
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
         if gpu_index:
             index_params.add_index(
                 field_name="vector",
-                index_name="dense_index",
+                index_name=DENSE_INDEX_NAME,
                 index_type="GPU_CAGRA",
                 metric_type="L2",
                 params={
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
         else:
             index_params.add_index(
                 field_name="vector",
-                index_name="dense_index",
+                index_name=DENSE_INDEX_NAME,
                 index_type="HNSW",
                 metric_type="L2",
                 params={"M": 64, "efConstruction": 512},
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
     if isinstance(indexes, dict):
         # Old Milvus behavior (< 2.5.6)
         for k, v in indexes.items():
-            if k[1] == "dense_index" and hasattr(v, "_index_type"):
+            if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
                 d_idx = v._index_type
             if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
                 s_idx = v._index_type
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
             index_name = getattr(idx, "index_name", None)
             index_type = getattr(idx, "index_type", None)
-            if index_name == "dense_index":
+            if index_name == DENSE_INDEX_NAME:
                 d_idx = index_type
             if sparse and index_name == "sparse_index":
                 s_idx = index_type
@@ -900,30 +901,32 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
     (refer to MilvusClient.refresh_load for bulk inserts).
     """
     client.flush(collection_name)
-    index_names = utility.list_indexes(collection_name)
+    # index_names = utility.list_indexes(collection_name)
     indexed_rows = 0
-    for index_name in index_names:
+    # observe dense_index, all indexes get populated simultaneously
+    for index_name in [DENSE_INDEX_NAME]:
         indexed_rows = 0
-        while indexed_rows < num_elements:
+        expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
+        while indexed_rows < expected_rows:
             pos_movement = 10  # number of iteration allowed without noticing an increase in indexed_rows
             for i in range(20):
-                new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
+                current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
                 time.sleep(1)
                 logger.info(
-                    f"polling for indexed rows, {collection_name}, {index_name} -  {new_indexed_rows} / {num_elements}"
+                    f"Indexed rows, {collection_name}, {index_name} -  {current_indexed_rows} / {expected_rows}"
                 )
-                if new_indexed_rows == num_elements:
-                    indexed_rows = new_indexed_rows
+                if current_indexed_rows == expected_rows:
+                    indexed_rows = current_indexed_rows
                     break
                 # check if indexed_rows is staying the same, too many times means something is wrong
-                if new_indexed_rows == indexed_rows:
+                if current_indexed_rows == indexed_rows:
                     pos_movement -= 1
                 else:
                     pos_movement = 10
                 # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
                 if pos_movement == 0:
-                    raise ValueError("Rows are not getting indexed as expected")
-                indexed_rows = new_indexed_rows
+                    raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
+                indexed_rows = current_indexed_rows
     return indexed_rows
@@ -2057,3 +2060,24 @@ class Milvus(VDB):
                 self.write_to_index(records, collection_name=coll_name, **sub_write_params)
         else:
             raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
+        return records
+    def run_async(self, records):
+        collection_name, create_params = self.get_connection_params()
+        _, write_params = self.get_write_params()
+        if isinstance(collection_name, str):
+            logger.info(f"creating index - {collection_name}")
+            self.create_index(collection_name=collection_name, **create_params)
+            records = records.result()
+            logger.info(f"writing to index, for collection - {collection_name}")
+            self.write_to_index(records, **write_params)
+        elif isinstance(collection_name, dict):
+            split_params_list = _dict_to_params(collection_name, write_params)
+            for sub_params in split_params_list:
+                coll_name, sub_write_params = sub_params
+                sub_write_params.pop("collection_name", None)
+                self.create_index(collection_name=coll_name, **create_params)
+                self.write_to_index(records, collection_name=coll_name, **sub_write_params)
+        else:
+            raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
+        return records

{nv_ingest_client-2025.11.5.dev20251105.dist-info → nv_ingest_client-2025.11.17.dev20251117.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.11.5.dev20251105
+Version: 2025.11.17.dev20251117
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_client-2025.11.5.dev20251105.dist-info → nv_ingest_client-2025.11.17.dev20251117.dist-info}/RECORD RENAMED Viewed

@@ -8,21 +8,22 @@ nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI
 nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
 nv_ingest_client/client/client.py,sha256=3uA54D4Y6lSS-Nvz8R8uzkHkoV8vJu8GPQQRPoc-Uxk,77368
 nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
-nv_ingest_client/client/interface.py,sha256=OCbH_5Q-cv1V4HpLBxLdaPCeaNKNkdEYi1JS4Tu6DGY,54745
+nv_ingest_client/client/interface.py,sha256=Y6JnjaRytlBrhgbU6MJYm2dblLvoYxWEB35TETZDSwk,55022
 nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
 nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
 nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
-nv_ingest_client/primitives/jobs/job_spec.py,sha256=teAZbpvxn25jIEUP5YJsAX_E_z9iWhejS-uy5opshFM,15681
+nv_ingest_client/primitives/jobs/job_spec.py,sha256=TBz5u7KRdQjQvqD0mMzwjTK9Jl3p7yTIknQQs0lfnV8,15909
 nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
 nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
 nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
 nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcYqtesS-HaZzeh4rI,2130
 nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
 nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
-nv_ingest_client/primitives/tasks/embed.py,sha256=YFnymU1UWID2gSrz1anlaL_SRMmDr3dNTeZv2UDu9kQ,6739
-nv_ingest_client/primitives/tasks/extract.py,sha256=bRriVkQyXN-UwzprHIt4Lp0iwmAojLEXqBb-IUrf3vY,9328
+nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
+nv_ingest_client/primitives/tasks/extract.py,sha256=ec2aKPU9OMOOw-oalQKAPaNRqgkREQ0ByLkFVqutD6E,9339
 nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
 nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
+nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
 nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
 nv_ingest_client/primitives/tasks/store.py,sha256=nIOnCH8vw4FLCLVBJYnsS5Unc0QmuO_jEtUp7-E9FU4,4199
 nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
@@ -42,14 +43,14 @@ nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-B
 nv_ingest_client/util/util.py,sha256=qwJ4MqF8w4-lws76z8iz1V0Hz_ebDYN8yAKyJPGuHuU,15828
 nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
 nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
+nv_ingest_client/util/file_processing/extract.py,sha256=Hjtem4bJWum1bbUPw7_TG-0Z2-7PsH4bBuqTF7bLn88,4794
 nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
 nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
-nv_ingest_client/util/vdb/milvus.py,sha256=6XWRh2SDJlgVZOKZVXG3cZTB4L-ZHIiiTenuIzkxp2Y,78704
+nv_ingest_client/util/vdb/milvus.py,sha256=LHZ4Z6fHk8vQUGQFJ3FZ5iay0Ike6Zur-K9yMiPxe44,80141
 nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
-nv_ingest_client-2025.11.5.dev20251105.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest_client-2025.11.5.dev20251105.dist-info/METADATA,sha256=rB92S3YltqT5qi70cDN7VK1wtRDiOKFMe0vU7Av8tQ4,30626
-nv_ingest_client-2025.11.5.dev20251105.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest_client-2025.11.5.dev20251105.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
-nv_ingest_client-2025.11.5.dev20251105.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
-nv_ingest_client-2025.11.5.dev20251105.dist-info/RECORD,,
+nv_ingest_client-2025.11.17.dev20251117.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_client-2025.11.17.dev20251117.dist-info/METADATA,sha256=bgCG3WP30zjURzJ_SZEm3fDbby-NoICZDYfbiA3sSjg,30627
+nv_ingest_client-2025.11.17.dev20251117.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest_client-2025.11.17.dev20251117.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
+nv_ingest_client-2025.11.17.dev20251117.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
+nv_ingest_client-2025.11.17.dev20251117.dist-info/RECORD,,

{nv_ingest_client-2025.11.5.dev20251105.dist-info → nv_ingest_client-2025.11.17.dev20251117.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.5.dev20251105.dist-info → nv_ingest_client-2025.11.17.dev20251117.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.5.dev20251105.dist-info → nv_ingest_client-2025.11.17.dev20251117.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.5.dev20251105.dist-info → nv_ingest_client-2025.11.17.dev20251117.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-client 2025.11.5.dev20251105__py3-none-any.whl → 2025.11.17.dev20251117__py3-none-any.whl

nv-ingest-client 2025.11.5.dev20251105py3-none-any.whl → 2025.11.17.dev20251117py3-none-any.whl