nv-ingest-client 2025.11.5.dev20251105__py3-none-any.whl → 2025.11.17.dev20251117__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -694,6 +694,7 @@ class Ingestor:
694
694
  submitted_futures = set(future_to_job_id.keys())
695
695
  completed_futures = set()
696
696
  future_results = []
697
+ vdb_future = None
697
698
 
698
699
  def _done_callback(future):
699
700
  job_id = future_to_job_id[future]
@@ -715,9 +716,10 @@ class Ingestor:
715
716
  future.add_done_callback(_done_callback)
716
717
 
717
718
  if self._vdb_bulk_upload:
718
- self._vdb_bulk_upload.run(combined_future.result())
719
+ executor = ThreadPoolExecutor(max_workers=1)
720
+ vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
719
721
 
720
- return combined_future
722
+ return combined_future if not vdb_future else vdb_future
721
723
 
722
724
  @ensure_job_specs
723
725
  def _prepare_ingest_run(self):
@@ -834,6 +836,7 @@ class Ingestor:
834
836
  extract_tables = kwargs.pop("extract_tables", True)
835
837
  extract_charts = kwargs.pop("extract_charts", True)
836
838
  extract_page_as_image = kwargs.pop("extract_page_as_image", False)
839
+ table_output_format = kwargs.pop("table_output_format", "markdown")
837
840
 
838
841
  # Defaulting to False since enabling infographic extraction reduces throughput.
839
842
  # Users have to set to True if infographic extraction is required.
@@ -856,6 +859,7 @@ class Ingestor:
856
859
  extract_charts=extract_charts,
857
860
  extract_infographics=extract_infographics,
858
861
  extract_page_as_image=extract_page_as_image,
862
+ table_output_format=table_output_format,
859
863
  **kwargs,
860
864
  )
861
865
 
@@ -18,6 +18,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
18
18
  from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
19
19
  from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
20
20
  from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
21
+ from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
21
22
  from nv_ingest_client.util.dataset import get_dataset_files
22
23
  from nv_ingest_client.util.dataset import get_dataset_statistics
23
24
 
@@ -199,6 +200,8 @@ class JobSpec:
199
200
  self._tasks.append(ChartExtractionTask())
200
201
  if isinstance(task, ExtractTask) and (task._extract_infographics is True):
201
202
  self._tasks.append(InfographicExtractionTask())
203
+ if isinstance(task, ExtractTask) and (task._extract_method in {"pdfium_hybrid", "ocr"}):
204
+ self._tasks.append(OCRExtractionTask())
202
205
  if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
203
206
  extract_audio_params = task._extract_audio_params or {}
204
207
  self._tasks.append(AudioExtractionTask(**extract_audio_params))
@@ -38,6 +38,7 @@ class EmbedTask(Task):
38
38
  audio_elements_modality: Optional[str] = None,
39
39
  custom_content_field: Optional[str] = None,
40
40
  result_target_field: Optional[str] = None,
41
+ dimensions: Optional[int] = None,
41
42
  ) -> None:
42
43
  """
43
44
  Initialize the EmbedTask configuration.
@@ -80,6 +81,7 @@ class EmbedTask(Task):
80
81
  audio_elements_modality=audio_elements_modality,
81
82
  custom_content_field=custom_content_field,
82
83
  result_target_field=result_target_field,
84
+ dimensions=dimensions,
83
85
  )
84
86
 
85
87
  self._endpoint_url = validated_data.endpoint_url
@@ -92,6 +94,7 @@ class EmbedTask(Task):
92
94
  self._audio_elements_modality = validated_data.audio_elements_modality
93
95
  self._custom_content_field = validated_data.custom_content_field
94
96
  self._result_target_field = validated_data.result_target_field
97
+ self._dimensions = validated_data.dimensions
95
98
 
96
99
  def __str__(self) -> str:
97
100
  """
@@ -124,6 +127,8 @@ class EmbedTask(Task):
124
127
  info += f" custom_content_field: {self._custom_content_field}\n"
125
128
  if self._result_target_field:
126
129
  info += f" result_target_field: {self.result_target_field}\n"
130
+ if self._dimensions:
131
+ info += f" dimensions: {self._dimensions}\n"
127
132
  return info
128
133
 
129
134
  def to_dict(self) -> Dict[str, Any]:
@@ -163,6 +168,9 @@ class EmbedTask(Task):
163
168
  task_properties["custom_content_field"] = self._custom_content_field
164
169
 
165
170
  if self._result_target_field:
166
- task_properties["result_target_field"] = self.result_target_field
171
+ task_properties["result_target_field"] = self._result_target_field
172
+
173
+ if self._dimensions:
174
+ task_properties["dimensions"] = self._dimensions
167
175
 
168
176
  return {"type": "embed", "task_properties": task_properties}
@@ -58,6 +58,7 @@ _Type_Extract_Method_PDF = Literal[
58
58
  "pdfium",
59
59
  "tika",
60
60
  "unstructured_io",
61
+ "ocr",
61
62
  ]
62
63
 
63
64
  _Type_Extract_Images_Method = Literal["group", "yolox"]
@@ -0,0 +1,55 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # pylint: disable=too-few-public-methods
7
+ # pylint: disable=too-many-arguments
8
+
9
+ import logging
10
+ from typing import Dict
11
+
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
13
+ from nv_ingest_client.primitives.tasks.task_base import Task
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class OCRExtractionTask(Task):
19
+ """
20
+ Object for ocr extraction task
21
+ """
22
+
23
+ def __init__(self, params: dict = None) -> None:
24
+ """
25
+ Setup OCR Extraction Task Config
26
+ """
27
+ super().__init__()
28
+
29
+ # Handle None params by converting to empty dict for backward compatibility
30
+ if params is None:
31
+ params = {}
32
+
33
+ # Use the API schema for validation
34
+ validated_data = IngestTaskOCRExtraction(params=params)
35
+
36
+ self._params = validated_data.params
37
+
38
+ def __str__(self) -> str:
39
+ """
40
+ Returns a string with the object's config and run time state
41
+ """
42
+ info = ""
43
+ info += "OCR Extraction Task:\n"
44
+ info += f" params: {self._params}\n"
45
+ return info
46
+
47
+ def to_dict(self) -> Dict:
48
+ """
49
+ Convert to a dict for submission to redis
50
+ """
51
+ task_properties = {
52
+ "params": self._params,
53
+ }
54
+
55
+ return {"type": "ocr_data_extract", "task_properties": task_properties}
@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
51
51
  "txt": DocumentTypeEnum.TXT,
52
52
  "mp3": DocumentTypeEnum.MP3,
53
53
  "wav": DocumentTypeEnum.WAV,
54
+ "mp4": DocumentTypeEnum.MP4,
55
+ "mov": DocumentTypeEnum.MOV,
56
+ "avi": DocumentTypeEnum.AVI,
57
+ "mkv": DocumentTypeEnum.MKV,
54
58
  # Add more as needed
55
59
  }
56
60
 
@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
44
44
  logger = logging.getLogger(__name__)
45
45
 
46
46
  CONSISTENCY = CONSISTENCY_BOUNDED
47
+ DENSE_INDEX_NAME = "dense_index"
47
48
 
48
49
  pandas_reader_map = {
49
50
  ".json": pd.read_json,
@@ -93,7 +94,7 @@ def create_meta_collection(
93
94
  index_params = MilvusClient.prepare_index_params()
94
95
  index_params.add_index(
95
96
  field_name="vector",
96
- index_name="dense_index",
97
+ index_name=DENSE_INDEX_NAME,
97
98
  index_type="FLAT",
98
99
  metric_type="L2",
99
100
  )
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
313
314
  if local_index:
314
315
  index_params.add_index(
315
316
  field_name="vector",
316
- index_name="dense_index",
317
+ index_name=DENSE_INDEX_NAME,
317
318
  index_type="FLAT",
318
319
  metric_type="L2",
319
320
  )
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
321
322
  if gpu_index:
322
323
  index_params.add_index(
323
324
  field_name="vector",
324
- index_name="dense_index",
325
+ index_name=DENSE_INDEX_NAME,
325
326
  index_type="GPU_CAGRA",
326
327
  metric_type="L2",
327
328
  params={
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
335
336
  else:
336
337
  index_params.add_index(
337
338
  field_name="vector",
338
- index_name="dense_index",
339
+ index_name=DENSE_INDEX_NAME,
339
340
  index_type="HNSW",
340
341
  metric_type="L2",
341
342
  params={"M": 64, "efConstruction": 512},
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
493
494
  if isinstance(indexes, dict):
494
495
  # Old Milvus behavior (< 2.5.6)
495
496
  for k, v in indexes.items():
496
- if k[1] == "dense_index" and hasattr(v, "_index_type"):
497
+ if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
497
498
  d_idx = v._index_type
498
499
  if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
499
500
  s_idx = v._index_type
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
504
505
  index_name = getattr(idx, "index_name", None)
505
506
  index_type = getattr(idx, "index_type", None)
506
507
 
507
- if index_name == "dense_index":
508
+ if index_name == DENSE_INDEX_NAME:
508
509
  d_idx = index_type
509
510
  if sparse and index_name == "sparse_index":
510
511
  s_idx = index_type
@@ -900,30 +901,32 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
900
901
  (refer to MilvusClient.refresh_load for bulk inserts).
901
902
  """
902
903
  client.flush(collection_name)
903
- index_names = utility.list_indexes(collection_name)
904
+ # index_names = utility.list_indexes(collection_name)
904
905
  indexed_rows = 0
905
- for index_name in index_names:
906
+ # observe dense_index, all indexes get populated simultaneously
907
+ for index_name in [DENSE_INDEX_NAME]:
906
908
  indexed_rows = 0
907
- while indexed_rows < num_elements:
909
+ expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
910
+ while indexed_rows < expected_rows:
908
911
  pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
909
912
  for i in range(20):
910
- new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
913
+ current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
911
914
  time.sleep(1)
912
915
  logger.info(
913
- f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
916
+ f"Indexed rows, {collection_name}, {index_name} - {current_indexed_rows} / {expected_rows}"
914
917
  )
915
- if new_indexed_rows == num_elements:
916
- indexed_rows = new_indexed_rows
918
+ if current_indexed_rows == expected_rows:
919
+ indexed_rows = current_indexed_rows
917
920
  break
918
921
  # check if indexed_rows is staying the same, too many times means something is wrong
919
- if new_indexed_rows == indexed_rows:
922
+ if current_indexed_rows == indexed_rows:
920
923
  pos_movement -= 1
921
924
  else:
922
925
  pos_movement = 10
923
926
  # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
924
927
  if pos_movement == 0:
925
- raise ValueError("Rows are not getting indexed as expected")
926
- indexed_rows = new_indexed_rows
928
+ raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
929
+ indexed_rows = current_indexed_rows
927
930
  return indexed_rows
928
931
 
929
932
 
@@ -2057,3 +2060,24 @@ class Milvus(VDB):
2057
2060
  self.write_to_index(records, collection_name=coll_name, **sub_write_params)
2058
2061
  else:
2059
2062
  raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
2063
+ return records
2064
+
2065
+ def run_async(self, records):
2066
+ collection_name, create_params = self.get_connection_params()
2067
+ _, write_params = self.get_write_params()
2068
+ if isinstance(collection_name, str):
2069
+ logger.info(f"creating index - {collection_name}")
2070
+ self.create_index(collection_name=collection_name, **create_params)
2071
+ records = records.result()
2072
+ logger.info(f"writing to index, for collection - {collection_name}")
2073
+ self.write_to_index(records, **write_params)
2074
+ elif isinstance(collection_name, dict):
2075
+ split_params_list = _dict_to_params(collection_name, write_params)
2076
+ for sub_params in split_params_list:
2077
+ coll_name, sub_write_params = sub_params
2078
+ sub_write_params.pop("collection_name", None)
2079
+ self.create_index(collection_name=coll_name, **create_params)
2080
+ self.write_to_index(records, collection_name=coll_name, **sub_write_params)
2081
+ else:
2082
+ raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
2083
+ return records
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.11.5.dev20251105
3
+ Version: 2025.11.17.dev20251117
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -8,21 +8,22 @@ nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI
8
8
  nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
9
9
  nv_ingest_client/client/client.py,sha256=3uA54D4Y6lSS-Nvz8R8uzkHkoV8vJu8GPQQRPoc-Uxk,77368
10
10
  nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
11
- nv_ingest_client/client/interface.py,sha256=OCbH_5Q-cv1V4HpLBxLdaPCeaNKNkdEYi1JS4Tu6DGY,54745
11
+ nv_ingest_client/client/interface.py,sha256=Y6JnjaRytlBrhgbU6MJYm2dblLvoYxWEB35TETZDSwk,55022
12
12
  nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
13
13
  nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
14
14
  nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
15
- nv_ingest_client/primitives/jobs/job_spec.py,sha256=teAZbpvxn25jIEUP5YJsAX_E_z9iWhejS-uy5opshFM,15681
15
+ nv_ingest_client/primitives/jobs/job_spec.py,sha256=TBz5u7KRdQjQvqD0mMzwjTK9Jl3p7yTIknQQs0lfnV8,15909
16
16
  nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
17
17
  nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
18
18
  nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
19
19
  nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcYqtesS-HaZzeh4rI,2130
20
20
  nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
21
21
  nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
22
- nv_ingest_client/primitives/tasks/embed.py,sha256=YFnymU1UWID2gSrz1anlaL_SRMmDr3dNTeZv2UDu9kQ,6739
23
- nv_ingest_client/primitives/tasks/extract.py,sha256=bRriVkQyXN-UwzprHIt4Lp0iwmAojLEXqBb-IUrf3vY,9328
22
+ nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
23
+ nv_ingest_client/primitives/tasks/extract.py,sha256=ec2aKPU9OMOOw-oalQKAPaNRqgkREQ0ByLkFVqutD6E,9339
24
24
  nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
25
25
  nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
26
+ nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
26
27
  nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
27
28
  nv_ingest_client/primitives/tasks/store.py,sha256=nIOnCH8vw4FLCLVBJYnsS5Unc0QmuO_jEtUp7-E9FU4,4199
28
29
  nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
@@ -42,14 +43,14 @@ nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-B
42
43
  nv_ingest_client/util/util.py,sha256=qwJ4MqF8w4-lws76z8iz1V0Hz_ebDYN8yAKyJPGuHuU,15828
43
44
  nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
44
45
  nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
46
+ nv_ingest_client/util/file_processing/extract.py,sha256=Hjtem4bJWum1bbUPw7_TG-0Z2-7PsH4bBuqTF7bLn88,4794
46
47
  nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
47
48
  nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
48
- nv_ingest_client/util/vdb/milvus.py,sha256=6XWRh2SDJlgVZOKZVXG3cZTB4L-ZHIiiTenuIzkxp2Y,78704
49
+ nv_ingest_client/util/vdb/milvus.py,sha256=LHZ4Z6fHk8vQUGQFJ3FZ5iay0Ike6Zur-K9yMiPxe44,80141
49
50
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
50
- nv_ingest_client-2025.11.5.dev20251105.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
51
- nv_ingest_client-2025.11.5.dev20251105.dist-info/METADATA,sha256=rB92S3YltqT5qi70cDN7VK1wtRDiOKFMe0vU7Av8tQ4,30626
52
- nv_ingest_client-2025.11.5.dev20251105.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
53
- nv_ingest_client-2025.11.5.dev20251105.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
54
- nv_ingest_client-2025.11.5.dev20251105.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
55
- nv_ingest_client-2025.11.5.dev20251105.dist-info/RECORD,,
51
+ nv_ingest_client-2025.11.17.dev20251117.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
+ nv_ingest_client-2025.11.17.dev20251117.dist-info/METADATA,sha256=bgCG3WP30zjURzJ_SZEm3fDbby-NoICZDYfbiA3sSjg,30627
53
+ nv_ingest_client-2025.11.17.dev20251117.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
+ nv_ingest_client-2025.11.17.dev20251117.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
55
+ nv_ingest_client-2025.11.17.dev20251117.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
56
+ nv_ingest_client-2025.11.17.dev20251117.dist-info/RECORD,,