nv-ingest-client 2025.11.9.dev20251109__py3-none-any.whl → 2025.11.27.dev20251127__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
202
202
  if not self.job_queue_id:
203
203
  logger.warning("job_queue_id is not set; submission of new jobs will fail.")
204
204
 
205
+ # Executor check required for run_async
206
+ if not hasattr(client, "_worker_pool"):
207
+ raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
208
+ if not isinstance(client._worker_pool, ThreadPoolExecutor):
209
+ raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
210
+ self._executor = client._worker_pool
211
+
205
212
  # --------------------------------------------------------------------------
206
213
  # Private Methods
207
214
  # --------------------------------------------------------------------------
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
246
253
  # Attempt to mark state as FAILED locally in the client (best effort)
247
254
  try:
248
255
  # Use a method assumed to safely get the state object
249
- job_state = self.client._get_job_state_object(job_index)
256
+ job_state = self.client._get_and_check_job_state(job_index)
250
257
  # Check state exists and is not already terminal before updating
251
258
  if (
252
259
  job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
495
502
 
496
503
  return batch_futures_dict, normalized_job_indices
497
504
 
498
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
505
+ # --------------------------------------------------------------------------
506
+ # Core Processing Logic
507
+ # --------------------------------------------------------------------------
508
+ def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
499
509
  """
500
510
  Executes the main processing loop in batches.
501
511
 
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
640
650
 
641
651
  return self.results, self.failures, self.traces if self.return_traces else []
642
652
 
653
+ # --------------------------------------------------------------------------
654
+ # Public Methods
655
+ # --------------------------------------------------------------------------
656
+
657
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
658
+ """
659
+ Executes the main processing loop synchronously.
660
+
661
+ This method orchestrates the job processing by maintaining a constant
662
+ pool of in-flight jobs, handling submissions, fetches, and retries until
663
+ all jobs are complete. It blocks until all jobs are processed.
664
+
665
+ Returns
666
+ -------
667
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
668
+ A tuple containing:
669
+ 1. A list of successfully fetched job results.
670
+ 2. A list of tuples for failed jobs (job_index, error_message).
671
+ 3. A list of trace dictionaries if `return_traces` was True.
672
+ """
673
+ return self._process_all_jobs()
674
+
675
+ def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
676
+ """
677
+ Executes the main processing loop asynchronously.
678
+
679
+ Submits the entire processing logic to the client's background
680
+ thread pool and returns a Future that resolves with the final
681
+ results, failures, and traces once all jobs are complete.
682
+
683
+ Returns
684
+ -------
685
+ Future
686
+ A future representing the asynchronous execution. Its result()
687
+ will be a tuple containing (results, failures, traces).
688
+ """
689
+ return self._executor.submit(self._process_all_jobs)
690
+
643
691
 
644
692
  class NvIngestClient:
645
693
  """
@@ -1377,6 +1425,68 @@ class NvIngestClient:
1377
1425
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
1378
1426
  return results
1379
1427
 
1428
+ def process_jobs_concurrently_async(
1429
+ self,
1430
+ job_indices: Union[str, List[str]],
1431
+ job_queue_id: Optional[str] = None,
1432
+ batch_size: Optional[int] = None,
1433
+ timeout: int = 100,
1434
+ max_job_retries: Optional[int] = None,
1435
+ retry_delay: float = 0.5,
1436
+ initial_fetch_delay: float = 0.3,
1437
+ fail_on_submit_error: bool = False,
1438
+ completion_callback: Optional[Callable[[Any, str], None]] = None,
1439
+ stream_to_callback_only: bool = False,
1440
+ return_full_response: bool = False,
1441
+ verbose: bool = False,
1442
+ return_traces: bool = False,
1443
+ ) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
1444
+ """
1445
+ Submit and fetch multiple jobs concurrently and asynchronously.
1446
+
1447
+ This method initializes the processing and returns a Future immediately. The Future
1448
+ will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
1449
+ jobs have completed.
1450
+
1451
+ Parameters are identical to `process_jobs_concurrently`.
1452
+
1453
+ Returns
1454
+ -------
1455
+ Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
1456
+ A future that completes when all jobs are done. Its result is a tuple
1457
+ containing (successful_results, failures, traces).
1458
+ """
1459
+ if isinstance(job_indices, str):
1460
+ job_indices = [job_indices]
1461
+
1462
+ if not job_indices:
1463
+ immediate_future: Future = Future()
1464
+ immediate_future.set_result(([], [], []))
1465
+ return immediate_future
1466
+
1467
+ validated_batch_size = self._validate_batch_size(batch_size)
1468
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1469
+
1470
+ processor = _ConcurrentProcessor(
1471
+ client=self,
1472
+ batch_size=validated_batch_size,
1473
+ job_indices=job_indices,
1474
+ job_queue_id=job_queue_id,
1475
+ timeout=effective_timeout,
1476
+ max_job_retries=max_job_retries,
1477
+ retry_delay=retry_delay,
1478
+ initial_fetch_delay=initial_fetch_delay,
1479
+ completion_callback=completion_callback,
1480
+ fail_on_submit_error=fail_on_submit_error,
1481
+ stream_to_callback_only=stream_to_callback_only,
1482
+ return_full_response=return_full_response,
1483
+ verbose=verbose,
1484
+ return_traces=return_traces,
1485
+ )
1486
+
1487
+ # Asynchronous call
1488
+ return processor.run_async()
1489
+
1380
1490
  def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
1381
1491
  """
1382
1492
  Block until all specified jobs have been marked submitted.
@@ -13,6 +13,7 @@ import os
13
13
  import shutil
14
14
  import tempfile
15
15
  import threading
16
+ from io import BytesIO
16
17
  from concurrent.futures import Future
17
18
  from concurrent.futures import ThreadPoolExecutor
18
19
  from concurrent.futures import as_completed
@@ -224,6 +225,7 @@ class Ingestor:
224
225
  **kwargs,
225
226
  ):
226
227
  self._documents = documents or []
228
+ self._buffers = []
227
229
  self._client = client
228
230
  self._job_queue_id = job_queue_id
229
231
  self._vdb_bulk_upload = None
@@ -352,6 +354,28 @@ class Ingestor:
352
354
 
353
355
  return self
354
356
 
357
+ def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
358
+ """
359
+ Add buffers for processing.
360
+
361
+ Parameters
362
+ ----------
363
+ buffers : List[Tuple[str, BytesIO]]
364
+ List of tuples containing the name of the buffer and the BytesIO object.
365
+ """
366
+ if (
367
+ isinstance(buffers, tuple)
368
+ and len(buffers) == 2
369
+ and isinstance(buffers[0], str)
370
+ and isinstance(buffers[1], BytesIO)
371
+ ):
372
+ buffers = [buffers]
373
+ self._buffers.extend(buffers)
374
+ self._job_specs = BatchJobSpec(self._buffers)
375
+ self._all_local = True
376
+
377
+ return self
378
+
355
379
  def load(self, **kwargs) -> "Ingestor":
356
380
  """
357
381
  Ensure all document files are accessible locally, downloading if necessary.
@@ -669,55 +693,133 @@ class Ingestor:
669
693
 
670
694
  return tuple(returns) if len(returns) > 1 else results
671
695
 
672
- def ingest_async(self, **kwargs: Any) -> Future:
696
+ def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
673
697
  """
674
698
  Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
675
699
 
700
+ The return type of the future's result is dynamic and mirrors the behavior of the synchronous
701
+ `ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
702
+ upload is configured, the future will complete *after* the VDB upload finishes.
703
+
676
704
  Parameters
677
705
  ----------
706
+ return_failures : bool, optional
707
+ If True, return a tuple containing failures; otherwise, only return results. Default is False.
708
+ return_traces : bool, optional
709
+ If True, return trace metrics alongside results. Default is False.
678
710
  kwargs : dict
679
- Additional parameters for the `submit_job_async` method.
711
+ Additional parameters passed to the concurrent processor.
712
+ Optional flags include `include_parent_trace_ids=True` to also return
713
+ parent job trace identifiers (V2 API only).
680
714
 
681
715
  Returns
682
716
  -------
683
- Future
684
- A future that completes when all submitted jobs have reached a terminal state.
717
+ Future[Union[List[Any], Tuple[Any, ...]]]
718
+ A future that completes when all jobs and any subsequent VDB upload
719
+ have finished. Its result will be one of the following:
720
+ - Default: list of results
721
+ - return_failures=True: (results, failures)
722
+ - return_traces=True: (results, traces)
723
+ - return_failures=True, return_traces=True: (results, failures, traces)
724
+
685
725
  """
686
- self._prepare_ingest_run()
726
+ try:
727
+ self._prepare_ingest_run()
687
728
 
688
- self._job_ids = self._client.add_job(self._job_specs)
729
+ # Add jobs locally first
730
+ if self._job_specs is None:
731
+ raise RuntimeError("Job specs missing for ingest_async.")
732
+ self._job_ids = self._client.add_job(self._job_specs)
733
+ self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
689
734
 
690
- future_to_job_id = self._client.submit_job_async(self._job_ids, self._job_queue_id, **kwargs)
691
- self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
735
+ proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
692
736
 
693
- combined_future = Future()
694
- submitted_futures = set(future_to_job_id.keys())
695
- completed_futures = set()
696
- future_results = []
737
+ final_future: Future = Future()
697
738
 
698
- def _done_callback(future):
699
- job_id = future_to_job_id[future]
700
- job_state = self._job_states[job_id]
701
- try:
702
- result = self._client.fetch_job_result(job_id)
703
- if job_state.state != JobStateEnum.COMPLETED:
704
- job_state.state = JobStateEnum.COMPLETED
705
- except Exception:
706
- result = None
707
- if job_state.state != JobStateEnum.FAILED:
708
- job_state.state = JobStateEnum.FAILED
709
- completed_futures.add(future)
710
- future_results.extend(result)
711
- if completed_futures == submitted_futures:
712
- combined_future.set_result(future_results)
739
+ processor_future = self._client.process_jobs_concurrently_async(
740
+ job_indices=self._job_ids,
741
+ job_queue_id=self._job_queue_id,
742
+ return_traces=return_traces,
743
+ **proc_kwargs,
744
+ )
713
745
 
714
- for future in future_to_job_id:
715
- future.add_done_callback(_done_callback)
746
+ include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
716
747
 
717
- if self._vdb_bulk_upload:
718
- self._vdb_bulk_upload.run(combined_future.result())
748
+ def _processor_done_callback(proc_future: Future):
749
+ """Callback to handle completion, VDB upload, and final result setting."""
750
+ try:
751
+ if proc_future.cancelled():
752
+ if not final_future.done():
753
+ final_future.cancel()
754
+ return
755
+ if proc_future.exception():
756
+ if not final_future.done():
757
+ final_future.set_exception(proc_future.exception())
758
+ return
759
+
760
+ results, failures, traces_list = proc_future.result()
761
+
762
+ failed_job_ids = set()
763
+ for job_id_with_source, error_msg in failures:
764
+ job_id = job_id_with_source.split(":", 1)[0]
765
+ if job_id in self._job_states:
766
+ if self._job_states[job_id].state != JobStateEnum.FAILED:
767
+ self._job_states[job_id].state = JobStateEnum.FAILED
768
+ failed_job_ids.add(job_id)
769
+
770
+ all_submitted_job_ids = set(self._job_ids)
771
+ successful_job_ids = all_submitted_job_ids - failed_job_ids
772
+
773
+ for job_id in successful_job_ids:
774
+ if job_id in self._job_states:
775
+ if self._job_states[job_id].state != JobStateEnum.COMPLETED:
776
+ self._job_states[job_id].state = JobStateEnum.COMPLETED
777
+
778
+ if self._vdb_bulk_upload and results:
779
+ with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
780
+ results_future = Future()
781
+ results_future.set_result(results)
782
+ vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
783
+ vdb_future.result()
784
+
785
+ parent_trace_ids = (
786
+ self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
787
+ )
719
788
 
720
- return combined_future
789
+ returns = [results]
790
+ if return_failures:
791
+ returns.append(failures)
792
+ if return_traces:
793
+ returns.append(traces_list)
794
+ if include_parent_trace_ids:
795
+ returns.append(parent_trace_ids)
796
+
797
+ final_result = tuple(returns) if len(returns) > 1 else results
798
+
799
+ if not final_future.done():
800
+ final_future.set_result(final_result)
801
+
802
+ except Exception as e:
803
+ logger.exception("Error in ingest_async processor callback")
804
+ if not final_future.done():
805
+ final_future.set_exception(e)
806
+ finally:
807
+ final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
808
+ for job_state in self._job_states.values():
809
+ if (
810
+ job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
811
+ and job_state.state != final_state
812
+ ):
813
+ job_state.state = final_state
814
+
815
+ processor_future.add_done_callback(_processor_done_callback)
816
+ return final_future
817
+
818
+ except Exception as setup_err:
819
+ logger.exception("Failed during synchronous setup of ingest_async")
820
+ error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
821
+ error_future.set_exception(setup_err)
822
+ return error_future
721
823
 
722
824
  @ensure_job_specs
723
825
  def _prepare_ingest_run(self):
@@ -834,6 +936,7 @@ class Ingestor:
834
936
  extract_tables = kwargs.pop("extract_tables", True)
835
937
  extract_charts = kwargs.pop("extract_charts", True)
836
938
  extract_page_as_image = kwargs.pop("extract_page_as_image", False)
939
+ table_output_format = kwargs.pop("table_output_format", "markdown")
837
940
 
838
941
  # Defaulting to False since enabling infographic extraction reduces throughput.
839
942
  # Users have to set to True if infographic extraction is required.
@@ -856,6 +959,7 @@ class Ingestor:
856
959
  extract_charts=extract_charts,
857
960
  extract_infographics=extract_infographics,
858
961
  extract_page_as_image=extract_page_as_image,
962
+ table_output_format=table_output_format,
859
963
  **kwargs,
860
964
  )
861
965
 
@@ -10,6 +10,7 @@ from typing import Dict
10
10
  from typing import List
11
11
  from typing import Optional
12
12
  from typing import Union
13
+ from typing import Tuple
13
14
  from uuid import UUID
14
15
 
15
16
  from nv_ingest_client.primitives.tasks import Task
@@ -18,6 +19,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
18
19
  from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
19
20
  from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
20
21
  from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
22
+ from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
21
23
  from nv_ingest_client.util.dataset import get_dataset_files
22
24
  from nv_ingest_client.util.dataset import get_dataset_statistics
23
25
 
@@ -199,6 +201,8 @@ class JobSpec:
199
201
  self._tasks.append(ChartExtractionTask())
200
202
  if isinstance(task, ExtractTask) and (task._extract_infographics is True):
201
203
  self._tasks.append(InfographicExtractionTask())
204
+ if isinstance(task, ExtractTask) and (task._extract_method in {"pdfium_hybrid", "ocr"}):
205
+ self._tasks.append(OCRExtractionTask())
202
206
  if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
203
207
  extract_audio_params = task._extract_audio_params or {}
204
208
  self._tasks.append(AudioExtractionTask(**extract_audio_params))
@@ -219,7 +223,9 @@ class BatchJobSpec:
219
223
  A dictionary that maps document types to a list of `JobSpec` instances.
220
224
  """
221
225
 
222
- def __init__(self, job_specs_or_files: Optional[Union[List[JobSpec], List[str]]] = None) -> None:
226
+ def __init__(
227
+ self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
228
+ ) -> None:
223
229
  """
224
230
  Initializes the BatchJobSpec instance.
225
231
 
@@ -236,6 +242,13 @@ class BatchJobSpec:
236
242
  self.from_job_specs(job_specs_or_files)
237
243
  elif isinstance(job_specs_or_files[0], str):
238
244
  self.from_files(job_specs_or_files)
245
+ elif (
246
+ isinstance(job_specs_or_files[0], tuple)
247
+ and len(job_specs_or_files[0]) == 2
248
+ and isinstance(job_specs_or_files[0][0], str)
249
+ and isinstance(job_specs_or_files[0][1], BytesIO)
250
+ ):
251
+ self.from_buffers(job_specs_or_files)
239
252
  else:
240
253
  raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
241
254
 
@@ -279,6 +292,21 @@ class BatchJobSpec:
279
292
  for job_spec in job_specs:
280
293
  self.add_job_spec(job_spec)
281
294
 
295
+ def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
296
+ """
297
+ Initializes the batch from a list of buffers.
298
+
299
+ Parameters
300
+ ----------
301
+ buffers : List[Tuple[str, BytesIO]]
302
+ A list of tuples containing the name of the buffer and the BytesIO object.
303
+ """
304
+ from nv_ingest_client.util.util import create_job_specs_for_buffers
305
+
306
+ job_specs = create_job_specs_for_buffers(buffers)
307
+ for job_spec in job_specs:
308
+ self.add_job_spec(job_spec)
309
+
282
310
  def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
283
311
  """
284
312
  Internal method to initialize the batch from a dataset.
@@ -58,6 +58,7 @@ _Type_Extract_Method_PDF = Literal[
58
58
  "pdfium",
59
59
  "tika",
60
60
  "unstructured_io",
61
+ "ocr",
61
62
  ]
62
63
 
63
64
  _Type_Extract_Images_Method = Literal["group", "yolox"]
@@ -0,0 +1,55 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # pylint: disable=too-few-public-methods
7
+ # pylint: disable=too-many-arguments
8
+
9
+ import logging
10
+ from typing import Dict
11
+
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
13
+ from nv_ingest_client.primitives.tasks.task_base import Task
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class OCRExtractionTask(Task):
19
+ """
20
+ Object for ocr extraction task
21
+ """
22
+
23
+ def __init__(self, params: dict = None) -> None:
24
+ """
25
+ Setup OCR Extraction Task Config
26
+ """
27
+ super().__init__()
28
+
29
+ # Handle None params by converting to empty dict for backward compatibility
30
+ if params is None:
31
+ params = {}
32
+
33
+ # Use the API schema for validation
34
+ validated_data = IngestTaskOCRExtraction(params=params)
35
+
36
+ self._params = validated_data.params
37
+
38
+ def __str__(self) -> str:
39
+ """
40
+ Returns a string with the object's config and run time state
41
+ """
42
+ info = ""
43
+ info += "OCR Extraction Task:\n"
44
+ info += f" params: {self._params}\n"
45
+ return info
46
+
47
+ def to_dict(self) -> Dict:
48
+ """
49
+ Convert to a dict for submission to redis
50
+ """
51
+ task_properties = {
52
+ "params": self._params,
53
+ }
54
+
55
+ return {"type": "ocr_data_extract", "task_properties": task_properties}
@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
51
51
  "txt": DocumentTypeEnum.TXT,
52
52
  "mp3": DocumentTypeEnum.MP3,
53
53
  "wav": DocumentTypeEnum.WAV,
54
+ "mp4": DocumentTypeEnum.MP4,
55
+ "mov": DocumentTypeEnum.MOV,
56
+ "avi": DocumentTypeEnum.AVI,
57
+ "mkv": DocumentTypeEnum.MKV,
54
58
  # Add more as needed
55
59
  }
56
60
 
@@ -141,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
141
145
 
142
146
  logger.debug(f"Content extracted from '{path}'")
143
147
  return content, DocumentTypeEnum(document_type)
148
+
149
+
150
+ def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
151
+ """
152
+ Extracts the content and type from a buffer.
153
+ """
154
+ document_type = get_or_infer_file_type(buffer[0])
155
+ try:
156
+ if document_type in [
157
+ DocumentTypeEnum.TXT,
158
+ DocumentTypeEnum.MD,
159
+ DocumentTypeEnum.HTML,
160
+ ]:
161
+ content = detect_encoding_and_read_text_file(buffer[1])
162
+ else:
163
+ content = serialize_to_base64(buffer[1])
164
+ except Exception as e:
165
+ logger.error(f"Error processing buffer {buffer[0]}: {e}")
166
+
167
+ raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
168
+
169
+ logger.debug(f"Content extracted from '{buffer[0]}'")
170
+ return content, DocumentTypeEnum(document_type)
@@ -12,10 +12,12 @@ import math
12
12
  import heapq
13
13
  from typing import Dict
14
14
  from typing import List
15
+ from typing import Tuple
16
+ from io import BytesIO
15
17
 
16
18
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
17
19
  from nv_ingest_client.primitives.jobs.job_spec import JobSpec
18
- from nv_ingest_client.util.file_processing.extract import extract_file_content
20
+ from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
19
21
 
20
22
  logger = logging.getLogger(__name__)
21
23
 
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
350
352
  return job_specs
351
353
 
352
354
 
355
+ def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
356
+ """
357
+ Create and job specifications (JobSpecs) for a list of buffers.
358
+ This function takes a list of buffers, processes each buffer to extract its content and type,
359
+ creates a job specification (JobSpec) for each buffer.
360
+
361
+ Parameters
362
+ ----------
363
+ buffers : List[Tuple[str, BytesIO]]
364
+ A list of tuples containing the name of the buffer and the BytesIO object.
365
+
366
+ Returns
367
+ -------
368
+ List[JobSpec]
369
+ A list of JobSpecs.
370
+ """
371
+
372
+ job_specs = []
373
+ for name, buffer in buffers:
374
+ content, file_type = extract_content_from_buffer((name, buffer))
375
+ job_spec = JobSpec(
376
+ document_type=file_type,
377
+ payload=content,
378
+ source_id=name,
379
+ source_name=name,
380
+ )
381
+ job_specs.append(job_spec)
382
+
383
+ return job_specs
384
+
385
+
353
386
  def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
354
387
  """
355
388
  Apply PDF split configuration to a list of JobSpec objects.
@@ -1,27 +1,243 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
3
 
4
+ """Abstract Vector Database (VDB) operator API.
5
+
6
+ This module defines the `VDB` abstract base class which specifies the
7
+ interface that custom vector-database operators must implement to integrate
8
+ with NV-Ingest.
9
+
10
+ The implementation details and an example OpenSearch operator are described
11
+ in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
12
+ production-ready OpenSearch implementation is available at
13
+ `client/src/nv_ingest_client/util/vdb/opensearch.py`.
14
+
15
+ Design goals:
16
+ - Provide a small, well-documented interface that supports common vector
17
+ database operations: index creation, batch ingestion, nearest-neighbor
18
+ retrieval, and a simple `run` orchestration entry-point used by the
19
+ NV-Ingest pipeline.
20
+ - Keep the API flexible by accepting `**kwargs` on methods so implementers can
21
+ pass database-specific options without changing the interface.
22
+
23
+ Typical implementation notes (inferred from the example OpenSearch operator):
24
+ - Constructor accepts connection and index configuration parameters such as
25
+ `host`, `port`, `index_name`, `dense_dim` and feature toggles for content
26
+ types (e.g. `enable_text`, `enable_images`).
27
+ - `create_index` should be able to create (and optionally recreate) an
28
+ index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
29
+ - `write_to_index` should accept batches of NV-Ingest records, perform
30
+ validation/transformation, and write documents into the database efficiently
31
+ (bulk APIs are recommended).
32
+ - `retrieval` should accept a list of textual queries, convert them to
33
+ embeddings (by calling an external embedding service or model), perform a
34
+ vector search (top-k), and return cleaned results (e.g., removing stored
35
+ dense vectors from returned payloads).
36
+
37
+ """
38
+
39
+
4
40
  class VDB(ABC):
41
+ """Abstract base class for Vector Database operators.
42
+
43
+ Subclasses must implement the abstract methods below. The interface is
44
+ intentionally small and uses `**kwargs` to allow operator-specific
45
+ configuration without changing the common API.
46
+
47
+ Example (high level):
48
+
49
+ class OpenSearch(VDB):
50
+ def __init__(self, **kwargs):
51
+ # parse kwargs, initialize client, call super().__init__(**kwargs)
52
+ ...
53
+
54
+ def create_index(self, **kwargs):
55
+ # create index, mappings, settings
56
+ ...
57
+
58
+ def write_to_index(self, records: list, **kwargs):
59
+ # transform NV-Ingest records and write to database
60
+ ...
61
+
62
+ def retrieval(self, queries: list, **kwargs):
63
+ # convert queries to embeddings, k-NN search, format results
64
+ ...
65
+
66
+ def run(self, records):
67
+ # orchestrate create_index + write_to_index
68
+ ...
69
+
70
+ Notes on recommended constructor parameters (not enforced by this ABC):
71
+ - host (str): database hostname (default: 'localhost')
72
+ - port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
73
+ - index_name (str): base index name used by the operator
74
+ - dense_dim (int): dimensionality of stored dense embeddings
75
+ - enable_text/enable_images/... (bool): content-type toggles used when
76
+ extracting text from NV-Ingest records before indexing
77
+
78
+ The concrete operator may accept additional parameters (username,
79
+ password, ssl options, client-specific flags). Passing these via
80
+ `**kwargs` is the intended pattern.
81
+ """
5
82
 
6
83
  @abstractmethod
7
84
  def __init__(self, **kwargs):
85
+ """Initialize the VDB operator.
86
+
87
+ Implementations should extract configuration values from `kwargs`
88
+ (or use defaults) and initialize any client connections required to
89
+ talk to the target vector database. Implementations are encouraged to
90
+ call `super().__init__(**kwargs)` only if they want the base-class
91
+ behavior of storing kwargs on the instance (the base class itself does
92
+ not require that behavior).
93
+
94
+ Parameters (suggested/common):
95
+ - host (str): database host
96
+ - port (int): database port
97
+ - index_name (str): base name for created indices
98
+ - dense_dim (int): embedding vector dimension
99
+ - enable_text (bool): whether text content should be extracted/indexed
100
+ - enable_images (bool), enable_audio (bool), etc.: other toggles
101
+
102
+ The constructor should not perform heavy operations (like creating
103
+ indices) unless explicitly desired; prefer leaving that work to
104
+ `create_index` to make the operator easier to test.
105
+ """
8
106
  self.__dict__.update(kwargs)
9
107
 
10
108
  @abstractmethod
11
109
  def create_index(self, **kwargs):
110
+ """Create and configure the index(es) required by this operator.
111
+
112
+ Implementations must ensure an appropriate index (or indices) exist
113
+ before data ingestion. For vector indexes this typically means
114
+ creating settings and mappings that enable k-NN/vector search (for
115
+ example, enabling an HNSW/FAISS engine, setting `dimension`, and any
116
+ engine-specific parameters).
117
+
118
+ Common keyword arguments (operator-specific):
119
+ - recreate (bool): if True, delete and recreate the index even if it
120
+ already exists (default: False)
121
+ - index_name (str): override the operator's configured index name for
122
+ this call
123
+
124
+ Returns:
125
+ implementation-specific result (e.g., a boolean, the created
126
+ index name, or the raw response from the database client). There
127
+ is no strict requirement here because different DB clients return
128
+ different values; document behavior in concrete implementations.
129
+ """
12
130
  pass
13
131
 
14
132
  @abstractmethod
15
133
  def write_to_index(self, records: list, **kwargs):
134
+ """Write a batch of NV-Ingest records to the vector database.
135
+
136
+ This method receives `records` formatted as NV-Ingest provides them
137
+ (commonly a list of record-sets). Implementations are responsible for
138
+ transforming each record into the target database document format,
139
+ validating the presence of embeddings and content, and using the most
140
+ efficient ingestion API available (for example a bulk endpoint).
141
+
142
+ Expected behavior:
143
+ - Iterate over the provided `records` (which can be nested lists of
144
+ record dictionaries) and transform each record to the DB document
145
+ structure (fields such as `dense` for the vector, `text` for the
146
+ content, and `metadata` for auxiliary fields are common in the
147
+ repository examples).
148
+ - Skip records missing required fields (for example, missing
149
+ embeddings) and log or report failures as appropriate.
150
+ - Use batching / bulk APIs to reduce overhead when writing large
151
+ volumes of documents.
152
+
153
+ Parameters:
154
+ - records (list): NV-Ingest records (see repository examples for
155
+ structure)
156
+ - batch_size (int, optional): how many documents to send per bulk
157
+ request; database-specific implementations can use this hint
158
+
159
+ Returns:
160
+ implementation-specific result (e.g., number of documents
161
+ indexed, client response for bulk API). Concrete implementations
162
+ should document exact return values and failure semantics.
163
+ """
16
164
  pass
17
165
 
18
166
  @abstractmethod
19
167
  def retrieval(self, queries: list, **kwargs):
168
+ """Perform similarity search for a list of text queries.
169
+
170
+ The typical retrieval flow implemented by operators in this ecosystem
171
+ is:
172
+ 1. Convert each textual `query` into a dense embedding using an
173
+ external embedding model or service (the example uses an NVIDIA
174
+ embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
175
+ 2. Issue a vector (k-NN) search to the database using the generated
176
+ embedding, requesting the top-k (configurable) neighbors.
177
+ 3. Post-process results (for example, remove stored dense vectors
178
+ from returned documents to reduce payload size) and return a
179
+ list-of-lists of result documents aligned with the input `queries`.
180
+
181
+ Keyword arguments (common):
182
+ - index_name (str): index to search (default: operator's configured
183
+ index_name)
184
+ - top_k (int): number of nearest neighbors to return (default: 10)
185
+ - embedding_endpoint / model_name / nvidia_api_key: parameters needed
186
+ when the operator integrates with an external embedding service.
187
+
188
+ Parameters:
189
+ - queries (list[str]): list of text queries to be vectorized and
190
+ searched
191
+
192
+ Returns:
193
+ - results (list[list[dict]]): for each query, a list of hit documents
194
+ (concrete implementations should specify the document shape they
195
+ return). Operators should remove large binary/vector fields from
196
+ responses where possible.
197
+ """
20
198
  pass
21
199
 
22
200
  @abstractmethod
23
201
  def run(self, records):
202
+ """Main entry point used by the NV-Ingest pipeline.
203
+
204
+ The `run` method is intended to be a simple orchestration layer that
205
+ ensures the index exists and then ingests provided records. A minimal
206
+ recommended implementation is::
207
+
208
+ def run(self, records):
209
+ self.create_index()
210
+ self.write_to_index(records)
211
+
212
+ Implementers can add pre/post hooks, metrics, retries, or error
213
+ handling as needed for production readiness. Keep `run` simple so the
214
+ pipeline orchestration remains predictable.
215
+
216
+ Parameters:
217
+ - records: NV-Ingest records to index (format follows repository
218
+ conventions)
219
+
220
+ Returns:
221
+ - implementation-specific result (for example, a summary dict or
222
+ boolean success flag).
223
+ """
24
224
  pass
25
225
 
26
226
  def reindex(self, records: list, **kwargs):
227
+ """Optional helper to rebuild or re-populate indexes with new data.
228
+
229
+ This non-abstract method is provided as an optional hook that concrete
230
+ classes may override. A typical reindex implementation will:
231
+ - optionally delete the existing index and recreate it (via
232
+ `create_index(recreate=True)`)
233
+ - call `write_to_index(records)` to populate the new index
234
+
235
+ Parameters:
236
+ - records (list): records used to populate the index
237
+ - recreate (bool, optional): whether to delete and recreate the
238
+ index before writing
239
+
240
+ Returns:
241
+ - implementation-specific result
242
+ """
27
243
  pass
@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
44
44
  logger = logging.getLogger(__name__)
45
45
 
46
46
  CONSISTENCY = CONSISTENCY_BOUNDED
47
+ DENSE_INDEX_NAME = "dense_index"
47
48
 
48
49
  pandas_reader_map = {
49
50
  ".json": pd.read_json,
@@ -93,7 +94,7 @@ def create_meta_collection(
93
94
  index_params = MilvusClient.prepare_index_params()
94
95
  index_params.add_index(
95
96
  field_name="vector",
96
- index_name="dense_index",
97
+ index_name=DENSE_INDEX_NAME,
97
98
  index_type="FLAT",
98
99
  metric_type="L2",
99
100
  )
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
313
314
  if local_index:
314
315
  index_params.add_index(
315
316
  field_name="vector",
316
- index_name="dense_index",
317
+ index_name=DENSE_INDEX_NAME,
317
318
  index_type="FLAT",
318
319
  metric_type="L2",
319
320
  )
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
321
322
  if gpu_index:
322
323
  index_params.add_index(
323
324
  field_name="vector",
324
- index_name="dense_index",
325
+ index_name=DENSE_INDEX_NAME,
325
326
  index_type="GPU_CAGRA",
326
327
  metric_type="L2",
327
328
  params={
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
335
336
  else:
336
337
  index_params.add_index(
337
338
  field_name="vector",
338
- index_name="dense_index",
339
+ index_name=DENSE_INDEX_NAME,
339
340
  index_type="HNSW",
340
341
  metric_type="L2",
341
342
  params={"M": 64, "efConstruction": 512},
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
493
494
  if isinstance(indexes, dict):
494
495
  # Old Milvus behavior (< 2.5.6)
495
496
  for k, v in indexes.items():
496
- if k[1] == "dense_index" and hasattr(v, "_index_type"):
497
+ if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
497
498
  d_idx = v._index_type
498
499
  if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
499
500
  s_idx = v._index_type
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
504
505
  index_name = getattr(idx, "index_name", None)
505
506
  index_type = getattr(idx, "index_type", None)
506
507
 
507
- if index_name == "dense_index":
508
+ if index_name == DENSE_INDEX_NAME:
508
509
  d_idx = index_type
509
510
  if sparse and index_name == "sparse_index":
510
511
  s_idx = index_type
@@ -891,7 +892,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
891
892
  logger.info(f"streamed {count} records")
892
893
 
893
894
 
894
- def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
895
+ def wait_for_index(collection_name: str, expected_rows_dict: dict, client: MilvusClient):
895
896
  """
896
897
  This function waits for the index to be built. It checks
897
898
  the indexed_rows of the index and waits for it to be equal
@@ -900,30 +901,28 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
900
901
  (refer to MilvusClient.refresh_load for bulk inserts).
901
902
  """
902
903
  client.flush(collection_name)
903
- index_names = utility.list_indexes(collection_name)
904
904
  indexed_rows = 0
905
- for index_name in index_names:
906
- indexed_rows = 0
907
- while indexed_rows < num_elements:
908
- pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
905
+ # observe dense_index, all indexes get populated simultaneously
906
+ for index_name, rows_expected in expected_rows_dict.items():
907
+ indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
908
+ while indexed_rows < rows_expected:
909
+ # 0.5% of rows expected allowed without noticing an increase in indexed_rows
910
+ pos_movement = start_pos_movement = max((rows_expected - indexed_rows) * 0.005, 10)
909
911
  for i in range(20):
910
- new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
912
+ prev_indexed_rows = indexed_rows
913
+ indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
911
914
  time.sleep(1)
912
- logger.info(
913
- f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
914
- )
915
- if new_indexed_rows == num_elements:
916
- indexed_rows = new_indexed_rows
915
+ logger.info(f"Indexed rows, {collection_name}, {index_name} - {indexed_rows} / {rows_expected}")
916
+ if indexed_rows == rows_expected:
917
917
  break
918
918
  # check if indexed_rows is staying the same, too many times means something is wrong
919
- if new_indexed_rows == indexed_rows:
919
+ if indexed_rows == prev_indexed_rows:
920
920
  pos_movement -= 1
921
921
  else:
922
- pos_movement = 10
922
+ pos_movement = start_pos_movement
923
923
  # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
924
924
  if pos_movement == 0:
925
- raise ValueError("Rows are not getting indexed as expected")
926
- indexed_rows = new_indexed_rows
925
+ raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
927
926
  return indexed_rows
928
927
 
929
928
 
@@ -1043,6 +1042,13 @@ def write_to_nvingest_collection(
1043
1042
  if num_elements < threshold:
1044
1043
  stream = True
1045
1044
  if stream:
1045
+ # most be accessed/saved before adding new records
1046
+ index_names = utility.list_indexes(collection_name)
1047
+ expected_rows = {}
1048
+ for index_name in index_names:
1049
+ expected_rows[index_name] = (
1050
+ int(client.describe_index(collection_name, index_name)["indexed_rows"]) + num_elements
1051
+ )
1046
1052
  stream_insert_milvus(
1047
1053
  cleaned_records,
1048
1054
  client,
@@ -1051,7 +1057,7 @@ def write_to_nvingest_collection(
1051
1057
  if not local_index:
1052
1058
  # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
1053
1059
  # know how long this should take, it is num_elements dependent.
1054
- wait_for_index(collection_name, num_elements, client)
1060
+ wait_for_index(collection_name, expected_rows, client)
1055
1061
  else:
1056
1062
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1057
1063
  bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
@@ -2002,6 +2008,12 @@ class Milvus(VDB):
2002
2008
  """
2003
2009
  kwargs = locals().copy()
2004
2010
  kwargs.pop("self", None)
2011
+ bucket_name = kwargs.get("bucket_name", None)
2012
+ if bucket_name is not None and bucket_name != ClientConfigSchema().minio_bucket_name:
2013
+ raise ValueError(
2014
+ "You must use the environment variable MINIO_BUCKET to specify bucket_name, detected:",
2015
+ f"`bucket_name`: {bucket_name} and MINIO_BUCKET: {ClientConfigSchema().minio_bucket_name}",
2016
+ )
2005
2017
  super().__init__(**kwargs)
2006
2018
 
2007
2019
  def create_index(self, **kwargs):
@@ -2057,3 +2069,24 @@ class Milvus(VDB):
2057
2069
  self.write_to_index(records, collection_name=coll_name, **sub_write_params)
2058
2070
  else:
2059
2071
  raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
2072
+ return records
2073
+
2074
+ def run_async(self, records):
2075
+ collection_name, create_params = self.get_connection_params()
2076
+ _, write_params = self.get_write_params()
2077
+ if isinstance(collection_name, str):
2078
+ logger.info(f"creating index - {collection_name}")
2079
+ self.create_index(collection_name=collection_name, **create_params)
2080
+ records = records.result()
2081
+ logger.info(f"writing to index, for collection - {collection_name}")
2082
+ self.write_to_index(records, **write_params)
2083
+ elif isinstance(collection_name, dict):
2084
+ split_params_list = _dict_to_params(collection_name, write_params)
2085
+ for sub_params in split_params_list:
2086
+ coll_name, sub_write_params = sub_params
2087
+ sub_write_params.pop("collection_name", None)
2088
+ self.create_index(collection_name=coll_name, **create_params)
2089
+ self.write_to_index(records, collection_name=coll_name, **sub_write_params)
2090
+ else:
2091
+ raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
2092
+ return records
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.11.9.dev20251109
3
+ Version: 2025.11.27.dev20251127
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -6,13 +6,13 @@ nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T
6
6
  nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
7
7
  nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
8
8
  nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
9
- nv_ingest_client/client/client.py,sha256=3uA54D4Y6lSS-Nvz8R8uzkHkoV8vJu8GPQQRPoc-Uxk,77368
9
+ nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
10
10
  nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
11
- nv_ingest_client/client/interface.py,sha256=OCbH_5Q-cv1V4HpLBxLdaPCeaNKNkdEYi1JS4Tu6DGY,54745
11
+ nv_ingest_client/client/interface.py,sha256=XQ2hHNBsL-Nnsk_w48UMxFqxfkO0CdQ2AOQZEdXU3OA,59990
12
12
  nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
13
13
  nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
14
14
  nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
15
- nv_ingest_client/primitives/jobs/job_spec.py,sha256=teAZbpvxn25jIEUP5YJsAX_E_z9iWhejS-uy5opshFM,15681
15
+ nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQbDkhCXhLA9hNOURc,16831
16
16
  nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
17
17
  nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
18
18
  nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
@@ -20,9 +20,10 @@ nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcY
20
20
  nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
21
21
  nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
22
22
  nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
23
- nv_ingest_client/primitives/tasks/extract.py,sha256=bRriVkQyXN-UwzprHIt4Lp0iwmAojLEXqBb-IUrf3vY,9328
23
+ nv_ingest_client/primitives/tasks/extract.py,sha256=ec2aKPU9OMOOw-oalQKAPaNRqgkREQ0ByLkFVqutD6E,9339
24
24
  nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
25
25
  nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
26
+ nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
26
27
  nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
27
28
  nv_ingest_client/primitives/tasks/store.py,sha256=nIOnCH8vw4FLCLVBJYnsS5Unc0QmuO_jEtUp7-E9FU4,4199
28
29
  nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
@@ -39,17 +40,17 @@ nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywk
39
40
  nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
40
41
  nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
41
42
  nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
42
- nv_ingest_client/util/util.py,sha256=qwJ4MqF8w4-lws76z8iz1V0Hz_ebDYN8yAKyJPGuHuU,15828
43
+ nv_ingest_client/util/util.py,sha256=zvWgIxIeATrtrS8olo_8-fHQ4aDd83yg2SjNDcHIv4g,16805
43
44
  nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
44
45
  nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
46
+ nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
46
47
  nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
47
- nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
48
- nv_ingest_client/util/vdb/milvus.py,sha256=6XWRh2SDJlgVZOKZVXG3cZTB4L-ZHIiiTenuIzkxp2Y,78704
48
+ nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
49
+ nv_ingest_client/util/vdb/milvus.py,sha256=jCQyWb6xoQ6utGNccASmN09eJbwF2HlgrGGIkpoUfI8,80792
49
50
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
50
- nv_ingest_client-2025.11.9.dev20251109.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
51
- nv_ingest_client-2025.11.9.dev20251109.dist-info/METADATA,sha256=3R6ABapewDWxuap4Se2ovqRlEIrte-XjL_1AhqMHKN4,30626
52
- nv_ingest_client-2025.11.9.dev20251109.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
53
- nv_ingest_client-2025.11.9.dev20251109.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
54
- nv_ingest_client-2025.11.9.dev20251109.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
55
- nv_ingest_client-2025.11.9.dev20251109.dist-info/RECORD,,
51
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/METADATA,sha256=jul59WHL8-9IYR27iL9ilxkw7IQRnqb7EMqBfJh7IGk,30627
53
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
55
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
56
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/RECORD,,