nv-ingest-client 2025.11.17.dev20251117__py3-none-any.whl → 2025.11.27.dev20251127__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
202
202
  if not self.job_queue_id:
203
203
  logger.warning("job_queue_id is not set; submission of new jobs will fail.")
204
204
 
205
+ # Executor check required for run_async
206
+ if not hasattr(client, "_worker_pool"):
207
+ raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
208
+ if not isinstance(client._worker_pool, ThreadPoolExecutor):
209
+ raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
210
+ self._executor = client._worker_pool
211
+
205
212
  # --------------------------------------------------------------------------
206
213
  # Private Methods
207
214
  # --------------------------------------------------------------------------
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
246
253
  # Attempt to mark state as FAILED locally in the client (best effort)
247
254
  try:
248
255
  # Use a method assumed to safely get the state object
249
- job_state = self.client._get_job_state_object(job_index)
256
+ job_state = self.client._get_and_check_job_state(job_index)
250
257
  # Check state exists and is not already terminal before updating
251
258
  if (
252
259
  job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
495
502
 
496
503
  return batch_futures_dict, normalized_job_indices
497
504
 
498
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
505
+ # --------------------------------------------------------------------------
506
+ # Core Processing Logic
507
+ # --------------------------------------------------------------------------
508
+ def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
499
509
  """
500
510
  Executes the main processing loop in batches.
501
511
 
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
640
650
 
641
651
  return self.results, self.failures, self.traces if self.return_traces else []
642
652
 
653
+ # --------------------------------------------------------------------------
654
+ # Public Methods
655
+ # --------------------------------------------------------------------------
656
+
657
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
658
+ """
659
+ Executes the main processing loop synchronously.
660
+
661
+ This method orchestrates the job processing by maintaining a constant
662
+ pool of in-flight jobs, handling submissions, fetches, and retries until
663
+ all jobs are complete. It blocks until all jobs are processed.
664
+
665
+ Returns
666
+ -------
667
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
668
+ A tuple containing:
669
+ 1. A list of successfully fetched job results.
670
+ 2. A list of tuples for failed jobs (job_index, error_message).
671
+ 3. A list of trace dictionaries if `return_traces` was True.
672
+ """
673
+ return self._process_all_jobs()
674
+
675
+ def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
676
+ """
677
+ Executes the main processing loop asynchronously.
678
+
679
+ Submits the entire processing logic to the client's background
680
+ thread pool and returns a Future that resolves with the final
681
+ results, failures, and traces once all jobs are complete.
682
+
683
+ Returns
684
+ -------
685
+ Future
686
+ A future representing the asynchronous execution. Its result()
687
+ will be a tuple containing (results, failures, traces).
688
+ """
689
+ return self._executor.submit(self._process_all_jobs)
690
+
643
691
 
644
692
  class NvIngestClient:
645
693
  """
@@ -1377,6 +1425,68 @@ class NvIngestClient:
1377
1425
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
1378
1426
  return results
1379
1427
 
1428
+ def process_jobs_concurrently_async(
1429
+ self,
1430
+ job_indices: Union[str, List[str]],
1431
+ job_queue_id: Optional[str] = None,
1432
+ batch_size: Optional[int] = None,
1433
+ timeout: int = 100,
1434
+ max_job_retries: Optional[int] = None,
1435
+ retry_delay: float = 0.5,
1436
+ initial_fetch_delay: float = 0.3,
1437
+ fail_on_submit_error: bool = False,
1438
+ completion_callback: Optional[Callable[[Any, str], None]] = None,
1439
+ stream_to_callback_only: bool = False,
1440
+ return_full_response: bool = False,
1441
+ verbose: bool = False,
1442
+ return_traces: bool = False,
1443
+ ) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
1444
+ """
1445
+ Submit and fetch multiple jobs concurrently and asynchronously.
1446
+
1447
+ This method initializes the processing and returns a Future immediately. The Future
1448
+ will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
1449
+ jobs have completed.
1450
+
1451
+ Parameters are identical to `process_jobs_concurrently`.
1452
+
1453
+ Returns
1454
+ -------
1455
+ Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
1456
+ A future that completes when all jobs are done. Its result is a tuple
1457
+ containing (successful_results, failures, traces).
1458
+ """
1459
+ if isinstance(job_indices, str):
1460
+ job_indices = [job_indices]
1461
+
1462
+ if not job_indices:
1463
+ immediate_future: Future = Future()
1464
+ immediate_future.set_result(([], [], []))
1465
+ return immediate_future
1466
+
1467
+ validated_batch_size = self._validate_batch_size(batch_size)
1468
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1469
+
1470
+ processor = _ConcurrentProcessor(
1471
+ client=self,
1472
+ batch_size=validated_batch_size,
1473
+ job_indices=job_indices,
1474
+ job_queue_id=job_queue_id,
1475
+ timeout=effective_timeout,
1476
+ max_job_retries=max_job_retries,
1477
+ retry_delay=retry_delay,
1478
+ initial_fetch_delay=initial_fetch_delay,
1479
+ completion_callback=completion_callback,
1480
+ fail_on_submit_error=fail_on_submit_error,
1481
+ stream_to_callback_only=stream_to_callback_only,
1482
+ return_full_response=return_full_response,
1483
+ verbose=verbose,
1484
+ return_traces=return_traces,
1485
+ )
1486
+
1487
+ # Asynchronous call
1488
+ return processor.run_async()
1489
+
1380
1490
  def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
1381
1491
  """
1382
1492
  Block until all specified jobs have been marked submitted.
@@ -13,6 +13,7 @@ import os
13
13
  import shutil
14
14
  import tempfile
15
15
  import threading
16
+ from io import BytesIO
16
17
  from concurrent.futures import Future
17
18
  from concurrent.futures import ThreadPoolExecutor
18
19
  from concurrent.futures import as_completed
@@ -224,6 +225,7 @@ class Ingestor:
224
225
  **kwargs,
225
226
  ):
226
227
  self._documents = documents or []
228
+ self._buffers = []
227
229
  self._client = client
228
230
  self._job_queue_id = job_queue_id
229
231
  self._vdb_bulk_upload = None
@@ -352,6 +354,28 @@ class Ingestor:
352
354
 
353
355
  return self
354
356
 
357
+ def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
358
+ """
359
+ Add buffers for processing.
360
+
361
+ Parameters
362
+ ----------
363
+ buffers : List[Tuple[str, BytesIO]]
364
+ List of tuples containing the name of the buffer and the BytesIO object.
365
+ """
366
+ if (
367
+ isinstance(buffers, tuple)
368
+ and len(buffers) == 2
369
+ and isinstance(buffers[0], str)
370
+ and isinstance(buffers[1], BytesIO)
371
+ ):
372
+ buffers = [buffers]
373
+ self._buffers.extend(buffers)
374
+ self._job_specs = BatchJobSpec(self._buffers)
375
+ self._all_local = True
376
+
377
+ return self
378
+
355
379
  def load(self, **kwargs) -> "Ingestor":
356
380
  """
357
381
  Ensure all document files are accessible locally, downloading if necessary.
@@ -669,57 +693,133 @@ class Ingestor:
669
693
 
670
694
  return tuple(returns) if len(returns) > 1 else results
671
695
 
672
- def ingest_async(self, **kwargs: Any) -> Future:
696
+ def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
673
697
  """
674
698
  Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
675
699
 
700
+ The return type of the future's result is dynamic and mirrors the behavior of the synchronous
701
+ `ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
702
+ upload is configured, the future will complete *after* the VDB upload finishes.
703
+
676
704
  Parameters
677
705
  ----------
706
+ return_failures : bool, optional
707
+ If True, return a tuple containing failures; otherwise, only return results. Default is False.
708
+ return_traces : bool, optional
709
+ If True, return trace metrics alongside results. Default is False.
678
710
  kwargs : dict
679
- Additional parameters for the `submit_job_async` method.
711
+ Additional parameters passed to the concurrent processor.
712
+ Optional flags include `include_parent_trace_ids=True` to also return
713
+ parent job trace identifiers (V2 API only).
680
714
 
681
715
  Returns
682
716
  -------
683
- Future
684
- A future that completes when all submitted jobs have reached a terminal state.
717
+ Future[Union[List[Any], Tuple[Any, ...]]]
718
+ A future that completes when all jobs and any subsequent VDB upload
719
+ have finished. Its result will be one of the following:
720
+ - Default: list of results
721
+ - return_failures=True: (results, failures)
722
+ - return_traces=True: (results, traces)
723
+ - return_failures=True, return_traces=True: (results, failures, traces)
724
+
685
725
  """
686
- self._prepare_ingest_run()
726
+ try:
727
+ self._prepare_ingest_run()
687
728
 
688
- self._job_ids = self._client.add_job(self._job_specs)
729
+ # Add jobs locally first
730
+ if self._job_specs is None:
731
+ raise RuntimeError("Job specs missing for ingest_async.")
732
+ self._job_ids = self._client.add_job(self._job_specs)
733
+ self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
689
734
 
690
- future_to_job_id = self._client.submit_job_async(self._job_ids, self._job_queue_id, **kwargs)
691
- self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
735
+ proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
692
736
 
693
- combined_future = Future()
694
- submitted_futures = set(future_to_job_id.keys())
695
- completed_futures = set()
696
- future_results = []
697
- vdb_future = None
737
+ final_future: Future = Future()
698
738
 
699
- def _done_callback(future):
700
- job_id = future_to_job_id[future]
701
- job_state = self._job_states[job_id]
702
- try:
703
- result = self._client.fetch_job_result(job_id)
704
- if job_state.state != JobStateEnum.COMPLETED:
705
- job_state.state = JobStateEnum.COMPLETED
706
- except Exception:
707
- result = None
708
- if job_state.state != JobStateEnum.FAILED:
709
- job_state.state = JobStateEnum.FAILED
710
- completed_futures.add(future)
711
- future_results.extend(result)
712
- if completed_futures == submitted_futures:
713
- combined_future.set_result(future_results)
739
+ processor_future = self._client.process_jobs_concurrently_async(
740
+ job_indices=self._job_ids,
741
+ job_queue_id=self._job_queue_id,
742
+ return_traces=return_traces,
743
+ **proc_kwargs,
744
+ )
714
745
 
715
- for future in future_to_job_id:
716
- future.add_done_callback(_done_callback)
746
+ include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
717
747
 
718
- if self._vdb_bulk_upload:
719
- executor = ThreadPoolExecutor(max_workers=1)
720
- vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
748
+ def _processor_done_callback(proc_future: Future):
749
+ """Callback to handle completion, VDB upload, and final result setting."""
750
+ try:
751
+ if proc_future.cancelled():
752
+ if not final_future.done():
753
+ final_future.cancel()
754
+ return
755
+ if proc_future.exception():
756
+ if not final_future.done():
757
+ final_future.set_exception(proc_future.exception())
758
+ return
759
+
760
+ results, failures, traces_list = proc_future.result()
761
+
762
+ failed_job_ids = set()
763
+ for job_id_with_source, error_msg in failures:
764
+ job_id = job_id_with_source.split(":", 1)[0]
765
+ if job_id in self._job_states:
766
+ if self._job_states[job_id].state != JobStateEnum.FAILED:
767
+ self._job_states[job_id].state = JobStateEnum.FAILED
768
+ failed_job_ids.add(job_id)
769
+
770
+ all_submitted_job_ids = set(self._job_ids)
771
+ successful_job_ids = all_submitted_job_ids - failed_job_ids
772
+
773
+ for job_id in successful_job_ids:
774
+ if job_id in self._job_states:
775
+ if self._job_states[job_id].state != JobStateEnum.COMPLETED:
776
+ self._job_states[job_id].state = JobStateEnum.COMPLETED
777
+
778
+ if self._vdb_bulk_upload and results:
779
+ with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
780
+ results_future = Future()
781
+ results_future.set_result(results)
782
+ vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
783
+ vdb_future.result()
784
+
785
+ parent_trace_ids = (
786
+ self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
787
+ )
721
788
 
722
- return combined_future if not vdb_future else vdb_future
789
+ returns = [results]
790
+ if return_failures:
791
+ returns.append(failures)
792
+ if return_traces:
793
+ returns.append(traces_list)
794
+ if include_parent_trace_ids:
795
+ returns.append(parent_trace_ids)
796
+
797
+ final_result = tuple(returns) if len(returns) > 1 else results
798
+
799
+ if not final_future.done():
800
+ final_future.set_result(final_result)
801
+
802
+ except Exception as e:
803
+ logger.exception("Error in ingest_async processor callback")
804
+ if not final_future.done():
805
+ final_future.set_exception(e)
806
+ finally:
807
+ final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
808
+ for job_state in self._job_states.values():
809
+ if (
810
+ job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
811
+ and job_state.state != final_state
812
+ ):
813
+ job_state.state = final_state
814
+
815
+ processor_future.add_done_callback(_processor_done_callback)
816
+ return final_future
817
+
818
+ except Exception as setup_err:
819
+ logger.exception("Failed during synchronous setup of ingest_async")
820
+ error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
821
+ error_future.set_exception(setup_err)
822
+ return error_future
723
823
 
724
824
  @ensure_job_specs
725
825
  def _prepare_ingest_run(self):
@@ -10,6 +10,7 @@ from typing import Dict
10
10
  from typing import List
11
11
  from typing import Optional
12
12
  from typing import Union
13
+ from typing import Tuple
13
14
  from uuid import UUID
14
15
 
15
16
  from nv_ingest_client.primitives.tasks import Task
@@ -222,7 +223,9 @@ class BatchJobSpec:
222
223
  A dictionary that maps document types to a list of `JobSpec` instances.
223
224
  """
224
225
 
225
- def __init__(self, job_specs_or_files: Optional[Union[List[JobSpec], List[str]]] = None) -> None:
226
+ def __init__(
227
+ self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
228
+ ) -> None:
226
229
  """
227
230
  Initializes the BatchJobSpec instance.
228
231
 
@@ -239,6 +242,13 @@ class BatchJobSpec:
239
242
  self.from_job_specs(job_specs_or_files)
240
243
  elif isinstance(job_specs_or_files[0], str):
241
244
  self.from_files(job_specs_or_files)
245
+ elif (
246
+ isinstance(job_specs_or_files[0], tuple)
247
+ and len(job_specs_or_files[0]) == 2
248
+ and isinstance(job_specs_or_files[0][0], str)
249
+ and isinstance(job_specs_or_files[0][1], BytesIO)
250
+ ):
251
+ self.from_buffers(job_specs_or_files)
242
252
  else:
243
253
  raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
244
254
 
@@ -282,6 +292,21 @@ class BatchJobSpec:
282
292
  for job_spec in job_specs:
283
293
  self.add_job_spec(job_spec)
284
294
 
295
+ def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
296
+ """
297
+ Initializes the batch from a list of buffers.
298
+
299
+ Parameters
300
+ ----------
301
+ buffers : List[Tuple[str, BytesIO]]
302
+ A list of tuples containing the name of the buffer and the BytesIO object.
303
+ """
304
+ from nv_ingest_client.util.util import create_job_specs_for_buffers
305
+
306
+ job_specs = create_job_specs_for_buffers(buffers)
307
+ for job_spec in job_specs:
308
+ self.add_job_spec(job_spec)
309
+
285
310
  def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
286
311
  """
287
312
  Internal method to initialize the batch from a dataset.
@@ -145,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
145
145
 
146
146
  logger.debug(f"Content extracted from '{path}'")
147
147
  return content, DocumentTypeEnum(document_type)
148
+
149
+
150
+ def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
151
+ """
152
+ Extracts the content and type from a buffer.
153
+ """
154
+ document_type = get_or_infer_file_type(buffer[0])
155
+ try:
156
+ if document_type in [
157
+ DocumentTypeEnum.TXT,
158
+ DocumentTypeEnum.MD,
159
+ DocumentTypeEnum.HTML,
160
+ ]:
161
+ content = detect_encoding_and_read_text_file(buffer[1])
162
+ else:
163
+ content = serialize_to_base64(buffer[1])
164
+ except Exception as e:
165
+ logger.error(f"Error processing buffer {buffer[0]}: {e}")
166
+
167
+ raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
168
+
169
+ logger.debug(f"Content extracted from '{buffer[0]}'")
170
+ return content, DocumentTypeEnum(document_type)
@@ -12,10 +12,12 @@ import math
12
12
  import heapq
13
13
  from typing import Dict
14
14
  from typing import List
15
+ from typing import Tuple
16
+ from io import BytesIO
15
17
 
16
18
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
17
19
  from nv_ingest_client.primitives.jobs.job_spec import JobSpec
18
- from nv_ingest_client.util.file_processing.extract import extract_file_content
20
+ from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
19
21
 
20
22
  logger = logging.getLogger(__name__)
21
23
 
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
350
352
  return job_specs
351
353
 
352
354
 
355
+ def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
356
+ """
357
+ Create and job specifications (JobSpecs) for a list of buffers.
358
+ This function takes a list of buffers, processes each buffer to extract its content and type,
359
+ creates a job specification (JobSpec) for each buffer.
360
+
361
+ Parameters
362
+ ----------
363
+ buffers : List[Tuple[str, BytesIO]]
364
+ A list of tuples containing the name of the buffer and the BytesIO object.
365
+
366
+ Returns
367
+ -------
368
+ List[JobSpec]
369
+ A list of JobSpecs.
370
+ """
371
+
372
+ job_specs = []
373
+ for name, buffer in buffers:
374
+ content, file_type = extract_content_from_buffer((name, buffer))
375
+ job_spec = JobSpec(
376
+ document_type=file_type,
377
+ payload=content,
378
+ source_id=name,
379
+ source_name=name,
380
+ )
381
+ job_specs.append(job_spec)
382
+
383
+ return job_specs
384
+
385
+
353
386
  def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
354
387
  """
355
388
  Apply PDF split configuration to a list of JobSpec objects.
@@ -1,27 +1,243 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
3
 
4
+ """Abstract Vector Database (VDB) operator API.
5
+
6
+ This module defines the `VDB` abstract base class which specifies the
7
+ interface that custom vector-database operators must implement to integrate
8
+ with NV-Ingest.
9
+
10
+ The implementation details and an example OpenSearch operator are described
11
+ in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
12
+ production-ready OpenSearch implementation is available at
13
+ `client/src/nv_ingest_client/util/vdb/opensearch.py`.
14
+
15
+ Design goals:
16
+ - Provide a small, well-documented interface that supports common vector
17
+ database operations: index creation, batch ingestion, nearest-neighbor
18
+ retrieval, and a simple `run` orchestration entry-point used by the
19
+ NV-Ingest pipeline.
20
+ - Keep the API flexible by accepting `**kwargs` on methods so implementers can
21
+ pass database-specific options without changing the interface.
22
+
23
+ Typical implementation notes (inferred from the example OpenSearch operator):
24
+ - Constructor accepts connection and index configuration parameters such as
25
+ `host`, `port`, `index_name`, `dense_dim` and feature toggles for content
26
+ types (e.g. `enable_text`, `enable_images`).
27
+ - `create_index` should be able to create (and optionally recreate) an
28
+ index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
29
+ - `write_to_index` should accept batches of NV-Ingest records, perform
30
+ validation/transformation, and write documents into the database efficiently
31
+ (bulk APIs are recommended).
32
+ - `retrieval` should accept a list of textual queries, convert them to
33
+ embeddings (by calling an external embedding service or model), perform a
34
+ vector search (top-k), and return cleaned results (e.g., removing stored
35
+ dense vectors from returned payloads).
36
+
37
+ """
38
+
39
+
4
40
  class VDB(ABC):
41
+ """Abstract base class for Vector Database operators.
42
+
43
+ Subclasses must implement the abstract methods below. The interface is
44
+ intentionally small and uses `**kwargs` to allow operator-specific
45
+ configuration without changing the common API.
46
+
47
+ Example (high level):
48
+
49
+ class OpenSearch(VDB):
50
+ def __init__(self, **kwargs):
51
+ # parse kwargs, initialize client, call super().__init__(**kwargs)
52
+ ...
53
+
54
+ def create_index(self, **kwargs):
55
+ # create index, mappings, settings
56
+ ...
57
+
58
+ def write_to_index(self, records: list, **kwargs):
59
+ # transform NV-Ingest records and write to database
60
+ ...
61
+
62
+ def retrieval(self, queries: list, **kwargs):
63
+ # convert queries to embeddings, k-NN search, format results
64
+ ...
65
+
66
+ def run(self, records):
67
+ # orchestrate create_index + write_to_index
68
+ ...
69
+
70
+ Notes on recommended constructor parameters (not enforced by this ABC):
71
+ - host (str): database hostname (default: 'localhost')
72
+ - port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
73
+ - index_name (str): base index name used by the operator
74
+ - dense_dim (int): dimensionality of stored dense embeddings
75
+ - enable_text/enable_images/... (bool): content-type toggles used when
76
+ extracting text from NV-Ingest records before indexing
77
+
78
+ The concrete operator may accept additional parameters (username,
79
+ password, ssl options, client-specific flags). Passing these via
80
+ `**kwargs` is the intended pattern.
81
+ """
5
82
 
6
83
  @abstractmethod
7
84
  def __init__(self, **kwargs):
85
+ """Initialize the VDB operator.
86
+
87
+ Implementations should extract configuration values from `kwargs`
88
+ (or use defaults) and initialize any client connections required to
89
+ talk to the target vector database. Implementations are encouraged to
90
+ call `super().__init__(**kwargs)` only if they want the base-class
91
+ behavior of storing kwargs on the instance (the base class itself does
92
+ not require that behavior).
93
+
94
+ Parameters (suggested/common):
95
+ - host (str): database host
96
+ - port (int): database port
97
+ - index_name (str): base name for created indices
98
+ - dense_dim (int): embedding vector dimension
99
+ - enable_text (bool): whether text content should be extracted/indexed
100
+ - enable_images (bool), enable_audio (bool), etc.: other toggles
101
+
102
+ The constructor should not perform heavy operations (like creating
103
+ indices) unless explicitly desired; prefer leaving that work to
104
+ `create_index` to make the operator easier to test.
105
+ """
8
106
  self.__dict__.update(kwargs)
9
107
 
10
108
  @abstractmethod
11
109
  def create_index(self, **kwargs):
110
+ """Create and configure the index(es) required by this operator.
111
+
112
+ Implementations must ensure an appropriate index (or indices) exist
113
+ before data ingestion. For vector indexes this typically means
114
+ creating settings and mappings that enable k-NN/vector search (for
115
+ example, enabling an HNSW/FAISS engine, setting `dimension`, and any
116
+ engine-specific parameters).
117
+
118
+ Common keyword arguments (operator-specific):
119
+ - recreate (bool): if True, delete and recreate the index even if it
120
+ already exists (default: False)
121
+ - index_name (str): override the operator's configured index name for
122
+ this call
123
+
124
+ Returns:
125
+ implementation-specific result (e.g., a boolean, the created
126
+ index name, or the raw response from the database client). There
127
+ is no strict requirement here because different DB clients return
128
+ different values; document behavior in concrete implementations.
129
+ """
12
130
  pass
13
131
 
14
132
  @abstractmethod
15
133
  def write_to_index(self, records: list, **kwargs):
134
+ """Write a batch of NV-Ingest records to the vector database.
135
+
136
+ This method receives `records` formatted as NV-Ingest provides them
137
+ (commonly a list of record-sets). Implementations are responsible for
138
+ transforming each record into the target database document format,
139
+ validating the presence of embeddings and content, and using the most
140
+ efficient ingestion API available (for example a bulk endpoint).
141
+
142
+ Expected behavior:
143
+ - Iterate over the provided `records` (which can be nested lists of
144
+ record dictionaries) and transform each record to the DB document
145
+ structure (fields such as `dense` for the vector, `text` for the
146
+ content, and `metadata` for auxiliary fields are common in the
147
+ repository examples).
148
+ - Skip records missing required fields (for example, missing
149
+ embeddings) and log or report failures as appropriate.
150
+ - Use batching / bulk APIs to reduce overhead when writing large
151
+ volumes of documents.
152
+
153
+ Parameters:
154
+ - records (list): NV-Ingest records (see repository examples for
155
+ structure)
156
+ - batch_size (int, optional): how many documents to send per bulk
157
+ request; database-specific implementations can use this hint
158
+
159
+ Returns:
160
+ implementation-specific result (e.g., number of documents
161
+ indexed, client response for bulk API). Concrete implementations
162
+ should document exact return values and failure semantics.
163
+ """
16
164
  pass
17
165
 
18
166
  @abstractmethod
19
167
  def retrieval(self, queries: list, **kwargs):
168
+ """Perform similarity search for a list of text queries.
169
+
170
+ The typical retrieval flow implemented by operators in this ecosystem
171
+ is:
172
+ 1. Convert each textual `query` into a dense embedding using an
173
+ external embedding model or service (the example uses an NVIDIA
174
+ embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
175
+ 2. Issue a vector (k-NN) search to the database using the generated
176
+ embedding, requesting the top-k (configurable) neighbors.
177
+ 3. Post-process results (for example, remove stored dense vectors
178
+ from returned documents to reduce payload size) and return a
179
+ list-of-lists of result documents aligned with the input `queries`.
180
+
181
+ Keyword arguments (common):
182
+ - index_name (str): index to search (default: operator's configured
183
+ index_name)
184
+ - top_k (int): number of nearest neighbors to return (default: 10)
185
+ - embedding_endpoint / model_name / nvidia_api_key: parameters needed
186
+ when the operator integrates with an external embedding service.
187
+
188
+ Parameters:
189
+ - queries (list[str]): list of text queries to be vectorized and
190
+ searched
191
+
192
+ Returns:
193
+ - results (list[list[dict]]): for each query, a list of hit documents
194
+ (concrete implementations should specify the document shape they
195
+ return). Operators should remove large binary/vector fields from
196
+ responses where possible.
197
+ """
20
198
  pass
21
199
 
22
200
  @abstractmethod
23
201
  def run(self, records):
202
+ """Main entry point used by the NV-Ingest pipeline.
203
+
204
+ The `run` method is intended to be a simple orchestration layer that
205
+ ensures the index exists and then ingests provided records. A minimal
206
+ recommended implementation is::
207
+
208
+ def run(self, records):
209
+ self.create_index()
210
+ self.write_to_index(records)
211
+
212
+ Implementers can add pre/post hooks, metrics, retries, or error
213
+ handling as needed for production readiness. Keep `run` simple so the
214
+ pipeline orchestration remains predictable.
215
+
216
+ Parameters:
217
+ - records: NV-Ingest records to index (format follows repository
218
+ conventions)
219
+
220
+ Returns:
221
+ - implementation-specific result (for example, a summary dict or
222
+ boolean success flag).
223
+ """
24
224
  pass
25
225
 
26
226
  def reindex(self, records: list, **kwargs):
227
+ """Optional helper to rebuild or re-populate indexes with new data.
228
+
229
+ This non-abstract method is provided as an optional hook that concrete
230
+ classes may override. A typical reindex implementation will:
231
+ - optionally delete the existing index and recreate it (via
232
+ `create_index(recreate=True)`)
233
+ - call `write_to_index(records)` to populate the new index
234
+
235
+ Parameters:
236
+ - records (list): records used to populate the index
237
+ - recreate (bool, optional): whether to delete and recreate the
238
+ index before writing
239
+
240
+ Returns:
241
+ - implementation-specific result
242
+ """
27
243
  pass
@@ -892,7 +892,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
892
892
  logger.info(f"streamed {count} records")
893
893
 
894
894
 
895
- def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
895
+ def wait_for_index(collection_name: str, expected_rows_dict: dict, client: MilvusClient):
896
896
  """
897
897
  This function waits for the index to be built. It checks
898
898
  the indexed_rows of the index and waits for it to be equal
@@ -901,32 +901,28 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
901
901
  (refer to MilvusClient.refresh_load for bulk inserts).
902
902
  """
903
903
  client.flush(collection_name)
904
- # index_names = utility.list_indexes(collection_name)
905
904
  indexed_rows = 0
906
905
  # observe dense_index, all indexes get populated simultaneously
907
- for index_name in [DENSE_INDEX_NAME]:
908
- indexed_rows = 0
909
- expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
910
- while indexed_rows < expected_rows:
911
- pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
906
+ for index_name, rows_expected in expected_rows_dict.items():
907
+ indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
908
+ while indexed_rows < rows_expected:
909
+ # 0.5% of rows expected allowed without noticing an increase in indexed_rows
910
+ pos_movement = start_pos_movement = max((rows_expected - indexed_rows) * 0.005, 10)
912
911
  for i in range(20):
913
- current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
912
+ prev_indexed_rows = indexed_rows
913
+ indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
914
914
  time.sleep(1)
915
- logger.info(
916
- f"Indexed rows, {collection_name}, {index_name} - {current_indexed_rows} / {expected_rows}"
917
- )
918
- if current_indexed_rows == expected_rows:
919
- indexed_rows = current_indexed_rows
915
+ logger.info(f"Indexed rows, {collection_name}, {index_name} - {indexed_rows} / {rows_expected}")
916
+ if indexed_rows == rows_expected:
920
917
  break
921
918
  # check if indexed_rows is staying the same, too many times means something is wrong
922
- if current_indexed_rows == indexed_rows:
919
+ if indexed_rows == prev_indexed_rows:
923
920
  pos_movement -= 1
924
921
  else:
925
- pos_movement = 10
922
+ pos_movement = start_pos_movement
926
923
  # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
927
924
  if pos_movement == 0:
928
925
  raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
929
- indexed_rows = current_indexed_rows
930
926
  return indexed_rows
931
927
 
932
928
 
@@ -1046,6 +1042,13 @@ def write_to_nvingest_collection(
1046
1042
  if num_elements < threshold:
1047
1043
  stream = True
1048
1044
  if stream:
1045
+ # most be accessed/saved before adding new records
1046
+ index_names = utility.list_indexes(collection_name)
1047
+ expected_rows = {}
1048
+ for index_name in index_names:
1049
+ expected_rows[index_name] = (
1050
+ int(client.describe_index(collection_name, index_name)["indexed_rows"]) + num_elements
1051
+ )
1049
1052
  stream_insert_milvus(
1050
1053
  cleaned_records,
1051
1054
  client,
@@ -1054,7 +1057,7 @@ def write_to_nvingest_collection(
1054
1057
  if not local_index:
1055
1058
  # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
1056
1059
  # know how long this should take, it is num_elements dependent.
1057
- wait_for_index(collection_name, num_elements, client)
1060
+ wait_for_index(collection_name, expected_rows, client)
1058
1061
  else:
1059
1062
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1060
1063
  bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
@@ -2005,6 +2008,12 @@ class Milvus(VDB):
2005
2008
  """
2006
2009
  kwargs = locals().copy()
2007
2010
  kwargs.pop("self", None)
2011
+ bucket_name = kwargs.get("bucket_name", None)
2012
+ if bucket_name is not None and bucket_name != ClientConfigSchema().minio_bucket_name:
2013
+ raise ValueError(
2014
+ "You must use the environment variable MINIO_BUCKET to specify bucket_name, detected:",
2015
+ f"`bucket_name`: {bucket_name} and MINIO_BUCKET: {ClientConfigSchema().minio_bucket_name}",
2016
+ )
2008
2017
  super().__init__(**kwargs)
2009
2018
 
2010
2019
  def create_index(self, **kwargs):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.11.17.dev20251117
3
+ Version: 2025.11.27.dev20251127
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -6,13 +6,13 @@ nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T
6
6
  nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
7
7
  nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
8
8
  nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
9
- nv_ingest_client/client/client.py,sha256=3uA54D4Y6lSS-Nvz8R8uzkHkoV8vJu8GPQQRPoc-Uxk,77368
9
+ nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
10
10
  nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
11
- nv_ingest_client/client/interface.py,sha256=Y6JnjaRytlBrhgbU6MJYm2dblLvoYxWEB35TETZDSwk,55022
11
+ nv_ingest_client/client/interface.py,sha256=XQ2hHNBsL-Nnsk_w48UMxFqxfkO0CdQ2AOQZEdXU3OA,59990
12
12
  nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
13
13
  nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
14
14
  nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
15
- nv_ingest_client/primitives/jobs/job_spec.py,sha256=TBz5u7KRdQjQvqD0mMzwjTK9Jl3p7yTIknQQs0lfnV8,15909
15
+ nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQbDkhCXhLA9hNOURc,16831
16
16
  nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
17
17
  nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
18
18
  nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
@@ -40,17 +40,17 @@ nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywk
40
40
  nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
41
41
  nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
42
42
  nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
43
- nv_ingest_client/util/util.py,sha256=qwJ4MqF8w4-lws76z8iz1V0Hz_ebDYN8yAKyJPGuHuU,15828
43
+ nv_ingest_client/util/util.py,sha256=zvWgIxIeATrtrS8olo_8-fHQ4aDd83yg2SjNDcHIv4g,16805
44
44
  nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
45
45
  nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- nv_ingest_client/util/file_processing/extract.py,sha256=Hjtem4bJWum1bbUPw7_TG-0Z2-7PsH4bBuqTF7bLn88,4794
46
+ nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
47
47
  nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
48
- nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
49
- nv_ingest_client/util/vdb/milvus.py,sha256=LHZ4Z6fHk8vQUGQFJ3FZ5iay0Ike6Zur-K9yMiPxe44,80141
48
+ nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
49
+ nv_ingest_client/util/vdb/milvus.py,sha256=jCQyWb6xoQ6utGNccASmN09eJbwF2HlgrGGIkpoUfI8,80792
50
50
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
51
- nv_ingest_client-2025.11.17.dev20251117.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
- nv_ingest_client-2025.11.17.dev20251117.dist-info/METADATA,sha256=bgCG3WP30zjURzJ_SZEm3fDbby-NoICZDYfbiA3sSjg,30627
53
- nv_ingest_client-2025.11.17.dev20251117.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
- nv_ingest_client-2025.11.17.dev20251117.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
55
- nv_ingest_client-2025.11.17.dev20251117.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
56
- nv_ingest_client-2025.11.17.dev20251117.dist-info/RECORD,,
51
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/METADATA,sha256=jul59WHL8-9IYR27iL9ilxkw7IQRnqb7EMqBfJh7IGk,30627
53
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
55
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
56
+ nv_ingest_client-2025.11.27.dev20251127.dist-info/RECORD,,