PyPI - nv-ingest-client - Versions diffs - 2025.11.17.dev20251117__py3-none-any.whl → 2025.11.27.dev20251127__py3-none-any.whl - Mend

nv-ingest-client 2025.11.17.dev20251117py3-none-any.whl → 2025.11.27.dev20251127py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

nv_ingest_client/client/client.py CHANGED Viewed

@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
         if not self.job_queue_id:
             logger.warning("job_queue_id is not set; submission of new jobs will fail.")
+        # Executor check required for run_async
+        if not hasattr(client, "_worker_pool"):
+            raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
+        if not isinstance(client._worker_pool, ThreadPoolExecutor):
+            raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
+        self._executor = client._worker_pool
     # --------------------------------------------------------------------------
     # Private Methods
     # --------------------------------------------------------------------------
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
         # Attempt to mark state as FAILED locally in the client (best effort)
         try:
             # Use a method assumed to safely get the state object
-            job_state = self.client._get_job_state_object(job_index)
+            job_state = self.client._get_and_check_job_state(job_index)
             # Check state exists and is not already terminal before updating
             if (
                 job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
         return batch_futures_dict, normalized_job_indices
-    def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
+    # --------------------------------------------------------------------------
+    # Core Processing Logic
+    # --------------------------------------------------------------------------
+    def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
         """
         Executes the main processing loop in batches.
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
         return self.results, self.failures, self.traces if self.return_traces else []
+    # --------------------------------------------------------------------------
+    # Public Methods
+    # --------------------------------------------------------------------------
+    def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
+        """
+        Executes the main processing loop synchronously.
+        This method orchestrates the job processing by maintaining a constant
+        pool of in-flight jobs, handling submissions, fetches, and retries until
+        all jobs are complete. It blocks until all jobs are processed.
+        Returns
+        -------
+        Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
+            A tuple containing:
+            1. A list of successfully fetched job results.
+            2. A list of tuples for failed jobs (job_index, error_message).
+            3. A list of trace dictionaries if `return_traces` was True.
+        """
+        return self._process_all_jobs()
+    def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
+        """
+        Executes the main processing loop asynchronously.
+        Submits the entire processing logic to the client's background
+        thread pool and returns a Future that resolves with the final
+        results, failures, and traces once all jobs are complete.
+        Returns
+        -------
+        Future
+            A future representing the asynchronous execution. Its result()
+            will be a tuple containing (results, failures, traces).
+        """
+        return self._executor.submit(self._process_all_jobs)
 class NvIngestClient:
     """
@@ -1377,6 +1425,68 @@ class NvIngestClient:
             logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
         return results
+    def process_jobs_concurrently_async(
+        self,
+        job_indices: Union[str, List[str]],
+        job_queue_id: Optional[str] = None,
+        batch_size: Optional[int] = None,
+        timeout: int = 100,
+        max_job_retries: Optional[int] = None,
+        retry_delay: float = 0.5,
+        initial_fetch_delay: float = 0.3,
+        fail_on_submit_error: bool = False,
+        completion_callback: Optional[Callable[[Any, str], None]] = None,
+        stream_to_callback_only: bool = False,
+        return_full_response: bool = False,
+        verbose: bool = False,
+        return_traces: bool = False,
+    ) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
+        """
+        Submit and fetch multiple jobs concurrently and asynchronously.
+        This method initializes the processing and returns a Future immediately. The Future
+        will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
+        jobs have completed.
+        Parameters are identical to `process_jobs_concurrently`.
+        Returns
+        -------
+        Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
+            A future that completes when all jobs are done. Its result is a tuple
+            containing (successful_results, failures, traces).
+        """
+        if isinstance(job_indices, str):
+            job_indices = [job_indices]
+        if not job_indices:
+            immediate_future: Future = Future()
+            immediate_future.set_result(([], [], []))
+            return immediate_future
+        validated_batch_size = self._validate_batch_size(batch_size)
+        effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
+        processor = _ConcurrentProcessor(
+            client=self,
+            batch_size=validated_batch_size,
+            job_indices=job_indices,
+            job_queue_id=job_queue_id,
+            timeout=effective_timeout,
+            max_job_retries=max_job_retries,
+            retry_delay=retry_delay,
+            initial_fetch_delay=initial_fetch_delay,
+            completion_callback=completion_callback,
+            fail_on_submit_error=fail_on_submit_error,
+            stream_to_callback_only=stream_to_callback_only,
+            return_full_response=return_full_response,
+            verbose=verbose,
+            return_traces=return_traces,
+        )
+        # Asynchronous call
+        return processor.run_async()
     def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
         """
         Block until all specified jobs have been marked submitted.

nv_ingest_client/client/interface.py CHANGED Viewed

@@ -13,6 +13,7 @@ import os
 import shutil
 import tempfile
 import threading
+from io import BytesIO
 from concurrent.futures import Future
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import as_completed
@@ -224,6 +225,7 @@ class Ingestor:
         **kwargs,
     ):
         self._documents = documents or []
+        self._buffers = []
         self._client = client
         self._job_queue_id = job_queue_id
         self._vdb_bulk_upload = None
@@ -352,6 +354,28 @@ class Ingestor:
         return self
+    def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
+        """
+        Add buffers for processing.
+        Parameters
+        ----------
+        buffers : List[Tuple[str, BytesIO]]
+            List of tuples containing the name of the buffer and the BytesIO object.
+        """
+        if (
+            isinstance(buffers, tuple)
+            and len(buffers) == 2
+            and isinstance(buffers[0], str)
+            and isinstance(buffers[1], BytesIO)
+        ):
+            buffers = [buffers]
+        self._buffers.extend(buffers)
+        self._job_specs = BatchJobSpec(self._buffers)
+        self._all_local = True
+        return self
     def load(self, **kwargs) -> "Ingestor":
         """
         Ensure all document files are accessible locally, downloading if necessary.
@@ -669,57 +693,133 @@ class Ingestor:
         return tuple(returns) if len(returns) > 1 else results
-    def ingest_async(self, **kwargs: Any) -> Future:
+    def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
         """
         Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
+        The return type of the future's result is dynamic and mirrors the behavior of the synchronous
+        `ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
+        upload is configured, the future will complete *after* the VDB upload finishes.
         Parameters
         ----------
+        return_failures : bool, optional
+            If True, return a tuple containing failures; otherwise, only return results. Default is False.
+        return_traces : bool, optional
+            If True, return trace metrics alongside results. Default is False.
         kwargs : dict
-            Additional parameters for the `submit_job_async` method.
+            Additional parameters passed to the concurrent processor.
+            Optional flags include `include_parent_trace_ids=True` to also return
+            parent job trace identifiers (V2 API only).
         Returns
         -------
-        Future
-            A future that completes when all submitted jobs have reached a terminal state.
+        Future[Union[List[Any], Tuple[Any, ...]]]
+            A future that completes when all jobs and any subsequent VDB upload
+            have finished. Its result will be one of the following:
+            - Default: list of results
+            - return_failures=True: (results, failures)
+            - return_traces=True: (results, traces)
+            - return_failures=True, return_traces=True: (results, failures, traces)
         """
-        self._prepare_ingest_run()
+        try:
+            self._prepare_ingest_run()
-        self._job_ids = self._client.add_job(self._job_specs)
+            # Add jobs locally first
+            if self._job_specs is None:
+                raise RuntimeError("Job specs missing for ingest_async.")
+            self._job_ids = self._client.add_job(self._job_specs)
+            self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
-        future_to_job_id = self._client.submit_job_async(self._job_ids, self._job_queue_id, **kwargs)
-        self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
+            proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
-        combined_future = Future()
-        submitted_futures = set(future_to_job_id.keys())
-        completed_futures = set()
-        future_results = []
-        vdb_future = None
+            final_future: Future = Future()
-        def _done_callback(future):
-            job_id = future_to_job_id[future]
-            job_state = self._job_states[job_id]
-            try:
-                result = self._client.fetch_job_result(job_id)
-                if job_state.state != JobStateEnum.COMPLETED:
-                    job_state.state = JobStateEnum.COMPLETED
-            except Exception:
-                result = None
-                if job_state.state != JobStateEnum.FAILED:
-                    job_state.state = JobStateEnum.FAILED
-            completed_futures.add(future)
-            future_results.extend(result)
-            if completed_futures == submitted_futures:
-                combined_future.set_result(future_results)
+            processor_future = self._client.process_jobs_concurrently_async(
+                job_indices=self._job_ids,
+                job_queue_id=self._job_queue_id,
+                return_traces=return_traces,
+                **proc_kwargs,
+            )
-        for future in future_to_job_id:
-            future.add_done_callback(_done_callback)
+            include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
-        if self._vdb_bulk_upload:
-            executor = ThreadPoolExecutor(max_workers=1)
-            vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
+            def _processor_done_callback(proc_future: Future):
+                """Callback to handle completion, VDB upload, and final result setting."""
+                try:
+                    if proc_future.cancelled():
+                        if not final_future.done():
+                            final_future.cancel()
+                        return
+                    if proc_future.exception():
+                        if not final_future.done():
+                            final_future.set_exception(proc_future.exception())
+                        return
+                    results, failures, traces_list = proc_future.result()
+                    failed_job_ids = set()
+                    for job_id_with_source, error_msg in failures:
+                        job_id = job_id_with_source.split(":", 1)[0]
+                        if job_id in self._job_states:
+                            if self._job_states[job_id].state != JobStateEnum.FAILED:
+                                self._job_states[job_id].state = JobStateEnum.FAILED
+                            failed_job_ids.add(job_id)
+                    all_submitted_job_ids = set(self._job_ids)
+                    successful_job_ids = all_submitted_job_ids - failed_job_ids
+                    for job_id in successful_job_ids:
+                        if job_id in self._job_states:
+                            if self._job_states[job_id].state != JobStateEnum.COMPLETED:
+                                self._job_states[job_id].state = JobStateEnum.COMPLETED
+                    if self._vdb_bulk_upload and results:
+                        with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
+                            results_future = Future()
+                            results_future.set_result(results)
+                            vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
+                            vdb_future.result()
+                    parent_trace_ids = (
+                        self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
+                    )
-        return combined_future if not vdb_future else vdb_future
+                    returns = [results]
+                    if return_failures:
+                        returns.append(failures)
+                    if return_traces:
+                        returns.append(traces_list)
+                    if include_parent_trace_ids:
+                        returns.append(parent_trace_ids)
+                    final_result = tuple(returns) if len(returns) > 1 else results
+                    if not final_future.done():
+                        final_future.set_result(final_result)
+                except Exception as e:
+                    logger.exception("Error in ingest_async processor callback")
+                    if not final_future.done():
+                        final_future.set_exception(e)
+                finally:
+                    final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
+                    for job_state in self._job_states.values():
+                        if (
+                            job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
+                            and job_state.state != final_state
+                        ):
+                            job_state.state = final_state
+            processor_future.add_done_callback(_processor_done_callback)
+            return final_future
+        except Exception as setup_err:
+            logger.exception("Failed during synchronous setup of ingest_async")
+            error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
+            error_future.set_exception(setup_err)
+            return error_future
     @ensure_job_specs
     def _prepare_ingest_run(self):

nv_ingest_client/primitives/jobs/job_spec.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Union
+from typing import Tuple
 from uuid import UUID
 from nv_ingest_client.primitives.tasks import Task
@@ -222,7 +223,9 @@ class BatchJobSpec:
         A dictionary that maps document types to a list of `JobSpec` instances.
     """
-    def __init__(self, job_specs_or_files: Optional[Union[List[JobSpec], List[str]]] = None) -> None:
+    def __init__(
+        self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
+    ) -> None:
         """
         Initializes the BatchJobSpec instance.
@@ -239,6 +242,13 @@ class BatchJobSpec:
                 self.from_job_specs(job_specs_or_files)
             elif isinstance(job_specs_or_files[0], str):
                 self.from_files(job_specs_or_files)
+            elif (
+                isinstance(job_specs_or_files[0], tuple)
+                and len(job_specs_or_files[0]) == 2
+                and isinstance(job_specs_or_files[0][0], str)
+                and isinstance(job_specs_or_files[0][1], BytesIO)
+            ):
+                self.from_buffers(job_specs_or_files)
             else:
                 raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
@@ -282,6 +292,21 @@ class BatchJobSpec:
         for job_spec in job_specs:
             self.add_job_spec(job_spec)
+    def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
+        """
+        Initializes the batch from a list of buffers.
+        Parameters
+        ----------
+        buffers : List[Tuple[str, BytesIO]]
+            A list of tuples containing the name of the buffer and the BytesIO object.
+        """
+        from nv_ingest_client.util.util import create_job_specs_for_buffers
+        job_specs = create_job_specs_for_buffers(buffers)
+        for job_spec in job_specs:
+            self.add_job_spec(job_spec)
     def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
         """
         Internal method to initialize the batch from a dataset.

nv_ingest_client/util/file_processing/extract.py CHANGED Viewed

@@ -145,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
     logger.debug(f"Content extracted from '{path}'")
     return content, DocumentTypeEnum(document_type)
+def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
+    """
+    Extracts the content and type from a buffer.
+    """
+    document_type = get_or_infer_file_type(buffer[0])
+    try:
+        if document_type in [
+            DocumentTypeEnum.TXT,
+            DocumentTypeEnum.MD,
+            DocumentTypeEnum.HTML,
+        ]:
+            content = detect_encoding_and_read_text_file(buffer[1])
+        else:
+            content = serialize_to_base64(buffer[1])
+    except Exception as e:
+        logger.error(f"Error processing buffer {buffer[0]}: {e}")
+        raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
+    logger.debug(f"Content extracted from '{buffer[0]}'")
+    return content, DocumentTypeEnum(document_type)

nv_ingest_client/util/util.py CHANGED Viewed

@@ -12,10 +12,12 @@ import math
 import heapq
 from typing import Dict
 from typing import List
+from typing import Tuple
+from io import BytesIO
 from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
 from nv_ingest_client.primitives.jobs.job_spec import JobSpec
-from nv_ingest_client.util.file_processing.extract import extract_file_content
+from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
 logger = logging.getLogger(__name__)
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
     return job_specs
+def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
+    """
+    Create and job specifications (JobSpecs) for a list of buffers.
+    This function takes a list of buffers, processes each buffer to extract its content and type,
+    creates a job specification (JobSpec) for each buffer.
+    Parameters
+    ----------
+    buffers : List[Tuple[str, BytesIO]]
+        A list of tuples containing the name of the buffer and the BytesIO object.
+    Returns
+    -------
+    List[JobSpec]
+        A list of JobSpecs.
+    """
+    job_specs = []
+    for name, buffer in buffers:
+        content, file_type = extract_content_from_buffer((name, buffer))
+        job_spec = JobSpec(
+            document_type=file_type,
+            payload=content,
+            source_id=name,
+            source_name=name,
+        )
+        job_specs.append(job_spec)
+    return job_specs
 def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
     """
     Apply PDF split configuration to a list of JobSpec objects.

nv_ingest_client/util/vdb/adt_vdb.py CHANGED Viewed

@@ -1,27 +1,243 @@
 from abc import ABC, abstractmethod
+"""Abstract Vector Database (VDB) operator API.
+This module defines the `VDB` abstract base class which specifies the
+interface that custom vector-database operators must implement to integrate
+with NV-Ingest.
+The implementation details and an example OpenSearch operator are described
+in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
+production-ready OpenSearch implementation is available at
+`client/src/nv_ingest_client/util/vdb/opensearch.py`.
+Design goals:
+- Provide a small, well-documented interface that supports common vector
+    database operations: index creation, batch ingestion, nearest-neighbor
+    retrieval, and a simple `run` orchestration entry-point used by the
+    NV-Ingest pipeline.
+- Keep the API flexible by accepting `**kwargs` on methods so implementers can
+    pass database-specific options without changing the interface.
+Typical implementation notes (inferred from the example OpenSearch operator):
+- Constructor accepts connection and index configuration parameters such as
+    `host`, `port`, `index_name`, `dense_dim` and feature toggles for content
+    types (e.g. `enable_text`, `enable_images`).
+- `create_index` should be able to create (and optionally recreate) an
+    index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
+- `write_to_index` should accept batches of NV-Ingest records, perform
+    validation/transformation, and write documents into the database efficiently
+    (bulk APIs are recommended).
+- `retrieval` should accept a list of textual queries, convert them to
+    embeddings (by calling an external embedding service or model), perform a
+    vector search (top-k), and return cleaned results (e.g., removing stored
+    dense vectors from returned payloads).
+"""
 class VDB(ABC):
+    """Abstract base class for Vector Database operators.
+    Subclasses must implement the abstract methods below. The interface is
+    intentionally small and uses `**kwargs` to allow operator-specific
+    configuration without changing the common API.
+    Example (high level):
+            class OpenSearch(VDB):
+                    def __init__(self, **kwargs):
+                            # parse kwargs, initialize client, call super().__init__(**kwargs)
+                            ...
+                    def create_index(self, **kwargs):
+                            # create index, mappings, settings
+                            ...
+                    def write_to_index(self, records: list, **kwargs):
+                            # transform NV-Ingest records and write to database
+                            ...
+                    def retrieval(self, queries: list, **kwargs):
+                            # convert queries to embeddings, k-NN search, format results
+                            ...
+                    def run(self, records):
+                            # orchestrate create_index + write_to_index
+                            ...
+    Notes on recommended constructor parameters (not enforced by this ABC):
+    - host (str): database hostname (default: 'localhost')
+    - port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
+    - index_name (str): base index name used by the operator
+    - dense_dim (int): dimensionality of stored dense embeddings
+    - enable_text/enable_images/... (bool): content-type toggles used when
+        extracting text from NV-Ingest records before indexing
+    The concrete operator may accept additional parameters (username,
+    password, ssl options, client-specific flags). Passing these via
+    `**kwargs` is the intended pattern.
+    """
     @abstractmethod
     def __init__(self, **kwargs):
+        """Initialize the VDB operator.
+        Implementations should extract configuration values from `kwargs`
+        (or use defaults) and initialize any client connections required to
+        talk to the target vector database. Implementations are encouraged to
+        call `super().__init__(**kwargs)` only if they want the base-class
+        behavior of storing kwargs on the instance (the base class itself does
+        not require that behavior).
+        Parameters (suggested/common):
+        - host (str): database host
+        - port (int): database port
+        - index_name (str): base name for created indices
+        - dense_dim (int): embedding vector dimension
+        - enable_text (bool): whether text content should be extracted/indexed
+        - enable_images (bool), enable_audio (bool), etc.: other toggles
+        The constructor should not perform heavy operations (like creating
+        indices) unless explicitly desired; prefer leaving that work to
+        `create_index` to make the operator easier to test.
+        """
         self.__dict__.update(kwargs)
     @abstractmethod
     def create_index(self, **kwargs):
+        """Create and configure the index(es) required by this operator.
+        Implementations must ensure an appropriate index (or indices) exist
+        before data ingestion. For vector indexes this typically means
+        creating settings and mappings that enable k-NN/vector search (for
+        example, enabling an HNSW/FAISS engine, setting `dimension`, and any
+        engine-specific parameters).
+        Common keyword arguments (operator-specific):
+        - recreate (bool): if True, delete and recreate the index even if it
+            already exists (default: False)
+        - index_name (str): override the operator's configured index name for
+            this call
+        Returns:
+                implementation-specific result (e.g., a boolean, the created
+                index name, or the raw response from the database client).  There
+                is no strict requirement here because different DB clients return
+                different values; document behavior in concrete implementations.
+        """
         pass
     @abstractmethod
     def write_to_index(self, records: list, **kwargs):
+        """Write a batch of NV-Ingest records to the vector database.
+        This method receives `records` formatted as NV-Ingest provides them
+        (commonly a list of record-sets). Implementations are responsible for
+        transforming each record into the target database document format,
+        validating the presence of embeddings and content, and using the most
+        efficient ingestion API available (for example a bulk endpoint).
+        Expected behavior:
+        - Iterate over the provided `records` (which can be nested lists of
+            record dictionaries) and transform each record to the DB document
+            structure (fields such as `dense` for the vector, `text` for the
+            content, and `metadata` for auxiliary fields are common in the
+            repository examples).
+        - Skip records missing required fields (for example, missing
+            embeddings) and log or report failures as appropriate.
+        - Use batching / bulk APIs to reduce overhead when writing large
+            volumes of documents.
+        Parameters:
+        - records (list): NV-Ingest records (see repository examples for
+            structure)
+        - batch_size (int, optional): how many documents to send per bulk
+            request; database-specific implementations can use this hint
+        Returns:
+                implementation-specific result (e.g., number of documents
+                indexed, client response for bulk API). Concrete implementations
+                should document exact return values and failure semantics.
+        """
         pass
     @abstractmethod
     def retrieval(self, queries: list, **kwargs):
+        """Perform similarity search for a list of text queries.
+        The typical retrieval flow implemented by operators in this ecosystem
+        is:
+        1. Convert each textual `query` into a dense embedding using an
+             external embedding model or service (the example uses an NVIDIA
+             embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
+        2. Issue a vector (k-NN) search to the database using the generated
+             embedding, requesting the top-k (configurable) neighbors.
+        3. Post-process results (for example, remove stored dense vectors
+             from returned documents to reduce payload size) and return a
+             list-of-lists of result documents aligned with the input `queries`.
+        Keyword arguments (common):
+        - index_name (str): index to search (default: operator's configured
+            index_name)
+        - top_k (int): number of nearest neighbors to return (default: 10)
+        - embedding_endpoint / model_name / nvidia_api_key: parameters needed
+            when the operator integrates with an external embedding service.
+        Parameters:
+        - queries (list[str]): list of text queries to be vectorized and
+            searched
+        Returns:
+        - results (list[list[dict]]): for each query, a list of hit documents
+            (concrete implementations should specify the document shape they
+            return). Operators should remove large binary/vector fields from
+            responses where possible.
+        """
         pass
     @abstractmethod
     def run(self, records):
+        """Main entry point used by the NV-Ingest pipeline.
+        The `run` method is intended to be a simple orchestration layer that
+        ensures the index exists and then ingests provided records. A minimal
+        recommended implementation is::
+                def run(self, records):
+                        self.create_index()
+                        self.write_to_index(records)
+        Implementers can add pre/post hooks, metrics, retries, or error
+        handling as needed for production readiness. Keep `run` simple so the
+        pipeline orchestration remains predictable.
+        Parameters:
+        - records: NV-Ingest records to index (format follows repository
+            conventions)
+        Returns:
+        - implementation-specific result (for example, a summary dict or
+            boolean success flag).
+        """
         pass
     def reindex(self, records: list, **kwargs):
+        """Optional helper to rebuild or re-populate indexes with new data.
+        This non-abstract method is provided as an optional hook that concrete
+        classes may override. A typical reindex implementation will:
+        - optionally delete the existing index and recreate it (via
+            `create_index(recreate=True)`)
+        - call `write_to_index(records)` to populate the new index
+        Parameters:
+        - records (list): records used to populate the index
+        - recreate (bool, optional): whether to delete and recreate the
+            index before writing
+        Returns:
+        - implementation-specific result
+        """
         pass

nv_ingest_client/util/vdb/milvus.py CHANGED Viewed

@@ -892,7 +892,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
     logger.info(f"streamed {count} records")
-def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
+def wait_for_index(collection_name: str, expected_rows_dict: dict, client: MilvusClient):
     """
     This function waits for the index to be built. It checks
     the indexed_rows of the index and waits for it to be equal
@@ -901,32 +901,28 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
     (refer to MilvusClient.refresh_load for bulk inserts).
     """
     client.flush(collection_name)
-    # index_names = utility.list_indexes(collection_name)
     indexed_rows = 0
     # observe dense_index, all indexes get populated simultaneously
-    for index_name in [DENSE_INDEX_NAME]:
-        indexed_rows = 0
-        expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
-        while indexed_rows < expected_rows:
-            pos_movement = 10  # number of iteration allowed without noticing an increase in indexed_rows
+    for index_name, rows_expected in expected_rows_dict.items():
+        indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
+        while indexed_rows < rows_expected:
+            # 0.5% of rows expected allowed without noticing an increase in indexed_rows
+            pos_movement = start_pos_movement = max((rows_expected - indexed_rows) * 0.005, 10)
             for i in range(20):
-                current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
+                prev_indexed_rows = indexed_rows
+                indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
                 time.sleep(1)
-                logger.info(
-                    f"Indexed rows, {collection_name}, {index_name} -  {current_indexed_rows} / {expected_rows}"
-                )
-                if current_indexed_rows == expected_rows:
-                    indexed_rows = current_indexed_rows
+                logger.info(f"Indexed rows, {collection_name}, {index_name} -  {indexed_rows} / {rows_expected}")
+                if indexed_rows == rows_expected:
                     break
                 # check if indexed_rows is staying the same, too many times means something is wrong
-                if current_indexed_rows == indexed_rows:
+                if indexed_rows == prev_indexed_rows:
                     pos_movement -= 1
                 else:
-                    pos_movement = 10
+                    pos_movement = start_pos_movement
                 # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
                 if pos_movement == 0:
                     raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
-                indexed_rows = current_indexed_rows
     return indexed_rows
@@ -1046,6 +1042,13 @@ def write_to_nvingest_collection(
     if num_elements < threshold:
         stream = True
     if stream:
+        # most be accessed/saved before adding new records
+        index_names = utility.list_indexes(collection_name)
+        expected_rows = {}
+        for index_name in index_names:
+            expected_rows[index_name] = (
+                int(client.describe_index(collection_name, index_name)["indexed_rows"]) + num_elements
+            )
         stream_insert_milvus(
             cleaned_records,
             client,
@@ -1054,7 +1057,7 @@ def write_to_nvingest_collection(
         if not local_index:
             # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
             # know how long this should take, it is num_elements dependent.
-            wait_for_index(collection_name, num_elements, client)
+            wait_for_index(collection_name, expected_rows, client)
     else:
         minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
         bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
@@ -2005,6 +2008,12 @@ class Milvus(VDB):
         """
         kwargs = locals().copy()
         kwargs.pop("self", None)
+        bucket_name = kwargs.get("bucket_name", None)
+        if bucket_name is not None and bucket_name != ClientConfigSchema().minio_bucket_name:
+            raise ValueError(
+                "You must use the environment variable MINIO_BUCKET to specify bucket_name, detected:",
+                f"`bucket_name`: {bucket_name} and MINIO_BUCKET: {ClientConfigSchema().minio_bucket_name}",
+            )
         super().__init__(**kwargs)
     def create_index(self, **kwargs):

{nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.11.17.dev20251117
+Version: 2025.11.27.dev20251127
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/RECORD RENAMED Viewed

@@ -6,13 +6,13 @@ nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T
 nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
 nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
 nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
-nv_ingest_client/client/client.py,sha256=3uA54D4Y6lSS-Nvz8R8uzkHkoV8vJu8GPQQRPoc-Uxk,77368
+nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
 nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
-nv_ingest_client/client/interface.py,sha256=Y6JnjaRytlBrhgbU6MJYm2dblLvoYxWEB35TETZDSwk,55022
+nv_ingest_client/client/interface.py,sha256=XQ2hHNBsL-Nnsk_w48UMxFqxfkO0CdQ2AOQZEdXU3OA,59990
 nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
 nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
 nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
-nv_ingest_client/primitives/jobs/job_spec.py,sha256=TBz5u7KRdQjQvqD0mMzwjTK9Jl3p7yTIknQQs0lfnV8,15909
+nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQbDkhCXhLA9hNOURc,16831
 nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
 nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
 nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
@@ -40,17 +40,17 @@ nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywk
 nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
 nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
 nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
-nv_ingest_client/util/util.py,sha256=qwJ4MqF8w4-lws76z8iz1V0Hz_ebDYN8yAKyJPGuHuU,15828
+nv_ingest_client/util/util.py,sha256=zvWgIxIeATrtrS8olo_8-fHQ4aDd83yg2SjNDcHIv4g,16805
 nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
 nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nv_ingest_client/util/file_processing/extract.py,sha256=Hjtem4bJWum1bbUPw7_TG-0Z2-7PsH4bBuqTF7bLn88,4794
+nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
 nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
-nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
-nv_ingest_client/util/vdb/milvus.py,sha256=LHZ4Z6fHk8vQUGQFJ3FZ5iay0Ike6Zur-K9yMiPxe44,80141
+nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
+nv_ingest_client/util/vdb/milvus.py,sha256=jCQyWb6xoQ6utGNccASmN09eJbwF2HlgrGGIkpoUfI8,80792
 nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
-nv_ingest_client-2025.11.17.dev20251117.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest_client-2025.11.17.dev20251117.dist-info/METADATA,sha256=bgCG3WP30zjURzJ_SZEm3fDbby-NoICZDYfbiA3sSjg,30627
-nv_ingest_client-2025.11.17.dev20251117.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest_client-2025.11.17.dev20251117.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
-nv_ingest_client-2025.11.17.dev20251117.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
-nv_ingest_client-2025.11.17.dev20251117.dist-info/RECORD,,
+nv_ingest_client-2025.11.27.dev20251127.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_client-2025.11.27.dev20251127.dist-info/METADATA,sha256=jul59WHL8-9IYR27iL9ilxkw7IQRnqb7EMqBfJh7IGk,30627
+nv_ingest_client-2025.11.27.dev20251127.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest_client-2025.11.27.dev20251127.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
+nv_ingest_client-2025.11.27.dev20251127.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
+nv_ingest_client-2025.11.27.dev20251127.dist-info/RECORD,,

{nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-client 2025.11.17.dev20251117__py3-none-any.whl → 2025.11.27.dev20251127__py3-none-any.whl

nv-ingest-client 2025.11.17.dev20251117py3-none-any.whl → 2025.11.27.dev20251127py3-none-any.whl