PyPI - nv-ingest-client - Versions diffs - 2025.11.17.dev20251117__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl - Mend

nv-ingest-client 2025.11.17.dev20251117py3-none-any.whl → 2025.12.17.dev20251217py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

nv_ingest_client/primitives/jobs/job_spec.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Union
+from typing import Tuple
 from uuid import UUID
 from nv_ingest_client.primitives.tasks import Task
@@ -222,7 +223,9 @@ class BatchJobSpec:
         A dictionary that maps document types to a list of `JobSpec` instances.
     """
-    def __init__(self, job_specs_or_files: Optional[Union[List[JobSpec], List[str]]] = None) -> None:
+    def __init__(
+        self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
+    ) -> None:
         """
         Initializes the BatchJobSpec instance.
@@ -239,6 +242,13 @@ class BatchJobSpec:
                 self.from_job_specs(job_specs_or_files)
             elif isinstance(job_specs_or_files[0], str):
                 self.from_files(job_specs_or_files)
+            elif (
+                isinstance(job_specs_or_files[0], tuple)
+                and len(job_specs_or_files[0]) == 2
+                and isinstance(job_specs_or_files[0][0], str)
+                and isinstance(job_specs_or_files[0][1], BytesIO)
+            ):
+                self.from_buffers(job_specs_or_files)
             else:
                 raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
@@ -282,6 +292,21 @@ class BatchJobSpec:
         for job_spec in job_specs:
             self.add_job_spec(job_spec)
+    def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
+        """
+        Initializes the batch from a list of buffers.
+        Parameters
+        ----------
+        buffers : List[Tuple[str, BytesIO]]
+            A list of tuples containing the name of the buffer and the BytesIO object.
+        """
+        from nv_ingest_client.util.util import create_job_specs_for_buffers
+        job_specs = create_job_specs_for_buffers(buffers)
+        for job_spec in job_specs:
+            self.add_job_spec(job_spec)
     def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
         """
         Internal method to initialize the batch from a dataset.

nv_ingest_client/primitives/tasks/caption.py CHANGED Viewed

@@ -22,18 +22,24 @@ class CaptionTask(Task):
         api_key: str = None,
         endpoint_url: str = None,
         prompt: str = None,
+        system_prompt: str = None,
         model_name: str = None,
     ) -> None:
         super().__init__()
         # Use the API schema for validation
         validated_data = IngestTaskCaptionSchema(
-            api_key=api_key, endpoint_url=endpoint_url, prompt=prompt, model_name=model_name
+            api_key=api_key,
+            endpoint_url=endpoint_url,
+            prompt=prompt,
+            system_prompt=system_prompt,
+            model_name=model_name,
         )
         self._api_key = validated_data.api_key
         self._endpoint_url = validated_data.endpoint_url
         self._prompt = validated_data.prompt
+        self._system_prompt = validated_data.system_prompt
         self._model_name = validated_data.model_name
     def __str__(self) -> str:
@@ -49,6 +55,8 @@ class CaptionTask(Task):
             info += f"  endpoint_url: {self._endpoint_url}\n"
         if self._prompt:
             info += f"  prompt: {self._prompt}\n"
+        if self._system_prompt:
+            info += f"  system_prompt: {self._system_prompt}\n"
         if self._model_name:
             info += f"  model_name: {self._model_name}\n"
@@ -69,6 +77,9 @@ class CaptionTask(Task):
         if self._prompt:
             task_properties["prompt"] = self._prompt
+        if self._system_prompt:
+            task_properties["system_prompt"] = self._system_prompt
         if self._model_name:
             task_properties["model_name"] = self._model_name

nv_ingest_client/primitives/tasks/extract.py CHANGED Viewed

@@ -8,6 +8,8 @@
 import logging
 import os
+import warnings
+from typing import get_args
 from typing import Any
 from typing import Dict
 from typing import Literal
@@ -52,15 +54,27 @@ _DEFAULT_EXTRACTOR_MAP = {
 _Type_Extract_Method_PDF = Literal[
     "adobe",
-    "nemoretriever_parse",
+    "nemotron_parse",
     "haystack",
     "llama_parse",
     "pdfium",
     "tika",
     "unstructured_io",
+    "unstructured_local",
+    "pdfium_hybrid",
     "ocr",
 ]
+_Type_Extract_Method_DOCX = Literal[
+    "python_docx",
+    "render_as_pdf",
+]
+_Type_Extract_Method_PPTX = Literal[
+    "python_pptx",
+    "render_as_pdf",
+]
 _Type_Extract_Images_Method = Literal["group", "yolox"]
 _Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
@@ -74,7 +88,7 @@ class ExtractTask(Task):
     def __init__(
         self,
         document_type,
-        extract_method: _Type_Extract_Method_PDF = None,
+        extract_method: Optional[str] = None,
         extract_text: bool = False,
         extract_images: bool = False,
         extract_tables: bool = False,
@@ -109,6 +123,12 @@ class ExtractTask(Task):
                 )
             extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
+        if extract_method == "nemoretriever_parse":
+            logger.warning("'nemoretriever_parse' is deprecated. Please use 'nemotron_parse' instead.")
+            extract_method = "nemotron_parse"
+        self._validate_extract_method(document_type, extract_method)
         # Set default extract_charts if None
         if extract_charts is None:
             extract_charts = extract_tables
@@ -240,3 +260,31 @@ class ExtractTask(Task):
     @property
     def document_type(self):
         return self._document_type.value
+    def _validate_extract_method(self, document_type: str, extract_method: str):
+        doc_type = document_type.lower()
+        valid_docx = set(get_args(_Type_Extract_Method_DOCX))
+        valid_pptx = set(get_args(_Type_Extract_Method_PPTX))
+        valid_pdf = set(get_args(_Type_Extract_Method_PDF))
+        if doc_type == "docx" and extract_method not in valid_docx:
+            raise ValueError(f"'{extract_method}' is invalid for DOCX. Options: {valid_docx}")
+        elif doc_type == "pptx" and extract_method not in valid_pptx:
+            raise ValueError(f"'{extract_method}' is invalid for PPTX. Options: {valid_pptx}")
+        elif doc_type == "pdf" and extract_method not in valid_pdf:
+            raise ValueError(f"'{extract_method}' is invalid for PDF. Options: {valid_pdf}")
+        elif doc_type not in ["docx", "pptx", "pdf"]:
+            is_docx_method = extract_method in valid_docx
+            is_pptx_method = extract_method in valid_pptx
+            is_pdf_method = extract_method in valid_pdf
+            if (is_docx_method or is_pptx_method) and not is_pdf_method:
+                warnings.warn(
+                    f"extract_method '{extract_method}' is valid for Office documents but NOT for PDFs. "
+                    "If your batch includes PDFs, extraction may fail for those files. "
+                    "Consider leaving extract_method=None for mixed batches."
+                )

nv_ingest_client/primitives/tasks/store.py CHANGED Viewed

@@ -7,8 +7,7 @@
 # pylint: disable=too-many-arguments
 import logging
-from typing import Dict
-from typing import Literal
+from typing import Dict, Literal, Optional
 from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
 from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
@@ -17,23 +16,19 @@ from .task_base import Task
 logger = logging.getLogger(__name__)
-_DEFAULT_STORE_METHOD = "minio"
 class StoreTask(Task):
     """
     Object for image storage task.
     """
-    _Type_Content_Type = Literal["image",]
-    _Type_Store_Method = Literal["minio",]
     def __init__(
         self,
         structured: bool = True,
         images: bool = False,
-        store_method: _Type_Store_Method = None,
+        storage_uri: Optional[str] = None,
+        storage_options: Optional[dict] = None,
+        public_base_url: Optional[str] = None,
         params: dict = None,
         **extra_params,
     ) -> None:
@@ -51,12 +46,19 @@ class StoreTask(Task):
         # Use the API schema for validation
         validated_data = IngestTaskStoreSchema(
-            structured=structured, images=images, method=store_method or _DEFAULT_STORE_METHOD, params=merged_params
+            structured=structured,
+            images=images,
+            storage_uri=storage_uri,
+            storage_options=storage_options or {},
+            public_base_url=public_base_url,
+            params=merged_params,
         )
         self._structured = validated_data.structured
         self._images = validated_data.images
-        self._store_method = validated_data.method
+        self._storage_uri = validated_data.storage_uri
+        self._storage_options = validated_data.storage_options
+        self._public_base_url = validated_data.public_base_url
         self._params = validated_data.params
         self._extra_params = extra_params
@@ -68,7 +70,8 @@ class StoreTask(Task):
         info += "Store Task:\n"
         info += f"  store structured types: {self._structured}\n"
         info += f"  store image types: {self._images}\n"
-        info += f"  store method: {self._store_method}\n"
+        info += f"  storage uri: {self._storage_uri}\n"
+        info += f"  public base url: {self._public_base_url}\n"
         for key, value in self._extra_params.items():
             info += f"  {key}: {value}\n"
         for key, value in self._params.items():
@@ -81,9 +84,11 @@ class StoreTask(Task):
         """
         task_properties = {
-            "method": self._store_method,
             "structured": self._structured,
             "images": self._images,
+            "storage_uri": self._storage_uri,
+            "storage_options": self._storage_options,
+            "public_base_url": self._public_base_url,
             "params": self._params,
             **self._extra_params,
         }

nv_ingest_client/util/file_processing/extract.py CHANGED Viewed

@@ -145,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
     logger.debug(f"Content extracted from '{path}'")
     return content, DocumentTypeEnum(document_type)
+def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
+    """
+    Extracts the content and type from a buffer.
+    """
+    document_type = get_or_infer_file_type(buffer[0])
+    try:
+        if document_type in [
+            DocumentTypeEnum.TXT,
+            DocumentTypeEnum.MD,
+            DocumentTypeEnum.HTML,
+        ]:
+            content = detect_encoding_and_read_text_file(buffer[1])
+        else:
+            content = serialize_to_base64(buffer[1])
+    except Exception as e:
+        logger.error(f"Error processing buffer {buffer[0]}: {e}")
+        raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
+    logger.debug(f"Content extracted from '{buffer[0]}'")
+    return content, DocumentTypeEnum(document_type)

nv_ingest_client/util/util.py CHANGED Viewed

@@ -12,10 +12,12 @@ import math
 import heapq
 from typing import Dict
 from typing import List
+from typing import Tuple
+from io import BytesIO
 from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
 from nv_ingest_client.primitives.jobs.job_spec import JobSpec
-from nv_ingest_client.util.file_processing.extract import extract_file_content
+from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
 logger = logging.getLogger(__name__)
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
     return job_specs
+def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
+    """
+    Create and job specifications (JobSpecs) for a list of buffers.
+    This function takes a list of buffers, processes each buffer to extract its content and type,
+    creates a job specification (JobSpec) for each buffer.
+    Parameters
+    ----------
+    buffers : List[Tuple[str, BytesIO]]
+        A list of tuples containing the name of the buffer and the BytesIO object.
+    Returns
+    -------
+    List[JobSpec]
+        A list of JobSpecs.
+    """
+    job_specs = []
+    for name, buffer in buffers:
+        content, file_type = extract_content_from_buffer((name, buffer))
+        job_spec = JobSpec(
+            document_type=file_type,
+            payload=content,
+            source_id=name,
+            source_name=name,
+        )
+        job_specs.append(job_spec)
+    return job_specs
 def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
     """
     Apply PDF split configuration to a list of JobSpec objects.

nv_ingest_client/util/vdb/adt_vdb.py CHANGED Viewed

@@ -1,27 +1,243 @@
 from abc import ABC, abstractmethod
+"""Abstract Vector Database (VDB) operator API.
+This module defines the `VDB` abstract base class which specifies the
+interface that custom vector-database operators must implement to integrate
+with NV-Ingest.
+The implementation details and an example OpenSearch operator are described
+in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
+production-ready OpenSearch implementation is available at
+`client/src/nv_ingest_client/util/vdb/opensearch.py`.
+Design goals:
+- Provide a small, well-documented interface that supports common vector
+    database operations: index creation, batch ingestion, nearest-neighbor
+    retrieval, and a simple `run` orchestration entry-point used by the
+    NV-Ingest pipeline.
+- Keep the API flexible by accepting `**kwargs` on methods so implementers can
+    pass database-specific options without changing the interface.
+Typical implementation notes (inferred from the example OpenSearch operator):
+- Constructor accepts connection and index configuration parameters such as
+    `host`, `port`, `index_name`, `dense_dim` and feature toggles for content
+    types (e.g. `enable_text`, `enable_images`).
+- `create_index` should be able to create (and optionally recreate) an
+    index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
+- `write_to_index` should accept batches of NV-Ingest records, perform
+    validation/transformation, and write documents into the database efficiently
+    (bulk APIs are recommended).
+- `retrieval` should accept a list of textual queries, convert them to
+    embeddings (by calling an external embedding service or model), perform a
+    vector search (top-k), and return cleaned results (e.g., removing stored
+    dense vectors from returned payloads).
+"""
 class VDB(ABC):
+    """Abstract base class for Vector Database operators.
+    Subclasses must implement the abstract methods below. The interface is
+    intentionally small and uses `**kwargs` to allow operator-specific
+    configuration without changing the common API.
+    Example (high level):
+            class OpenSearch(VDB):
+                    def __init__(self, **kwargs):
+                            # parse kwargs, initialize client, call super().__init__(**kwargs)
+                            ...
+                    def create_index(self, **kwargs):
+                            # create index, mappings, settings
+                            ...
+                    def write_to_index(self, records: list, **kwargs):
+                            # transform NV-Ingest records and write to database
+                            ...
+                    def retrieval(self, queries: list, **kwargs):
+                            # convert queries to embeddings, k-NN search, format results
+                            ...
+                    def run(self, records):
+                            # orchestrate create_index + write_to_index
+                            ...
+    Notes on recommended constructor parameters (not enforced by this ABC):
+    - host (str): database hostname (default: 'localhost')
+    - port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
+    - index_name (str): base index name used by the operator
+    - dense_dim (int): dimensionality of stored dense embeddings
+    - enable_text/enable_images/... (bool): content-type toggles used when
+        extracting text from NV-Ingest records before indexing
+    The concrete operator may accept additional parameters (username,
+    password, ssl options, client-specific flags). Passing these via
+    `**kwargs` is the intended pattern.
+    """
     @abstractmethod
     def __init__(self, **kwargs):
+        """Initialize the VDB operator.
+        Implementations should extract configuration values from `kwargs`
+        (or use defaults) and initialize any client connections required to
+        talk to the target vector database. Implementations are encouraged to
+        call `super().__init__(**kwargs)` only if they want the base-class
+        behavior of storing kwargs on the instance (the base class itself does
+        not require that behavior).
+        Parameters (suggested/common):
+        - host (str): database host
+        - port (int): database port
+        - index_name (str): base name for created indices
+        - dense_dim (int): embedding vector dimension
+        - enable_text (bool): whether text content should be extracted/indexed
+        - enable_images (bool), enable_audio (bool), etc.: other toggles
+        The constructor should not perform heavy operations (like creating
+        indices) unless explicitly desired; prefer leaving that work to
+        `create_index` to make the operator easier to test.
+        """
         self.__dict__.update(kwargs)
     @abstractmethod
     def create_index(self, **kwargs):
+        """Create and configure the index(es) required by this operator.
+        Implementations must ensure an appropriate index (or indices) exist
+        before data ingestion. For vector indexes this typically means
+        creating settings and mappings that enable k-NN/vector search (for
+        example, enabling an HNSW/FAISS engine, setting `dimension`, and any
+        engine-specific parameters).
+        Common keyword arguments (operator-specific):
+        - recreate (bool): if True, delete and recreate the index even if it
+            already exists (default: False)
+        - index_name (str): override the operator's configured index name for
+            this call
+        Returns:
+                implementation-specific result (e.g., a boolean, the created
+                index name, or the raw response from the database client).  There
+                is no strict requirement here because different DB clients return
+                different values; document behavior in concrete implementations.
+        """
         pass
     @abstractmethod
     def write_to_index(self, records: list, **kwargs):
+        """Write a batch of NV-Ingest records to the vector database.
+        This method receives `records` formatted as NV-Ingest provides them
+        (commonly a list of record-sets). Implementations are responsible for
+        transforming each record into the target database document format,
+        validating the presence of embeddings and content, and using the most
+        efficient ingestion API available (for example a bulk endpoint).
+        Expected behavior:
+        - Iterate over the provided `records` (which can be nested lists of
+            record dictionaries) and transform each record to the DB document
+            structure (fields such as `dense` for the vector, `text` for the
+            content, and `metadata` for auxiliary fields are common in the
+            repository examples).
+        - Skip records missing required fields (for example, missing
+            embeddings) and log or report failures as appropriate.
+        - Use batching / bulk APIs to reduce overhead when writing large
+            volumes of documents.
+        Parameters:
+        - records (list): NV-Ingest records (see repository examples for
+            structure)
+        - batch_size (int, optional): how many documents to send per bulk
+            request; database-specific implementations can use this hint
+        Returns:
+                implementation-specific result (e.g., number of documents
+                indexed, client response for bulk API). Concrete implementations
+                should document exact return values and failure semantics.
+        """
         pass
     @abstractmethod
     def retrieval(self, queries: list, **kwargs):
+        """Perform similarity search for a list of text queries.
+        The typical retrieval flow implemented by operators in this ecosystem
+        is:
+        1. Convert each textual `query` into a dense embedding using an
+             external embedding model or service (the example uses an NVIDIA
+             embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
+        2. Issue a vector (k-NN) search to the database using the generated
+             embedding, requesting the top-k (configurable) neighbors.
+        3. Post-process results (for example, remove stored dense vectors
+             from returned documents to reduce payload size) and return a
+             list-of-lists of result documents aligned with the input `queries`.
+        Keyword arguments (common):
+        - index_name (str): index to search (default: operator's configured
+            index_name)
+        - top_k (int): number of nearest neighbors to return (default: 10)
+        - embedding_endpoint / model_name / nvidia_api_key: parameters needed
+            when the operator integrates with an external embedding service.
+        Parameters:
+        - queries (list[str]): list of text queries to be vectorized and
+            searched
+        Returns:
+        - results (list[list[dict]]): for each query, a list of hit documents
+            (concrete implementations should specify the document shape they
+            return). Operators should remove large binary/vector fields from
+            responses where possible.
+        """
         pass
     @abstractmethod
     def run(self, records):
+        """Main entry point used by the NV-Ingest pipeline.
+        The `run` method is intended to be a simple orchestration layer that
+        ensures the index exists and then ingests provided records. A minimal
+        recommended implementation is::
+                def run(self, records):
+                        self.create_index()
+                        self.write_to_index(records)
+        Implementers can add pre/post hooks, metrics, retries, or error
+        handling as needed for production readiness. Keep `run` simple so the
+        pipeline orchestration remains predictable.
+        Parameters:
+        - records: NV-Ingest records to index (format follows repository
+            conventions)
+        Returns:
+        - implementation-specific result (for example, a summary dict or
+            boolean success flag).
+        """
         pass
     def reindex(self, records: list, **kwargs):
+        """Optional helper to rebuild or re-populate indexes with new data.
+        This non-abstract method is provided as an optional hook that concrete
+        classes may override. A typical reindex implementation will:
+        - optionally delete the existing index and recreate it (via
+            `create_index(recreate=True)`)
+        - call `write_to_index(records)` to populate the new index
+        Parameters:
+        - records (list): records used to populate the index
+        - recreate (bool, optional): whether to delete and recreate the
+            index before writing
+        Returns:
+        - implementation-specific result
+        """
         pass

nv-ingest-client 2025.11.17.dev20251117__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl

nv-ingest-client 2025.11.17.dev20251117py3-none-any.whl → 2025.12.17.dev20251217py3-none-any.whl