PyPI - nv-ingest-client - Versions diffs - 2025.11.27.dev20251127__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl - Mend

nv-ingest-client 2025.11.27.dev20251127py3-none-any.whl → 2025.12.17.dev20251217py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

nv_ingest_client/client/interface.py CHANGED Viewed

@@ -53,6 +53,7 @@ from nv_ingest_client.primitives.tasks import SplitTask
 from nv_ingest_client.primitives.tasks import StoreTask
 from nv_ingest_client.primitives.tasks import StoreEmbedTask
 from nv_ingest_client.primitives.tasks import UDFTask
+from nv_ingest_client.util.file_processing.extract import EXTENSION_TO_DOCUMENT_TYPE
 from nv_ingest_client.util.processing import check_schema
 from nv_ingest_client.util.system import ensure_directory_with_permissions
 from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
@@ -421,6 +422,92 @@ class Ingestor:
         return self
+    def _resolve_source_name(self, job_id: str, results_data: Optional[Union[List, Dict]] = None) -> str:
+        """
+        Resolves the source name for a given job ID using available metadata or fallback options.
+        Parameters
+        ----------
+        job_id : str
+            The job identifier.
+        results_data : Any, optional
+            The data associated with the job result, which might contain metadata.
+        Returns
+        -------
+        str
+            The resolved source name.
+        """
+        source_name = "unknown_source"
+        job_spec = self._client._job_index_to_job_spec.get(job_id)
+        if job_spec:
+            source_name = job_spec.source_name
+        else:
+            try:
+                if results_data:
+                    first_item = results_data[0] if isinstance(results_data, list) and results_data else results_data
+                    if isinstance(first_item, dict):
+                        source_name = first_item.get("metadata", {}).get("source_metadata", {}).get("source_id", "")
+                        if not source_name:
+                            source_name = f"{job_id}"
+            except (IndexError, KeyError, TypeError):
+                source_name = f"{job_id}"
+        return source_name
+    def _write_results_to_disk(self, doc_data: Any, source_name: str, job_id: str) -> Optional[LazyLoadedList]:
+        """
+        Writes the results for a single job to a JSONL file and returns a LazyLoadedList.
+        Parameters
+        ----------
+        doc_data : Any
+            The result data to save.
+        source_name : str
+            The name of the source document.
+        job_id : str
+            The job identifier.
+        Returns
+        -------
+        Optional[LazyLoadedList]
+            A proxy object to the saved file, or None if the save failed.
+        """
+        if not self._output_config:
+            logger.warning("Attempted to write results to disk without output configuration.")
+            return None
+        try:
+            output_dir = self._output_config["output_directory"]
+            clean_source_basename = get_valid_filename(os.path.basename(source_name))
+            file_name, file_ext = os.path.splitext(clean_source_basename)
+            file_suffix = f".{file_ext.strip('.')}.results.jsonl"
+            if self._output_config["compression"] == "gzip":
+                file_suffix += ".gz"
+            jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
+            data_to_save = doc_data if isinstance(doc_data, list) else [doc_data]
+            num_items_saved = save_document_results_to_jsonl(
+                data_to_save,
+                jsonl_filepath,
+                source_name,
+                ensure_parent_dir_exists=False,
+                compression=self._output_config["compression"],
+            )
+            if num_items_saved > 0:
+                return LazyLoadedList(
+                    jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
+                )
+        except Exception as e_save:
+            logger.error(
+                f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
+                exc_info=True,
+            )
+        return None
     def ingest(
         self,
         show_progress: bool = False,
@@ -488,52 +575,19 @@ class Ingestor:
         def _perform_save_task(doc_data, job_id, source_name):
             # This function runs in the io_executor
-            try:
-                output_dir = self._output_config["output_directory"]
-                clean_source_basename = get_valid_filename(os.path.basename(source_name))
-                file_name, file_ext = os.path.splitext(clean_source_basename)
-                file_suffix = f".{file_ext.strip('.')}.results.jsonl"
-                if self._output_config["compression"] == "gzip":
-                    file_suffix += ".gz"
-                jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
-                num_items_saved = save_document_results_to_jsonl(
-                    doc_data,
-                    jsonl_filepath,
-                    source_name,
-                    ensure_parent_dir_exists=False,
-                    compression=self._output_config["compression"],
-                )
-                if num_items_saved > 0:
-                    results = LazyLoadedList(
-                        jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
-                    )
-                    if results_lock:
-                        with results_lock:
-                            final_results_payload_list.append(results)
-                    else:  # Should not happen if io_executor is used
+            results = self._write_results_to_disk(doc_data, source_name, job_id)
+            if results:
+                if results_lock:
+                    with results_lock:
                         final_results_payload_list.append(results)
-            except Exception as e_save:
-                logger.error(
-                    f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
-                    exc_info=True,
-                )
+                else:  # Should not happen if io_executor is used
+                    final_results_payload_list.append(results)
         def _disk_save_callback(
             results_data: Dict[str, Any],
             job_id: str,
         ):
-            source_name = "unknown_source_in_callback"
-            job_spec = self._client._job_index_to_job_spec.get(job_id)
-            if job_spec:
-                source_name = job_spec.source_name
-            else:
-                try:
-                    if results_data:
-                        source_name = results_data[0]["metadata"]["source_metadata"]["source_id"]
-                except (IndexError, KeyError, TypeError):
-                    source_name = f"{job_id}"
+            source_name = self._resolve_source_name(job_id, results_data)
             if not results_data:
                 logger.warning(f"No data in response for job {job_id} (source: {source_name}). Skipping save.")
@@ -734,12 +788,49 @@ class Ingestor:
             proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
+            stream_to_callback_only = False
+            completion_callback = None
+            async_results_map = {}
+            io_executor = None
+            io_futures = []
+            if self._output_config:
+                stream_to_callback_only = True
+                output_dir = self._output_config["output_directory"]
+                os.makedirs(output_dir, exist_ok=True)
+                io_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="IngestAsyncIO")
+                def _io_task(data: Dict[str, Any], job_id: str):
+                    try:
+                        source_name = self._resolve_source_name(job_id, data)
+                        result = self._write_results_to_disk(data, source_name, job_id)
+                        if result:
+                            # Store the LazyLoadedList in our map using job_id as key
+                            async_results_map[job_id] = result
+                    except Exception as e:
+                        logger.error(f"Error in async I/O task for job {job_id}: {e}", exc_info=True)
+                def _composite_callback(data: Dict[str, Any], job_id: str):
+                    """Callback executed by worker threads to save data to disk."""
+                    try:
+                        future = io_executor.submit(_io_task, data, job_id)
+                        io_futures.append(future)
+                    except Exception as e:
+                        logger.error(f"Error in async callback for job {job_id}: {e}", exc_info=True)
+                completion_callback = _composite_callback
             final_future: Future = Future()
             processor_future = self._client.process_jobs_concurrently_async(
                 job_indices=self._job_ids,
                 job_queue_id=self._job_queue_id,
                 return_traces=return_traces,
+                completion_callback=completion_callback,
+                stream_to_callback_only=stream_to_callback_only,
                 **proc_kwargs,
             )
@@ -759,6 +850,20 @@ class Ingestor:
                     results, failures, traces_list = proc_future.result()
+                    if io_executor:
+                        for f in as_completed(io_futures):
+                            if f.exception():
+                                logger.error(f"Async I/O task failed: {f.exception()}")
+                        io_executor.shutdown(wait=True)
+                    final_results_list = []
+                    if self._output_config:
+                        for item in results:
+                            if isinstance(item, str) and item in async_results_map:
+                                final_results_list.append(async_results_map[item])
+                    else:
+                        final_results_list = results
                     failed_job_ids = set()
                     for job_id_with_source, error_msg in failures:
                         job_id = job_id_with_source.split(":", 1)[0]
@@ -775,18 +880,22 @@ class Ingestor:
                             if self._job_states[job_id].state != JobStateEnum.COMPLETED:
                                 self._job_states[job_id].state = JobStateEnum.COMPLETED
-                    if self._vdb_bulk_upload and results:
+                    if self._vdb_bulk_upload and final_results_list:
                         with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
                             results_future = Future()
-                            results_future.set_result(results)
+                            results_future.set_result(final_results_list)
                             vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
                             vdb_future.result()
+                            if self._purge_results_after_vdb_upload and self._output_config:
+                                logger.info("Purging saved results from disk after successful VDB upload.")
+                                self._purge_saved_results(final_results_list)
                     parent_trace_ids = (
                         self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
                     )
-                    returns = [results]
+                    returns = [final_results_list]
                     if return_failures:
                         returns.append(failures)
                     if return_traces:
@@ -794,7 +903,7 @@ class Ingestor:
                     if include_parent_trace_ids:
                         returns.append(parent_trace_ids)
-                    final_result = tuple(returns) if len(returns) > 1 else results
+                    final_result = tuple(returns) if len(returns) > 1 else final_results_list
                     if not final_future.done():
                         final_future.set_result(final_result)
@@ -812,6 +921,9 @@ class Ingestor:
                         ):
                             job_state.state = final_state
+                    if io_executor:
+                        io_executor.shutdown(wait=False)
             processor_future.add_done_callback(_processor_done_callback)
             return final_future
@@ -963,11 +1075,18 @@ class Ingestor:
                 **kwargs,
             )
+            api_document_type = EXTENSION_TO_DOCUMENT_TYPE.get(document_type.lower(), document_type)
             # Extract method from task_options for API schema
             method = task_options.pop("extract_method", None)
             if method is None:
                 # Let ExtractTask constructor handle default method selection
-                method = "pdfium"  # Default fallback
+                if api_document_type == "docx":
+                    method = "python_docx"
+                elif api_document_type == "pptx":
+                    method = "python_pptx"
+                else:
+                    method = "pdfium"  # Default fallback
             # Build params dict for API schema
             params = {k: v for k, v in task_options.items() if k != "document_type"}
@@ -1088,13 +1207,9 @@ class Ingestor:
         Ingestor
             Returns self for chaining.
         """
-        # Handle parameter name mapping: store_method -> method for API schema
-        if "store_method" in kwargs:
-            kwargs["method"] = kwargs.pop("store_method")
-        # Provide default method if not specified (matching client StoreTask behavior)
-        if "method" not in kwargs:
-            kwargs["method"] = "minio"
+        deprecated_method = kwargs.pop("store_method", None)
+        if deprecated_method is not None:
+            logger.warning("`store_method` is deprecated and no longer used. Configure storage_uri instead.")
         task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
@@ -1102,7 +1217,9 @@ class Ingestor:
         store_params = {
             "structured": task_options.structured,
             "images": task_options.images,
-            "store_method": task_options.method,  # Map method back to store_method
+            "storage_uri": task_options.storage_uri,
+            "storage_options": task_options.storage_options,
+            "public_base_url": task_options.public_base_url,
             "params": task_options.params,
         }
         store_task = StoreTask(**store_params)
@@ -1347,6 +1464,7 @@ class Ingestor:
             "api_key": task_options.api_key,
             "endpoint_url": task_options.endpoint_url,
             "prompt": task_options.prompt,
+            "system_prompt": task_options.system_prompt,
             "model_name": task_options.model_name,
         }
         caption_task = CaptionTask(**caption_params)

nv_ingest_client/nv_ingest_cli.py CHANGED Viewed

@@ -76,7 +76,7 @@ logger = logging.getLogger(__name__)
 @click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
 @click.option(
     "--api_version",
-    default="v1",
+    default="v2",
     type=click.Choice(["v1", "v2"], case_sensitive=False),
     help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
 )
@@ -120,7 +120,7 @@ Each task must be specified with its type and corresponding options in the '[tas
 Example:
   --task 'split:{"split_by":"page", "split_length":10}'
   --task 'extract:{"document_type":"pdf", "extract_text":true}'
-  --task 'extract:{"document_type":"pdf", "extract_method":"nemoretriever_parse"}'
+  --task 'extract:{"document_type":"pdf", "extract_method":"nemotron_parse"}'
   --task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
   --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
   --task 'embed'

nv_ingest_client/primitives/tasks/caption.py CHANGED Viewed

@@ -22,18 +22,24 @@ class CaptionTask(Task):
         api_key: str = None,
         endpoint_url: str = None,
         prompt: str = None,
+        system_prompt: str = None,
         model_name: str = None,
     ) -> None:
         super().__init__()
         # Use the API schema for validation
         validated_data = IngestTaskCaptionSchema(
-            api_key=api_key, endpoint_url=endpoint_url, prompt=prompt, model_name=model_name
+            api_key=api_key,
+            endpoint_url=endpoint_url,
+            prompt=prompt,
+            system_prompt=system_prompt,
+            model_name=model_name,
         )
         self._api_key = validated_data.api_key
         self._endpoint_url = validated_data.endpoint_url
         self._prompt = validated_data.prompt
+        self._system_prompt = validated_data.system_prompt
         self._model_name = validated_data.model_name
     def __str__(self) -> str:
@@ -49,6 +55,8 @@ class CaptionTask(Task):
             info += f"  endpoint_url: {self._endpoint_url}\n"
         if self._prompt:
             info += f"  prompt: {self._prompt}\n"
+        if self._system_prompt:
+            info += f"  system_prompt: {self._system_prompt}\n"
         if self._model_name:
             info += f"  model_name: {self._model_name}\n"
@@ -69,6 +77,9 @@ class CaptionTask(Task):
         if self._prompt:
             task_properties["prompt"] = self._prompt
+        if self._system_prompt:
+            task_properties["system_prompt"] = self._system_prompt
         if self._model_name:
             task_properties["model_name"] = self._model_name

nv_ingest_client/primitives/tasks/extract.py CHANGED Viewed

@@ -8,6 +8,8 @@
 import logging
 import os
+import warnings
+from typing import get_args
 from typing import Any
 from typing import Dict
 from typing import Literal
@@ -52,15 +54,27 @@ _DEFAULT_EXTRACTOR_MAP = {
 _Type_Extract_Method_PDF = Literal[
     "adobe",
-    "nemoretriever_parse",
+    "nemotron_parse",
     "haystack",
     "llama_parse",
     "pdfium",
     "tika",
     "unstructured_io",
+    "unstructured_local",
+    "pdfium_hybrid",
     "ocr",
 ]
+_Type_Extract_Method_DOCX = Literal[
+    "python_docx",
+    "render_as_pdf",
+]
+_Type_Extract_Method_PPTX = Literal[
+    "python_pptx",
+    "render_as_pdf",
+]
 _Type_Extract_Images_Method = Literal["group", "yolox"]
 _Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
@@ -74,7 +88,7 @@ class ExtractTask(Task):
     def __init__(
         self,
         document_type,
-        extract_method: _Type_Extract_Method_PDF = None,
+        extract_method: Optional[str] = None,
         extract_text: bool = False,
         extract_images: bool = False,
         extract_tables: bool = False,
@@ -109,6 +123,12 @@ class ExtractTask(Task):
                 )
             extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
+        if extract_method == "nemoretriever_parse":
+            logger.warning("'nemoretriever_parse' is deprecated. Please use 'nemotron_parse' instead.")
+            extract_method = "nemotron_parse"
+        self._validate_extract_method(document_type, extract_method)
         # Set default extract_charts if None
         if extract_charts is None:
             extract_charts = extract_tables
@@ -240,3 +260,31 @@ class ExtractTask(Task):
     @property
     def document_type(self):
         return self._document_type.value
+    def _validate_extract_method(self, document_type: str, extract_method: str):
+        doc_type = document_type.lower()
+        valid_docx = set(get_args(_Type_Extract_Method_DOCX))
+        valid_pptx = set(get_args(_Type_Extract_Method_PPTX))
+        valid_pdf = set(get_args(_Type_Extract_Method_PDF))
+        if doc_type == "docx" and extract_method not in valid_docx:
+            raise ValueError(f"'{extract_method}' is invalid for DOCX. Options: {valid_docx}")
+        elif doc_type == "pptx" and extract_method not in valid_pptx:
+            raise ValueError(f"'{extract_method}' is invalid for PPTX. Options: {valid_pptx}")
+        elif doc_type == "pdf" and extract_method not in valid_pdf:
+            raise ValueError(f"'{extract_method}' is invalid for PDF. Options: {valid_pdf}")
+        elif doc_type not in ["docx", "pptx", "pdf"]:
+            is_docx_method = extract_method in valid_docx
+            is_pptx_method = extract_method in valid_pptx
+            is_pdf_method = extract_method in valid_pdf
+            if (is_docx_method or is_pptx_method) and not is_pdf_method:
+                warnings.warn(
+                    f"extract_method '{extract_method}' is valid for Office documents but NOT for PDFs. "
+                    "If your batch includes PDFs, extraction may fail for those files. "
+                    "Consider leaving extract_method=None for mixed batches."
+                )

nv_ingest_client/primitives/tasks/store.py CHANGED Viewed

@@ -7,8 +7,7 @@
 # pylint: disable=too-many-arguments
 import logging
-from typing import Dict
-from typing import Literal
+from typing import Dict, Literal, Optional
 from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
 from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
@@ -17,23 +16,19 @@ from .task_base import Task
 logger = logging.getLogger(__name__)
-_DEFAULT_STORE_METHOD = "minio"
 class StoreTask(Task):
     """
     Object for image storage task.
     """
-    _Type_Content_Type = Literal["image",]
-    _Type_Store_Method = Literal["minio",]
     def __init__(
         self,
         structured: bool = True,
         images: bool = False,
-        store_method: _Type_Store_Method = None,
+        storage_uri: Optional[str] = None,
+        storage_options: Optional[dict] = None,
+        public_base_url: Optional[str] = None,
         params: dict = None,
         **extra_params,
     ) -> None:
@@ -51,12 +46,19 @@ class StoreTask(Task):
         # Use the API schema for validation
         validated_data = IngestTaskStoreSchema(
-            structured=structured, images=images, method=store_method or _DEFAULT_STORE_METHOD, params=merged_params
+            structured=structured,
+            images=images,
+            storage_uri=storage_uri,
+            storage_options=storage_options or {},
+            public_base_url=public_base_url,
+            params=merged_params,
         )
         self._structured = validated_data.structured
         self._images = validated_data.images
-        self._store_method = validated_data.method
+        self._storage_uri = validated_data.storage_uri
+        self._storage_options = validated_data.storage_options
+        self._public_base_url = validated_data.public_base_url
         self._params = validated_data.params
         self._extra_params = extra_params
@@ -68,7 +70,8 @@ class StoreTask(Task):
         info += "Store Task:\n"
         info += f"  store structured types: {self._structured}\n"
         info += f"  store image types: {self._images}\n"
-        info += f"  store method: {self._store_method}\n"
+        info += f"  storage uri: {self._storage_uri}\n"
+        info += f"  public base url: {self._public_base_url}\n"
         for key, value in self._extra_params.items():
             info += f"  {key}: {value}\n"
         for key, value in self._params.items():
@@ -81,9 +84,11 @@ class StoreTask(Task):
         """
         task_properties = {
-            "method": self._store_method,
             "structured": self._structured,
             "images": self._images,
+            "storage_uri": self._storage_uri,
+            "storage_options": self._storage_options,
+            "public_base_url": self._public_base_url,
             "params": self._params,
             **self._extra_params,
         }

nv_ingest_client/util/vdb/lancedb.py ADDED Viewed

@@ -0,0 +1,276 @@
+import logging
+from nv_ingest_client.util.vdb.adt_vdb import VDB
+from datetime import timedelta
+from functools import partial
+from urllib.parse import urlparse
+from nv_ingest_client.util.transport import infer_microservice
+import lancedb
+import pyarrow as pa
+logger = logging.getLogger(__name__)
+def create_lancedb_results(results):
+    """Transform NV-Ingest pipeline results into LanceDB ingestible rows.
+    The NV-Ingest pipeline provides nested lists of record dictionaries. This
+    helper extracts the inner `metadata` dict for each record, filters out
+    entries without an embedding, and returns a list of dictionaries with the
+    exact fields expected by the LanceDB table schema used in
+    `LanceDB.create_index`.
+    Parameters
+    ----------
+    results : list
+        Nested list-of-lists containing record dicts in the NV-Ingest format.
+    Returns
+    -------
+    list
+        List of dictionaries with keys: `vector` (embedding list), `text`
+        (string content), `metadata` (page number) and `source` (source id).
+    Notes
+    -----
+    - The function expects each inner record to have a `metadata` mapping
+        containing `embedding`, `content`, `content_metadata.page_number`, and
+        `source_metadata.source_id`.
+    - Records with `embedding is None` are skipped.
+    """
+    old_results = [res["metadata"] for result in results for res in result]
+    results = []
+    for result in old_results:
+        if result["embedding"] is None:
+            continue
+        results.append(
+            {
+                "vector": result["embedding"],
+                "text": result["content"],
+                "metadata": result["content_metadata"]["page_number"],
+                "source": result["source_metadata"]["source_id"],
+            }
+        )
+    return results
+class LanceDB(VDB):
+    """LanceDB operator implementing the VDB interface.
+    This class adapts NV-Ingest records to LanceDB, providing index creation,
+    ingestion, and retrieval hooks. The implementation is intentionally small
+    and focuses on the example configuration used in NV-Ingest evaluation
+    scripts.
+    """
+    def __init__(
+        self,
+        uri=None,
+        overwrite=True,
+        table_name="nv-ingest",
+        index_type="IVF_HNSW_SQ",
+        metric="l2",
+        num_partitions=16,
+        num_sub_vectors=256,
+        **kwargs
+    ):
+        """Initialize the LanceDB VDB operator.
+        Parameters
+        ----------
+        uri: str, optional
+            LanceDB connection URI (default is "lancedb" for local file-based
+            storage).
+        overwrite : bool, optional
+            If True, existing tables will be overwritten during index creation.
+            If False, new data will be appended to existing tables.
+        table_name : str, optional
+            Name of the LanceDB table to create/use (default is "nv-ingest").
+        index_type : str, optional
+            Type of vector index to create (default is "IVF_HNSW_SQ").
+        metric : str, optional
+            Distance metric for the vector index (default is "l2").
+        num_partitions : int, optional
+            Number of partitions for the vector index (default is 16).
+        num_sub_vectors : int, optional
+            Number of sub-vectors for the vector index (default is 256).
+        **kwargs : dict
+            Forwarded configuration options. This implementation does not
+            actively consume specific keys, but passing parameters such as
+            `uri`, `index_name`, or security options is supported by the
+            interface pattern and may be used by future enhancements.
+        """
+        self.uri = uri or "lancedb"
+        self.overwrite = overwrite
+        self.table_name = table_name
+        self.index_type = index_type
+        self.metric = metric
+        self.num_partitions = num_partitions
+        self.num_sub_vectors = num_sub_vectors
+        super().__init__(**kwargs)
+    def create_index(self, records=None, table_name="nv-ingest", **kwargs):
+        """Create a LanceDB table and populate it with transformed records.
+        This method connects to LanceDB, transforms NV-Ingest records using
+        `create_lancedb_results`, builds a PyArrow schema that matches the
+        expected table layout, and creates/overwrites a table named `bo`.
+        Parameters
+        ----------
+        records : list, optional
+            NV-Ingest records in nested list format (the same structure passed
+            to `run`). If ``None``, an empty table will be created.
+        table_name : str, optional
+            Name of the LanceDB table to create (default is "nv-ingest").
+        Returns
+        -------
+        table
+            The LanceDB table object returned by `db.create_table`.
+        """
+        db = lancedb.connect(uri=self.uri)
+        results = create_lancedb_results(records)
+        schema = pa.schema(
+            [
+                pa.field("vector", pa.list_(pa.float32(), 2048)),
+                pa.field("text", pa.string()),
+                pa.field("metadata", pa.string()),
+                pa.field("source", pa.string()),
+            ]
+        )
+        table = db.create_table(
+            table_name, data=results, schema=schema, mode="overwrite" if self.overwrite else "append"
+        )
+        return table
+    def write_to_index(
+        self,
+        records,
+        table=None,
+        index_type="IVF_HNSW_SQ",
+        metric="l2",
+        num_partitions=16,
+        num_sub_vectors=256,
+        **kwargs
+    ):
+        """Create an index on the LanceDB table and wait for it to become ready.
+        This function calls `table.create_index` with an IVF+HNSW+SQ index
+        configuration used in NV-Ingest benchmarks. After requesting index
+        construction it lists available indices and waits for each one to
+        reach a ready state using `table.wait_for_index`.
+        Parameters
+        ----------
+        records : list
+            The original records being indexed (not used directly in this
+            implementation but kept in the signature for consistency).
+        table : object
+            LanceDB table object returned by `create_index`.
+        """
+        table.create_index(
+            index_type=index_type,
+            metric=metric,
+            num_partitions=num_partitions,
+            num_sub_vectors=num_sub_vectors,
+            # accelerator="cuda",
+            vector_column_name="vector",
+        )
+        for index_stub in table.list_indices():
+            table.wait_for_index([index_stub.name], timeout=timedelta(seconds=600))
+    def retrieval(
+        self,
+        queries,
+        table=None,
+        embedding_endpoint="http://localhost:8012/v1",
+        nvidia_api_key=None,
+        model_name="nvidia/llama-3.2-nv-embedqa-1b-v2",
+        result_fields=["text", "metadata", "source"],
+        top_k=10,
+        **kwargs
+    ):
+        """Run similarity search for a list of text queries.
+        This method converts textual queries to embeddings by calling the
+        transport helper `infer_microservice` (configured to use an NVIDIA
+        embedding model in the example) and performs a vector search against
+        the LanceDB `table`.
+        Parameters
+        ----------
+        queries : list[str]
+            Text queries to be embedded and searched.
+        table : object
+            LanceDB table object with a built vector index.
+        embedding_endpoint : str, optional
+            URL of the embedding microservice (default is
+            "http://localhost:8012/v1").
+        nvidia_api_key : str, optional
+            NVIDIA API key for authentication with the embedding service. If
+            ``None``, no authentication is used.
+        model_name : str, optional
+            Name of the embedding model to use (default is
+            "nvidia/llama-3.2-nv-embedqa-1b-v2").
+        result_fields : list, optional
+            List of field names to retrieve from each hit document (default is
+            `["text", "metadata", "source"]`).
+        top_k : int, optional
+            Number of top results to return per query (default is 10).
+        Returns
+        -------
+        list[list[dict]]
+            For each input query, a list of hit documents (each document is a
+            dict with fields such as `text`, `metadata`, and `source`). The
+            example limits each query to 20 results.
+        """
+        embed_model = partial(
+            infer_microservice,
+            model_name=model_name,
+            embedding_endpoint=embedding_endpoint,
+            nvidia_api_key=nvidia_api_key,
+            input_type="query",
+            output_names=["embeddings"],
+            grpc=not ("http" in urlparse(embedding_endpoint).scheme),
+        )
+        results = []
+        query_embeddings = embed_model(queries)
+        for query_embed in query_embeddings:
+            results.append(
+                table.search([query_embed], vector_column_name="vector").select(result_fields).limit(top_k).to_list()
+            )
+        return results
+    def run(self, records):
+        """Orchestrate index creation and data ingestion.
+        The `run` method is the public entry point used by NV-Ingest pipeline
+        tasks. A minimal implementation first ensures the table exists by
+        calling `create_index` and then kicks off index construction with
+        `write_to_index`.
+        Parameters
+        ----------
+        records : list
+            NV-Ingest records to index.
+        Returns
+        -------
+        list
+            The original `records` list is returned unchanged to make the
+            operator composable in pipelines.
+        """
+        table = self.create_index(records=records, table_name=self.table_name)
+        self.write_to_index(
+            records,
+            table=table,
+            index_type=self.index_type,
+            metric=self.metric,
+            num_partitions=self.num_partitions,
+            num_sub_vectors=self.num_sub_vectors,
+        )
+        return records

nv_ingest_client/util/vdb/milvus.py CHANGED Viewed

@@ -287,6 +287,10 @@ def create_nvingest_index_params(
     gpu_index: bool = True,
     gpu_search: bool = False,
     local_index: bool = True,
+    intermediate_graph_degree: int = 128,
+    graph_degree: int = 100,
+    m: int = 64,
+    ef_construction: int = 512,
 ) -> IndexParams:
     """
     Creates index params necessary to create an index for a collection. At a minimum,
@@ -326,8 +330,8 @@ def create_nvingest_index_params(
                 index_type="GPU_CAGRA",
                 metric_type="L2",
                 params={
-                    "intermediate_graph_degree": 128,
-                    "graph_degree": 100,
+                    "intermediate_graph_degree": intermediate_graph_degree,
+                    "graph_degree": graph_degree,
                     "build_algo": "NN_DESCENT",
                     "cache_dataset_on_device": "true",
                     "adapt_for_cpu": "false" if gpu_search else "true",
@@ -339,7 +343,7 @@ def create_nvingest_index_params(
                 index_name=DENSE_INDEX_NAME,
                 index_type="HNSW",
                 metric_type="L2",
-                params={"M": 64, "efConstruction": 512},
+                params={"M": m, "efConstruction": ef_construction},
             )
     if sparse and local_index:
         index_params.add_index(
@@ -407,6 +411,10 @@ def create_nvingest_collection(
     recreate_meta: bool = False,
     username: str = None,
     password: str = None,
+    intermediate_graph_degree: int = 128,
+    graph_degree: int = 100,
+    m: int = 64,
+    ef_construction: int = 512,
 ) -> CollectionSchema:
     """
     Creates a milvus collection with an nv-ingest compatible schema under
@@ -457,6 +465,10 @@ def create_nvingest_collection(
         gpu_index=gpu_index,
         gpu_search=gpu_search,
         local_index=local_index,
+        intermediate_graph_degree=intermediate_graph_degree,
+        graph_degree=graph_degree,
+        m=m,
+        ef_construction=ef_construction,
     )
     create_collection(client, collection_name, schema, index_params, recreate=recreate)
     d_idx, s_idx = _get_index_types(index_params, sparse=sparse)
@@ -949,6 +961,7 @@ def write_to_nvingest_collection(
     stream: bool = False,
     username: str = None,
     password: str = None,
+    no_wait_index: bool = False,
     **kwargs,
 ):
     """
@@ -1054,7 +1067,7 @@ def write_to_nvingest_collection(
             client,
             collection_name,
         )
-        if not local_index:
+        if not local_index and not no_wait_index:
             # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
             # know how long this should take, it is num_elements dependent.
             wait_for_index(collection_name, expected_rows, client)
@@ -1971,6 +1984,7 @@ class Milvus(VDB):
         threshold: int = 1000,
         username: str = None,
         password: str = None,
+        no_wait_index: bool = False,
         **kwargs,
     ):
         """

{nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.11.27.dev20251127
+Version: 2025.12.17.dev20251217
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
 Requires-Dist: requests>=2.28.2
 Requires-Dist: setuptools>=78.1.1
 Requires-Dist: tqdm>=4.67.1
+Requires-Dist: lancedb>=0.25.3
 Provides-Extra: milvus
 Requires-Dist: pymilvus==2.5.10; extra == "milvus"
 Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"

{nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest_client/nv_ingest_cli.py,sha256=84fc0-6TUe-0BMasRIiRH4okfjno4AKCaKvUwJEZ45k,14457
+nv_ingest_client/nv_ingest_cli.py,sha256=qeZJZq_ltnNFiytQNwMY3VAL7nBUXW2HnwMzBGaKQJ0,14452
 nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
@@ -8,7 +8,7 @@ nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI
 nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
 nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
 nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
-nv_ingest_client/client/interface.py,sha256=XQ2hHNBsL-Nnsk_w48UMxFqxfkO0CdQ2AOQZEdXU3OA,59990
+nv_ingest_client/client/interface.py,sha256=1gmFQ7bVQDiEweChN_Divv1Y87a4cNkEgH2Shp4tIMw,64915
 nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
 nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
 nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
@@ -16,16 +16,16 @@ nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQb
 nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
 nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
 nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
-nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcYqtesS-HaZzeh4rI,2130
+nv_ingest_client/primitives/tasks/caption.py,sha256=w-xPKN77zruUel0md4OA-x2ciELSLY-8Px1ds76gak0,2498
 nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
 nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
 nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
-nv_ingest_client/primitives/tasks/extract.py,sha256=ec2aKPU9OMOOw-oalQKAPaNRqgkREQ0ByLkFVqutD6E,9339
+nv_ingest_client/primitives/tasks/extract.py,sha256=jTCOSQG1MG0RoQg4DxPgmYgeHQR7O24hmysygkWYyIY,11270
 nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
 nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
 nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
 nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
-nv_ingest_client/primitives/tasks/store.py,sha256=nIOnCH8vw4FLCLVBJYnsS5Unc0QmuO_jEtUp7-E9FU4,4199
+nv_ingest_client/primitives/tasks/store.py,sha256=UeIspL_RDPBbUV3gv8SK3tIoYNun8r4cSSMxXvBSaks,4575
 nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
 nv_ingest_client/primitives/tasks/task_base.py,sha256=Mrx6kgePJHolYd3Im6mVISXcVgdulLst2MYG5gPov9I,1687
 nv_ingest_client/primitives/tasks/task_factory.py,sha256=uvGQXjgWmeF015jPWmBhiclzfrUf3_yD2PPeirQBczM,3218
@@ -46,11 +46,12 @@ nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
 nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
 nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
 nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
-nv_ingest_client/util/vdb/milvus.py,sha256=jCQyWb6xoQ6utGNccASmN09eJbwF2HlgrGGIkpoUfI8,80792
+nv_ingest_client/util/vdb/lancedb.py,sha256=mLykdOFkLC5-SpRvHAvt0do9rhyQDqy_H48D6hEtegw,10037
+nv_ingest_client/util/vdb/milvus.py,sha256=NLlsYU5LdESh0r_Psvn0vzGiNN-70iouOGr3RgZaMVg,81316
 nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
-nv_ingest_client-2025.11.27.dev20251127.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest_client-2025.11.27.dev20251127.dist-info/METADATA,sha256=jul59WHL8-9IYR27iL9ilxkw7IQRnqb7EMqBfJh7IGk,30627
-nv_ingest_client-2025.11.27.dev20251127.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest_client-2025.11.27.dev20251127.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
-nv_ingest_client-2025.11.27.dev20251127.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
-nv_ingest_client-2025.11.27.dev20251127.dist-info/RECORD,,
+nv_ingest_client-2025.12.17.dev20251217.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_client-2025.12.17.dev20251217.dist-info/METADATA,sha256=EbEZoUk3-GvCBAB2z0hqZjgMOGasw75hZCWTDk7yxpk,30658
+nv_ingest_client-2025.12.17.dev20251217.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest_client-2025.12.17.dev20251217.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
+nv_ingest_client-2025.12.17.dev20251217.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
+nv_ingest_client-2025.12.17.dev20251217.dist-info/RECORD,,

{nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-client 2025.11.27.dev20251127__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl

nv-ingest-client 2025.11.27.dev20251127py3-none-any.whl → 2025.12.17.dev20251217py3-none-any.whl