PyPI - nv-ingest-client - Versions diffs - 2025.9.8.dev20250908__tar.gz → 2025.9.9.dev20250909__tar.gz - Mend

nv-ingest-client 2025.9.8.dev20250908tar.gz → 2025.9.9.dev20250909tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (60) hide show

{nv_ingest_client-2025.9.8.dev20250908/src/nv_ingest_client.egg-info → nv_ingest_client-2025.9.9.dev20250909}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.9.8.dev20250908
+Version: 2025.9.9.dev20250909
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

nv_ingest_client-2025.9.9.dev20250909/src/nv_ingest_client/util/image_disk_utils.py ADDED Viewed

@@ -0,0 +1,300 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Utility functions for saving images from ingestion results to disk as actual image files.
+This module provides comprehensive utilities for extracting and saving base64-encoded
+images from nv-ingest results to local filesystem. Features include:
+- Configurable filtering by image type (charts, tables, infographics, etc.)
+- Descriptive filename generation with source and page information
+- Organized directory structure by image type
+- Detailed image counting and statistics
+Typical use cases:
+- Debugging and visual inspection of extracted content
+- Quality assessment of image extraction pipeline
+"""
+import logging
+import os
+from typing import Any, Dict, List
+from nv_ingest_client.client.util.processing import get_valid_filename
+from nv_ingest_api.util.image_processing.transforms import save_image_to_disk, _detect_base64_image_format
+logger = logging.getLogger(__name__)
+def _detect_extension_from_content(image_content: str) -> str:
+    """
+    Get file extension by detecting original image format.
+    Falls back to .jpeg if detection fails or format is unknown.
+    """
+    DEFAULT_EXT = "jpg"  # must be either "jpg" or "png"
+    try:
+        fmt = _detect_base64_image_format(image_content).upper()
+    except Exception:
+        logger.warning("Image format detection failed; falling back to default '%s'.", DEFAULT_EXT)
+        return DEFAULT_EXT
+    ext_map = {
+        "JPEG": "jpg",
+        "JPG": "jpg",
+        "PNG": "png",
+    }
+    ext = ext_map.get(fmt, None)
+    if ext:
+        return ext
+    logger.warning("Unsupported image format '%s'; falling back to default '%s'.", fmt, DEFAULT_EXT)
+    return DEFAULT_EXT
+def save_images_to_disk(
+    response_data: List[Dict[str, Any]],
+    output_directory: str,
+    save_charts: bool = True,
+    save_tables: bool = True,
+    save_infographics: bool = True,
+    save_page_images: bool = False,
+    save_raw_images: bool = False,
+    organize_by_type: bool = True,
+    output_format: str = "auto",
+) -> Dict[str, int]:
+    """
+    Save base64-encoded images from ingestion results to disk as actual image files.
+    This utility extracts images from ingestion response data and saves them to disk
+    with descriptive filenames that include the image subtype and page information.
+    It provides granular control over which types of images to save.
+    Parameters
+    ----------
+    response_data : List[Dict[str, Any]]
+        List of document results from ingestion, each containing metadata with base64 images.
+    output_directory : str
+        Base directory where images will be saved.
+    save_charts : bool, optional
+        Whether to save chart images. Default is True.
+    save_tables : bool, optional
+        Whether to save table images. Default is True.
+    save_infographics : bool, optional
+        Whether to save infographic images. Default is True.
+    save_page_images : bool, optional
+        Whether to save page-as-image files. Default is False.
+    save_raw_images : bool, optional
+        Whether to save raw/natural images. Default is False.
+    organize_by_type : bool, optional
+        Whether to organize images into subdirectories by type. Default is True.
+    output_format : str, optional
+        Output image format for saved files. Default is "auto".
+        - "auto": Preserve original format (fastest, no conversion)
+        - "jpeg": Convert to JPEG (smaller files, good compression)
+        - "png": Convert to PNG (lossless quality)
+        Use "auto" for maximum speed by avoiding format conversion.
+    Returns
+    -------
+    Dict[str, int]
+        Dictionary with counts of images saved by type.
+    Raises
+    ------
+    ValueError
+        If output_format is not supported.
+    Examples
+    --------
+    >>> from nv_ingest_client.util.image_disk_utils import save_images_to_disk
+    >>>
+    >>> # Save only charts and tables
+    >>> counts = save_images_to_disk(
+    ...     response_data,
+    ...     "./output/images",
+    ...     save_charts=True,
+    ...     save_tables=True,
+    ...     save_page_images=False
+    ... )
+    >>> print(f"Saved {counts['chart']} charts and {counts['table']} tables")
+    """
+    if not response_data:
+        logger.warning("No response data provided")
+        return {}
+    # Validate format upfront to fail fast
+    normalized_format = output_format.lower()
+    if normalized_format not in ["auto", "png", "jpeg", "jpg"]:
+        raise ValueError(
+            f"Unsupported output format: '{output_format}'. Supported formats: 'auto', 'png', 'jpeg', 'jpg'"
+        )
+    # Initialize counters
+    image_counts = {"chart": 0, "table": 0, "infographic": 0, "page_image": 0, "image": 0, "total": 0}
+    # Create output directory
+    os.makedirs(output_directory, exist_ok=True)
+    for doc_idx, document in enumerate(response_data):
+        try:
+            metadata = document.get("metadata", {})
+            doc_type = document.get("document_type", "unknown")
+            # Skip documents without image content
+            image_content = metadata.get("content")
+            if not image_content:
+                continue
+            # Get document info for naming
+            source_metadata = metadata.get("source_metadata", {})
+            source_id = source_metadata.get("source_id", f"document_{doc_idx}")
+            clean_source_name = get_valid_filename(os.path.basename(source_id))
+            content_metadata = metadata.get("content_metadata", {})
+            subtype = content_metadata.get("subtype", "image")
+            page_number = content_metadata.get("page_number", 0)
+            # Apply filtering based on image subtype and user preferences
+            should_save = False
+            if subtype == "chart" and save_charts:
+                should_save = True
+            elif subtype == "table" and save_tables:
+                should_save = True
+            elif subtype == "infographic" and save_infographics:
+                should_save = True
+            elif subtype == "page_image" and save_page_images:
+                should_save = True
+            elif (
+                doc_type == "image"
+                and subtype not in ["chart", "table", "infographic", "page_image"]
+                and save_raw_images
+            ):
+                should_save = True
+                subtype = "image"  # Normalize subtype for consistent counting
+            if not should_save:
+                continue
+            # Determine file extension and target format (format already validated upfront)
+            if normalized_format in ["jpeg", "jpg"]:
+                file_ext, target_format = "jpeg", "jpeg"
+            elif normalized_format == "png":
+                file_ext, target_format = "png", "png"
+            else:  # normalized_format == "auto" - detect once and use result
+                detected_ext = _detect_extension_from_content(image_content)
+                if detected_ext == "png":
+                    file_ext, target_format = "png", "png"
+                else:  # detected_ext == "jpeg"
+                    file_ext, target_format = "jpeg", "jpeg"
+            if organize_by_type:
+                # Organize into subdirectories by image type
+                type_dir = os.path.join(output_directory, subtype)
+                os.makedirs(type_dir, exist_ok=True)
+                image_filename = f"{clean_source_name}_p{page_number}_{doc_idx}.{file_ext}"
+                image_path = os.path.join(type_dir, image_filename)
+            else:
+                # Flat directory structure with type in filename
+                image_filename = f"{clean_source_name}_{subtype}_p{page_number}_{doc_idx}.{file_ext}"
+                image_path = os.path.join(output_directory, image_filename)
+            # Save image using centralized API function
+            try:
+                success = save_image_to_disk(image_content, image_path, target_format)
+                if success:
+                    # Update image type counters
+                    image_counts[subtype] += 1
+                    image_counts["total"] += 1
+                    logger.debug(f"Saved {subtype} image: {image_path}")
+                else:
+                    logger.error(f"Failed to save {subtype} image for {clean_source_name}")
+            except Exception as e:
+                logger.error(f"Failed to save {subtype} image for {clean_source_name}: {e}")
+        except Exception as e:
+            logger.error(f"Failed to process document {doc_idx}: {e}")
+            continue
+    # Log summary statistics
+    if image_counts["total"] > 0:
+        logger.info(f"Successfully saved {image_counts['total']} images to {output_directory}")
+        for img_type, count in image_counts.items():
+            if img_type != "total" and count > 0:
+                logger.info(f"  - {img_type}: {count}")
+    else:
+        logger.info("No images were saved (none met filter criteria)")
+    return image_counts
+def save_images_from_response(response: Dict[str, Any], output_directory: str, **kwargs) -> Dict[str, int]:
+    """
+    Convenience function to save images from a full API response.
+    Parameters
+    ----------
+    response : Dict[str, Any]
+        Full API response containing a "data" field with document results.
+    output_directory : str
+        Directory where images will be saved.
+    **kwargs
+        Additional arguments passed to save_images_to_disk().
+        Includes output_format ("auto", "png", or "jpeg") and other filtering options.
+    Returns
+    -------
+    Dict[str, int]
+        Dictionary with counts of images saved by type.
+    """
+    if "data" not in response or not response["data"]:
+        logger.warning("No data found in response")
+        return {}
+    return save_images_to_disk(response["data"], output_directory, **kwargs)
+def save_images_from_ingestor_results(
+    results: List[List[Dict[str, Any]]], output_directory: str, **kwargs
+) -> Dict[str, int]:
+    """
+    Save images from Ingestor.ingest() results.
+    Parameters
+    ----------
+    results : List[List[Dict[str, Any]]]
+        Results from Ingestor.ingest(), where each inner list contains
+        document results for one source file. Can also handle LazyLoadedList
+        objects when save_to_disk=True is used.
+    output_directory : str
+        Directory where images will be saved.
+    **kwargs
+        Additional arguments passed to save_images_to_disk().
+        Includes output_format ("auto", "png", or "jpeg") and other filtering options.
+    Returns
+    -------
+    Dict[str, int]
+        Dictionary with counts of images saved by type.
+    """
+    # Flatten results from multiple documents into single list
+    all_documents = []
+    for doc_results in results:
+        if isinstance(doc_results, list):
+            # Standard list of document results
+            all_documents.extend(doc_results)
+        elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
+            # Handle LazyLoadedList or other sequence-like objects
+            try:
+                all_documents.extend(list(doc_results))
+            except Exception as e:
+                logger.warning(f"Failed to process document results: {e}")
+                continue
+        else:
+            # Handle single document case
+            all_documents.append(doc_results)
+    return save_images_to_disk(all_documents, output_directory, **kwargs)

{nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/vdb/milvus.py RENAMED Viewed

@@ -42,6 +42,7 @@ from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
 from pymilvus.orm.types import CONSISTENCY_BOUNDED
 from scipy.sparse import csr_array
 logger = logging.getLogger(__name__)
 CONSISTENCY = CONSISTENCY_BOUNDED
@@ -881,7 +882,7 @@ def create_bm25_model(
     return bm25_ef
-def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
+def stream_insert_milvus(records, client: MilvusClient, collection_name: str, batch_size: int = 5000):
     """
     This function takes the input records and creates a corpus,
     factoring in filters (i.e. texts, charts, tables) and fits
@@ -899,12 +900,42 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
         Milvus Collection to search against
     """
     count = 0
-    for element in records:
-        client.insert(collection_name=collection_name, data=[element])
-        count += 1
+    for idx in range(0, len(records), batch_size):
+        client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
+        count += len(records[idx : idx + batch_size])
     logger.info(f"streamed {count} records")
+def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
+    """
+    This function waits for the index to be built. It checks
+    the indexed_rows of the index and waits for it to be equal
+    to the number of records. This only works for streaming inserts,
+    bulk inserts are not supported by this function
+    (refer to MilvusClient.refresh_load for bulk inserts).
+    """
+    index_names = utility.list_indexes(collection_name)
+    indexed_rows = 0
+    for index_name in index_names:
+        indexed_rows = 0
+        while indexed_rows < num_elements:
+            pos_movement = 10  # number of iteration allowed without noticing an increase in indexed_rows
+            for i in range(20):
+                new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
+                time.sleep(1)
+                if new_indexed_rows == num_elements:
+                    indexed_rows = new_indexed_rows
+                    break
+                # check if indexed_rows is staying the same, too many times means something is wrong
+                if new_indexed_rows == indexed_rows:
+                    pos_movement = -1
+                # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
+                if pos_movement == 0:
+                    raise ValueError("Rows are not getting indexed as expected")
+                indexed_rows = new_indexed_rows
+    return indexed_rows
 def write_to_nvingest_collection(
     records,
     collection_name: str,
@@ -1026,6 +1057,9 @@ def write_to_nvingest_collection(
             client,
             collection_name,
         )
+        # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
+        # know how long this should take, it is num_elements dependent.
+        wait_for_index(collection_name, num_elements, client)
     else:
         minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
         if not minio_client.bucket_exists(bucket_name):

{nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909/src/nv_ingest_client.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-client
-Version: 2025.9.8.dev20250908
+Version: 2025.9.9.dev20250909
 Summary: Python client for the nv-ingest service
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client.egg-info/SOURCES.txt RENAMED Viewed

@@ -42,6 +42,7 @@ src/nv_ingest_client/primitives/tasks/udf.py
 src/nv_ingest_client/primitives/tasks/vdb_upload.py
 src/nv_ingest_client/util/__init__.py
 src/nv_ingest_client/util/dataset.py
+src/nv_ingest_client/util/image_disk_utils.py
 src/nv_ingest_client/util/milvus.py
 src/nv_ingest_client/util/process_json_files.py
 src/nv_ingest_client/util/processing.py