PyPI - nv-ingest-api - Versions diffs - 2025.7.15.dev20250715__py3-none-any.whl → 2025.7.16.dev20250716__py3-none-any.whl - Mend

nv-ingest-api 2025.7.15.dev20250715py3-none-any.whl → 2025.7.16.dev20250716py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (13) hide show

nv_ingest_api/internal/enums/common.py CHANGED Viewed

@@ -52,6 +52,8 @@ class ContentDescriptionEnum(str, Enum):
         Description for image extracted from PDF document.
     PDF_INFOGRAPHIC : str
         Description for structured infographic extracted from PDF document.
+    PDF_PAGE_IMAGE : str
+        Description for a full-page image rendered from a PDF document.
     PDF_TABLE : str
         Description for structured table extracted from PDF document.
     PDF_TEXT : str
@@ -70,6 +72,7 @@ class ContentDescriptionEnum(str, Enum):
     PDF_CHART: str = "Structured chart extracted from PDF document."
     PDF_IMAGE: str = "Image extracted from PDF document."
     PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
+    PDF_PAGE_IMAGE: str = "Full-page image rendered from a PDF document."
     PDF_TABLE: str = "Structured table extracted from PDF document."
     PDF_TEXT: str = "Unstructured text from PDF document."
     PPTX_IMAGE: str = "Image extracted from PPTX presentation."
@@ -94,6 +97,8 @@ class ContentTypeEnum(str, Enum):
         Represents image content.
     INFO_MSG : str
         Represents an informational message.
+    PAGE_IMAGE : str
+        Represents a full-page image rendered from a document.
     STRUCTURED : str
         Represents structured content.
     TEXT : str
@@ -111,6 +116,7 @@ class ContentTypeEnum(str, Enum):
     INFOGRAPHIC: str = "infographic"
     INFO_MSG: str = "info_message"
     NONE: str = "none"
+    PAGE_IMAGE: str = "page_image"
     STRUCTURED: str = "structured"
     TABLE: str = "table"
     TEXT: str = "text"

nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py CHANGED Viewed

@@ -4,20 +4,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import base64
+import inspect
 import io
-import pandas as pd
-from typing import Any, Dict, List, Optional
 import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
-from nv_ingest_api.internal.extract.pdf.engines import (
-    adobe_extractor,
-    llama_parse_extractor,
-    nemoretriever_parse_extractor,
-    pdfium_extractor,
-    tika_extractor,
-    unstructured_io_extractor,
-)
+import pandas as pd
+from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
+from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
+from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
+from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
+from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
+from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
 from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
 # Import extraction functions for different engines.
@@ -43,6 +44,7 @@ def _work_extract_pdf(
     extract_infographics: bool,
     extract_tables: bool,
     extract_charts: bool,
+    extract_page_as_image: bool,
     extractor_config: dict,
     execution_trace_log=None,
 ) -> Any:
@@ -52,17 +54,25 @@ def _work_extract_pdf(
     extract_method = extractor_config["extract_method"]
     extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
-    return extractor_fn(
-        pdf_stream,
-        extract_text,
-        extract_images,
-        extract_infographics,
-        extract_tables,
-        extract_charts,
-        extractor_config,
-        execution_trace_log,
+    extractor_fn_args = dict(
+        pdf_stream=pdf_stream,
+        extract_text=extract_text,
+        extract_images=extract_images,
+        extract_infographics=extract_infographics,
+        extract_tables=extract_tables,
+        extract_charts=extract_charts,
+        extractor_config=extractor_config,
+        execution_trace_log=execution_trace_log,
     )
+    if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
+        extractor_fn_args["extract_page_as_image"] = extract_page_as_image
+    elif extract_page_as_image:
+        logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
+    return extractor_fn(**extractor_fn_args)
 @unified_exception_handler
 def _orchestrate_row_extraction(
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
         extract_tables = params.pop("extract_tables", False)
         extract_charts = params.pop("extract_charts", False)
         extract_infographics = params.pop("extract_infographics", False)
+        extract_page_as_image = params.pop("extract_page_as_image", False)
         extract_method = params.get("extract_method", "pdfium")
     except KeyError as e:
         raise ValueError(f"Missing required extraction flag: {e}")
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
         extract_text=extract_text,
         extract_images=extract_images,
         extract_infographics=extract_infographics,
+        extract_page_as_image=extract_page_as_image,
         extract_tables=extract_tables,
         extract_charts=extract_charts,
         extractor_config=extractor_config,

nv_ingest_api/internal/extract/pdf/engines/pdfium.py CHANGED Viewed

@@ -24,6 +24,7 @@ import numpy as np
 import pandas as pd
 import pypdfium2 as libpdfium
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
 from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
 from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
     YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
@@ -35,6 +36,7 @@ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
 from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
 from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
 from nv_ingest_api.util.metadata.aggregators import (
+    construct_image_metadata_from_base64,
     construct_image_metadata_from_pdf_image,
     extract_pdf_metadata,
     construct_text_metadata,
@@ -47,6 +49,7 @@ from nv_ingest_api.util.pdf.pdfium import (
     extract_image_like_objects_from_pdfium_page,
 )
 from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
+from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
 from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
 logger = logging.getLogger(__name__)
@@ -385,6 +388,7 @@ def pdfium_extractor(
     extract_infographics: bool,
     extract_tables: bool,
     extract_charts: bool,
+    extract_page_as_image: bool,
     extractor_config: dict,
     execution_trace_log: Optional[List[Any]] = None,
 ) -> pd.DataFrame:
@@ -525,6 +529,24 @@ def pdfium_extractor(
                 )
                 extracted_data.extend(image_data)
+            # Full page image extraction
+            if extract_page_as_image:
+                page_text = _extract_page_text(page)
+                image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
+                base64_image = numpy_to_base64(image[0])
+                if len(base64_image) > 2**24 - 1:
+                    base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
+                image_meta = construct_image_metadata_from_base64(
+                    base64_image,
+                    page_idx,
+                    page_count,
+                    source_metadata,
+                    base_unified_metadata,
+                    subtype=ContentTypeEnum.PAGE_IMAGE,
+                    text=page_text,
+                )
+                extracted_data.append(image_meta)
             # If we want tables or charts, rasterize the page and store it
             if extract_tables or extract_charts or extract_infographics:
                 image, padding_offsets = pdfium_pages_to_numpy(
@@ -575,6 +597,7 @@ def pdfium_extractor(
                 execution_trace_log=execution_trace_log,
             )
             futures.append(future)
             pages_for_tables.clear()
         # Wait for all asynchronous jobs to complete.

nv_ingest_api/internal/schemas/meta/ingest_job_schema.py CHANGED Viewed

@@ -107,6 +107,10 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
     model_name: Optional[str] = None
     api_key: Optional[str] = None
     filter_errors: bool = False
+    text_elements_modality: Optional[str] = None
+    image_elements_modality: Optional[str] = None
+    structured_elements_modality: Optional[str] = None
+    audio_elements_modality: Optional[str] = None
 class IngestTaskVdbUploadSchema(BaseModelNoExt):
@@ -195,6 +199,7 @@ class IngestTaskSchema(BaseModelNoExt):
         validated_task_properties = expected_schema_cls(**task_properties)
         values["type"] = task_type  # ensure type is now always the enum
         values["task_properties"] = validated_task_properties
         return values
     @field_validator("type", mode="before")

nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py CHANGED Viewed

@@ -22,5 +22,9 @@ class TextEmbeddingSchema(BaseModel):
     input_type: str = Field(default="passage")
     raise_on_failure: bool = Field(default=False)
     truncate: str = Field(default="END")
+    text_elements_modality: str = Field(default="text")
+    image_elements_modality: str = Field(default="text")
+    structured_elements_modality: str = Field(default="text")
+    audio_elements_modality: str = Field(default="text")
     model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/transform/embed_text.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor
+from functools import partial
 from typing import Any, Dict, Tuple, Optional, Iterable, List
 import pandas as pd
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
 logger = logging.getLogger(__name__)
+MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
 # ------------------------------------------------------------------------------
 # Asynchronous Embedding Requests
 # ------------------------------------------------------------------------------
@@ -33,6 +37,7 @@ def _make_async_request(
     input_type: str,
     truncate: str,
     filter_errors: bool,
+    modalities: Optional[List[str]] = None,
 ) -> list:
     """
     Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
@@ -74,11 +79,18 @@ def _make_async_request(
             base_url=embedding_nim_endpoint,
         )
+        extra_body = {
+            "input_type": input_type,
+            "truncate": truncate,
+        }
+        if modalities:
+            extra_body["modality"] = modalities
         resp = client.embeddings.create(
             input=prompts,
             model=embedding_model,
             encoding_format=encoding_format,
-            extra_body={"input_type": input_type, "truncate": truncate},
+            extra_body=extra_body,
         )
         response["embedding"] = resp.data
@@ -110,6 +122,7 @@ def _async_request_handler(
     input_type: str,
     truncate: str,
     filter_errors: bool,
+    modalities: Optional[List[str]] = None,
 ) -> List[dict]:
     """
     Gathers calculated embedding results from the NIM embedding service concurrently.
@@ -138,6 +151,9 @@ def _async_request_handler(
     List[dict]
         A list of response dictionaries from the embedding service.
     """
+    if modalities is None:
+        modalities = [None] * len(prompts)
     with ThreadPoolExecutor() as executor:
         futures = [
             executor.submit(
@@ -150,8 +166,9 @@ def _async_request_handler(
                 input_type=input_type,
                 truncate=truncate,
                 filter_errors=filter_errors,
+                modalities=modality_batch,
             )
-            for prompt_batch in prompts
+            for prompt_batch, modality_batch in zip(prompts, modalities)
         ]
         results = [future.result() for future in futures]
@@ -167,6 +184,7 @@ def _async_runner(
     input_type: str,
     truncate: str,
     filter_errors: bool,
+    modalities: Optional[List[str]] = None,
 ) -> dict:
     """
     Concurrently launches all NIM embedding requests and flattens the results.
@@ -204,6 +222,7 @@ def _async_runner(
         input_type,
         truncate,
         filter_errors,
+        modalities=modalities,
     )
     flat_results = {"embeddings": [], "info_msgs": []}
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
     return row
-def _get_pandas_text_content(row):
+def _format_image_input_string(image_b64: Optional[str]) -> str:
+    if not image_b64:
+        return
+    return f"data:image/png;base64,{image_b64}"
+def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
+    if (not text) or (not text.strip()) or (not image_b64):
+        return
+    return f"{text.strip()} {_format_image_input_string(image_b64)}"
+def _get_pandas_text_content(row, modality="text"):
     """
     Extracts text content from a DataFrame row.
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
     return row["content"]
-def _get_pandas_table_content(row):
+def _get_pandas_table_content(row, modality="text"):
     """
     Extracts table/chart content from a DataFrame row.
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
     str
         The table/chart content from the row.
     """
-    return row.get("table_metadata", {}).get("table_content")
+    if modality == "text":
+        content = row.get("table_metadata", {}).get("table_content")
+    elif modality == "image":
+        content = _format_image_input_string(row.get("content"))
+    elif modality == "text_image":
+        text = row.get("table_metadata", {}).get("table_content")
+        image = row.get("content")
+        content = _format_text_image_pair_input_string(text, image)
+    return content
-def _get_pandas_image_content(row):
+def _get_pandas_image_content(row, modality="text"):
     """
     Extracts image caption content from a DataFrame row.
@@ -311,10 +351,28 @@ def _get_pandas_image_content(row):
     str
         The image caption from the row.
     """
-    return row.get("image_metadata", {}).get("caption")
+    subtype = row.get("content_metadata", {}).get("subtype")
+    if modality == "text":
+        if subtype == "page_image":
+            content = row.get("image_metadata", {}).get("text")
+        else:
+            content = row.get("image_metadata", {}).get("caption")
+    elif modality == "image":
+        content = _format_image_input_string(row.get("content"))
+    elif modality == "text_image":
+        if subtype == "page_image":
+            text = row.get("image_metadata", {}).get("text")
+        else:
+            text = row.get("image_metadata", {}).get("caption")
+        image = row.get("content")
+        content = _format_text_image_pair_input_string(text, image)
+    # A workaround to save memory.
+    row["content"] = ""
+    return content
-def _get_pandas_audio_content(row):
+def _get_pandas_audio_content(row, modality="text"):
     """
     A pandas UDF used to select extracted audio transcription to be used to create embeddings.
     """
@@ -408,6 +466,23 @@ def _concatenate_extractions_pandas(
 # ------------------------------------------------------------------------------
+def does_model_support_multimodal_embeddings(model: str) -> bool:
+    """
+    Checks if a given model supports multi-modal embeddings.
+    Parameters
+    ----------
+    model : str
+        The name of the model.
+    Returns
+    -------
+    bool
+        True if the model supports multi-modal embeddings, False otherwise.
+    """
+    return model in MULTI_MODAL_MODELS
 def transform_create_text_embeddings_internal(
     df_transform_ledger: pd.DataFrame,
     task_config: Dict[str, Any],
@@ -460,6 +535,15 @@ def transform_create_text_embeddings_internal(
         ContentTypeEnum.AUDIO: _get_pandas_audio_content,
         ContentTypeEnum.VIDEO: lambda x: None,  # Not supported yet.
     }
+    task_type_to_modality = {
+        ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
+        ContentTypeEnum.STRUCTURED: (
+            task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
+        ),
+        ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
+        ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
+        ContentTypeEnum.VIDEO: lambda x: None,  # Not supported yet.
+    }
     def _content_type_getter(row):
         return row["content_metadata"]["type"]
@@ -480,7 +564,7 @@ def transform_create_text_embeddings_internal(
         # Extract content and normalize empty or non-str to None
         extracted_content = (
             df_content["metadata"]
-            .apply(content_getter)
+            .apply(partial(content_getter, modality=task_type_to_modality[content_type]))
             .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
         )
         df_content["_content"] = extracted_content
@@ -488,9 +572,15 @@ def transform_create_text_embeddings_internal(
         # Prepare batches for only valid (non-None) content
         valid_content_mask = df_content["_content"].notna()
         if valid_content_mask.any():
-            filtered_content_batches = _generate_batches(
-                df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
-            )
+            filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
+            filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
+            if model_name in MULTI_MODAL_MODELS:
+                modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
+                modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
+            else:
+                modality_batches = None
             content_embeddings = _async_runner(
                 filtered_content_batches,
                 api_key,
@@ -500,6 +590,7 @@ def transform_create_text_embeddings_internal(
                 transform_config.input_type,
                 transform_config.truncate,
                 False,
+                modalities=modality_batches,
             )
             # Build a simple row index -> embedding map
             embeddings_dict = dict(

nv_ingest_api/util/image_processing/transforms.py CHANGED Viewed

@@ -20,6 +20,9 @@ cv2.setNumThreads(1)
 DEFAULT_MAX_WIDTH = 1024
 DEFAULT_MAX_HEIGHT = 1280
+# Workaround for PIL.Image.DecompressionBombError
+Image.MAX_IMAGE_PIXELS = None
 logger = logging.getLogger(__name__)

nv_ingest_api/util/metadata/aggregators.py CHANGED Viewed

@@ -201,6 +201,8 @@ def construct_image_metadata_from_base64(
     page_count: int,
     source_metadata: Dict[str, Any],
     base_unified_metadata: Dict[str, Any],
+    subtype: None | ContentTypeEnum | str = "",
+    text: str = "",
 ) -> List[Any]:
     """
     Extracts image data from a base64-encoded image string, decodes the image to get
@@ -252,6 +254,7 @@ def construct_image_metadata_from_base64(
             "line": -1,
             "span": -1,
         },
+        "subtype": subtype or "",
     }
     # Construct image metadata
@@ -259,7 +262,7 @@ def construct_image_metadata_from_base64(
         "image_type": DocumentTypeEnum.PNG,
         "structured_image_type": ContentTypeEnum.UNKNOWN,
         "caption": "",
-        "text": "",
+        "text": text,
         "image_location": bbox,
         "image_location_max_dimensions": (width, height),
         "height": height,

{nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.7.15.dev20250715
+Version: 2025.7.16.dev20250716
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/RECORD RENAMED Viewed

@@ -7,7 +7,7 @@ nv_ingest_api/interface/transform.py,sha256=g6YnFR7TpEU0xNtzCvv6kqnFbuCwQ6vRMjjB
 nv_ingest_api/interface/utility.py,sha256=AL4l0cJNvTjG1MAe1YNTk1jbbPED3g4HCewzx6Ffcio,7296
 nv_ingest_api/internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nv_ingest_api/internal/enums/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest_api/internal/enums/common.py,sha256=HSj7qqNr6KXu_FIyK_Wvel24R-r8lV7dLA173z5XFBc,12321
+nv_ingest_api/internal/enums/common.py,sha256=lzDJ35VWfIwlL_Lx_q0dfHUuwEB7CXudHIQAilpjoRw,12611
 nv_ingest_api/internal/extract/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
 nv_ingest_api/internal/extract/audio/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/extract/audio/audio_extraction.py,sha256=_jf_UC_FTqZr-xEpwG8edwBzdDjM01gGhqm9ulOsDcY,6973
@@ -32,10 +32,10 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
 nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
 nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=PpKTqS8jGHBV6mKLGZWwjpfT8ga6Fy8ffrvL-gPAf2c,8182
 nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=XNYz4S2tMFBv0KFzXNERrVs-1raxJ_iIIXpBGlJFcD0,22987
-nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=vtdBue1EEQJsHcBuX3NdPutbLfyKPIzily6JOK6yV0w,22421
+nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=8hUJUdpx6FhOBgabFmGhJiAQdl12kR8YoSbUfN-geOk,23506
 nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
 nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
-nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ2CZs167juvEZ-uV6qXWQjR08hhIu8otk2MWj4,4931
+nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=4bvN6LsPksLicI6jM0JqbJFiOZNHEcuc8MVVW4XfgV8,5875
 nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
 nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
 nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -82,7 +82,7 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
 nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
 nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
-nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=szDvgc2A_JetD2Jyewyl4ac4lwpy3NiLxD9dOYz42sM,8116
+nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=ceYQjRjhBSDbbZ6q-Db7Y6GHVOvWPdGAMb3TX1vMWfY,8321
 nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=VnAzkSFat_ckI19mlwQTlFrvP6EZVCwyNl9bt51b8oU,7193
 nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
@@ -92,14 +92,14 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
 nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=OtM1iPw26uioC3mghbOJQurKGg641uQfhASH462VqOY,578
 nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
-nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=ongmHkJA2953f9_RI7ZYzf5BUnFzVL6Al5E8WKyfgw4,885
+nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=RZCISA8CUqKiY8eJuk4uWxzo4PZ-fuYdzMO7_LYFkoM,1117
 nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
 nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
 nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
 nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
-nv_ingest_api/internal/transform/embed_text.py,sha256=A8JMotTkC8KQ0pmz4AIJhaKebza6JzhQ0aEnHX2oHY8,16539
+nv_ingest_api/internal/transform/embed_text.py,sha256=kvVGlNH1S91UENXWLD31uh3KzlfJYOlYitpIFMsyowU,20033
 nv_ingest_api/internal/transform/split_text.py,sha256=-kwpRWSVZrPldm1hn3-tVz_TkzuKM-kPvNU3HTp9zOY,7476
 nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +123,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
 nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
 nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
 nv_ingest_api/util/image_processing/table_and_chart.py,sha256=bxOu9PZYkG_WFCDGw_JLaO60S2pDSN8EOWK3xkIwr2A,14376
-nv_ingest_api/util/image_processing/transforms.py,sha256=CJVGQgUvHk_mzihR8ZZrvwJUBgUYcgFAKzXyRTmKdCE,23371
+nv_ingest_api/util/image_processing/transforms.py,sha256=3-xeUerc2AaXJTYuR23EjwdtjRQ8F85pS5D9zxR4cLA,23452
 nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/util/imports/callable_signatures.py,sha256=e2bJB1pmkN4Ee-Bf-VggOSBaQ4RXofWF5eKkWXgIj2U,1855
 nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=7GByV_-8z2X0tnVoabCxVioxOP3sYMros3ZllVAW-wY,4343
@@ -135,7 +135,7 @@ nv_ingest_api/util/message_brokers/simple_message_broker/broker.py,sha256=h9Q4q_
 nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py,sha256=3p-LRqG8qLnsfEhBNf73_DG22C08JKahTqUvPLS2Apg,2554
 nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=fh7Q0wO5H_FtrHV1VdT6V66aZNqglOh_2XdkfLt8hgg,15722
 nv_ingest_api/util/metadata/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
-nv_ingest_api/util/metadata/aggregators.py,sha256=Y5JSKuLhhk_ldpzT3eRIcVg7QM7cTNhfQZn4g5bcbq4,15884
+nv_ingest_api/util/metadata/aggregators.py,sha256=YYdvJ1E04eGFZKKHUxXoH6mzLg8nor9Smvnv0qzqK5w,15988
 nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ-HBVVEunn5Z9I99swA,242
 nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
 nv_ingest_api/util/nim/__init__.py,sha256=UqbiXFCqjWcjNvoduXd_0gOUOGBT8JvppiYHOmMyneA,1775
@@ -153,8 +153,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
 nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
 nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
-nv_ingest_api-2025.7.15.dev20250715.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest_api-2025.7.15.dev20250715.dist-info/METADATA,sha256=OWZyeCR9DZ23SdT0RcMdodCkxR508CZZaVczdM3qXPE,13947
-nv_ingest_api-2025.7.15.dev20250715.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest_api-2025.7.15.dev20250715.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
-nv_ingest_api-2025.7.15.dev20250715.dist-info/RECORD,,
+nv_ingest_api-2025.7.16.dev20250716.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_api-2025.7.16.dev20250716.dist-info/METADATA,sha256=RaPAkQ4Dtkkrn6hi9Va1t2XDpDgRbe-bFqmCVL3IlEA,13947
+nv_ingest_api-2025.7.16.dev20250716.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest_api-2025.7.16.dev20250716.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
+nv_ingest_api-2025.7.16.dev20250716.dist-info/RECORD,,

{nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-api 2025.7.15.dev20250715__py3-none-any.whl → 2025.7.16.dev20250716__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.7.15.dev20250715py3-none-any.whl → 2025.7.16.dev20250716py3-none-any.whl