PyPI - nv-ingest-api - Versions diffs - 2025.10.8.dev20251008__py3-none-any.whl → 2025.10.9.dev20251009__py3-none-any.whl - Mend

nv-ingest-api 2025.10.8.dev20251008py3-none-any.whl → 2025.10.9.dev20251009py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (8) hide show

nv_ingest_api/internal/extract/pdf/engines/pdfium.py CHANGED Viewed

@@ -332,6 +332,7 @@ def _extract_page_elements(
         # Process each extracted element based on extraction flags
         for page_idx, page_element in page_element_results:
+            page_reading_index = page_idx + 1
             # Skip elements that shouldn't be extracted based on flags
             if (not extract_tables) and (page_element.type_string == "table"):
                 continue
@@ -347,7 +348,7 @@ def _extract_page_elements(
             # Construct metadata for the page element
             page_element_meta = construct_page_element_metadata(
                 page_element,
-                page_idx,
+                page_reading_index,
                 page_count,
                 source_metadata,
                 base_unified_metadata,
@@ -473,6 +474,7 @@ def pdfium_extractor(
         for page_idx in range(page_count):
             page = doc.get_page(page_idx)
             page_width, page_height = page.get_size()
+            page_reading_index = page_idx + 1
             # Text extraction
             if extract_text:
@@ -481,7 +483,7 @@ def pdfium_extractor(
                     text_meta = construct_text_metadata(
                         [page_text],
                         pdf_metadata.keywords,
-                        page_idx,
+                        page_reading_index,
                         -1,
                         -1,
                         -1,
@@ -499,7 +501,7 @@ def pdfium_extractor(
                 image_data = _extract_page_images(
                     extract_images_method,
                     page,
-                    page_idx,
+                    page_reading_index,
                     page_width,
                     page_height,
                     page_count,
@@ -518,7 +520,7 @@ def pdfium_extractor(
                     base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
                 image_meta = construct_image_metadata_from_base64(
                     base64_image,
-                    page_idx,
+                    page_reading_index,
                     page_count,
                     source_metadata,
                     base_unified_metadata,

nv_ingest_api/internal/primitives/nim/nim_client.py CHANGED Viewed

@@ -326,16 +326,52 @@ class NimClient:
         outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
-        response = self.client.infer(
-            model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
-        )
+        base_delay = 0.5
+        attempt = 0
+        retries_429 = 0
+        max_grpc_retries = self.max_429_retries
-        logger.debug(f"gRPC inference response: {response}")
+        while attempt < self.max_retries:
+            try:
+                response = self.client.infer(
+                    model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
+                )
-        if len(outputs) == 1:
-            return response.as_numpy(outputs[0].name())
-        else:
-            return [response.as_numpy(output.name()) for output in outputs]
+                logger.debug(f"gRPC inference response: {response}")
+                if len(outputs) == 1:
+                    return response.as_numpy(outputs[0].name())
+                else:
+                    return [response.as_numpy(output.name()) for output in outputs]
+            except grpcclient.InferenceServerException as e:
+                status = e.status()
+                if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
+                    retries_429 += 1
+                    logger.warning(
+                        f"Received gRPC {status} for model '{model_name}'. "
+                        f"Attempt {retries_429} of {max_grpc_retries}."
+                    )
+                    if retries_429 >= max_grpc_retries:
+                        logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
+                        raise
+                    backoff_time = base_delay * (2**retries_429)
+                    time.sleep(backoff_time)
+                    continue
+                else:
+                    # For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
+                    # retrying will not help. We should fail fast.
+                    logger.error(
+                        f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
+                    )
+                    raise
+            except Exception as e:
+                # Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
+                logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
+                raise
     def _http_infer(self, formatted_input: dict) -> dict:
         """

{nv_ingest_api-2025.10.8.dev20251008.dist-info → nv_ingest_api-2025.10.9.dev20251009.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.10.8.dev20251008
+Version: 2025.10.9.dev20251009
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_api-2025.10.8.dev20251008.dist-info → nv_ingest_api-2025.10.9.dev20251009.dist-info}/RECORD RENAMED Viewed

@@ -32,7 +32,7 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
 nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
 nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=MwzM-n2tu0FHM0wDe_0mONLlzHrPte7EOTuPtzCh7Zs,8384
 nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=IVbNcH_phMiRSxnkZ04pGfQrPJ-x1zVR3hXyhxv7juc,22977
-nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=CCfxcHAS3mED8zD6GKTGNUi02CzBMs7FsSopevhsiyk,22720
+nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=yAndWwh_k00nP0spYGxlewP3RBPxE4QR-b3U3VgXnBo,22852
 nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
 nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
 nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=uTPTUTWQsGM1oeTUo49_hzwC5Yy9iEokrnS3z3WvtIo,5988
@@ -50,7 +50,7 @@ nv_ingest_api/internal/primitives/control_message_task.py,sha256=nWVB3QsP6p8BKwH
 nv_ingest_api/internal/primitives/ingest_control_message.py,sha256=8rA0UbPDSB3avReAKNxiUa_FCy7fIQpqk6tfmcYUibA,9879
 nv_ingest_api/internal/primitives/nim/__init__.py,sha256=-dFBTHQnMKV0yc5tfSqIT-rkJXKtpcmyUfTPs8TJAi8,339
 nv_ingest_api/internal/primitives/nim/default_values.py,sha256=W92XjfyeC6uuVxut6J7p00x1kpNsnXIDb97gSVytZJk,380
-nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=kQAHWwZ6kjTVYZSfa0qRyIOFcqrhMe8LUygGtgzAly0,26321
+nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=AwOyED1Kt6F-pxJPi4kpb15ioeWHV5z5zTtJ9GliyYQ,28007
 nv_ingest_api/internal/primitives/nim/nim_model_interface.py,sha256=gWhyR33mIgEOYirq53WOk1bRl1SL0C_SVrM4w1-JmKU,4166
 nv_ingest_api/internal/primitives/nim/model_interface/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1ExW5V6pXC1ZiHdobeG_BmbPr3rBbVJef13s,11003
@@ -164,10 +164,10 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
 nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
 nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
-nv_ingest_api-2025.10.8.dev20251008.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_api-2025.10.9.dev20251009.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
 udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
-udfs/llm_summarizer_udf.py,sha256=sIMfcH4GRyciTKUtq4dmhd6fZmAp07X32irIC4k7nEI,7316
-nv_ingest_api-2025.10.8.dev20251008.dist-info/METADATA,sha256=lHvP6DR5gEfSPzyevDfnSrkPZl-5TB9S35V3GzJY7L4,14085
-nv_ingest_api-2025.10.8.dev20251008.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest_api-2025.10.8.dev20251008.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
-nv_ingest_api-2025.10.8.dev20251008.dist-info/RECORD,,
+udfs/llm_summarizer_udf.py,sha256=t_ZFoz0e03uECYcRw4IabRj0GBlwAoJkJn13NL2wbsI,7217
+nv_ingest_api-2025.10.9.dev20251009.dist-info/METADATA,sha256=KB8EkNNQMTlk9Q7aDa09O4Q6DBQBCbBxJl0vtRoVbJY,14085
+nv_ingest_api-2025.10.9.dev20251009.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest_api-2025.10.9.dev20251009.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
+nv_ingest_api-2025.10.9.dev20251009.dist-info/RECORD,,

udfs/llm_summarizer_udf.py CHANGED Viewed

@@ -2,22 +2,40 @@
 """
 LLM Content Summarizer UDF for NV-Ingest Pipeline
-This UDF uses an LLM API to generate concise summaries
-of text content chunks, adding AI-generated summaries to the metadata for
-enhanced downstream processing and search capabilities.
+This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
+for enhanced downstream processing and search capabilities.
-Environment Variables:
+These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
 - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
 - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
-- LLM_SUMMARIZATION_BASE_URL: API base URL (default: https://integrate.api.nvidia.com/v1)
-- LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
-- LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
-- LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
+- LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
+- TIMEOUT: API timeout in seconds (default: 60)
+- MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
+- MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
+TODO: Implement this
+- NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
 """
-import os
 import logging
-from typing import Optional
+import os
+import time
+# REMOVE BEFORE MERGING
+# import yaml
+# from pathlib import Path
+logger = logging.getLogger(__name__)
+PROMPT = """
+Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
+and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
+This summary will be used for document search and understanding.
+[CONTENT]
+{content}
+[END CONTENT]
+"""
 def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage":  # noqa: F821
@@ -27,13 +45,6 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
     This function processes text primitives and generates concise summaries using
     an LLM API, storing the results in the metadata's custom_content field.
-    Features:
-    - Flexible content detection across multiple metadata locations
-    - Robust error handling with graceful fallbacks
-    - Comprehensive logging for monitoring and debugging
-    - Configurable content length thresholds
-    - Safe metadata manipulation preserving existing data
     Parameters
     ----------
     control_message : IngestControlMessage
@@ -44,167 +55,150 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
     IngestControlMessage
         The modified control message with LLM summaries added to metadata
     """
-    from openai import OpenAI
-    logger = logging.getLogger(__name__)
     logger.info("UDF: Starting LLM content summarization")
-    # Get configuration from environment
-    api_key = os.getenv("NVIDIA_API_KEY", "")
+    api_key = os.getenv("NVIDIA_API_KEY")
     model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
     base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
-    timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", "60"))
-    min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", "50"))
-    max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", "12000"))
+    min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
+    max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
+    timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
+    stats = {
+        "skipped": False,
+        "failed": False,
+        "tokens": 0,
+        "duration": 0.0,
+    }
     if not api_key:
-        logger.warning("NVIDIA_API_KEY not found, skipping summarization")
+        logger.error("NVIDIA_API_KEY not set. Skipping...")
         return control_message
-    # Get the DataFrame payload
     df = control_message.payload()
-    if df is None or len(df) == 0:
-        logger.warning("No payload found in control message")
-        return control_message
-    logger.info(f"Processing {len(df)} rows for LLM summarization")
-    # Initialize OpenAI client with error handling
-    try:
-        client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
-    except Exception as e:
-        logger.error(f"Failed to initialize OpenAI client: {e}")
+    if df is None or df.empty:
+        logger.warning("No payload found. Nothing to summarize.")
         return control_message
-    # Stats for reporting
-    stats = {"processed": 0, "summarized": 0, "skipped": 0, "failed": 0}
-    # Process each row
-    for idx, row in df.iterrows():
-        stats["processed"] += 1
-        try:
-            # Extract content - be more flexible about where it comes from
-            content = _extract_content(row, logger)
-            if not content:
-                stats["skipped"] += 1
-                continue
-            content = content.strip()
-            if len(content) < min_content_length:
-                stats["skipped"] += 1
-                continue
-            # Truncate if needed
-            if len(content) > max_content_length:
-                content = content[:max_content_length]
-            # Generate summary
-            summary = _generate_summary(client, content, model_name, logger)
-            if summary:
-                # Add to metadata
-                _add_summary(df, idx, row, summary, model_name, logger)
-                stats["summarized"] += 1
-            else:
-                stats["failed"] += 1
-        except Exception as e:
-            stats["failed"] += 1
-            logger.error(f"Row {idx}: Error processing content: {e}")
-    # Update the control message with modified DataFrame
-    control_message.payload(df)
-    logger.info(
-        f"LLM summarization complete: {stats['summarized']}/{stats['processed']} documents summarized, "
-        f"{stats['skipped']} skipped, {stats['failed']} failed"
+    # Select first and last chunk for summarization
+    # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
+    # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
+    # pages, it must require parsing the payload to see which chunks correspond to which pages
+    if len(df) > 1:
+        # TODO: add feature to select N first and last chunks
+        df = df.iloc[[0, -1]]
+    else:
+        logger.info("Document has only one chunk")
+    # Combine all content into a single string
+    content_list = df.apply(
+        _extract_content,
+        axis=1,
+        min_content_length=min_content_length,
+        max_content_length=max_content_length,
+        stats=stats,
     )
+    content = " ".join(content_list)
-    return control_message
+    # Nicely ask LLM to summarize content
+    summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
+    stats["failed"] = summary is None
+    if not stats["failed"]:
+        stats["tokens"] = _estimate_tokens(content)
+        logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
+        _store_summary(df, summary, model_name)
-def _extract_content(row, logger) -> Optional[str]:
-    """Extract text content from row, trying multiple locations."""
-    content = ""
+        # Update the control message with modified DataFrame
+        control_message.payload(df)
+    else:
+        logger.warning("%s failed to summarize content", model_name)
-    # Try different locations for content
-    if isinstance(row.get("metadata"), dict):
-        metadata = row["metadata"]
+    return control_message
-        # Primary location: metadata.content
-        content = metadata.get("content", "")
-        # If no content, try other locations
-        if not content:
-            # Try in text_metadata
-            text_metadata = metadata.get("text_metadata", {})
-            content = text_metadata.get("text", "") or text_metadata.get("content", "")
+def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
+    """Extract text content from row"""
+    metadata = row.get("metadata")
-    # Try top-level content field
-    if not content:
-        content = row.get("content", "")
+    if isinstance(metadata, dict):
+        content = metadata.get("content")
+        if content is not None:
+            content = content.strip()
+            if len(content) < min_content_length:
+                stats["skipped"] = True
+                logger.warning(f"Content less than min={min_content_length}. Skipping...")
+                content = ""
+            elif len(content) > max_content_length:
+                logger.warning(f"Truncating content to {max_content_length} characters")
+                content = content[:max_content_length]
+        else:
+            stats["skipped"] = True
+            content = ""
-    if not content:
-        return None
+    else:
+        stats["skipped"] = True
+        logger.warning("No metadata found. Skipping...")
+        content = ""
     return content
-def _generate_summary(client, content: str, model_name: str, logger) -> Optional[str]:
-    """Generate summary with robust error handling."""
-    prompt = f"""Please provide a comprehensive 3-4 sentence summary of the following document:
-{content}
-Focus on the main purpose, key topics, and important details.
-This summary will be used for document search and understanding.
-Summary:"""
+def _generate_llm_summary(
+    content: str,
+    model_name: str,
+    base_url: str,
+    api_key: str,
+    timeout: int,
+) -> tuple[str | None, float]:
+    """Ask an LLM to summarize content extracted from doc."""
+    start_time = time.time()
     try:
+        from openai import OpenAI
+        client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
+        start_time = time.time()
         completion = client.chat.completions.create(
             model=model_name,
-            messages=[{"role": "user", "content": prompt}],
+            messages=[{"role": "user", "content": PROMPT.format(content=content)}],
             max_tokens=400,  # Increased for more comprehensive summaries
             temperature=0.7,
         )
+        duration = time.time() - start_time
-        if completion.choices and len(completion.choices) > 0:
+        if completion.choices:
             summary = completion.choices[0].message.content.strip()
-            return summary
-        else:
-            return None
+            return summary, duration
+        return None, duration
     except Exception as e:
         logger.error(f"API call failed: {e}")
-        return None
+        # TODO: GitHub Thread
+        # Reviewers, tell me if this is a bad idea.
+        # I think the convention is to return timestamp for time even if it fails
+        return None, time.time() - start_time
-def _add_summary(df, idx: int, row, summary: str, model_name: str, logger):
-    """Add summary to metadata with safe handling."""
-    try:
-        # Get current metadata or create new dict - handle None case properly
-        existing_metadata = row.get("metadata")
-        if existing_metadata is not None and isinstance(existing_metadata, dict):
-            metadata = dict(existing_metadata)  # Create a copy
-        else:
-            metadata = {}
+def _store_summary(df, summary: str, model_name: str):
+    """Add summary to metadata and store in df"""
+    # hardcoded heuristic to store everything on chunk 0's metadata
+    row_0 = df.iloc[0]
-        # Ensure custom_content exists
-        if "custom_content" not in metadata or metadata["custom_content"] is None:
-            metadata["custom_content"] = {}
+    # this is a reference to a dictionary that is stored in the dataframe
+    # and is modified in place
+    metadata = row_0.get("metadata")
-        # Add LLM summary
-        metadata["custom_content"]["llm_summary"] = {"summary": summary, "model": model_name}
+    if metadata.get("custom_content") is None:
+        metadata["custom_content"] = {}
+    metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
-        # Update the DataFrame at the specific index
-        try:
-            df.at[idx, "metadata"] = metadata
-        except Exception:
-            # Alternative approach: update the original row reference
-            df.iloc[idx]["metadata"] = metadata
-    except Exception as e:
-        logger.error(f"Failed to add summary to row {idx}: {e}")
+def _estimate_tokens(text: str) -> int:
+    """Rough estimate (~4 characters per token)"""
+    return len(text) // 4
+def _safe_model_name(name: str) -> str:
+    return name.replace("/", "__").replace("-", "_")

{nv_ingest_api-2025.10.8.dev20251008.dist-info → nv_ingest_api-2025.10.9.dev20251009.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_api-2025.10.8.dev20251008.dist-info → nv_ingest_api-2025.10.9.dev20251009.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_api-2025.10.8.dev20251008.dist-info → nv_ingest_api-2025.10.9.dev20251009.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-api 2025.10.8.dev20251008__py3-none-any.whl → 2025.10.9.dev20251009__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.10.8.dev20251008py3-none-any.whl → 2025.10.9.dev20251009py3-none-any.whl