PyPI - nv-ingest-api - Versions diffs - 2025.10.7.dev20251007__tar.gz → 2025.10.9.dev20251009__tar.gz - Mend

nv-ingest-api 2025.10.7.dev20251007tar.gz → 2025.10.9.dev20251009tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (181) hide show

{nv_ingest_api-2025.10.7.dev20251007/src/nv_ingest_api.egg-info → nv_ingest_api-2025.10.9.dev20251009}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.10.7.dev20251007
+Version: 2025.10.9.dev20251009
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py RENAMED Viewed

@@ -332,6 +332,7 @@ def _extract_page_elements(
         # Process each extracted element based on extraction flags
         for page_idx, page_element in page_element_results:
+            page_reading_index = page_idx + 1
             # Skip elements that shouldn't be extracted based on flags
             if (not extract_tables) and (page_element.type_string == "table"):
                 continue
@@ -347,7 +348,7 @@ def _extract_page_elements(
             # Construct metadata for the page element
             page_element_meta = construct_page_element_metadata(
                 page_element,
-                page_idx,
+                page_reading_index,
                 page_count,
                 source_metadata,
                 base_unified_metadata,
@@ -473,6 +474,7 @@ def pdfium_extractor(
         for page_idx in range(page_count):
             page = doc.get_page(page_idx)
             page_width, page_height = page.get_size()
+            page_reading_index = page_idx + 1
             # Text extraction
             if extract_text:
@@ -481,7 +483,7 @@ def pdfium_extractor(
                     text_meta = construct_text_metadata(
                         [page_text],
                         pdf_metadata.keywords,
-                        page_idx,
+                        page_reading_index,
                         -1,
                         -1,
                         -1,
@@ -499,7 +501,7 @@ def pdfium_extractor(
                 image_data = _extract_page_images(
                     extract_images_method,
                     page,
-                    page_idx,
+                    page_reading_index,
                     page_width,
                     page_height,
                     page_count,
@@ -518,7 +520,7 @@ def pdfium_extractor(
                     base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
                 image_meta = construct_image_metadata_from_base64(
                     base64_image,
-                    page_idx,
+                    page_reading_index,
                     page_count,
                     source_metadata,
                     base_unified_metadata,

{nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/nim_client.py RENAMED Viewed

@@ -326,16 +326,52 @@ class NimClient:
         outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
-        response = self.client.infer(
-            model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
-        )
+        base_delay = 0.5
+        attempt = 0
+        retries_429 = 0
+        max_grpc_retries = self.max_429_retries
-        logger.debug(f"gRPC inference response: {response}")
+        while attempt < self.max_retries:
+            try:
+                response = self.client.infer(
+                    model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
+                )
-        if len(outputs) == 1:
-            return response.as_numpy(outputs[0].name())
-        else:
-            return [response.as_numpy(output.name()) for output in outputs]
+                logger.debug(f"gRPC inference response: {response}")
+                if len(outputs) == 1:
+                    return response.as_numpy(outputs[0].name())
+                else:
+                    return [response.as_numpy(output.name()) for output in outputs]
+            except grpcclient.InferenceServerException as e:
+                status = e.status()
+                if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
+                    retries_429 += 1
+                    logger.warning(
+                        f"Received gRPC {status} for model '{model_name}'. "
+                        f"Attempt {retries_429} of {max_grpc_retries}."
+                    )
+                    if retries_429 >= max_grpc_retries:
+                        logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
+                        raise
+                    backoff_time = base_delay * (2**retries_429)
+                    time.sleep(backoff_time)
+                    continue
+                else:
+                    # For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
+                    # retrying will not help. We should fail fast.
+                    logger.error(
+                        f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
+                    )
+                    raise
+            except Exception as e:
+                # Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
+                logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
+                raise
     def _http_infer(self, formatted_input: dict) -> dict:
         """

{nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/rest/rest_client.py RENAMED Viewed

@@ -308,7 +308,18 @@ class RestClient(MessageBrokerClientBase):
         retries: int = 0
         url: str = f"{self._base_url}{self._fetch_endpoint}/{job_id}"
-        req_timeout: Tuple[float, Optional[float]] = self._timeout
+        # Derive per-call timeout if provided; otherwise use default
+        if timeout is None:
+            req_timeout: Tuple[float, Optional[float]] = self._timeout
+        else:
+            if isinstance(timeout, tuple):
+                # Expect (connect, read)
+                connect_t = float(timeout[0])
+                read_t = None if (len(timeout) < 2 or timeout[1] is None) else float(timeout[1])
+                req_timeout = (connect_t, read_t)
+            else:
+                # Single float means override read timeout, keep a small connect timeout
+                req_timeout = (min(self._default_connect_timeout, 5.0), float(timeout))
         while True:
             result: Optional[Any] = None

{nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009/src/nv_ingest_api.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.10.7.dev20251007
+Version: 2025.10.9.dev20251009
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

nv_ingest_api-2025.10.9.dev20251009/src/udfs/llm_summarizer_udf.py ADDED Viewed

@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+LLM Content Summarizer UDF for NV-Ingest Pipeline
+This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
+for enhanced downstream processing and search capabilities.
+These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
+- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
+- LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
+- LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
+- TIMEOUT: API timeout in seconds (default: 60)
+- MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
+- MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
+TODO: Implement this
+- NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
+"""
+import logging
+import os
+import time
+# REMOVE BEFORE MERGING
+# import yaml
+# from pathlib import Path
+logger = logging.getLogger(__name__)
+PROMPT = """
+Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
+and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
+This summary will be used for document search and understanding.
+[CONTENT]
+{content}
+[END CONTENT]
+"""
+def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage":  # noqa: F821
+    """
+    UDF function that adds LLM-generated summaries to text content chunks.
+    This function processes text primitives and generates concise summaries using
+    an LLM API, storing the results in the metadata's custom_content field.
+    Parameters
+    ----------
+    control_message : IngestControlMessage
+        The control message containing the DataFrame payload with text content
+    Returns
+    -------
+    IngestControlMessage
+        The modified control message with LLM summaries added to metadata
+    """
+    logger.info("UDF: Starting LLM content summarization")
+    api_key = os.getenv("NVIDIA_API_KEY")
+    model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
+    base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
+    min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
+    max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
+    timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
+    stats = {
+        "skipped": False,
+        "failed": False,
+        "tokens": 0,
+        "duration": 0.0,
+    }
+    if not api_key:
+        logger.error("NVIDIA_API_KEY not set. Skipping...")
+        return control_message
+    df = control_message.payload()
+    if df is None or df.empty:
+        logger.warning("No payload found. Nothing to summarize.")
+        return control_message
+    # Select first and last chunk for summarization
+    # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
+    # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
+    # pages, it must require parsing the payload to see which chunks correspond to which pages
+    if len(df) > 1:
+        # TODO: add feature to select N first and last chunks
+        df = df.iloc[[0, -1]]
+    else:
+        logger.info("Document has only one chunk")
+    # Combine all content into a single string
+    content_list = df.apply(
+        _extract_content,
+        axis=1,
+        min_content_length=min_content_length,
+        max_content_length=max_content_length,
+        stats=stats,
+    )
+    content = " ".join(content_list)
+    # Nicely ask LLM to summarize content
+    summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
+    stats["failed"] = summary is None
+    if not stats["failed"]:
+        stats["tokens"] = _estimate_tokens(content)
+        logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
+        _store_summary(df, summary, model_name)
+        # Update the control message with modified DataFrame
+        control_message.payload(df)
+    else:
+        logger.warning("%s failed to summarize content", model_name)
+    return control_message
+def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
+    """Extract text content from row"""
+    metadata = row.get("metadata")
+    if isinstance(metadata, dict):
+        content = metadata.get("content")
+        if content is not None:
+            content = content.strip()
+            if len(content) < min_content_length:
+                stats["skipped"] = True
+                logger.warning(f"Content less than min={min_content_length}. Skipping...")
+                content = ""
+            elif len(content) > max_content_length:
+                logger.warning(f"Truncating content to {max_content_length} characters")
+                content = content[:max_content_length]
+        else:
+            stats["skipped"] = True
+            content = ""
+    else:
+        stats["skipped"] = True
+        logger.warning("No metadata found. Skipping...")
+        content = ""
+    return content
+def _generate_llm_summary(
+    content: str,
+    model_name: str,
+    base_url: str,
+    api_key: str,
+    timeout: int,
+) -> tuple[str | None, float]:
+    """Ask an LLM to summarize content extracted from doc."""
+    start_time = time.time()
+    try:
+        from openai import OpenAI
+        client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
+        start_time = time.time()
+        completion = client.chat.completions.create(
+            model=model_name,
+            messages=[{"role": "user", "content": PROMPT.format(content=content)}],
+            max_tokens=400,  # Increased for more comprehensive summaries
+            temperature=0.7,
+        )
+        duration = time.time() - start_time
+        if completion.choices:
+            summary = completion.choices[0].message.content.strip()
+            return summary, duration
+        return None, duration
+    except Exception as e:
+        logger.error(f"API call failed: {e}")
+        # TODO: GitHub Thread
+        # Reviewers, tell me if this is a bad idea.
+        # I think the convention is to return timestamp for time even if it fails
+        return None, time.time() - start_time
+def _store_summary(df, summary: str, model_name: str):
+    """Add summary to metadata and store in df"""
+    # hardcoded heuristic to store everything on chunk 0's metadata
+    row_0 = df.iloc[0]
+    # this is a reference to a dictionary that is stored in the dataframe
+    # and is modified in place
+    metadata = row_0.get("metadata")
+    if metadata.get("custom_content") is None:
+        metadata["custom_content"] = {}
+    metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
+def _estimate_tokens(text: str) -> int:
+    """Rough estimate (~4 characters per token)"""
+    return len(text) // 4
+def _safe_model_name(name: str) -> str:
+    return name.replace("/", "__").replace("-", "_")

nv_ingest_api-2025.10.7.dev20251007/src/udfs/llm_summarizer_udf.py DELETED Viewed

@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-"""
-LLM Content Summarizer UDF for NV-Ingest Pipeline
-This UDF uses an LLM API to generate concise summaries
-of text content chunks, adding AI-generated summaries to the metadata for
-enhanced downstream processing and search capabilities.
-Environment Variables:
-- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
-- LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
-- LLM_SUMMARIZATION_BASE_URL: API base URL (default: https://integrate.api.nvidia.com/v1)
-- LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
-- LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
-- LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
-"""
-import os
-import logging
-from typing import Optional
-def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage":  # noqa: F821
-    """
-    UDF function that adds LLM-generated summaries to text content chunks.
-    This function processes text primitives and generates concise summaries using
-    an LLM API, storing the results in the metadata's custom_content field.
-    Features:
-    - Flexible content detection across multiple metadata locations
-    - Robust error handling with graceful fallbacks
-    - Comprehensive logging for monitoring and debugging
-    - Configurable content length thresholds
-    - Safe metadata manipulation preserving existing data
-    Parameters
-    ----------
-    control_message : IngestControlMessage
-        The control message containing the DataFrame payload with text content
-    Returns
-    -------
-    IngestControlMessage
-        The modified control message with LLM summaries added to metadata
-    """
-    from openai import OpenAI
-    logger = logging.getLogger(__name__)
-    logger.info("UDF: Starting LLM content summarization")
-    # Get configuration from environment
-    api_key = os.getenv("NVIDIA_API_KEY", "")
-    model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
-    base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
-    timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", "60"))
-    min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", "50"))
-    max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", "12000"))
-    if not api_key:
-        logger.warning("NVIDIA_API_KEY not found, skipping summarization")
-        return control_message
-    # Get the DataFrame payload
-    df = control_message.payload()
-    if df is None or len(df) == 0:
-        logger.warning("No payload found in control message")
-        return control_message
-    logger.info(f"Processing {len(df)} rows for LLM summarization")
-    # Initialize OpenAI client with error handling
-    try:
-        client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
-    except Exception as e:
-        logger.error(f"Failed to initialize OpenAI client: {e}")
-        return control_message
-    # Stats for reporting
-    stats = {"processed": 0, "summarized": 0, "skipped": 0, "failed": 0}
-    # Process each row
-    for idx, row in df.iterrows():
-        stats["processed"] += 1
-        try:
-            # Extract content - be more flexible about where it comes from
-            content = _extract_content(row, logger)
-            if not content:
-                stats["skipped"] += 1
-                continue
-            content = content.strip()
-            if len(content) < min_content_length:
-                stats["skipped"] += 1
-                continue
-            # Truncate if needed
-            if len(content) > max_content_length:
-                content = content[:max_content_length]
-            # Generate summary
-            summary = _generate_summary(client, content, model_name, logger)
-            if summary:
-                # Add to metadata
-                _add_summary(df, idx, row, summary, model_name, logger)
-                stats["summarized"] += 1
-            else:
-                stats["failed"] += 1
-        except Exception as e:
-            stats["failed"] += 1
-            logger.error(f"Row {idx}: Error processing content: {e}")
-    # Update the control message with modified DataFrame
-    control_message.payload(df)
-    logger.info(
-        f"LLM summarization complete: {stats['summarized']}/{stats['processed']} documents summarized, "
-        f"{stats['skipped']} skipped, {stats['failed']} failed"
-    )
-    return control_message
-def _extract_content(row, logger) -> Optional[str]:
-    """Extract text content from row, trying multiple locations."""
-    content = ""
-    # Try different locations for content
-    if isinstance(row.get("metadata"), dict):
-        metadata = row["metadata"]
-        # Primary location: metadata.content
-        content = metadata.get("content", "")
-        # If no content, try other locations
-        if not content:
-            # Try in text_metadata
-            text_metadata = metadata.get("text_metadata", {})
-            content = text_metadata.get("text", "") or text_metadata.get("content", "")
-    # Try top-level content field
-    if not content:
-        content = row.get("content", "")
-    if not content:
-        return None
-    return content
-def _generate_summary(client, content: str, model_name: str, logger) -> Optional[str]:
-    """Generate summary with robust error handling."""
-    prompt = f"""Please provide a comprehensive 3-4 sentence summary of the following document:
-{content}
-Focus on the main purpose, key topics, and important details.
-This summary will be used for document search and understanding.
-Summary:"""
-    try:
-        completion = client.chat.completions.create(
-            model=model_name,
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=400,  # Increased for more comprehensive summaries
-            temperature=0.7,
-        )
-        if completion.choices and len(completion.choices) > 0:
-            summary = completion.choices[0].message.content.strip()
-            return summary
-        else:
-            return None
-    except Exception as e:
-        logger.error(f"API call failed: {e}")
-        return None
-def _add_summary(df, idx: int, row, summary: str, model_name: str, logger):
-    """Add summary to metadata with safe handling."""
-    try:
-        # Get current metadata or create new dict - handle None case properly
-        existing_metadata = row.get("metadata")
-        if existing_metadata is not None and isinstance(existing_metadata, dict):
-            metadata = dict(existing_metadata)  # Create a copy
-        else:
-            metadata = {}
-        # Ensure custom_content exists
-        if "custom_content" not in metadata or metadata["custom_content"] is None:
-            metadata["custom_content"] = {}
-        # Add LLM summary
-        metadata["custom_content"]["llm_summary"] = {"summary": summary, "model": model_name}
-        # Update the DataFrame at the specific index
-        try:
-            df.at[idx, "metadata"] = metadata
-        except Exception:
-            # Alternative approach: update the original row reference
-            df.iloc[idx]["metadata"] = metadata
-    except Exception as e:
-        logger.error(f"Failed to add summary to row {idx}: {e}")