PyPI - nv-ingest-api - Versions diffs - 2025.5.12.dev20250512__tar.gz → 2025.5.14.dev20250514__tar.gz - Mend

nv-ingest-api 2025.5.12.dev20250512tar.gz → 2025.5.14.dev20250514tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (163) hide show

{nv_ingest_api-2025.5.12.dev20250512/src/nv_ingest_api.egg-info → nv_ingest_api-2025.5.14.dev20250514}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.5.12.dev20250512
+Version: 2025.5.14.dev20250514
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/transform.py RENAMED Viewed

@@ -207,7 +207,7 @@ def transform_image_create_vlm_caption(
         "api_key": api_key,
         "prompt": prompt,
         "endpoint_url": endpoint_url,
-        "model_name": model_name,
+        "image_caption_model_name": model_name,
     }
     filtered_task_config: Dict[str, str] = {k: v for k, v in task_config.items() if v is not None}

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py RENAMED Viewed

@@ -7,7 +7,7 @@ import base64
 import functools
 import io
 import logging
-from typing import Optional, Dict, Any, Union
+from typing import Optional, Dict, Any, Union, Tuple
 import pandas as pd
 from pydantic import BaseModel
@@ -146,7 +146,7 @@ def extract_primitives_from_docx_internal(
     task_config: Union[Dict[str, Any], BaseModel],
     extraction_config: DocxExtractorSchema,
     execution_trace_log: Optional[Dict[str, Any]] = None,
-) -> pd.DataFrame:
+) -> Tuple[pd.DataFrame, Union[Dict, None]]:
     """
     Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
     each document and replacing the original content with the extracted text.
@@ -202,4 +202,4 @@ def extract_primitives_from_docx_internal(
     else:
         extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
-    return extracted_df
+    return extracted_df, {}

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/chart_extractor.py RENAMED Viewed

@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
 PADDLE_MIN_WIDTH = 32
 PADDLE_MIN_HEIGHT = 32
-logger = logging.getLogger(f"morpheus.{__name__}")
+logger = logging.getLogger(f"ray.{__name__}")
 def _filter_valid_chart_images(
@@ -80,7 +80,7 @@ def _run_chart_inference(
             yolox_client.infer,
             data=data_yolox,
             model_name="yolox",
-            stage_name="chart_data_extraction",
+            stage_name="chart_extraction",
             max_batch_size=8,
             trace_info=trace_info,
         )
@@ -88,7 +88,7 @@ def _run_chart_inference(
             paddle_client.infer,
             data=data_paddle,
             model_name="paddle",
-            stage_name="chart_data_extraction",
+            stage_name="chart_extraction",
             max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
             trace_info=trace_info,
         )

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/image_extractor.py RENAMED Viewed

@@ -16,7 +16,7 @@ import pandas as pd
 from pydantic import BaseModel
 from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
-from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
+from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
 from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
 logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 def _decode_and_extract_from_image(
     base64_row: pd.Series,
     task_config: Dict[str, Any],
-    validated_extraction_config: ImageExtractorSchema,
+    validated_extraction_config: ImageConfigSchema,
     execution_trace_log: Optional[List[Any]] = None,
 ) -> Any:
     """
@@ -106,10 +106,10 @@ def _decode_and_extract_from_image(
         logger.debug(
             f"decode_and_extract: Extracting image content using image_extraction_config: "
-            f"{validated_extraction_config.image_extraction_config}"
+            f"{validated_extraction_config}"
         )
-        if validated_extraction_config.image_extraction_config is not None:
-            extract_params["image_extraction_config"] = validated_extraction_config.image_extraction_config
+        if validated_extraction_config is not None:
+            extract_params["image_extraction_config"] = validated_extraction_config
         if execution_trace_log is not None:
             extract_params["trace_info"] = execution_trace_log

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py RENAMED Viewed

@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
             model_name="yolox",
             max_batch_size=YOLOX_MAX_BATCH_SIZE,
             trace_info=trace_info,
-            stage_name="pdf_content_extractor",
+            stage_name="pdf_extraction",
         )
         # Process each result along with its corresponding image.

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py RENAMED Viewed

@@ -100,7 +100,7 @@ def _update_infographic_metadata(
         paddle_results = paddle_client.infer(
             data=data_paddle,
             model_name="paddle",
-            stage_name="infographic_data_extraction",
+            stage_name="infographic_extraction",
             max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
             trace_info=trace_info,
         )

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/table_extractor.py RENAMED Viewed

@@ -81,7 +81,7 @@ def _run_inference(
                 yolox_client.infer,
                 data=data_yolox,
                 model_name="yolox",
-                stage_name="table_data_extraction",
+                stage_name="table_extraction",
                 max_batch_size=8,
                 trace_info=trace_info,
             )
@@ -89,7 +89,7 @@ def _run_inference(
             paddle_client.infer,
             data=data_paddle,
             model_name="paddle",
-            stage_name="table_data_extraction",
+            stage_name="table_extraction",
             max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
             trace_info=trace_info,
         )

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py RENAMED Viewed

@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
     inference_results = nemoretriever_parse_client.infer(
         data=data,
         model_name="nemoretriever_parse",
-        stage_name="pdf_content_extractor",
+        stage_name="pdf_extraction",
         max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
         execution_trace_log=execution_trace_log,
     )
@@ -476,7 +476,7 @@ def _extract_text_and_bounding_boxes(
 def _create_clients(nemoretriever_parse_config):
     model_interface = nemoretriever_parse_utils.NemoRetrieverParseModelInterface(
-        model_name=nemoretriever_parse_config.model_name,
+        model_name=nemoretriever_parse_config.nemoretriever_parse_model_name,
     )
     nemoretriever_parse_client = create_inference_client(
         nemoretriever_parse_config.nemoretriever_parse_endpoints,

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py RENAMED Viewed

@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
             model_name="yolox",
             max_batch_size=YOLOX_MAX_BATCH_SIZE,
             trace_info=execution_trace_log,
-            stage_name="pdf_content_extractor",
+            stage_name="pdf_extraction",
         )
         # Process results: iterate over each image's inference output.

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py RENAMED Viewed

@@ -17,7 +17,6 @@
 import logging
 import io
-import operator
 import re
 import uuid
 from collections import defaultdict
@@ -155,6 +154,12 @@ def _finalize_images(
             extracted_data.append(image_entry)
+def _safe_position(shape):
+    top = shape.top if shape.top is not None else float("inf")
+    left = shape.left if shape.left is not None else float("inf")
+    return (top, left)
 # -----------------------------------------------------------------------------
 # Helper Function: Recursive Image Extraction
 # -----------------------------------------------------------------------------
@@ -283,7 +288,7 @@ def python_pptx(
     for slide_idx, slide in enumerate(presentation.slides):
         # Obtain a flat list of shapes (ungrouped) sorted by top then left.
-        shapes = sorted(ungroup_shapes(slide.shapes), key=operator.attrgetter("top", "left"))
+        shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
         page_nearby_blocks = {
             "text": {"content": [], "bbox": []},
@@ -656,21 +661,43 @@ def get_bbox(
     shape_object: Optional[Slide] = None,
     text_depth: Optional[TextTypeEnum] = None,
 ):
-    bbox = (-1, -1, -1, -1)
-    if text_depth == TextTypeEnum.DOCUMENT:
-        bbox = (-1, -1, -1, -1)
-    elif text_depth == TextTypeEnum.PAGE:
-        top = left = 0
-        width = presentation_object.slide_width
-        height = presentation_object.slide_height
-        bbox = (top, left, top + height, left + width)
-    elif shape_object:
-        top = shape_object.top
-        left = shape_object.left
-        width = shape_object.width
-        height = shape_object.height
-        bbox = (top, left, top + height, left + width)
-    return bbox
+    """
+    Safely computes bounding box for a slide, shape, or document.
+    Ensures that missing or None values are gracefully handled.
+    Returns
+    -------
+    Tuple[int, int, int, int]
+        Bounding box as (top, left, bottom, right).
+        Defaults to (-1, -1, -1, -1) if invalid or unsupported.
+    """
+    try:
+        if text_depth == TextTypeEnum.DOCUMENT:
+            return (-1, -1, -1, -1)
+        elif text_depth == TextTypeEnum.PAGE and presentation_object:
+            top = left = 0
+            width = presentation_object.slide_width
+            height = presentation_object.slide_height
+            return (top, left, top + height, left + width)
+        elif shape_object:
+            top = shape_object.top if shape_object.top is not None else -1
+            left = shape_object.left if shape_object.left is not None else -1
+            width = shape_object.width if shape_object.width is not None else -1
+            height = shape_object.height if shape_object.height is not None else -1
+            # If all are valid, return normally, else return placeholder
+            if -1 in [top, left, width, height]:
+                return (-1, -1, -1, -1)
+            return (top, left, top + height, left + width)
+    except Exception as e:
+        logger.warning(f"get_bbox: Failed to compute bbox due to {e}")
+        return (-1, -1, -1, -1)
+    return (-1, -1, -1, -1)
 def ungroup_shapes(shapes):

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py RENAMED Viewed

@@ -184,4 +184,4 @@ def extract_primitives_from_pptx_internal(
     else:
         extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
-    return extracted_df
+    return extracted_df, {}

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py RENAMED Viewed

@@ -8,7 +8,6 @@ from nv_ingest_api.internal.primitives.nim import ModelInterface
 import numpy as np
-# Assume ModelInterface is defined elsewhere in the project.
 class EmbeddingModelInterface(ModelInterface):
     """
     An interface for handling inference with an embedding model endpoint.

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py RENAMED Viewed

@@ -709,7 +709,13 @@ def postprocess_results(
             raise ValueError(f"Error in postprocessing {result.shape} and {original_image_shape}: {e}")
         for box, score, label in zip(bboxes, scores, labels):
-            class_name = class_labels[int(label)]
+            # TODO(Devin): Sometimes we get back unexpected class labels?
+            if (label < 0) or (label >= len(class_labels)):
+                logger.warning(f"Invalid class label {label} found in postprocessing")
+                continue
+            else:
+                class_name = class_labels[int(label)]
             annotation_dict[class_name].append([round(float(x), 4) for x in np.concatenate((box, [score]))])
         out.append(annotation_dict)

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/nim_client.py RENAMED Viewed

@@ -251,7 +251,7 @@ class NimClient:
             model_name=model_name, parameters=parameters, inputs=[input_tensors], outputs=outputs
         )
         logger.debug(f"gRPC inference response: {response}")
-        # TODO(self.client.has_error(response)) => raise error
         if len(outputs) == 1:
             return response.as_numpy(outputs[0].name())
         else:

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/tracing/tagging.py RENAMED Viewed

@@ -31,13 +31,15 @@ def traceable(trace_name=None):
     Notes
     -----
-    The decorated function must accept a IngestControlMessage object as its first argument. The
-    IngestControlMessage object must implement `has_metadata`, `get_metadata`, and `set_metadata`
-    methods used by the decorator to check for the trace tagging flag and to add trace metadata.
+    The decorated function must accept a IngestControlMessage object as one of its arguments.
+    For a regular function, this is expected to be the first argument; for a class method,
+    this is expected to be the second argument (after 'self'). The IngestControlMessage object
+    must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator
+    to check for the trace tagging flag and to add trace metadata.
     The trace metadata added by the decorator includes two entries:
-    - 'trace::entry::<trace_name>': The monotonic timestamp marking the function's entry.
-    - 'trace::exit::<trace_name>': The monotonic timestamp marking the function's exit.
+    - 'trace::entry::<trace_name>': The timestamp marking the function's entry.
+    - 'trace::exit::<trace_name>': The timestamp marking the function's exit.
     Example
     -------
@@ -47,23 +49,25 @@ def traceable(trace_name=None):
     ... def process_message(message):
     ...     pass
-    Applying the decorator with a custom trace name:
-    >>> @traceable(custom_trace_name="CustomTraceName")
-    ... def process_message(message):
-    ...     pass
-    In both examples, `process_message` will have entry and exit timestamps added to the
-    IngestControlMessage's metadata if 'config::add_trace_tagging' is True.
+    Applying the decorator with a custom trace name on a class method:
+    >>> class Processor:
+    ...     @traceable(trace_name="CustomTrace")
+    ...     def process(self, message):
+    ...         pass
     """
     def decorator_trace_tagging(func):
         @functools.wraps(func)
         def wrapper_trace_tagging(*args, **kwargs):
-            # Assuming the first argument is always the message
             ts_fetched = datetime.now()
-            message = args[0]
+            # Determine which argument is the message.
+            if hasattr(args[0], "has_metadata"):
+                message = args[0]
+            elif len(args) > 1 and hasattr(args[1], "has_metadata"):
+                message = args[1]
+            else:
+                raise ValueError("traceable decorator could not find a message argument with 'has_metadata()'")
             do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
                 message.get_metadata("config::add_trace_tagging") is True
@@ -79,7 +83,7 @@ def traceable(trace_name=None):
                     message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
                     message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
-            # Call the decorated function
+            # Call the decorated function.
             result = func(*args, **kwargs)
             if do_trace_tagging:

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py RENAMED Viewed

@@ -131,7 +131,7 @@ class NemoRetrieverParseConfigSchema(BaseModel):
     nemoretriever_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
     nemoretriever_parse_infer_protocol: str = ""
-    model_name: str = "nvidia/nemoretriever-parse"
+    nemoretriever_parse_model_name: str = "nvidia/nemoretriever-parse"
     timeout: float = 300.0

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py RENAMED Viewed

@@ -76,7 +76,7 @@ class IngestTaskCaptionSchema(BaseModelNoExt):
     api_key: Optional[str] = None
     endpoint_url: Optional[str] = None
     prompt: Optional[str] = None
-    model_name: Optional[str] = None
+    caption_model_name: Optional[str] = None
 class IngestTaskFilterParamsSchema(BaseModelNoExt):
@@ -104,7 +104,7 @@ class IngestTaskDedupSchema(BaseModelNoExt):
 class IngestTaskEmbedSchema(BaseModelNoExt):
     endpoint_url: Optional[str] = None
-    model_name: Optional[str] = None
+    embedding_model_name: Optional[str] = None
     api_key: Optional[str] = None
     filter_errors: bool = False

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py RENAMED Viewed

@@ -10,6 +10,6 @@ class ImageCaptionExtractionSchema(BaseModel):
     api_key: str = "api_key"
     endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
     prompt: str = "Caption the content of this image:"
-    model_name: str = "meta/llama-3.2-11b-vision-instruct"
+    image_caption_model_name: str = "meta/llama-3.2-11b-vision-instruct"
     raise_on_failure: bool = False
     model_config = ConfigDict(extra="forbid")

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/transform/caption_image.py RENAMED Viewed

@@ -173,7 +173,7 @@ def transform_image_create_vlm_caption_internal(
     api_key: str = task_config.get("api_key") or transform_config.api_key
     prompt: str = task_config.get("prompt") or transform_config.prompt
     endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
-    model_name: str = task_config.get("model_name") or transform_config.model_name
+    model_name: str = task_config.get("image_caption_model_name") or transform_config.image_caption_model_name
     # Create a mask for rows where the content type is "image".
     df_mask: pd.Series = df_transform_ledger["metadata"].apply(

{nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/transform/embed_text.py RENAMED Viewed

@@ -230,28 +230,35 @@ def _async_runner(
 def _add_embeddings(row, embeddings, info_msgs):
     """
     Updates a DataFrame row with embedding data and associated error info.
+    Ensures the 'embedding' field is always present, even if None.
     Parameters
     ----------
     row : pandas.Series
         A row of the DataFrame.
-    embeddings : list
-        List of embeddings corresponding to DataFrame rows.
-    info_msgs : list
-        List of info message dictionaries corresponding to DataFrame rows.
+    embeddings : dict
+        Dictionary mapping row indices to embeddings.
+    info_msgs : dict
+        Dictionary mapping row indices to info message dicts.
     Returns
     -------
     pandas.Series
-        The updated row with embedding and info message metadata added.
+        The updated row with 'embedding', 'info_message_metadata', and
+        '_contains_embeddings' appropriately set.
     """
-    row["metadata"]["embedding"] = embeddings[row.name]
-    if info_msgs[row.name] is not None:
-        row["metadata"]["info_message_metadata"] = info_msgs[row.name]
+    embedding = embeddings.get(row.name, None)
+    info_msg = info_msgs.get(row.name, None)
+    # Always set embedding, even if None
+    row["metadata"]["embedding"] = embedding
+    if info_msg:
+        row["metadata"]["info_message_metadata"] = info_msg
         row["document_type"] = ContentTypeEnum.INFO_MSG
         row["_contains_embeddings"] = False
     else:
-        row["_contains_embeddings"] = True
+        row["_contains_embeddings"] = embedding is not None
     return row
@@ -287,7 +294,7 @@ def _get_pandas_table_content(row):
     str
         The table/chart content from the row.
     """
-    return row["table_metadata"]["table_content"]
+    return row.get("table_metadata", {}).get("table_content")
 def _get_pandas_image_content(row):
@@ -304,7 +311,14 @@ def _get_pandas_image_content(row):
     str
         The image caption from the row.
     """
-    return row["image_metadata"]["caption"]
+    return row.get("image_metadata", {}).get("caption")
+def _get_pandas_audio_content(row):
+    """
+    A pandas UDF used to select extracted audio transcription to be used to create embeddings.
+    """
+    return row.get("audio_metadata", {}).get("audio_transcript")
 # ------------------------------------------------------------------------------
@@ -352,13 +366,6 @@ def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
     return [batch for batch in _batch_generator(prompts, batch_size)]
-def _get_pandas_audio_content(row):
-    """
-    A pandas UDF used to select extracted audio transcription to be used to create embeddings.
-    """
-    return row["audio_metadata"]["audio_transcript"]
 # ------------------------------------------------------------------------------
 # DataFrame Concatenation Utility
 # ------------------------------------------------------------------------------
@@ -408,17 +415,20 @@ def transform_create_text_embeddings_internal(
     execution_trace_log: Optional[Dict] = None,
 ) -> Tuple[pd.DataFrame, Dict]:
     """
-    Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE)
+    Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE, AUDIO)
     from a pandas DataFrame using asynchronous requests.
+    This function ensures that even if the extracted content is empty or None,
+    the embedding field is explicitly created and set to None.
     Parameters
     ----------
     df_transform_ledger : pd.DataFrame
         The DataFrame containing content for embedding extraction.
     task_config : Dict[str, Any]
         Dictionary containing task properties (e.g., filter error flag).
-    transform_config : Any
-        Validated configuration for text embedding extraction (EmbedExtractionsSchema).
+    transform_config : TextEmbeddingSchema, optional
+        Validated configuration for text embedding extraction.
     execution_trace_log : Optional[Dict], optional
         Optional trace information for debugging or logging (default is None).
@@ -429,24 +439,20 @@ def transform_create_text_embeddings_internal(
             - The updated DataFrame with embeddings applied.
             - A dictionary with trace information.
     """
-    # Retrieve configuration values with fallback to transform_config defaults.
-    api_key: str = task_config.get("api_key") or transform_config.api_key
-    endpoint_url: str = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
-    model_name: str = task_config.get("model_name") or transform_config.embedding_model
+    api_key = task_config.get("api_key") or transform_config.api_key
+    endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
+    model_name = task_config.get("model_name") or transform_config.embedding_model
     if execution_trace_log is None:
         execution_trace_log = {}
         logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
-    # TODO(Devin)
     if df_transform_ledger.empty:
         return df_transform_ledger, {"trace_info": execution_trace_log}
     embedding_dataframes = []
-    content_masks = []  # List of pandas boolean Series
+    content_masks = []
-    # Define pandas content extractors for supported content types.
     pandas_content_extractor = {
         ContentTypeEnum.TEXT: _get_pandas_text_content,
         ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
@@ -455,49 +461,62 @@ def transform_create_text_embeddings_internal(
         ContentTypeEnum.VIDEO: lambda x: None,  # Not supported yet.
     }
-    logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
     def _content_type_getter(row):
         return row["content_metadata"]["type"]
-    # Process each supported content type.
     for content_type, content_getter in pandas_content_extractor.items():
         if not content_getter:
             logger.debug(f"Skipping unsupported content type: {content_type}")
             continue
+        # Get rows matching the content type
         content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
         if not content_mask.any():
             continue
-        # Extract content from metadata and filter out rows with empty content.
-        extracted_content = df_transform_ledger.loc[content_mask, "metadata"].apply(content_getter)
-        non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "")
-        final_mask = content_mask & non_empty_mask
-        if not final_mask.any():
-            continue
+        # Always include all content_mask rows and prepare them
+        df_content = df_transform_ledger.loc[content_mask].copy().reset_index(drop=True)
-        df_content = df_transform_ledger.loc[final_mask].copy().reset_index(drop=True)
-        filtered_content = df_content["metadata"].apply(content_getter)
-        filtered_content_batches = _generate_batches(filtered_content.tolist(), batch_size=transform_config.batch_size)
-        content_embeddings = _async_runner(
-            filtered_content_batches,
-            api_key,
-            endpoint_url,
-            model_name,
-            transform_config.encoding_format,
-            transform_config.input_type,
-            transform_config.truncate,
-            False,
+        # Extract content and normalize empty or non-str to None
+        extracted_content = (
+            df_content["metadata"]
+            .apply(content_getter)
+            .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
         )
-        # Apply the embeddings (and any error info) to each row.
-        df_content[["metadata", "document_type", "_contains_embeddings"]] = df_content.apply(
-            _add_embeddings, **content_embeddings, axis=1
-        )[["metadata", "document_type", "_contains_embeddings"]]
-        df_content["_content"] = filtered_content
+        df_content["_content"] = extracted_content
+        # Prepare batches for only valid (non-None) content
+        valid_content_mask = df_content["_content"].notna()
+        if valid_content_mask.any():
+            filtered_content_batches = _generate_batches(
+                df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
+            )
+            content_embeddings = _async_runner(
+                filtered_content_batches,
+                api_key,
+                endpoint_url,
+                model_name,
+                transform_config.encoding_format,
+                transform_config.input_type,
+                transform_config.truncate,
+                False,
+            )
+            # Build a simple row index -> embedding map
+            embeddings_dict = dict(
+                zip(df_content.loc[valid_content_mask].index, content_embeddings.get("embeddings", []))
+            )
+            info_msgs_dict = dict(
+                zip(df_content.loc[valid_content_mask].index, content_embeddings.get("info_msgs", []))
+            )
+        else:
+            embeddings_dict = {}
+            info_msgs_dict = {}
+        # Apply embeddings or None to all rows
+        df_content = df_content.apply(_add_embeddings, embeddings=embeddings_dict, info_msgs=info_msgs_dict, axis=1)
         embedding_dataframes.append(df_content)
-        content_masks.append(final_mask)
+        content_masks.append(content_mask)
     combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
     return combined_df, {"trace_info": execution_trace_log}

nv-ingest-api 2025.5.12.dev20250512__tar.gz → 2025.5.14.dev20250514__tar.gz

Potentially problematic release.

nv-ingest-api 2025.5.12.dev20250512tar.gz → 2025.5.14.dev20250514tar.gz