PyPI - nv-ingest-api - Versions diffs - 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc2__py3-none-any.whl - Mend

nv-ingest-api 25.7.7.dev20250707py3-none-any.whl → 25.8.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (33) hide show

nv_ingest_api/internal/schemas/extract/extract_table_schema.py CHANGED Viewed

@@ -22,8 +22,8 @@ class TableExtractorConfigSchema(BaseModel):
     auth_token : Optional[str], default=None
         Authentication token required for secure services.
-    paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
-        A tuple containing the gRPC and HTTP services for the paddle endpoint.
+    ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the ocr endpoint.
         Either the gRPC or HTTP service can be empty, but not both.
     Methods
@@ -47,8 +47,8 @@ class TableExtractorConfigSchema(BaseModel):
     yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
     yolox_infer_protocol: str = ""
-    paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
-    paddle_infer_protocol: str = ""
+    ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    ocr_infer_protocol: str = ""
     nim_batch_size: int = 2
     workers_per_progress_engine: int = 5
@@ -81,7 +81,7 @@ class TableExtractorConfigSchema(BaseModel):
                 return None
             return service
-        for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
+        for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
             grpc_service, http_service = values.get(endpoint_name, (None, None))
             grpc_service = clean_service(grpc_service)
             http_service = clean_service(http_service)

nv_ingest_api/internal/schemas/meta/ingest_job_schema.py CHANGED Viewed

@@ -107,6 +107,10 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
     model_name: Optional[str] = None
     api_key: Optional[str] = None
     filter_errors: bool = False
+    text_elements_modality: Optional[str] = None
+    image_elements_modality: Optional[str] = None
+    structured_elements_modality: Optional[str] = None
+    audio_elements_modality: Optional[str] = None
 class IngestTaskVdbUploadSchema(BaseModelNoExt):
@@ -195,6 +199,7 @@ class IngestTaskSchema(BaseModelNoExt):
         validated_task_properties = expected_schema_cls(**task_properties)
         values["type"] = task_type  # ensure type is now always the enum
         values["task_properties"] = validated_task_properties
         return values
     @field_validator("type", mode="before")

nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import ConfigDict, BaseModel
 class ImageCaptionExtractionSchema(BaseModel):
     api_key: str = "api_key"
-    endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/nvidia/llama-3.1-nemotron-nano-vl-8b-v1/chat/completions"
+    endpoint_url: str = "https://integrate.api.nvidia.com/v1/chat/completions"
     prompt: str = "Caption the content of this image:"
     model_name: str = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
     raise_on_failure: bool = False

nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py CHANGED Viewed

@@ -22,5 +22,9 @@ class TextEmbeddingSchema(BaseModel):
     input_type: str = Field(default="passage")
     raise_on_failure: bool = Field(default=False)
     truncate: str = Field(default="END")
+    text_elements_modality: str = Field(default="text")
+    image_elements_modality: str = Field(default="text")
+    structured_elements_modality: str = Field(default="text")
+    audio_elements_modality: str = Field(default="text")
     model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/transform/embed_text.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor
+from functools import partial
 from typing import Any, Dict, Tuple, Optional, Iterable, List
 import pandas as pd
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
 logger = logging.getLogger(__name__)
+MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
 # ------------------------------------------------------------------------------
 # Asynchronous Embedding Requests
 # ------------------------------------------------------------------------------
@@ -33,6 +37,7 @@ def _make_async_request(
     input_type: str,
     truncate: str,
     filter_errors: bool,
+    modalities: Optional[List[str]] = None,
 ) -> list:
     """
     Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
@@ -74,11 +79,18 @@ def _make_async_request(
             base_url=embedding_nim_endpoint,
         )
+        extra_body = {
+            "input_type": input_type,
+            "truncate": truncate,
+        }
+        if modalities:
+            extra_body["modality"] = modalities
         resp = client.embeddings.create(
             input=prompts,
             model=embedding_model,
             encoding_format=encoding_format,
-            extra_body={"input_type": input_type, "truncate": truncate},
+            extra_body=extra_body,
         )
         response["embedding"] = resp.data
@@ -110,6 +122,7 @@ def _async_request_handler(
     input_type: str,
     truncate: str,
     filter_errors: bool,
+    modalities: Optional[List[str]] = None,
 ) -> List[dict]:
     """
     Gathers calculated embedding results from the NIM embedding service concurrently.
@@ -138,6 +151,9 @@ def _async_request_handler(
     List[dict]
         A list of response dictionaries from the embedding service.
     """
+    if modalities is None:
+        modalities = [None] * len(prompts)
     with ThreadPoolExecutor() as executor:
         futures = [
             executor.submit(
@@ -150,8 +166,9 @@ def _async_request_handler(
                 input_type=input_type,
                 truncate=truncate,
                 filter_errors=filter_errors,
+                modalities=modality_batch,
             )
-            for prompt_batch in prompts
+            for prompt_batch, modality_batch in zip(prompts, modalities)
         ]
         results = [future.result() for future in futures]
@@ -167,6 +184,7 @@ def _async_runner(
     input_type: str,
     truncate: str,
     filter_errors: bool,
+    modalities: Optional[List[str]] = None,
 ) -> dict:
     """
     Concurrently launches all NIM embedding requests and flattens the results.
@@ -204,6 +222,7 @@ def _async_runner(
         input_type,
         truncate,
         filter_errors,
+        modalities=modalities,
     )
     flat_results = {"embeddings": [], "info_msgs": []}
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
     return row
-def _get_pandas_text_content(row):
+def _format_image_input_string(image_b64: Optional[str]) -> str:
+    if not image_b64:
+        return
+    return f"data:image/png;base64,{image_b64}"
+def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
+    if (not text) or (not text.strip()) or (not image_b64):
+        return
+    return f"{text.strip()} {_format_image_input_string(image_b64)}"
+def _get_pandas_text_content(row, modality="text"):
     """
     Extracts text content from a DataFrame row.
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
     return row["content"]
-def _get_pandas_table_content(row):
+def _get_pandas_table_content(row, modality="text"):
     """
     Extracts table/chart content from a DataFrame row.
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
     str
         The table/chart content from the row.
     """
-    return row.get("table_metadata", {}).get("table_content")
+    if modality == "text":
+        content = row.get("table_metadata", {}).get("table_content")
+    elif modality == "image":
+        content = _format_image_input_string(row.get("content"))
+    elif modality == "text_image":
+        text = row.get("table_metadata", {}).get("table_content")
+        image = row.get("content")
+        content = _format_text_image_pair_input_string(text, image)
+    return content
-def _get_pandas_image_content(row):
+def _get_pandas_image_content(row, modality="text"):
     """
     Extracts image caption content from a DataFrame row.
@@ -311,10 +351,30 @@ def _get_pandas_image_content(row):
     str
         The image caption from the row.
     """
-    return row.get("image_metadata", {}).get("caption")
+    subtype = row.get("content_metadata", {}).get("subtype")
+    if modality == "text":
+        if subtype == "page_image":
+            content = row.get("image_metadata", {}).get("text")
+        else:
+            content = row.get("image_metadata", {}).get("caption")
+    elif modality == "image":
+        content = _format_image_input_string(row.get("content"))
+    elif modality == "text_image":
+        if subtype == "page_image":
+            text = row.get("image_metadata", {}).get("text")
+        else:
+            text = row.get("image_metadata", {}).get("caption")
+        image = row.get("content")
+        content = _format_text_image_pair_input_string(text, image)
+    if subtype == "page_image":
+        # A workaround to save memory for full page images.
+        row["content"] = ""
-def _get_pandas_audio_content(row):
+    return content
+def _get_pandas_audio_content(row, modality="text"):
     """
     A pandas UDF used to select extracted audio transcription to be used to create embeddings.
     """
@@ -408,6 +468,23 @@ def _concatenate_extractions_pandas(
 # ------------------------------------------------------------------------------
+def does_model_support_multimodal_embeddings(model: str) -> bool:
+    """
+    Checks if a given model supports multi-modal embeddings.
+    Parameters
+    ----------
+    model : str
+        The name of the model.
+    Returns
+    -------
+    bool
+        True if the model supports multi-modal embeddings, False otherwise.
+    """
+    return model in MULTI_MODAL_MODELS
 def transform_create_text_embeddings_internal(
     df_transform_ledger: pd.DataFrame,
     task_config: Dict[str, Any],
@@ -460,6 +537,15 @@ def transform_create_text_embeddings_internal(
         ContentTypeEnum.AUDIO: _get_pandas_audio_content,
         ContentTypeEnum.VIDEO: lambda x: None,  # Not supported yet.
     }
+    task_type_to_modality = {
+        ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
+        ContentTypeEnum.STRUCTURED: (
+            task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
+        ),
+        ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
+        ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
+        ContentTypeEnum.VIDEO: lambda x: None,  # Not supported yet.
+    }
     def _content_type_getter(row):
         return row["content_metadata"]["type"]
@@ -480,7 +566,7 @@ def transform_create_text_embeddings_internal(
         # Extract content and normalize empty or non-str to None
         extracted_content = (
             df_content["metadata"]
-            .apply(content_getter)
+            .apply(partial(content_getter, modality=task_type_to_modality[content_type]))
             .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
         )
         df_content["_content"] = extracted_content
@@ -488,9 +574,15 @@ def transform_create_text_embeddings_internal(
         # Prepare batches for only valid (non-None) content
         valid_content_mask = df_content["_content"].notna()
         if valid_content_mask.any():
-            filtered_content_batches = _generate_batches(
-                df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
-            )
+            filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
+            filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
+            if model_name in MULTI_MODAL_MODELS:
+                modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
+                modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
+            else:
+                modality_batches = None
             content_embeddings = _async_runner(
                 filtered_content_batches,
                 api_key,
@@ -500,6 +592,7 @@ def transform_create_text_embeddings_internal(
                 transform_config.input_type,
                 transform_config.truncate,
                 False,
+                modalities=modality_batches,
             )
             # Build a simple row index -> embedding map
             embeddings_dict = dict(

nv_ingest_api/internal/transform/split_text.py CHANGED Viewed

@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
     model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
-    if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
-        tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
-    ):
-        tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
-    elif os.path.exists(os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")) and (
-        tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"
-    ):
-        tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
+    if model_predownload_path is not None:
+        if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
+            tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
+        ):
+            tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
+        elif os.path.exists(
+            os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
+        ) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
+            tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
+    # Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
+    if tokenizer_identifier is None:
+        tokenizer_identifier = "intfloat/e5-large-unsupervised"
     tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)

nv_ingest_api/util/image_processing/table_and_chart.py CHANGED Viewed

@@ -46,14 +46,14 @@ def process_yolox_graphic_elements(yolox_text_dict):
     return chart_content.strip()
-def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
+def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
     """
     Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
     Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
     Boxes are expeceted in format (x0, y0, x1, y1)
     Args:
         yolox_box (np array [4]): Cached Bbox.
-        paddle_ocr_boxes (np array [n x 4]): PaddleOCR boxes
+        ocr_boxes (np array [n x 4]): PaddleOCR boxes
         already_matched (list or None, Optional): Already matched ids to ignore.
         delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
     Returns:
@@ -61,10 +61,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
     """
     x0_1, y0_1, x1_1, y1_1 = yolox_box
     x0_2, y0_2, x1_2, y1_2 = (
-        paddle_ocr_boxes[:, 0],
-        paddle_ocr_boxes[:, 1],
-        paddle_ocr_boxes[:, 2],
-        paddle_ocr_boxes[:, 3],
+        ocr_boxes[:, 0],
+        ocr_boxes[:, 1],
+        ocr_boxes[:, 2],
+        ocr_boxes[:, 3],
     )
     # Intersection
@@ -92,10 +92,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
     return matches
-def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, paddle_txts):
+def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
     """
     Matching boxes
-    We need to associate a text to the paddle detections.
+    We need to associate a text to the ocr detections.
     For each class and for each CACHED detections, we look for overlapping text bboxes
     with  IoU > max_iou / delta where max_iou is the biggest found overlap.
     Found texts are added to the class representation, and removed from the texts to match
@@ -113,18 +113,18 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
         "value_label",
     ]
-    paddle_txts = np.array(paddle_txts)
-    paddle_boxes = np.array(paddle_boxes)
+    ocr_txts = np.array(ocr_txts)
+    ocr_boxes = np.array(ocr_boxes)
-    if (paddle_txts.size == 0) or (paddle_boxes.size == 0):
+    if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
         return {}
-    paddle_boxes = np.array(
+    ocr_boxes = np.array(
         [
-            paddle_boxes[:, :, 0].min(-1),
-            paddle_boxes[:, :, 1].min(-1),
-            paddle_boxes[:, :, 0].max(-1),
-            paddle_boxes[:, :, 1].max(-1),
+            ocr_boxes[:, :, 0].min(-1),
+            ocr_boxes[:, :, 1].min(-1),
+            ocr_boxes[:, :, 0].max(-1),
+            ocr_boxes[:, :, 1].max(-1),
         ]
     ).T
@@ -139,10 +139,10 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
         for yolox_box in yolox_output[k]:
             # if there's a score at the end, drop the score.
             yolox_box = yolox_box[:4]
-            paddle_ids = match_bboxes(yolox_box, paddle_boxes, already_matched=already_matched, delta=4)
+            ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
-            if len(paddle_ids) > 0:
-                text = " ".join(paddle_txts[paddle_ids].tolist())
+            if len(ocr_ids) > 0:
+                text = " ".join(ocr_txts[ocr_ids].tolist())
                 texts.append(text)
         processed_texts = []
@@ -161,7 +161,7 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
     return results
-def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
+def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
     if (not bboxes) or (not texts):
         return ""
@@ -186,22 +186,22 @@ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
     return results
-def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_boxes, paddle_ocr_txts):
-    if (not paddle_ocr_boxes) or (not paddle_ocr_txts):
+def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
+    if (not ocr_boxes) or (not ocr_txts):
         return ""
-    paddle_ocr_boxes = np.array(paddle_ocr_boxes)
-    paddle_ocr_boxes_ = np.array(
+    ocr_boxes = np.array(ocr_boxes)
+    ocr_boxes_ = np.array(
         [
-            paddle_ocr_boxes[:, :, 0].min(-1),
-            paddle_ocr_boxes[:, :, 1].min(-1),
-            paddle_ocr_boxes[:, :, 0].max(-1),
-            paddle_ocr_boxes[:, :, 1].max(-1),
+            ocr_boxes[:, :, 0].min(-1),
+            ocr_boxes[:, :, 1].min(-1),
+            ocr_boxes[:, :, 0].max(-1),
+            ocr_boxes[:, :, 1].max(-1),
         ]
     ).T
     assignments = []
-    for i, (b, t) in enumerate(zip(paddle_ocr_boxes_, paddle_ocr_txts)):
+    for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
         # Find a cell
         matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
         cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
@@ -221,7 +221,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
         assignments.append(
             {
                 "index": i,
-                "paddle_box": b,
+                "ocr_box": b,
                 "is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
                 "cell_id": matches_cell[0] if len(matches_cell) else -1,
                 "cell": cell,
@@ -249,13 +249,13 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
         mat = build_markdown(df_table)
         markdown_table = display_markdown(mat, use_header=False)
-        all_boxes = np.stack(df_table.paddle_box.values)
+        all_boxes = np.stack(df_table.ocr_box.values)
         table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
         df_table_to_text = pd.DataFrame(
             [
                 {
-                    "paddle_box": table_box,
+                    "ocr_box": table_box,
                     "text": markdown_table,
                     "is_table": True,
                 }
@@ -264,7 +264,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
         # Final text representations dataframe
         df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
-    df_text = df_text.rename(columns={"paddle_box": "box"})
+    df_text = df_text.rename(columns={"ocr_box": "box"})
     # Sort by y and x
     df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
@@ -297,12 +297,12 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
     return result
-def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
+def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
     """
-    Assigns the closest bounding boxes to a reference `paddle_box` based on overlap.
+    Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
     Args:
-        paddle_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
+        ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
         boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
         delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
         min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
@@ -316,7 +316,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
     boxes = np.array(boxes)
-    x0_1, y0_1, x1_1, y1_1 = paddle_box
+    x0_1, y0_1, x1_1, y1_1 = ocr_box
     x0_2, y0_2, x1_2, y1_2 = (
         boxes[:, 0],
         boxes[:, 1],
@@ -331,7 +331,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
     inter_x1 = np.minimum(x1_1, x1_2)
     inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
-    # Normalize by paddle_box size
+    # Normalize by ocr_box size
     area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
     ious = inter_area / (area_1 + 1e-6)
@@ -385,16 +385,16 @@ def merge_text_in_cell(df_cell):
     Returns:
         pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
     """
-    paddle_boxes = np.stack(df_cell["paddle_box"].values)
+    ocr_boxes = np.stack(df_cell["ocr_box"].values)
-    df_cell["x"] = (paddle_boxes[:, 0] - paddle_boxes[:, 0].min()) // 10
-    df_cell["y"] = (paddle_boxes[:, 1] - paddle_boxes[:, 1].min()) // 10
+    df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
+    df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
     df_cell = df_cell.sort_values(["y", "x"])
     text = " ".join(df_cell["text"].values.tolist())
     df_cell["text"] = text
     df_cell = df_cell.head(1)
-    df_cell["paddle_box"] = df_cell["cell"]
+    df_cell["ocr_box"] = df_cell["cell"]
     df_cell.drop(["x", "y"], axis=1, inplace=True)
     return df_cell
@@ -447,3 +447,58 @@ def display_markdown(
         markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
     return markdown_table
+def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
+    """
+    Reorders the boxes in reading order.
+    If mode is "center", the boxes are reordered using bbox center.
+    If mode is "top_left", the boxes are reordered using the top left corner.
+    If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
+    Args:
+        boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
+        texts (np array [n]): The text of the OCR results.
+        confs (np array [n]): The confidence scores of the OCR results.
+        mode (str, optional): The mode to reorder the boxes. Defaults to "center".
+        dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
+    Returns:
+        List[List[int, ...]]: The reordered bounding boxes.
+        List[str]: The reordered texts.
+        List[float]: The reordered confidence scores.
+    """
+    df = pd.DataFrame(
+        [[b, t, c] for b, t, c in zip(boxes, texts, confs)],
+        columns=["bbox", "text", "conf"],
+    )
+    if mode == "center":
+        df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
+        df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
+    elif mode == "top_left":
+        df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
+        df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
+    if dbscan_eps:
+        do_naive_sorting = False
+        try:
+            dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
+            dbscan.fit(df["y"].values[:, None])
+            df["cluster"] = dbscan.labels_
+            df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
+            df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
+        except ValueError:
+            do_naive_sorting = True
+    else:
+        do_naive_sorting = True
+    if do_naive_sorting:
+        df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
+        df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
+    bboxes = df["bbox"].values.tolist()
+    texts = df["text"].values.tolist()
+    confs = df["conf"].values.tolist()
+    return bboxes, texts, confs

nv-ingest-api 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc2__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 25.7.7.dev20250707py3-none-any.whl → 25.8.0rc2py3-none-any.whl