PyPI - nv-ingest-api - Versions diffs - 25.4.2__py3-none-any.whl → 25.6.1__py3-none-any.whl - Mend

nv-ingest-api 25.4.2py3-none-any.whl → 25.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (46) hide show

nv_ingest_api/internal/extract/docx/docx_extractor.py CHANGED Viewed

@@ -7,7 +7,7 @@ import base64
 import functools
 import io
 import logging
-from typing import Optional, Dict, Any, Union
+from typing import Optional, Dict, Any, Union, Tuple
 import pandas as pd
 from pydantic import BaseModel
@@ -146,7 +146,7 @@ def extract_primitives_from_docx_internal(
     task_config: Union[Dict[str, Any], BaseModel],
     extraction_config: DocxExtractorSchema,
     execution_trace_log: Optional[Dict[str, Any]] = None,
-) -> pd.DataFrame:
+) -> Tuple[pd.DataFrame, Union[Dict, None]]:
     """
     Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
     each document and replacing the original content with the extracted text.
@@ -202,4 +202,4 @@ def extract_primitives_from_docx_internal(
     else:
         extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
-    return extracted_df
+    return extracted_df, {}

nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py CHANGED Viewed

@@ -274,59 +274,70 @@ class DocxReader:
             - A list of extracted images from the paragraph.
         """
-        paragraph_images = []
-        if self.paragraph_format == "text":
-            paragraph_text = paragraph.text
-        else:
-            # Get the default style of the paragraph, "markdown"
+        try:
+            paragraph_images = []
+            if self.paragraph_format == "text":
+                return paragraph.text.strip(), paragraph_images
             font = paragraph.style.font
             default_style = (font.bold, font.italic, font.underline)
-            # Iterate over the runs of the paragraph and group them by style, excluding empty runs
             paragraph_text = ""
             group_text = ""
             previous_style = None
             for c in paragraph.iter_inner_content():
-                if isinstance(c, Hyperlink):
-                    text = f"[{c.text}]({c.address})"
-                    style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
-                elif isinstance(c, Run):
-                    text = c.text
-                    style = (c.bold, c.italic, c.underline)
-                    # 1. Locate the inline shape which is stored in the <w:drawing> element.
-                    # 2. r:embed in <a.blip> has the relationship id for extracting the file where
-                    # the image is stored as bytes.
-                    # Reference:
-                    # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
-                    inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
-                    for r_id in inline_shapes:
-                        text += self.image_tag.format(self.image_tag_index)
-                        self.image_tag_index += 1
-                        image = paragraph.part.related_parts[r_id].image
-                        paragraph_images.append(image)
-                else:
-                    continue
-                style = tuple([s if s is not None else d for s, d in zip(style, default_style)])
-                # If the style changes for a non empty text, format the previous group and start a new one
-                if (not self.is_text_empty(text)) and (previous_style is not None):
-                    if style != previous_style:
+                try:
+                    if isinstance(c, Hyperlink):
+                        text = f"[{c.text}]({c.address})"
+                        style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
+                    elif isinstance(c, Run):
+                        text = c.text
+                        style = (c.bold, c.italic, c.underline)
+                        # 1. Locate the inline shape which is stored in the <w:drawing> element.
+                        # 2. r:embed in <a.blip> has the relationship id for extracting the file where
+                        # the image is stored as bytes.
+                        # Reference:
+                        # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
+                        inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
+                        for r_id in inline_shapes:
+                            text += self.image_tag.format(self.image_tag_index)
+                            self.image_tag_index += 1
+                            try:
+                                image = paragraph.part.related_parts[r_id].image
+                                paragraph_images.append(image)
+                            except Exception as img_e:
+                                logger.warning(
+                                    "Failed to extract image with rId " "%s: %s -- object / file may be malformed",
+                                    r_id,
+                                    img_e,
+                                )
+                    else:
+                        continue
+                    style = tuple(s if s is not None else d for s, d in zip(style, default_style))
+                    if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
                         paragraph_text += self.format_text(group_text, *previous_style)
                         group_text = ""
-                group_text += text
-                if not self.is_text_empty(text):
-                    previous_style = style
+                    group_text += text
+                    if not self.is_text_empty(text):
+                        previous_style = style
-            # Format the last group
-            if group_text:
-                paragraph_text += self.format_text(group_text, *style)
+                except Exception as e:
+                    logger.error("format_paragraph: failed to process run: %s", e)
+                    continue
+            if group_text and previous_style:
+                paragraph_text += self.format_text(group_text, *previous_style)
+            return paragraph_text.strip(), paragraph_images
-        # Remove trailing spaces
-        paragraph_text = paragraph_text.strip()
-        return paragraph_text, paragraph_images
+        except Exception as e:
+            logger.error("format_paragraph: failed for paragraph: %s", e)
+            return "", []
     def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
         """
@@ -344,12 +355,23 @@ class DocxReader:
             - A list of images extracted from the cell.
         """
-        if self.paragraph_format == "markdown":
-            newline = "<br>"
-        else:
-            newline = "\n"
-        paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs])
-        return newline.join(paragraph_texts), paragraph_images
+        try:
+            newline = "<br>" if self.paragraph_format == "markdown" else "\n"
+            texts, images = [], []
+            for p in cell.paragraphs:
+                try:
+                    t, imgs = self.format_paragraph(p)
+                    texts.append(t)
+                    images.extend(imgs)
+                except Exception as e:
+                    logger.error("format_cell: failed to format paragraph in cell: %s", e)
+            return newline.join(texts), images
+        except Exception as e:
+            logger.error("format_cell: failed entirely: %s", e)
+            return "", []
     def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
         """
@@ -368,25 +390,50 @@ class DocxReader:
             - A DataFrame representation of the table's content.
         """
-        rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows]
-        texts = [[text for text, _ in row] for row in rows]
-        table_images = [image for row in rows for _, images in row for image in images]
-        table = pd.DataFrame(texts[1:], columns=texts[0])
-        if "markdown" in self.table_format:
-            table_text = table.to_markdown(index=False)
-            if self.table_format == "markdown_light":
-                table_text = re.sub(r"\s{2,}", " ", table_text)
-                table_text = re.sub(r"-{2,}", "-", table_text)
-        elif self.table_format == "csv":
-            table_text = table.to_csv()
-        elif self.table_format == "tag":
-            table_text = self.table_tag.format(self.table_tag_index)
-            self.table_tag_index += 1
-        else:
-            raise ValueError(f"Unknown table format {format}")
+        try:
+            rows_data = []
+            all_images = []
+            for row in table.rows:
+                row_texts = []
+                row_images = []
+                for cell in row.cells:
+                    try:
+                        cell_text, cell_imgs = self.format_cell(cell)
+                        row_texts.append(cell_text)
+                        row_images.extend(cell_imgs)
+                    except Exception as e:
+                        logger.error("format_table: failed to process cell: %s", e)
+                        row_texts.append("")  # pad for column alignment
+                rows_data.append(row_texts)
+                all_images.extend(row_images)
+            if not rows_data or not rows_data[0]:
+                return None, [], pd.DataFrame()
+            header = rows_data[0]
+            body = rows_data[1:]
+            df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
+            if "markdown" in self.table_format:
+                table_text = df.to_markdown(index=False)
+                if self.table_format == "markdown_light":
+                    table_text = re.sub(r"\s{2,}", " ", table_text)
+                    table_text = re.sub(r"-{2,}", "-", table_text)
+            elif self.table_format == "csv":
+                table_text = df.to_csv(index=False)
+            elif self.table_format == "tag":
+                table_text = self.table_tag.format(self.table_tag_index)
+                self.table_tag_index += 1
+            else:
+                raise ValueError(f"Unknown table format {self.table_format}")
+            return table_text, all_images, df
-        return table_text, table_images, table
+        except Exception as e:
+            logger.error("format_table: failed to format table: %s", e)
+            return None, [], pd.DataFrame()
     @staticmethod
     def apply_text_style(style: str, text: str, level: int = 0) -> str:
@@ -841,30 +888,39 @@ class DocxReader:
         self._prev_para_image_idx = 0
         para_idx = 0
         for child in self.document.element.body.iterchildren():
-            if isinstance(child, CT_P):
-                paragraph = Paragraph(child, self.document)
-                paragraph_text, paragraph_images = self.format_paragraph(paragraph)
-                if extract_text:
-                    self._extract_para_text(
-                        paragraph,
-                        paragraph_text,
-                        base_unified_metadata,
-                        text_depth,
-                        para_idx,
-                    )
-                if (extract_charts or extract_images or extract_tables) and paragraph_images:
-                    self._prev_para_images = paragraph_images
-                    self._prev_para_image_idx = para_idx
-                    self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images]
-                    self.images += paragraph_images
+            try:
+                if isinstance(child, CT_P):
+                    paragraph = Paragraph(child, self.document)
+                    paragraph_text, paragraph_images = self.format_paragraph(paragraph)
+                    if extract_text:
+                        try:
+                            self._extract_para_text(
+                                paragraph,
+                                paragraph_text,
+                                base_unified_metadata,
+                                text_depth,
+                                para_idx,
+                            )
+                        except Exception as e:
+                            logger.error("extract_data: _extract_para_text failed: %s", e)
+                    if (extract_images or extract_charts or extract_tables) and paragraph_images:
+                        self._pending_images += [
+                            (image, para_idx, "", base_unified_metadata) for image in paragraph_images
+                        ]
+                        self.images.extend(paragraph_images)
+                elif isinstance(child, CT_Tbl):
+                    if extract_tables or extract_charts:
+                        try:
+                            self._extract_table_data(child, base_unified_metadata)
+                        except Exception as e:
+                            logger.error("extract_data: _extract_table_data failed: %s", e)
-            elif isinstance(child, CT_Tbl):
-                if extract_tables or extract_charts:
-                    self._extract_table_data(child, base_unified_metadata)
+            except Exception as e:
+                logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
             para_idx += 1

nv_ingest_api/internal/extract/html/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/extract/html/html_extractor.py ADDED Viewed

@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import uuid
+from typing import Optional, Dict, Any, Union, Tuple, List
+import pandas as pd
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
+from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
+from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
+from nv_ingest_api.util.schema.schema_validator import validate_schema
+from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
+from markitdown.converters import HtmlConverter
+logger = logging.getLogger(__name__)
+@unified_exception_handler
+def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
+    metadata = row.get("metadata")
+    html_content = row.get("content")
+    if html_content:
+        html_converter = HtmlConverter()
+        md_content = html_converter.convert_string(html_content=html_content).text_content
+        metadata["content"] = md_content
+    return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
+def extract_markdown_from_html_internal(
+    df_extraction_ledger: pd.DataFrame,
+    task_config: Dict[str, Any],
+    extraction_config: HtmlExtractorSchema,
+    execution_trace_log: Optional[Dict[str, Any]] = None,
+) -> Tuple[pd.DataFrame, Union[Dict, None]]:
+    """
+    Processes a pandas DataFrame containing HTML file content, extracting html as text from
+    each document and converting it to markdown.
+    Parameters
+    ----------
+    df_extraction_ledger : pd.DataFrame
+        The input DataFrame containing html files as raw text. Expected columns include
+        'source_id' and 'content'.
+    task_config : Union[Dict[str, Any], BaseModel]
+        Configuration instructions for the document processing task. This can be provided as a
+        dictionary or a Pydantic model.
+    extraction_config : Any
+        A configuration object for document extraction that guides the extraction process.
+    execution_trace_log : Optional[Dict[str, Any]], default=None
+        An optional dictionary containing trace information for debugging or logging.
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame with the original html content converted to markdown. The resulting
+        DataFrame contains the columns "document_type", "metadata", and "uuid".
+    Raises
+    ------
+    Exception
+        If an error occurs during the document extraction process, the exception is logged and
+        re-raised.
+    """
+    # Apply the decode_and_extract function to each row in the DataFrame.
+    sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
+    # Explode any list results and drop missing values.
+    sr_extraction = sr_extraction.explode().dropna()
+    # Convert the extraction results to a DataFrame if available.
+    if not sr_extraction.empty:
+        extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
+    else:
+        extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
+    return extracted_df, {}

nv_ingest_api/internal/extract/image/chart_extractor.py CHANGED Viewed

@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
 PADDLE_MIN_WIDTH = 32
 PADDLE_MIN_HEIGHT = 32
-logger = logging.getLogger(f"morpheus.{__name__}")
+logger = logging.getLogger(f"ray.{__name__}")
 def _filter_valid_chart_images(
@@ -80,7 +80,7 @@ def _run_chart_inference(
             yolox_client.infer,
             data=data_yolox,
             model_name="yolox",
-            stage_name="chart_data_extraction",
+            stage_name="chart_extraction",
             max_batch_size=8,
             trace_info=trace_info,
         )
@@ -88,7 +88,7 @@ def _run_chart_inference(
             paddle_client.infer,
             data=data_paddle,
             model_name="paddle",
-            stage_name="chart_data_extraction",
+            stage_name="chart_extraction",
             max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
             trace_info=trace_info,
         )

nv_ingest_api/internal/extract/image/image_extractor.py CHANGED Viewed

@@ -16,7 +16,7 @@ import pandas as pd
 from pydantic import BaseModel
 from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
-from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
+from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
 from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
 logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 def _decode_and_extract_from_image(
     base64_row: pd.Series,
     task_config: Dict[str, Any],
-    validated_extraction_config: ImageExtractorSchema,
+    validated_extraction_config: ImageConfigSchema,
     execution_trace_log: Optional[List[Any]] = None,
 ) -> Any:
     """
@@ -106,10 +106,10 @@ def _decode_and_extract_from_image(
         logger.debug(
             f"decode_and_extract: Extracting image content using image_extraction_config: "
-            f"{validated_extraction_config.image_extraction_config}"
+            f"{validated_extraction_config}"
         )
-        if validated_extraction_config.image_extraction_config is not None:
-            extract_params["image_extraction_config"] = validated_extraction_config.image_extraction_config
+        if validated_extraction_config is not None:
+            extract_params["image_extraction_config"] = validated_extraction_config
         if execution_trace_log is not None:
             extract_params["trace_info"] = execution_trace_log

nv_ingest_api/internal/extract/image/image_helpers/common.py CHANGED Viewed

@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
             model_name="yolox",
             max_batch_size=YOLOX_MAX_BATCH_SIZE,
             trace_info=trace_info,
-            stage_name="pdf_content_extractor",
+            stage_name="pdf_extraction",
         )
         # Process each result along with its corresponding image.

nv_ingest_api/internal/extract/image/infographic_extractor.py CHANGED Viewed

@@ -100,7 +100,7 @@ def _update_infographic_metadata(
         paddle_results = paddle_client.infer(
             data=data_paddle,
             model_name="paddle",
-            stage_name="infographic_data_extraction",
+            stage_name="infographic_extraction",
             max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
             trace_info=trace_info,
         )

nv_ingest_api/internal/extract/image/table_extractor.py CHANGED Viewed

@@ -81,7 +81,7 @@ def _run_inference(
                 yolox_client.infer,
                 data=data_yolox,
                 model_name="yolox",
-                stage_name="table_data_extraction",
+                stage_name="table_extraction",
                 max_batch_size=8,
                 trace_info=trace_info,
             )
@@ -89,7 +89,7 @@ def _run_inference(
             paddle_client.infer,
             data=data_paddle,
             model_name="paddle",
-            stage_name="table_data_extraction",
+            stage_name="table_extraction",
             max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
             trace_info=trace_info,
         )

nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py CHANGED Viewed

@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
     inference_results = nemoretriever_parse_client.infer(
         data=data,
         model_name="nemoretriever_parse",
-        stage_name="pdf_content_extractor",
+        stage_name="pdf_extraction",
         max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
         execution_trace_log=execution_trace_log,
     )
@@ -476,7 +476,7 @@ def _extract_text_and_bounding_boxes(
 def _create_clients(nemoretriever_parse_config):
     model_interface = nemoretriever_parse_utils.NemoRetrieverParseModelInterface(
-        model_name=nemoretriever_parse_config.model_name,
+        model_name=nemoretriever_parse_config.nemoretriever_parse_model_name,
     )
     nemoretriever_parse_client = create_inference_client(
         nemoretriever_parse_config.nemoretriever_parse_endpoints,

nv_ingest_api/internal/extract/pdf/engines/pdfium.py CHANGED Viewed

@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
             model_name="yolox",
             max_batch_size=YOLOX_MAX_BATCH_SIZE,
             trace_info=execution_trace_log,
-            stage_name="pdf_content_extractor",
+            stage_name="pdf_extraction",
         )
         # Process results: iterate over each image's inference output.

nv-ingest-api 25.4.2__py3-none-any.whl → 25.6.1__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 25.4.2py3-none-any.whl → 25.6.1py3-none-any.whl