PyPI - nv-ingest-api - Versions diffs - 2025.5.23.dev20250523__py3-none-any.whl → 2025.5.25.dev20250525__py3-none-any.whl - Mend

nv-ingest-api 2025.5.23.dev20250523py3-none-any.whl → 2025.5.25.dev20250525py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (8) hide show

nv_ingest_api/internal/extract/html/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/extract/html/html_extractor.py ADDED Viewed

@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import uuid
+from typing import Optional, Dict, Any, Union, Tuple, List
+import pandas as pd
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
+from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
+from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
+from nv_ingest_api.util.schema.schema_validator import validate_schema
+from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
+from markitdown.converters import HtmlConverter
+logger = logging.getLogger(__name__)
+@unified_exception_handler
+def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
+    metadata = row.get("metadata")
+    html_content = row.get("content")
+    if html_content:
+        html_converter = HtmlConverter()
+        md_content = html_converter.convert_string(html_content=html_content).text_content
+        metadata["content"] = md_content
+    return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
+def extract_markdown_from_html_internal(
+    df_extraction_ledger: pd.DataFrame,
+    task_config: Dict[str, Any],
+    extraction_config: HtmlExtractorSchema,
+    execution_trace_log: Optional[Dict[str, Any]] = None,
+) -> Tuple[pd.DataFrame, Union[Dict, None]]:
+    """
+    Processes a pandas DataFrame containing HTML file content, extracting html as text from
+    each document and converting it to markdown.
+    Parameters
+    ----------
+    df_extraction_ledger : pd.DataFrame
+        The input DataFrame containing html files as raw text. Expected columns include
+        'source_id' and 'content'.
+    task_config : Union[Dict[str, Any], BaseModel]
+        Configuration instructions for the document processing task. This can be provided as a
+        dictionary or a Pydantic model.
+    extraction_config : Any
+        A configuration object for document extraction that guides the extraction process.
+    execution_trace_log : Optional[Dict[str, Any]], default=None
+        An optional dictionary containing trace information for debugging or logging.
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame with the original html content converted to markdown. The resulting
+        DataFrame contains the columns "document_type", "metadata", and "uuid".
+    Raises
+    ------
+    Exception
+        If an error occurs during the document extraction process, the exception is logged and
+        re-raised.
+    """
+    # Apply the decode_and_extract function to each row in the DataFrame.
+    sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
+    # Explode any list results and drop missing values.
+    sr_extraction = sr_extraction.explode().dropna()
+    # Convert the extraction results to a DataFrame if available.
+    if not sr_extraction.empty:
+        extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
+    else:
+        extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
+    return extracted_df, {}

nv_ingest_api/internal/schemas/extract/extract_html_schema.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from pydantic import ConfigDict, BaseModel
+logger = logging.getLogger(__name__)
+class HtmlExtractorSchema(BaseModel):
+    """
+    Configuration schema for the Html extractor settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=16
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception on processing failure.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 16
+    raise_on_failure: bool = False
+    model_config = ConfigDict(extra="forbid")

{nv_ingest_api-2025.5.23.dev20250523.dist-info → nv_ingest_api-2025.5.25.dev20250525.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.5.23.dev20250523
+Version: 2025.5.25.dev20250525
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_api-2025.5.23.dev20250523.dist-info → nv_ingest_api-2025.5.25.dev20250525.dist-info}/RECORD RENAMED Viewed

@@ -17,6 +17,8 @@ nv_ingest_api/internal/extract/docx/engines/__init__.py,sha256=47DEQpj8HBSa-_TIm
 nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
 nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py,sha256=1wkciAxu8lz9WuPuoleJFy2s09ieSzXl1S71F9r0BWA,4385
 nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=FOZZBD9gRRAr93qgK_L6o9xVBYD-6EE5-xI2-cWKvzo,33713
+nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
+nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
 nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=CkaW8ihPmGMQGrZh0ih14gtEpWuGOJ8InPQfZwpsP2g,13300
 nv_ingest_api/internal/extract/image/image_extractor.py,sha256=4tUWinuFMN3ukWa2tZa2_LtzRiTyUAUCBF6BDkUEvm0,8705
@@ -68,6 +70,7 @@ nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEcz
 nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=VVppZgV1lnyJCTfADexzoj3V0lOSq3t6Dw_6VhIxZ7k,3771
 nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=iu8lHQC0zbBB9VRK7PZisAVzpeSpFqjcXRAnwZ9OzoM,4301
 nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=M2N7WjMNvSemHcJHWeNUD_kFG0wC5VE2W3K6SVrJqvA,3761
+nv_ingest_api/internal/schemas/extract/extract_html_schema.py,sha256=lazpONTGZ6Fl420BGBAr6rogFGtlzBiZTc1uA694OIs,841
 nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=GC4xV8Z9TPLOuxlEtf2fbklSSp8ETGMrDpZgMQ02UwA,3766
 nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=rl_hFDoJaJLTKbtnEpDSBj-73KQL9aUEVKGiW0IdXiU,3991
 nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=G9g1lEORmryUWTzDyZ0vHAuPnVMK7VaRx0E4xzmAw3Q,6589
@@ -147,8 +150,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
 nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
 nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
-nv_ingest_api-2025.5.23.dev20250523.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest_api-2025.5.23.dev20250523.dist-info/METADATA,sha256=x227_7zGVySv3eRjuzNvvbrdSGpzJkOVBhXDzaDJqos,13919
-nv_ingest_api-2025.5.23.dev20250523.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
-nv_ingest_api-2025.5.23.dev20250523.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
-nv_ingest_api-2025.5.23.dev20250523.dist-info/RECORD,,
+nv_ingest_api-2025.5.25.dev20250525.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_api-2025.5.25.dev20250525.dist-info/METADATA,sha256=TAgrQPoouk_SAT4TQLxv2vopf5YK5wZPMseFjFMBjTU,13919
+nv_ingest_api-2025.5.25.dev20250525.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
+nv_ingest_api-2025.5.25.dev20250525.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
+nv_ingest_api-2025.5.25.dev20250525.dist-info/RECORD,,

{nv_ingest_api-2025.5.23.dev20250523.dist-info → nv_ingest_api-2025.5.25.dev20250525.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_api-2025.5.23.dev20250523.dist-info → nv_ingest_api-2025.5.25.dev20250525.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_api-2025.5.23.dev20250523.dist-info → nv_ingest_api-2025.5.25.dev20250525.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-api 2025.5.23.dev20250523__py3-none-any.whl → 2025.5.25.dev20250525__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.5.23.dev20250523py3-none-any.whl → 2025.5.25.dev20250525py3-none-any.whl