nv-ingest-api 2025.5.23.dev20250523__py3-none-any.whl → 2025.5.25.dev20250525__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import uuid
8
+ from typing import Optional, Dict, Any, Union, Tuple, List
9
+
10
+ import pandas as pd
11
+
12
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
13
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
14
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
15
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
16
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
17
+
18
+ from markitdown.converters import HtmlConverter
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @unified_exception_handler
24
+ def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
25
+ metadata = row.get("metadata")
26
+ html_content = row.get("content")
27
+
28
+ if html_content:
29
+ html_converter = HtmlConverter()
30
+ md_content = html_converter.convert_string(html_content=html_content).text_content
31
+ metadata["content"] = md_content
32
+
33
+ return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
34
+
35
+
36
+ def extract_markdown_from_html_internal(
37
+ df_extraction_ledger: pd.DataFrame,
38
+ task_config: Dict[str, Any],
39
+ extraction_config: HtmlExtractorSchema,
40
+ execution_trace_log: Optional[Dict[str, Any]] = None,
41
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
42
+ """
43
+ Processes a pandas DataFrame containing HTML file content, extracting html as text from
44
+ each document and converting it to markdown.
45
+
46
+ Parameters
47
+ ----------
48
+ df_extraction_ledger : pd.DataFrame
49
+ The input DataFrame containing html files as raw text. Expected columns include
50
+ 'source_id' and 'content'.
51
+ task_config : Union[Dict[str, Any], BaseModel]
52
+ Configuration instructions for the document processing task. This can be provided as a
53
+ dictionary or a Pydantic model.
54
+ extraction_config : Any
55
+ A configuration object for document extraction that guides the extraction process.
56
+ execution_trace_log : Optional[Dict[str, Any]], default=None
57
+ An optional dictionary containing trace information for debugging or logging.
58
+
59
+ Returns
60
+ -------
61
+ pd.DataFrame
62
+ A DataFrame with the original html content converted to markdown. The resulting
63
+ DataFrame contains the columns "document_type", "metadata", and "uuid".
64
+
65
+ Raises
66
+ ------
67
+ Exception
68
+ If an error occurs during the document extraction process, the exception is logged and
69
+ re-raised.
70
+ """
71
+
72
+ # Apply the decode_and_extract function to each row in the DataFrame.
73
+ sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
74
+
75
+ # Explode any list results and drop missing values.
76
+ sr_extraction = sr_extraction.explode().dropna()
77
+
78
+ # Convert the extraction results to a DataFrame if available.
79
+ if not sr_extraction.empty:
80
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
81
+ else:
82
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
83
+
84
+ return extracted_df, {}
@@ -0,0 +1,34 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class HtmlExtractorSchema(BaseModel):
14
+ """
15
+ Configuration schema for the Html extractor settings.
16
+
17
+ Parameters
18
+ ----------
19
+ max_queue_size : int, default=1
20
+ The maximum number of items allowed in the processing queue.
21
+
22
+ n_workers : int, default=16
23
+ The number of worker threads to use for processing.
24
+
25
+ raise_on_failure : bool, default=False
26
+ A flag indicating whether to raise an exception on processing failure.
27
+
28
+ """
29
+
30
+ max_queue_size: int = 1
31
+ n_workers: int = 16
32
+ raise_on_failure: bool = False
33
+
34
+ model_config = ConfigDict(extra="forbid")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.23.dev20250523
3
+ Version: 2025.5.25.dev20250525
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -17,6 +17,8 @@ nv_ingest_api/internal/extract/docx/engines/__init__.py,sha256=47DEQpj8HBSa-_TIm
17
17
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
18
18
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py,sha256=1wkciAxu8lz9WuPuoleJFy2s09ieSzXl1S71F9r0BWA,4385
19
19
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=FOZZBD9gRRAr93qgK_L6o9xVBYD-6EE5-xI2-cWKvzo,33713
20
+ nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
+ nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
20
22
  nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
23
  nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=CkaW8ihPmGMQGrZh0ih14gtEpWuGOJ8InPQfZwpsP2g,13300
22
24
  nv_ingest_api/internal/extract/image/image_extractor.py,sha256=4tUWinuFMN3ukWa2tZa2_LtzRiTyUAUCBF6BDkUEvm0,8705
@@ -68,6 +70,7 @@ nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEcz
68
70
  nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=VVppZgV1lnyJCTfADexzoj3V0lOSq3t6Dw_6VhIxZ7k,3771
69
71
  nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=iu8lHQC0zbBB9VRK7PZisAVzpeSpFqjcXRAnwZ9OzoM,4301
70
72
  nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=M2N7WjMNvSemHcJHWeNUD_kFG0wC5VE2W3K6SVrJqvA,3761
73
+ nv_ingest_api/internal/schemas/extract/extract_html_schema.py,sha256=lazpONTGZ6Fl420BGBAr6rogFGtlzBiZTc1uA694OIs,841
71
74
  nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=GC4xV8Z9TPLOuxlEtf2fbklSSp8ETGMrDpZgMQ02UwA,3766
72
75
  nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=rl_hFDoJaJLTKbtnEpDSBj-73KQL9aUEVKGiW0IdXiU,3991
73
76
  nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=G9g1lEORmryUWTzDyZ0vHAuPnVMK7VaRx0E4xzmAw3Q,6589
@@ -147,8 +150,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
147
150
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
148
151
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
152
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
150
- nv_ingest_api-2025.5.23.dev20250523.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
151
- nv_ingest_api-2025.5.23.dev20250523.dist-info/METADATA,sha256=x227_7zGVySv3eRjuzNvvbrdSGpzJkOVBhXDzaDJqos,13919
152
- nv_ingest_api-2025.5.23.dev20250523.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
153
- nv_ingest_api-2025.5.23.dev20250523.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
154
- nv_ingest_api-2025.5.23.dev20250523.dist-info/RECORD,,
153
+ nv_ingest_api-2025.5.25.dev20250525.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
154
+ nv_ingest_api-2025.5.25.dev20250525.dist-info/METADATA,sha256=TAgrQPoouk_SAT4TQLxv2vopf5YK5wZPMseFjFMBjTU,13919
155
+ nv_ingest_api-2025.5.25.dev20250525.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
156
+ nv_ingest_api-2025.5.25.dev20250525.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
157
+ nv_ingest_api-2025.5.25.dev20250525.dist-info/RECORD,,