nv-ingest-api 2025.5.22.dev20250522__py3-none-any.whl → 2025.5.24.dev20250524__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- {nv_ingest_api-2025.5.22.dev20250522.dist-info → nv_ingest_api-2025.5.24.dev20250524.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.5.22.dev20250522.dist-info → nv_ingest_api-2025.5.24.dev20250524.dist-info}/RECORD +8 -5
- {nv_ingest_api-2025.5.22.dev20250522.dist-info → nv_ingest_api-2025.5.24.dev20250524.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.5.22.dev20250522.dist-info → nv_ingest_api-2025.5.24.dev20250524.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.5.22.dev20250522.dist-info → nv_ingest_api-2025.5.24.dev20250524.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import Optional, Dict, Any, Union, Tuple, List
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
13
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
|
|
14
|
+
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
|
|
15
|
+
from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
17
|
+
|
|
18
|
+
from markitdown.converters import HtmlConverter
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@unified_exception_handler
|
|
24
|
+
def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
|
|
25
|
+
metadata = row.get("metadata")
|
|
26
|
+
html_content = row.get("content")
|
|
27
|
+
|
|
28
|
+
if html_content:
|
|
29
|
+
html_converter = HtmlConverter()
|
|
30
|
+
md_content = html_converter.convert_string(html_content=html_content).text_content
|
|
31
|
+
metadata["content"] = md_content
|
|
32
|
+
|
|
33
|
+
return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_markdown_from_html_internal(
|
|
37
|
+
df_extraction_ledger: pd.DataFrame,
|
|
38
|
+
task_config: Dict[str, Any],
|
|
39
|
+
extraction_config: HtmlExtractorSchema,
|
|
40
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
41
|
+
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
|
|
42
|
+
"""
|
|
43
|
+
Processes a pandas DataFrame containing HTML file content, extracting html as text from
|
|
44
|
+
each document and converting it to markdown.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
df_extraction_ledger : pd.DataFrame
|
|
49
|
+
The input DataFrame containing html files as raw text. Expected columns include
|
|
50
|
+
'source_id' and 'content'.
|
|
51
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
52
|
+
Configuration instructions for the document processing task. This can be provided as a
|
|
53
|
+
dictionary or a Pydantic model.
|
|
54
|
+
extraction_config : Any
|
|
55
|
+
A configuration object for document extraction that guides the extraction process.
|
|
56
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
57
|
+
An optional dictionary containing trace information for debugging or logging.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
pd.DataFrame
|
|
62
|
+
A DataFrame with the original html content converted to markdown. The resulting
|
|
63
|
+
DataFrame contains the columns "document_type", "metadata", and "uuid".
|
|
64
|
+
|
|
65
|
+
Raises
|
|
66
|
+
------
|
|
67
|
+
Exception
|
|
68
|
+
If an error occurs during the document extraction process, the exception is logged and
|
|
69
|
+
re-raised.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# Apply the decode_and_extract function to each row in the DataFrame.
|
|
73
|
+
sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
|
|
74
|
+
|
|
75
|
+
# Explode any list results and drop missing values.
|
|
76
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
77
|
+
|
|
78
|
+
# Convert the extraction results to a DataFrame if available.
|
|
79
|
+
if not sr_extraction.empty:
|
|
80
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
81
|
+
else:
|
|
82
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
83
|
+
|
|
84
|
+
return extracted_df, {}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HtmlExtractorSchema(BaseModel):
|
|
14
|
+
"""
|
|
15
|
+
Configuration schema for the Html extractor settings.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
max_queue_size : int, default=1
|
|
20
|
+
The maximum number of items allowed in the processing queue.
|
|
21
|
+
|
|
22
|
+
n_workers : int, default=16
|
|
23
|
+
The number of worker threads to use for processing.
|
|
24
|
+
|
|
25
|
+
raise_on_failure : bool, default=False
|
|
26
|
+
A flag indicating whether to raise an exception on processing failure.
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
max_queue_size: int = 1
|
|
31
|
+
n_workers: int = 16
|
|
32
|
+
raise_on_failure: bool = False
|
|
33
|
+
|
|
34
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -17,6 +17,8 @@ nv_ingest_api/internal/extract/docx/engines/__init__.py,sha256=47DEQpj8HBSa-_TIm
|
|
|
17
17
|
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
|
|
18
18
|
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py,sha256=1wkciAxu8lz9WuPuoleJFy2s09ieSzXl1S71F9r0BWA,4385
|
|
19
19
|
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=FOZZBD9gRRAr93qgK_L6o9xVBYD-6EE5-xI2-cWKvzo,33713
|
|
20
|
+
nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
21
|
+
nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
|
|
20
22
|
nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
21
23
|
nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=CkaW8ihPmGMQGrZh0ih14gtEpWuGOJ8InPQfZwpsP2g,13300
|
|
22
24
|
nv_ingest_api/internal/extract/image/image_extractor.py,sha256=4tUWinuFMN3ukWa2tZa2_LtzRiTyUAUCBF6BDkUEvm0,8705
|
|
@@ -68,6 +70,7 @@ nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEcz
|
|
|
68
70
|
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=VVppZgV1lnyJCTfADexzoj3V0lOSq3t6Dw_6VhIxZ7k,3771
|
|
69
71
|
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=iu8lHQC0zbBB9VRK7PZisAVzpeSpFqjcXRAnwZ9OzoM,4301
|
|
70
72
|
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=M2N7WjMNvSemHcJHWeNUD_kFG0wC5VE2W3K6SVrJqvA,3761
|
|
73
|
+
nv_ingest_api/internal/schemas/extract/extract_html_schema.py,sha256=lazpONTGZ6Fl420BGBAr6rogFGtlzBiZTc1uA694OIs,841
|
|
71
74
|
nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=GC4xV8Z9TPLOuxlEtf2fbklSSp8ETGMrDpZgMQ02UwA,3766
|
|
72
75
|
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=rl_hFDoJaJLTKbtnEpDSBj-73KQL9aUEVKGiW0IdXiU,3991
|
|
73
76
|
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=G9g1lEORmryUWTzDyZ0vHAuPnVMK7VaRx0E4xzmAw3Q,6589
|
|
@@ -147,8 +150,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
|
|
|
147
150
|
nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
|
|
148
151
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
152
|
nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
|
|
150
|
-
nv_ingest_api-2025.5.
|
|
151
|
-
nv_ingest_api-2025.5.
|
|
152
|
-
nv_ingest_api-2025.5.
|
|
153
|
-
nv_ingest_api-2025.5.
|
|
154
|
-
nv_ingest_api-2025.5.
|
|
153
|
+
nv_ingest_api-2025.5.24.dev20250524.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
154
|
+
nv_ingest_api-2025.5.24.dev20250524.dist-info/METADATA,sha256=0CeeJghpDqCn7SmTB4o7Qr1DjaD7iVffLf7pyH5XXHw,13919
|
|
155
|
+
nv_ingest_api-2025.5.24.dev20250524.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
156
|
+
nv_ingest_api-2025.5.24.dev20250524.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
|
|
157
|
+
nv_ingest_api-2025.5.24.dev20250524.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|