nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import functools
|
|
9
|
+
import uuid
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import Dict
|
|
12
|
+
from typing import Optional
|
|
13
|
+
from typing import Tuple
|
|
14
|
+
import base64
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
18
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.parakeet import create_audio_inference_client
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
20
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema, AudioMetadataSchema
|
|
21
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
22
|
+
from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
23
|
+
from nv_ingest_api.interface.utility import read_file_as_base64
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@unified_exception_handler
|
|
29
|
+
def _extract_from_audio(row: pd.Series, audio_client: Any, trace_info: Dict, segment_audio: bool = False) -> Dict:
|
|
30
|
+
"""
|
|
31
|
+
Modifies the metadata of a row if the conditions for table extraction are met.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
row : pd.Series
|
|
36
|
+
A row from the DataFrame containing metadata for the audio extraction.
|
|
37
|
+
|
|
38
|
+
audio_client : Any
|
|
39
|
+
The client used to call the audio inference model.
|
|
40
|
+
|
|
41
|
+
trace_info : Dict
|
|
42
|
+
Trace information used for logging or debugging.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
Dict
|
|
47
|
+
The modified metadata if conditions are met, otherwise the original metadata.
|
|
48
|
+
|
|
49
|
+
Raises
|
|
50
|
+
------
|
|
51
|
+
ValueError
|
|
52
|
+
If critical information (such as metadata) is missing from the row.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
metadata = row.get("metadata")
|
|
56
|
+
|
|
57
|
+
if metadata is None:
|
|
58
|
+
logger.error("Row does not contain 'metadata'.")
|
|
59
|
+
raise ValueError("Row does not contain 'metadata'.")
|
|
60
|
+
|
|
61
|
+
base64_audio = metadata.pop("content")
|
|
62
|
+
try:
|
|
63
|
+
base64_file_path = base64_audio
|
|
64
|
+
if not base64_file_path:
|
|
65
|
+
return [row.to_list()]
|
|
66
|
+
base64_file_path = base64.b64decode(base64_file_path).decode("utf-8")
|
|
67
|
+
if not base64_file_path:
|
|
68
|
+
return [row.to_list()]
|
|
69
|
+
if Path(base64_file_path).exists():
|
|
70
|
+
base64_audio = read_file_as_base64(base64_file_path)
|
|
71
|
+
except (UnicodeDecodeError, base64.binascii.Error):
|
|
72
|
+
pass
|
|
73
|
+
content_metadata = metadata.get("content_metadata", {})
|
|
74
|
+
|
|
75
|
+
# Only extract transcript if content type is audio
|
|
76
|
+
if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")):
|
|
77
|
+
return [row.to_list()]
|
|
78
|
+
|
|
79
|
+
logger.debug(f"Removing file {base64_file_path}")
|
|
80
|
+
Path(base64_file_path).unlink(missing_ok=True)
|
|
81
|
+
|
|
82
|
+
# Get the result from the inference model
|
|
83
|
+
segments, transcript = audio_client.infer(
|
|
84
|
+
base64_audio,
|
|
85
|
+
model_name="parakeet",
|
|
86
|
+
trace_info=trace_info, # traceable_func arg
|
|
87
|
+
stage_name="audio_extraction",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
extracted_data = []
|
|
91
|
+
if segment_audio:
|
|
92
|
+
for segment in segments:
|
|
93
|
+
segment_metadata = metadata.copy()
|
|
94
|
+
audio_metadata = {"audio_transcript": segment["text"]}
|
|
95
|
+
segment_metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
|
|
96
|
+
segment_metadata["content_metadata"]["start_time"] = segment["start"]
|
|
97
|
+
segment_metadata["content_metadata"]["end_time"] = segment["end"]
|
|
98
|
+
|
|
99
|
+
extracted_data.append(
|
|
100
|
+
[
|
|
101
|
+
ContentTypeEnum.AUDIO,
|
|
102
|
+
validate_schema(segment_metadata, MetadataSchema).model_dump(),
|
|
103
|
+
str(uuid.uuid4()),
|
|
104
|
+
]
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
audio_metadata = {"audio_transcript": transcript}
|
|
108
|
+
metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
|
|
109
|
+
extracted_data.append(
|
|
110
|
+
[ContentTypeEnum.AUDIO, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return extracted_data
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def extract_text_from_audio_internal(
|
|
117
|
+
df_extraction_ledger: pd.DataFrame,
|
|
118
|
+
task_config: Dict[str, Any],
|
|
119
|
+
extraction_config: AudioExtractorSchema,
|
|
120
|
+
execution_trace_log: Optional[Dict] = None,
|
|
121
|
+
) -> Tuple[pd.DataFrame, Dict]:
|
|
122
|
+
"""
|
|
123
|
+
Extracts audio data from a DataFrame.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
df_extraction_ledger : pd.DataFrame
|
|
128
|
+
DataFrame containing the content from which audio data is to be extracted.
|
|
129
|
+
|
|
130
|
+
task_config : Dict[str, Any]
|
|
131
|
+
Dictionary containing task properties and configurations.
|
|
132
|
+
|
|
133
|
+
extraction_config : Any
|
|
134
|
+
The validated configuration object for audio extraction.
|
|
135
|
+
|
|
136
|
+
execution_trace_log : Optional[Dict], optional
|
|
137
|
+
Optional trace information for debugging or logging. Defaults to None.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
Tuple[pd.DataFrame, Dict]
|
|
142
|
+
A tuple containing the updated DataFrame and the trace information.
|
|
143
|
+
|
|
144
|
+
Raises
|
|
145
|
+
------
|
|
146
|
+
Exception
|
|
147
|
+
If any error occurs during the audio data extraction process.
|
|
148
|
+
"""
|
|
149
|
+
logger.debug(f"Entering audio extraction stage with {len(df_extraction_ledger)} rows.")
|
|
150
|
+
|
|
151
|
+
extract_params = task_config.get("params", {}).get("extract_audio_params", {})
|
|
152
|
+
audio_extraction_config = extraction_config.audio_extraction_config
|
|
153
|
+
|
|
154
|
+
grpc_endpoint = extract_params.get("grpc_endpoint") or audio_extraction_config.audio_endpoints[0]
|
|
155
|
+
http_endpoint = extract_params.get("http_endpoint") or audio_extraction_config.audio_endpoints[1]
|
|
156
|
+
infer_protocol = extract_params.get("infer_protocol") or audio_extraction_config.audio_infer_protocol
|
|
157
|
+
auth_token = extract_params.get("auth_token") or audio_extraction_config.auth_token
|
|
158
|
+
function_id = extract_params.get("function_id") or audio_extraction_config.function_id
|
|
159
|
+
use_ssl = extract_params.get("use_ssl") or audio_extraction_config.use_ssl
|
|
160
|
+
ssl_cert = extract_params.get("ssl_cert") or audio_extraction_config.ssl_cert
|
|
161
|
+
segment_audio = extract_params.get("segment_audio") or audio_extraction_config.segment_audio
|
|
162
|
+
|
|
163
|
+
parakeet_client = create_audio_inference_client(
|
|
164
|
+
(grpc_endpoint, http_endpoint),
|
|
165
|
+
infer_protocol=infer_protocol,
|
|
166
|
+
auth_token=auth_token,
|
|
167
|
+
function_id=function_id,
|
|
168
|
+
use_ssl=use_ssl,
|
|
169
|
+
ssl_cert=ssl_cert,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
if execution_trace_log is None:
|
|
173
|
+
execution_trace_log = {}
|
|
174
|
+
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
# Create a partial function to extract using the provided configurations.
|
|
178
|
+
_extract_from_audio_partial = functools.partial(
|
|
179
|
+
_extract_from_audio,
|
|
180
|
+
audio_client=parakeet_client,
|
|
181
|
+
trace_info=execution_trace_log,
|
|
182
|
+
segment_audio=segment_audio,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Apply the _extract_from_audio_partial function to each row in the DataFrame
|
|
186
|
+
extraction_series = df_extraction_ledger.apply(_extract_from_audio_partial, axis=1)
|
|
187
|
+
|
|
188
|
+
# Explode the results if the extraction returns lists.
|
|
189
|
+
extraction_series = extraction_series.explode().dropna()
|
|
190
|
+
|
|
191
|
+
# Convert the extracted results into a DataFrame.
|
|
192
|
+
if not extraction_series.empty:
|
|
193
|
+
extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
194
|
+
else:
|
|
195
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
196
|
+
|
|
197
|
+
return extracted_df, execution_trace_log
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.exception(f"Error occurred while extracting audio data: {e}", exc_info=True)
|
|
201
|
+
|
|
202
|
+
raise
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import functools
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Optional, Dict, Any, Union, Tuple
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docx_helper import python_docx
|
|
16
|
+
from nv_ingest_api.internal.extract.pdf.engines.pdfium import pdfium_extractor
|
|
17
|
+
from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import convert_stream_with_libreoffice
|
|
18
|
+
from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
|
|
19
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _prepare_task_props(
|
|
26
|
+
task_config: Union[Dict[str, Any], BaseModel], base64_row: pd.Series
|
|
27
|
+
) -> (Dict[str, Any], Optional[str]):
|
|
28
|
+
"""
|
|
29
|
+
Prepares the task properties by converting a Pydantic model to a dictionary (if needed)
|
|
30
|
+
and injecting row-specific data.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
35
|
+
A dictionary or Pydantic model containing instructions and parameters for extraction.
|
|
36
|
+
base64_row : pd.Series
|
|
37
|
+
A Series representing a row from the DataFrame that contains at least the "content"
|
|
38
|
+
key and optionally "source_id".
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
Tuple[Dict[str, Any], Optional[str]]
|
|
43
|
+
A tuple where the first element is the prepared task properties dictionary with the key
|
|
44
|
+
"row_data" added under its "params" key, and the second element is the source_id (if present),
|
|
45
|
+
otherwise None.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
if isinstance(task_config, BaseModel):
|
|
49
|
+
task_config = task_config.model_dump()
|
|
50
|
+
else:
|
|
51
|
+
task_config = dict(task_config)
|
|
52
|
+
|
|
53
|
+
# Extract all row data except the "content" field.
|
|
54
|
+
row_data = base64_row.drop(labels=["content"], errors="ignore")
|
|
55
|
+
if "params" not in task_config:
|
|
56
|
+
task_config["params"] = {}
|
|
57
|
+
|
|
58
|
+
task_config["params"]["row_data"] = row_data
|
|
59
|
+
|
|
60
|
+
source_id = base64_row.get("source_id", None)
|
|
61
|
+
|
|
62
|
+
return task_config, source_id
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@unified_exception_handler
|
|
66
|
+
def _decode_and_extract_from_docx(
|
|
67
|
+
base64_row: pd.Series,
|
|
68
|
+
task_config: Union[Dict[str, Any], BaseModel],
|
|
69
|
+
extraction_config: Any,
|
|
70
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
71
|
+
) -> Any:
|
|
72
|
+
"""
|
|
73
|
+
Decodes base64 content from a DataFrame row and extracts data using the specified extraction method.
|
|
74
|
+
|
|
75
|
+
The function decodes the base64-encoded content from the "content" key in the row, prepares
|
|
76
|
+
extraction parameters (including additional row data and configuration), and invokes the extraction
|
|
77
|
+
function from the docx module. If an error occurs, an exception tag is returned.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
base64_row : pd.Series
|
|
82
|
+
A Series containing the base64-encoded content under the key "content" and optionally a "source_id".
|
|
83
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
84
|
+
A dictionary or Pydantic model containing extraction instructions and parameters.
|
|
85
|
+
Expected to have a "params" key for additional parameters and optionally a "method" key specifying
|
|
86
|
+
the extraction method.
|
|
87
|
+
extraction_config : Any
|
|
88
|
+
A configuration object that contains extraction-specific settings, such as `docx_extraction_config`.
|
|
89
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
90
|
+
A dictionary containing trace information for debugging or logging.
|
|
91
|
+
default : str, optional
|
|
92
|
+
The default extraction method to use if the specified method is not available (default is "python_docx").
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
Any
|
|
97
|
+
The extracted data, or an exception tag if extraction fails.
|
|
98
|
+
|
|
99
|
+
Raises
|
|
100
|
+
------
|
|
101
|
+
Exception
|
|
102
|
+
If an unhandled exception occurs during extraction, it is logged and a tagged error is returned.
|
|
103
|
+
"""
|
|
104
|
+
# Prepare task properties and extract source_id
|
|
105
|
+
task_config, source_id = _prepare_task_props(task_config, base64_row)
|
|
106
|
+
|
|
107
|
+
# Retrieve base64 content and decode it into a byte stream.
|
|
108
|
+
base64_content: str = base64_row["content"]
|
|
109
|
+
doc_bytes: bytes = base64.b64decode(base64_content)
|
|
110
|
+
doc_stream: io.BytesIO = io.BytesIO(doc_bytes)
|
|
111
|
+
|
|
112
|
+
extract_method = task_config.get("method", "python_docx")
|
|
113
|
+
extract_params: Dict[str, Any] = task_config.get("params", {})
|
|
114
|
+
|
|
115
|
+
# Extract required boolean flags from params.
|
|
116
|
+
try:
|
|
117
|
+
extract_text = extract_params.pop("extract_text", False)
|
|
118
|
+
extract_images = extract_params.pop("extract_images", False)
|
|
119
|
+
extract_tables = extract_params.pop("extract_tables", False)
|
|
120
|
+
extract_charts = extract_params.pop("extract_charts", False)
|
|
121
|
+
extract_infographics = extract_params.pop("extract_infographics", False)
|
|
122
|
+
except KeyError as e:
|
|
123
|
+
raise ValueError(f"Missing required extraction flag: {e}")
|
|
124
|
+
|
|
125
|
+
# Inject configuration and trace info into extraction parameters.
|
|
126
|
+
if getattr(extraction_config, "docx_extraction_config", None) is not None:
|
|
127
|
+
extract_params["docx_extraction_config"] = extraction_config.docx_extraction_config
|
|
128
|
+
|
|
129
|
+
if execution_trace_log is not None:
|
|
130
|
+
extract_params["trace_info"] = execution_trace_log
|
|
131
|
+
|
|
132
|
+
if extract_method == "render_as_pdf":
|
|
133
|
+
pdf_stream = convert_stream_with_libreoffice(doc_stream, "docx", "pdf")
|
|
134
|
+
|
|
135
|
+
pdf_extract_method = extract_params.get("pdf_extract_method", "pdfium")
|
|
136
|
+
pdf_extractor_config = extract_params.copy()
|
|
137
|
+
pdf_extractor_config["extract_method"] = pdf_extract_method
|
|
138
|
+
if getattr(extraction_config, "pdfium_config", None) is not None:
|
|
139
|
+
pdf_extractor_config["pdfium_config"] = extraction_config.pdfium_config
|
|
140
|
+
|
|
141
|
+
extracted_data: Any = pdfium_extractor(
|
|
142
|
+
pdf_stream=pdf_stream,
|
|
143
|
+
extract_text=extract_text,
|
|
144
|
+
extract_images=extract_images,
|
|
145
|
+
extract_infographics=extract_infographics,
|
|
146
|
+
extract_tables=extract_tables,
|
|
147
|
+
extract_charts=extract_charts,
|
|
148
|
+
extract_page_as_image=False,
|
|
149
|
+
extractor_config=pdf_extractor_config,
|
|
150
|
+
execution_trace_log=None,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
elif extract_method == "python_docx":
|
|
154
|
+
extracted_data: Any = python_docx(
|
|
155
|
+
docx_stream=doc_stream,
|
|
156
|
+
extract_text=extract_text,
|
|
157
|
+
extract_images=extract_images,
|
|
158
|
+
extract_infographics=extract_infographics,
|
|
159
|
+
extract_tables=extract_tables,
|
|
160
|
+
extract_charts=extract_charts,
|
|
161
|
+
extraction_config=extract_params,
|
|
162
|
+
execution_trace_log=None,
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError(f"Unsupported DOCX extraction method: {extract_method}")
|
|
166
|
+
|
|
167
|
+
return extracted_data
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@unified_exception_handler
|
|
171
|
+
def extract_primitives_from_docx_internal(
|
|
172
|
+
df_extraction_ledger: pd.DataFrame,
|
|
173
|
+
task_config: Union[Dict[str, Any], BaseModel],
|
|
174
|
+
extraction_config: DocxExtractorSchema,
|
|
175
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
176
|
+
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
|
|
177
|
+
"""
|
|
178
|
+
Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
|
|
179
|
+
each document and replacing the original content with the extracted text.
|
|
180
|
+
|
|
181
|
+
This function applies a decoding and extraction routine to each row of the input DataFrame.
|
|
182
|
+
The routine is provided via the `decode_and_extract` function, which is partially applied with
|
|
183
|
+
task configuration, extraction configuration, and optional trace information. The results are
|
|
184
|
+
exploded and any missing values are dropped, then compiled into a new DataFrame with columns
|
|
185
|
+
for document type, metadata, and a UUID identifier.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
df_extraction_ledger : pd.DataFrame
|
|
190
|
+
The input DataFrame containing DOCX files in base64 encoding. Expected columns include
|
|
191
|
+
'source_id' and 'content'.
|
|
192
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
193
|
+
Configuration instructions for the document processing task. This can be provided as a
|
|
194
|
+
dictionary or a Pydantic model.
|
|
195
|
+
extraction_config : Any
|
|
196
|
+
A configuration object for document extraction that guides the extraction process.
|
|
197
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
198
|
+
An optional dictionary containing trace information for debugging or logging.
|
|
199
|
+
|
|
200
|
+
Returns
|
|
201
|
+
-------
|
|
202
|
+
pd.DataFrame
|
|
203
|
+
A DataFrame with the original DOCX content replaced by the extracted text. The resulting
|
|
204
|
+
DataFrame contains the columns "document_type", "metadata", and "uuid".
|
|
205
|
+
|
|
206
|
+
Raises
|
|
207
|
+
------
|
|
208
|
+
Exception
|
|
209
|
+
If an error occurs during the document extraction process, the exception is logged and
|
|
210
|
+
re-raised.
|
|
211
|
+
"""
|
|
212
|
+
# Create a partial function to decode and extract using the provided configurations.
|
|
213
|
+
_decode_and_extract = functools.partial(
|
|
214
|
+
_decode_and_extract_from_docx,
|
|
215
|
+
task_config=task_config,
|
|
216
|
+
extraction_config=extraction_config,
|
|
217
|
+
execution_trace_log=execution_trace_log,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Apply the decode_and_extract function to each row in the DataFrame.
|
|
221
|
+
sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
|
|
222
|
+
|
|
223
|
+
# Explode any list results and drop missing values.
|
|
224
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
225
|
+
|
|
226
|
+
# Convert the extraction results to a DataFrame if available.
|
|
227
|
+
if not sr_extraction.empty:
|
|
228
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
229
|
+
else:
|
|
230
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
231
|
+
|
|
232
|
+
return extracted_df, {}
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
# pylint: disable=too-many-locals
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
from typing import IO, Optional, List
|
|
22
|
+
|
|
23
|
+
from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
|
|
24
|
+
from nv_ingest_api.internal.enums.common import TextTypeEnum
|
|
25
|
+
from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docxreader import DocxReader
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def python_docx(
|
|
31
|
+
*,
|
|
32
|
+
docx_stream: IO,
|
|
33
|
+
extract_text: bool,
|
|
34
|
+
extract_images: bool,
|
|
35
|
+
extract_infographics: bool,
|
|
36
|
+
extract_tables: bool,
|
|
37
|
+
extract_charts: bool,
|
|
38
|
+
extraction_config: dict,
|
|
39
|
+
execution_trace_log: Optional[List] = None,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Helper function that use python-docx to extract text from a bytestream document
|
|
43
|
+
|
|
44
|
+
A document has three levels - document, paragraphs and runs. To align with the
|
|
45
|
+
pdf extraction paragraphs are aliased as block. python-docx leaves the page number
|
|
46
|
+
and line number to the renderer so we assume that the entire document is a single
|
|
47
|
+
page.
|
|
48
|
+
|
|
49
|
+
Run level parsing has been skipped but can be added as needed.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
docx_stream:
|
|
54
|
+
Bytestream
|
|
55
|
+
extract_text : bool
|
|
56
|
+
Specifies whether to extract text.
|
|
57
|
+
extract_images : bool
|
|
58
|
+
Specifies whether to extract images.
|
|
59
|
+
extract_infographics : bool
|
|
60
|
+
Specifies whether to extract infographics.
|
|
61
|
+
extract_tables : bool
|
|
62
|
+
Specifies whether to extract tables.
|
|
63
|
+
extract_charts : bool
|
|
64
|
+
Specifies whether to extract charts.
|
|
65
|
+
extraction_config : dict
|
|
66
|
+
A dictionary of configuration parameters for the extraction process.
|
|
67
|
+
execution_trace_log : list, optional
|
|
68
|
+
A list for accumulating trace information during extraction. Defaults to None.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
str
|
|
73
|
+
A string of extracted text.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
_ = execution_trace_log
|
|
77
|
+
_ = extract_infographics
|
|
78
|
+
|
|
79
|
+
row_data = extraction_config.get("row_data")
|
|
80
|
+
# get source_id
|
|
81
|
+
source_id = row_data["source_id"]
|
|
82
|
+
# get text_depth
|
|
83
|
+
text_depth = extraction_config.get("text_depth", "document")
|
|
84
|
+
text_depth = TextTypeEnum(text_depth)
|
|
85
|
+
# get base metadata
|
|
86
|
+
metadata_col = "metadata"
|
|
87
|
+
|
|
88
|
+
docx_extractor_config = extraction_config.get("docx_extraction_config", {})
|
|
89
|
+
|
|
90
|
+
base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
|
|
91
|
+
|
|
92
|
+
# get base source_metadata
|
|
93
|
+
base_source_metadata = base_unified_metadata.get("source_metadata", {})
|
|
94
|
+
# get source_location
|
|
95
|
+
source_location = base_source_metadata.get("source_location", "")
|
|
96
|
+
# get collection_id (assuming coming in from source_metadata...)
|
|
97
|
+
collection_id = base_source_metadata.get("collection_id", "")
|
|
98
|
+
# get partition_id (assuming coming in from source_metadata...)
|
|
99
|
+
partition_id = base_source_metadata.get("partition_id", -1)
|
|
100
|
+
# get access_level (assuming coming in from source_metadata...)
|
|
101
|
+
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
|
|
102
|
+
|
|
103
|
+
# python-docx doesn't maintain filename; re-use source_id
|
|
104
|
+
source_metadata = {
|
|
105
|
+
"source_name": source_id,
|
|
106
|
+
"source_id": source_id,
|
|
107
|
+
"source_location": source_location,
|
|
108
|
+
"source_type": DocumentTypeEnum.DOCX,
|
|
109
|
+
"collection_id": collection_id,
|
|
110
|
+
"partition_id": partition_id,
|
|
111
|
+
"access_level": access_level,
|
|
112
|
+
"summary": "",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Extract data from the document using python-docx
|
|
116
|
+
doc = DocxReader(docx_stream, source_metadata, extraction_config=docx_extractor_config)
|
|
117
|
+
extracted_data = doc.extract_data(
|
|
118
|
+
base_unified_metadata,
|
|
119
|
+
text_depth=text_depth,
|
|
120
|
+
extract_text=extract_text,
|
|
121
|
+
extract_tables=extract_tables,
|
|
122
|
+
extract_charts=extract_charts,
|
|
123
|
+
extract_infographics=extract_infographics,
|
|
124
|
+
extract_images=extract_images,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return extracted_data
|