nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.23.dev20250423.dist-info/RECORD +152 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/WHEEL +1 -1
- nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import functools
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Optional, Dict, Any, Union
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docx_helper import python_docx
|
|
16
|
+
from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
|
|
17
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _prepare_task_props(
|
|
23
|
+
task_config: Union[Dict[str, Any], BaseModel], base64_row: pd.Series
|
|
24
|
+
) -> (Dict[str, Any], Optional[str]):
|
|
25
|
+
"""
|
|
26
|
+
Prepares the task properties by converting a Pydantic model to a dictionary (if needed)
|
|
27
|
+
and injecting row-specific data.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
32
|
+
A dictionary or Pydantic model containing instructions and parameters for extraction.
|
|
33
|
+
base64_row : pd.Series
|
|
34
|
+
A Series representing a row from the DataFrame that contains at least the "content"
|
|
35
|
+
key and optionally "source_id".
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
Tuple[Dict[str, Any], Optional[str]]
|
|
40
|
+
A tuple where the first element is the prepared task properties dictionary with the key
|
|
41
|
+
"row_data" added under its "params" key, and the second element is the source_id (if present),
|
|
42
|
+
otherwise None.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
if isinstance(task_config, BaseModel):
|
|
46
|
+
task_config = task_config.model_dump()
|
|
47
|
+
else:
|
|
48
|
+
task_config = dict(task_config)
|
|
49
|
+
|
|
50
|
+
# Extract all row data except the "content" field.
|
|
51
|
+
row_data = base64_row.drop(labels=["content"], errors="ignore")
|
|
52
|
+
if "params" not in task_config:
|
|
53
|
+
task_config["params"] = {}
|
|
54
|
+
|
|
55
|
+
task_config["params"]["row_data"] = row_data
|
|
56
|
+
|
|
57
|
+
source_id = base64_row.get("source_id", None)
|
|
58
|
+
|
|
59
|
+
return task_config, source_id
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@unified_exception_handler
|
|
63
|
+
def _decode_and_extract_from_docx(
|
|
64
|
+
base64_row: pd.Series,
|
|
65
|
+
task_config: Union[Dict[str, Any], BaseModel],
|
|
66
|
+
extraction_config: Any,
|
|
67
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
68
|
+
) -> Any:
|
|
69
|
+
"""
|
|
70
|
+
Decodes base64 content from a DataFrame row and extracts data using the specified extraction method.
|
|
71
|
+
|
|
72
|
+
The function decodes the base64-encoded content from the "content" key in the row, prepares
|
|
73
|
+
extraction parameters (including additional row data and configuration), and invokes the extraction
|
|
74
|
+
function from the docx module. If an error occurs, an exception tag is returned.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
base64_row : pd.Series
|
|
79
|
+
A Series containing the base64-encoded content under the key "content" and optionally a "source_id".
|
|
80
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
81
|
+
A dictionary or Pydantic model containing extraction instructions and parameters.
|
|
82
|
+
Expected to have a "params" key for additional parameters and optionally a "method" key specifying
|
|
83
|
+
the extraction method.
|
|
84
|
+
extraction_config : Any
|
|
85
|
+
A configuration object that contains extraction-specific settings, such as `docx_extraction_config`.
|
|
86
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
87
|
+
A dictionary containing trace information for debugging or logging.
|
|
88
|
+
default : str, optional
|
|
89
|
+
The default extraction method to use if the specified method is not available (default is "python_docx").
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
Any
|
|
94
|
+
The extracted data, or an exception tag if extraction fails.
|
|
95
|
+
|
|
96
|
+
Raises
|
|
97
|
+
------
|
|
98
|
+
Exception
|
|
99
|
+
If an unhandled exception occurs during extraction, it is logged and a tagged error is returned.
|
|
100
|
+
"""
|
|
101
|
+
# Prepare task properties and extract source_id
|
|
102
|
+
task_config, source_id = _prepare_task_props(task_config, base64_row)
|
|
103
|
+
|
|
104
|
+
# Retrieve base64 content and decode it into a byte stream.
|
|
105
|
+
base64_content: str = base64_row["content"]
|
|
106
|
+
doc_bytes: bytes = base64.b64decode(base64_content)
|
|
107
|
+
doc_stream: io.BytesIO = io.BytesIO(doc_bytes)
|
|
108
|
+
|
|
109
|
+
extract_params: Dict[str, Any] = task_config.get("params", {})
|
|
110
|
+
|
|
111
|
+
# Extract required boolean flags from params.
|
|
112
|
+
try:
|
|
113
|
+
extract_text = extract_params.pop("extract_text", False)
|
|
114
|
+
extract_images = extract_params.pop("extract_images", False)
|
|
115
|
+
extract_tables = extract_params.pop("extract_tables", False)
|
|
116
|
+
extract_charts = extract_params.pop("extract_charts", False)
|
|
117
|
+
extract_infographics = extract_params.pop("extract_infographics", False)
|
|
118
|
+
except KeyError as e:
|
|
119
|
+
raise ValueError(f"Missing required extraction flag: {e}")
|
|
120
|
+
|
|
121
|
+
# Inject configuration and trace info into extraction parameters.
|
|
122
|
+
if getattr(extraction_config, "docx_extraction_config", None) is not None:
|
|
123
|
+
extract_params["docx_extraction_config"] = extraction_config.docx_extraction_config
|
|
124
|
+
|
|
125
|
+
if execution_trace_log is not None:
|
|
126
|
+
extract_params["trace_info"] = execution_trace_log
|
|
127
|
+
|
|
128
|
+
# extraction_func: Callable = _get_extraction_function(extract_method, default)
|
|
129
|
+
extracted_data: Any = python_docx(
|
|
130
|
+
docx_stream=doc_stream,
|
|
131
|
+
extract_text=extract_text,
|
|
132
|
+
extract_images=extract_images,
|
|
133
|
+
extract_infographics=extract_infographics,
|
|
134
|
+
extract_tables=extract_tables,
|
|
135
|
+
extract_charts=extract_charts,
|
|
136
|
+
extraction_config=extract_params,
|
|
137
|
+
execution_trace_log=None,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return extracted_data
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@unified_exception_handler
|
|
144
|
+
def extract_primitives_from_docx_internal(
|
|
145
|
+
df_extraction_ledger: pd.DataFrame,
|
|
146
|
+
task_config: Union[Dict[str, Any], BaseModel],
|
|
147
|
+
extraction_config: DocxExtractorSchema,
|
|
148
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
149
|
+
) -> pd.DataFrame:
|
|
150
|
+
"""
|
|
151
|
+
Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
|
|
152
|
+
each document and replacing the original content with the extracted text.
|
|
153
|
+
|
|
154
|
+
This function applies a decoding and extraction routine to each row of the input DataFrame.
|
|
155
|
+
The routine is provided via the `decode_and_extract` function, which is partially applied with
|
|
156
|
+
task configuration, extraction configuration, and optional trace information. The results are
|
|
157
|
+
exploded and any missing values are dropped, then compiled into a new DataFrame with columns
|
|
158
|
+
for document type, metadata, and a UUID identifier.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
df_extraction_ledger : pd.DataFrame
|
|
163
|
+
The input DataFrame containing DOCX files in base64 encoding. Expected columns include
|
|
164
|
+
'source_id' and 'content'.
|
|
165
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
166
|
+
Configuration instructions for the document processing task. This can be provided as a
|
|
167
|
+
dictionary or a Pydantic model.
|
|
168
|
+
extraction_config : Any
|
|
169
|
+
A configuration object for document extraction that guides the extraction process.
|
|
170
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
171
|
+
An optional dictionary containing trace information for debugging or logging.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
pd.DataFrame
|
|
176
|
+
A DataFrame with the original DOCX content replaced by the extracted text. The resulting
|
|
177
|
+
DataFrame contains the columns "document_type", "metadata", and "uuid".
|
|
178
|
+
|
|
179
|
+
Raises
|
|
180
|
+
------
|
|
181
|
+
Exception
|
|
182
|
+
If an error occurs during the document extraction process, the exception is logged and
|
|
183
|
+
re-raised.
|
|
184
|
+
"""
|
|
185
|
+
# Create a partial function to decode and extract using the provided configurations.
|
|
186
|
+
_decode_and_extract = functools.partial(
|
|
187
|
+
_decode_and_extract_from_docx,
|
|
188
|
+
task_config=task_config,
|
|
189
|
+
extraction_config=extraction_config,
|
|
190
|
+
execution_trace_log=execution_trace_log,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Apply the decode_and_extract function to each row in the DataFrame.
|
|
194
|
+
sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
|
|
195
|
+
|
|
196
|
+
# Explode any list results and drop missing values.
|
|
197
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
198
|
+
|
|
199
|
+
# Convert the extraction results to a DataFrame if available.
|
|
200
|
+
if not sr_extraction.empty:
|
|
201
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
202
|
+
else:
|
|
203
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
204
|
+
|
|
205
|
+
return extracted_df
|
|
File without changes
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
# pylint: disable=too-many-locals
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from typing import IO, Optional, List
|
|
23
|
+
|
|
24
|
+
from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
|
|
25
|
+
from nv_ingest_api.internal.enums.common import TextTypeEnum
|
|
26
|
+
from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docxreader import DocxReader
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def python_docx(
|
|
32
|
+
*,
|
|
33
|
+
docx_stream: IO,
|
|
34
|
+
extract_text: bool,
|
|
35
|
+
extract_images: bool,
|
|
36
|
+
extract_infographics: bool,
|
|
37
|
+
extract_tables: bool,
|
|
38
|
+
extract_charts: bool,
|
|
39
|
+
extraction_config: dict,
|
|
40
|
+
execution_trace_log: Optional[List] = None,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Helper function that use python-docx to extract text from a bytestream document
|
|
44
|
+
|
|
45
|
+
A document has three levels - document, paragraphs and runs. To align with the
|
|
46
|
+
pdf extraction paragraphs are aliased as block. python-docx leaves the page number
|
|
47
|
+
and line number to the renderer so we assume that the entire document is a single
|
|
48
|
+
page.
|
|
49
|
+
|
|
50
|
+
Run level parsing has been skipped but can be added as needed.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
docx_stream:
|
|
55
|
+
Bytestream
|
|
56
|
+
extract_text : bool
|
|
57
|
+
Specifies whether to extract text.
|
|
58
|
+
extract_images : bool
|
|
59
|
+
Specifies whether to extract images.
|
|
60
|
+
extract_infographics : bool
|
|
61
|
+
Specifies whether to extract infographics.
|
|
62
|
+
extract_tables : bool
|
|
63
|
+
Specifies whether to extract tables.
|
|
64
|
+
extract_charts : bool
|
|
65
|
+
Specifies whether to extract charts.
|
|
66
|
+
extraction_config : dict
|
|
67
|
+
A dictionary of configuration parameters for the extraction process.
|
|
68
|
+
execution_trace_log : list, optional
|
|
69
|
+
A list for accumulating trace information during extraction. Defaults to None.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
str
|
|
74
|
+
A string of extracted text.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
_ = execution_trace_log
|
|
78
|
+
_ = extract_infographics
|
|
79
|
+
|
|
80
|
+
row_data = extraction_config.get("row_data")
|
|
81
|
+
# get source_id
|
|
82
|
+
source_id = row_data["source_id"]
|
|
83
|
+
# get text_depth
|
|
84
|
+
text_depth = extraction_config.get("text_depth", "document")
|
|
85
|
+
text_depth = TextTypeEnum(text_depth)
|
|
86
|
+
# get base metadata
|
|
87
|
+
metadata_col = "metadata"
|
|
88
|
+
|
|
89
|
+
docx_extractor_config = extraction_config.get("docx_extraction_config", {})
|
|
90
|
+
|
|
91
|
+
base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
|
|
92
|
+
|
|
93
|
+
# get base source_metadata
|
|
94
|
+
base_source_metadata = base_unified_metadata.get("source_metadata", {})
|
|
95
|
+
# get source_location
|
|
96
|
+
source_location = base_source_metadata.get("source_location", "")
|
|
97
|
+
# get collection_id (assuming coming in from source_metadata...)
|
|
98
|
+
collection_id = base_source_metadata.get("collection_id", "")
|
|
99
|
+
# get partition_id (assuming coming in from source_metadata...)
|
|
100
|
+
partition_id = base_source_metadata.get("partition_id", -1)
|
|
101
|
+
# get access_level (assuming coming in from source_metadata...)
|
|
102
|
+
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
|
|
103
|
+
|
|
104
|
+
# python-docx doesn't maintain filename; re-use source_id
|
|
105
|
+
source_metadata = {
|
|
106
|
+
"source_name": source_id,
|
|
107
|
+
"source_id": source_id,
|
|
108
|
+
"source_location": source_location,
|
|
109
|
+
"source_type": DocumentTypeEnum.DOCX,
|
|
110
|
+
"collection_id": collection_id,
|
|
111
|
+
"partition_id": partition_id,
|
|
112
|
+
"access_level": access_level,
|
|
113
|
+
"summary": "",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Extract data from the document using python-docx
|
|
117
|
+
doc = DocxReader(docx_stream, source_metadata, extraction_config=docx_extractor_config)
|
|
118
|
+
extracted_data = doc.extract_data(
|
|
119
|
+
base_unified_metadata, text_depth, extract_text, extract_charts, extract_tables, extract_images
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return extracted_data
|