nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,202 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ import pandas as pd
8
+ import functools
9
+ import uuid
10
+ from typing import Any
11
+ from typing import Dict
12
+ from typing import Optional
13
+ from typing import Tuple
14
+ import base64
15
+ from pathlib import Path
16
+
17
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
18
+ from nv_ingest_api.internal.primitives.nim.model_interface.parakeet import create_audio_inference_client
19
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
20
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema, AudioMetadataSchema
21
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
22
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
23
+ from nv_ingest_api.interface.utility import read_file_as_base64
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ @unified_exception_handler
29
+ def _extract_from_audio(row: pd.Series, audio_client: Any, trace_info: Dict, segment_audio: bool = False) -> Dict:
30
+ """
31
+ Modifies the metadata of a row if the conditions for table extraction are met.
32
+
33
+ Parameters
34
+ ----------
35
+ row : pd.Series
36
+ A row from the DataFrame containing metadata for the audio extraction.
37
+
38
+ audio_client : Any
39
+ The client used to call the audio inference model.
40
+
41
+ trace_info : Dict
42
+ Trace information used for logging or debugging.
43
+
44
+ Returns
45
+ -------
46
+ Dict
47
+ The modified metadata if conditions are met, otherwise the original metadata.
48
+
49
+ Raises
50
+ ------
51
+ ValueError
52
+ If critical information (such as metadata) is missing from the row.
53
+ """
54
+
55
+ metadata = row.get("metadata")
56
+
57
+ if metadata is None:
58
+ logger.error("Row does not contain 'metadata'.")
59
+ raise ValueError("Row does not contain 'metadata'.")
60
+
61
+ base64_audio = metadata.pop("content")
62
+ try:
63
+ base64_file_path = base64_audio
64
+ if not base64_file_path:
65
+ return [row.to_list()]
66
+ base64_file_path = base64.b64decode(base64_file_path).decode("utf-8")
67
+ if not base64_file_path:
68
+ return [row.to_list()]
69
+ if Path(base64_file_path).exists():
70
+ base64_audio = read_file_as_base64(base64_file_path)
71
+ except (UnicodeDecodeError, base64.binascii.Error):
72
+ pass
73
+ content_metadata = metadata.get("content_metadata", {})
74
+
75
+ # Only extract transcript if content type is audio
76
+ if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")):
77
+ return [row.to_list()]
78
+
79
+ logger.debug(f"Removing file {base64_file_path}")
80
+ Path(base64_file_path).unlink(missing_ok=True)
81
+
82
+ # Get the result from the inference model
83
+ segments, transcript = audio_client.infer(
84
+ base64_audio,
85
+ model_name="parakeet",
86
+ trace_info=trace_info, # traceable_func arg
87
+ stage_name="audio_extraction",
88
+ )
89
+
90
+ extracted_data = []
91
+ if segment_audio:
92
+ for segment in segments:
93
+ segment_metadata = metadata.copy()
94
+ audio_metadata = {"audio_transcript": segment["text"]}
95
+ segment_metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
96
+ segment_metadata["content_metadata"]["start_time"] = segment["start"]
97
+ segment_metadata["content_metadata"]["end_time"] = segment["end"]
98
+
99
+ extracted_data.append(
100
+ [
101
+ ContentTypeEnum.AUDIO,
102
+ validate_schema(segment_metadata, MetadataSchema).model_dump(),
103
+ str(uuid.uuid4()),
104
+ ]
105
+ )
106
+ else:
107
+ audio_metadata = {"audio_transcript": transcript}
108
+ metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
109
+ extracted_data.append(
110
+ [ContentTypeEnum.AUDIO, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]
111
+ )
112
+
113
+ return extracted_data
114
+
115
+
116
+ def extract_text_from_audio_internal(
117
+ df_extraction_ledger: pd.DataFrame,
118
+ task_config: Dict[str, Any],
119
+ extraction_config: AudioExtractorSchema,
120
+ execution_trace_log: Optional[Dict] = None,
121
+ ) -> Tuple[pd.DataFrame, Dict]:
122
+ """
123
+ Extracts audio data from a DataFrame.
124
+
125
+ Parameters
126
+ ----------
127
+ df_extraction_ledger : pd.DataFrame
128
+ DataFrame containing the content from which audio data is to be extracted.
129
+
130
+ task_config : Dict[str, Any]
131
+ Dictionary containing task properties and configurations.
132
+
133
+ extraction_config : Any
134
+ The validated configuration object for audio extraction.
135
+
136
+ execution_trace_log : Optional[Dict], optional
137
+ Optional trace information for debugging or logging. Defaults to None.
138
+
139
+ Returns
140
+ -------
141
+ Tuple[pd.DataFrame, Dict]
142
+ A tuple containing the updated DataFrame and the trace information.
143
+
144
+ Raises
145
+ ------
146
+ Exception
147
+ If any error occurs during the audio data extraction process.
148
+ """
149
+ logger.debug(f"Entering audio extraction stage with {len(df_extraction_ledger)} rows.")
150
+
151
+ extract_params = task_config.get("params", {}).get("extract_audio_params", {})
152
+ audio_extraction_config = extraction_config.audio_extraction_config
153
+
154
+ grpc_endpoint = extract_params.get("grpc_endpoint") or audio_extraction_config.audio_endpoints[0]
155
+ http_endpoint = extract_params.get("http_endpoint") or audio_extraction_config.audio_endpoints[1]
156
+ infer_protocol = extract_params.get("infer_protocol") or audio_extraction_config.audio_infer_protocol
157
+ auth_token = extract_params.get("auth_token") or audio_extraction_config.auth_token
158
+ function_id = extract_params.get("function_id") or audio_extraction_config.function_id
159
+ use_ssl = extract_params.get("use_ssl") or audio_extraction_config.use_ssl
160
+ ssl_cert = extract_params.get("ssl_cert") or audio_extraction_config.ssl_cert
161
+ segment_audio = extract_params.get("segment_audio") or audio_extraction_config.segment_audio
162
+
163
+ parakeet_client = create_audio_inference_client(
164
+ (grpc_endpoint, http_endpoint),
165
+ infer_protocol=infer_protocol,
166
+ auth_token=auth_token,
167
+ function_id=function_id,
168
+ use_ssl=use_ssl,
169
+ ssl_cert=ssl_cert,
170
+ )
171
+
172
+ if execution_trace_log is None:
173
+ execution_trace_log = {}
174
+ logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
175
+
176
+ try:
177
+ # Create a partial function to extract using the provided configurations.
178
+ _extract_from_audio_partial = functools.partial(
179
+ _extract_from_audio,
180
+ audio_client=parakeet_client,
181
+ trace_info=execution_trace_log,
182
+ segment_audio=segment_audio,
183
+ )
184
+
185
+ # Apply the _extract_from_audio_partial function to each row in the DataFrame
186
+ extraction_series = df_extraction_ledger.apply(_extract_from_audio_partial, axis=1)
187
+
188
+ # Explode the results if the extraction returns lists.
189
+ extraction_series = extraction_series.explode().dropna()
190
+
191
+ # Convert the extracted results into a DataFrame.
192
+ if not extraction_series.empty:
193
+ extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
194
+ else:
195
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
196
+
197
+ return extracted_df, execution_trace_log
198
+
199
+ except Exception as e:
200
+ logger.exception(f"Error occurred while extracting audio data: {e}", exc_info=True)
201
+
202
+ raise
@@ -0,0 +1,5 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
@@ -0,0 +1,232 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import functools
8
+ import io
9
+ import logging
10
+ from typing import Optional, Dict, Any, Union, Tuple
11
+
12
+ import pandas as pd
13
+ from pydantic import BaseModel
14
+
15
+ from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docx_helper import python_docx
16
+ from nv_ingest_api.internal.extract.pdf.engines.pdfium import pdfium_extractor
17
+ from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import convert_stream_with_libreoffice
18
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
19
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def _prepare_task_props(
26
+ task_config: Union[Dict[str, Any], BaseModel], base64_row: pd.Series
27
+ ) -> (Dict[str, Any], Optional[str]):
28
+ """
29
+ Prepares the task properties by converting a Pydantic model to a dictionary (if needed)
30
+ and injecting row-specific data.
31
+
32
+ Parameters
33
+ ----------
34
+ task_config : Union[Dict[str, Any], BaseModel]
35
+ A dictionary or Pydantic model containing instructions and parameters for extraction.
36
+ base64_row : pd.Series
37
+ A Series representing a row from the DataFrame that contains at least the "content"
38
+ key and optionally "source_id".
39
+
40
+ Returns
41
+ -------
42
+ Tuple[Dict[str, Any], Optional[str]]
43
+ A tuple where the first element is the prepared task properties dictionary with the key
44
+ "row_data" added under its "params" key, and the second element is the source_id (if present),
45
+ otherwise None.
46
+ """
47
+
48
+ if isinstance(task_config, BaseModel):
49
+ task_config = task_config.model_dump()
50
+ else:
51
+ task_config = dict(task_config)
52
+
53
+ # Extract all row data except the "content" field.
54
+ row_data = base64_row.drop(labels=["content"], errors="ignore")
55
+ if "params" not in task_config:
56
+ task_config["params"] = {}
57
+
58
+ task_config["params"]["row_data"] = row_data
59
+
60
+ source_id = base64_row.get("source_id", None)
61
+
62
+ return task_config, source_id
63
+
64
+
65
+ @unified_exception_handler
66
+ def _decode_and_extract_from_docx(
67
+ base64_row: pd.Series,
68
+ task_config: Union[Dict[str, Any], BaseModel],
69
+ extraction_config: Any,
70
+ execution_trace_log: Optional[Dict[str, Any]] = None,
71
+ ) -> Any:
72
+ """
73
+ Decodes base64 content from a DataFrame row and extracts data using the specified extraction method.
74
+
75
+ The function decodes the base64-encoded content from the "content" key in the row, prepares
76
+ extraction parameters (including additional row data and configuration), and invokes the extraction
77
+ function from the docx module. If an error occurs, an exception tag is returned.
78
+
79
+ Parameters
80
+ ----------
81
+ base64_row : pd.Series
82
+ A Series containing the base64-encoded content under the key "content" and optionally a "source_id".
83
+ task_config : Union[Dict[str, Any], BaseModel]
84
+ A dictionary or Pydantic model containing extraction instructions and parameters.
85
+ Expected to have a "params" key for additional parameters and optionally a "method" key specifying
86
+ the extraction method.
87
+ extraction_config : Any
88
+ A configuration object that contains extraction-specific settings, such as `docx_extraction_config`.
89
+ execution_trace_log : Optional[Dict[str, Any]], default=None
90
+ A dictionary containing trace information for debugging or logging.
91
+ default : str, optional
92
+ The default extraction method to use if the specified method is not available (default is "python_docx").
93
+
94
+ Returns
95
+ -------
96
+ Any
97
+ The extracted data, or an exception tag if extraction fails.
98
+
99
+ Raises
100
+ ------
101
+ Exception
102
+ If an unhandled exception occurs during extraction, it is logged and a tagged error is returned.
103
+ """
104
+ # Prepare task properties and extract source_id
105
+ task_config, source_id = _prepare_task_props(task_config, base64_row)
106
+
107
+ # Retrieve base64 content and decode it into a byte stream.
108
+ base64_content: str = base64_row["content"]
109
+ doc_bytes: bytes = base64.b64decode(base64_content)
110
+ doc_stream: io.BytesIO = io.BytesIO(doc_bytes)
111
+
112
+ extract_method = task_config.get("method", "python_docx")
113
+ extract_params: Dict[str, Any] = task_config.get("params", {})
114
+
115
+ # Extract required boolean flags from params.
116
+ try:
117
+ extract_text = extract_params.pop("extract_text", False)
118
+ extract_images = extract_params.pop("extract_images", False)
119
+ extract_tables = extract_params.pop("extract_tables", False)
120
+ extract_charts = extract_params.pop("extract_charts", False)
121
+ extract_infographics = extract_params.pop("extract_infographics", False)
122
+ except KeyError as e:
123
+ raise ValueError(f"Missing required extraction flag: {e}")
124
+
125
+ # Inject configuration and trace info into extraction parameters.
126
+ if getattr(extraction_config, "docx_extraction_config", None) is not None:
127
+ extract_params["docx_extraction_config"] = extraction_config.docx_extraction_config
128
+
129
+ if execution_trace_log is not None:
130
+ extract_params["trace_info"] = execution_trace_log
131
+
132
+ if extract_method == "render_as_pdf":
133
+ pdf_stream = convert_stream_with_libreoffice(doc_stream, "docx", "pdf")
134
+
135
+ pdf_extract_method = extract_params.get("pdf_extract_method", "pdfium")
136
+ pdf_extractor_config = extract_params.copy()
137
+ pdf_extractor_config["extract_method"] = pdf_extract_method
138
+ if getattr(extraction_config, "pdfium_config", None) is not None:
139
+ pdf_extractor_config["pdfium_config"] = extraction_config.pdfium_config
140
+
141
+ extracted_data: Any = pdfium_extractor(
142
+ pdf_stream=pdf_stream,
143
+ extract_text=extract_text,
144
+ extract_images=extract_images,
145
+ extract_infographics=extract_infographics,
146
+ extract_tables=extract_tables,
147
+ extract_charts=extract_charts,
148
+ extract_page_as_image=False,
149
+ extractor_config=pdf_extractor_config,
150
+ execution_trace_log=None,
151
+ )
152
+
153
+ elif extract_method == "python_docx":
154
+ extracted_data: Any = python_docx(
155
+ docx_stream=doc_stream,
156
+ extract_text=extract_text,
157
+ extract_images=extract_images,
158
+ extract_infographics=extract_infographics,
159
+ extract_tables=extract_tables,
160
+ extract_charts=extract_charts,
161
+ extraction_config=extract_params,
162
+ execution_trace_log=None,
163
+ )
164
+ else:
165
+ raise ValueError(f"Unsupported DOCX extraction method: {extract_method}")
166
+
167
+ return extracted_data
168
+
169
+
170
+ @unified_exception_handler
171
+ def extract_primitives_from_docx_internal(
172
+ df_extraction_ledger: pd.DataFrame,
173
+ task_config: Union[Dict[str, Any], BaseModel],
174
+ extraction_config: DocxExtractorSchema,
175
+ execution_trace_log: Optional[Dict[str, Any]] = None,
176
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
177
+ """
178
+ Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
179
+ each document and replacing the original content with the extracted text.
180
+
181
+ This function applies a decoding and extraction routine to each row of the input DataFrame.
182
+ The routine is provided via the `decode_and_extract` function, which is partially applied with
183
+ task configuration, extraction configuration, and optional trace information. The results are
184
+ exploded and any missing values are dropped, then compiled into a new DataFrame with columns
185
+ for document type, metadata, and a UUID identifier.
186
+
187
+ Parameters
188
+ ----------
189
+ df_extraction_ledger : pd.DataFrame
190
+ The input DataFrame containing DOCX files in base64 encoding. Expected columns include
191
+ 'source_id' and 'content'.
192
+ task_config : Union[Dict[str, Any], BaseModel]
193
+ Configuration instructions for the document processing task. This can be provided as a
194
+ dictionary or a Pydantic model.
195
+ extraction_config : Any
196
+ A configuration object for document extraction that guides the extraction process.
197
+ execution_trace_log : Optional[Dict[str, Any]], default=None
198
+ An optional dictionary containing trace information for debugging or logging.
199
+
200
+ Returns
201
+ -------
202
+ pd.DataFrame
203
+ A DataFrame with the original DOCX content replaced by the extracted text. The resulting
204
+ DataFrame contains the columns "document_type", "metadata", and "uuid".
205
+
206
+ Raises
207
+ ------
208
+ Exception
209
+ If an error occurs during the document extraction process, the exception is logged and
210
+ re-raised.
211
+ """
212
+ # Create a partial function to decode and extract using the provided configurations.
213
+ _decode_and_extract = functools.partial(
214
+ _decode_and_extract_from_docx,
215
+ task_config=task_config,
216
+ extraction_config=extraction_config,
217
+ execution_trace_log=execution_trace_log,
218
+ )
219
+
220
+ # Apply the decode_and_extract function to each row in the DataFrame.
221
+ sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
222
+
223
+ # Explode any list results and drop missing values.
224
+ sr_extraction = sr_extraction.explode().dropna()
225
+
226
+ # Convert the extraction results to a DataFrame if available.
227
+ if not sr_extraction.empty:
228
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
229
+ else:
230
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
231
+
232
+ return extracted_df, {}
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,127 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ # pylint: disable=too-many-locals
19
+
20
+ import logging
21
+ from typing import IO, Optional, List
22
+
23
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
24
+ from nv_ingest_api.internal.enums.common import TextTypeEnum
25
+ from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docxreader import DocxReader
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def python_docx(
31
+ *,
32
+ docx_stream: IO,
33
+ extract_text: bool,
34
+ extract_images: bool,
35
+ extract_infographics: bool,
36
+ extract_tables: bool,
37
+ extract_charts: bool,
38
+ extraction_config: dict,
39
+ execution_trace_log: Optional[List] = None,
40
+ ):
41
+ """
42
+ Helper function that use python-docx to extract text from a bytestream document
43
+
44
+ A document has three levels - document, paragraphs and runs. To align with the
45
+ pdf extraction paragraphs are aliased as block. python-docx leaves the page number
46
+ and line number to the renderer so we assume that the entire document is a single
47
+ page.
48
+
49
+ Run level parsing has been skipped but can be added as needed.
50
+
51
+ Parameters
52
+ ----------
53
+ docx_stream:
54
+ Bytestream
55
+ extract_text : bool
56
+ Specifies whether to extract text.
57
+ extract_images : bool
58
+ Specifies whether to extract images.
59
+ extract_infographics : bool
60
+ Specifies whether to extract infographics.
61
+ extract_tables : bool
62
+ Specifies whether to extract tables.
63
+ extract_charts : bool
64
+ Specifies whether to extract charts.
65
+ extraction_config : dict
66
+ A dictionary of configuration parameters for the extraction process.
67
+ execution_trace_log : list, optional
68
+ A list for accumulating trace information during extraction. Defaults to None.
69
+
70
+ Returns
71
+ -------
72
+ str
73
+ A string of extracted text.
74
+ """
75
+
76
+ _ = execution_trace_log
77
+ _ = extract_infographics
78
+
79
+ row_data = extraction_config.get("row_data")
80
+ # get source_id
81
+ source_id = row_data["source_id"]
82
+ # get text_depth
83
+ text_depth = extraction_config.get("text_depth", "document")
84
+ text_depth = TextTypeEnum(text_depth)
85
+ # get base metadata
86
+ metadata_col = "metadata"
87
+
88
+ docx_extractor_config = extraction_config.get("docx_extraction_config", {})
89
+
90
+ base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
91
+
92
+ # get base source_metadata
93
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
94
+ # get source_location
95
+ source_location = base_source_metadata.get("source_location", "")
96
+ # get collection_id (assuming coming in from source_metadata...)
97
+ collection_id = base_source_metadata.get("collection_id", "")
98
+ # get partition_id (assuming coming in from source_metadata...)
99
+ partition_id = base_source_metadata.get("partition_id", -1)
100
+ # get access_level (assuming coming in from source_metadata...)
101
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
102
+
103
+ # python-docx doesn't maintain filename; re-use source_id
104
+ source_metadata = {
105
+ "source_name": source_id,
106
+ "source_id": source_id,
107
+ "source_location": source_location,
108
+ "source_type": DocumentTypeEnum.DOCX,
109
+ "collection_id": collection_id,
110
+ "partition_id": partition_id,
111
+ "access_level": access_level,
112
+ "summary": "",
113
+ }
114
+
115
+ # Extract data from the document using python-docx
116
+ doc = DocxReader(docx_stream, source_metadata, extraction_config=docx_extractor_config)
117
+ extracted_data = doc.extract_data(
118
+ base_unified_metadata,
119
+ text_depth=text_depth,
120
+ extract_text=extract_text,
121
+ extract_tables=extract_tables,
122
+ extract_charts=extract_charts,
123
+ extract_infographics=extract_infographics,
124
+ extract_images=extract_images,
125
+ )
126
+
127
+ return extracted_data