nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,971 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # Copyright (c) 2024, NVIDIA CORPORATION.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ # pylint: disable=line-too-long
21
+ # pylint: disable=too-few-public-methods
22
+
23
+ import io
24
+ import logging
25
+ import re
26
+ import uuid
27
+ from typing import Dict, Optional, Union
28
+ from typing import List
29
+ from typing import Tuple
30
+
31
+ from collections import defaultdict
32
+
33
+ import pandas as pd
34
+ from docx import Document
35
+ from docx.image.constants import MIME_TYPE
36
+ from docx.image.image import Image
37
+ from docx.oxml.table import CT_Tbl
38
+ from docx.oxml.text.paragraph import CT_P
39
+ from docx.table import Table
40
+ from docx.table import _Cell
41
+ from docx.text.hyperlink import Hyperlink
42
+ from docx.text.paragraph import Paragraph
43
+ from docx.text.run import Run
44
+ from pandas import DataFrame
45
+
46
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
47
+ from nv_ingest_api.internal.extract.image.image_helpers.common import (
48
+ load_and_preprocess_image,
49
+ extract_page_elements_from_images,
50
+ )
51
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
52
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import (
53
+ ContentTypeEnum,
54
+ validate_metadata,
55
+ TextTypeEnum,
56
+ )
57
+ from nv_ingest_api.util.converters import bytetools
58
+ from nv_ingest_api.util.detectors.language import detect_language
59
+ from nv_ingest_api.util.metadata.aggregators import construct_table_and_chart_metadata, CroppedImageWithContent
60
+
61
+ PARAGRAPH_FORMATS = ["text", "markdown"]
62
+ TABLE_FORMATS = ["markdown", "markdown_light", "csv", "tag"]
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+
67
+ class DocxProperties:
68
+ """
69
+ Parse document core properties and update metadata
70
+ """
71
+
72
+ def __init__(self, document: Document, source_metadata: Dict):
73
+ """
74
+ Copy over some of the docx core properties
75
+ """
76
+ self.document = document
77
+ self.source_metadata = source_metadata
78
+
79
+ # Extract core properties with None checks
80
+ core_properties = self.document.core_properties
81
+
82
+ # Get properties with None handling
83
+ self.title = core_properties.title
84
+
85
+ # Author with fallback to last_modified_by if author is None
86
+ self.author = core_properties.author if core_properties.author is not None else core_properties.last_modified_by
87
+
88
+ self.created = core_properties.created
89
+ self.modified = core_properties.modified
90
+ self.keywords = core_properties.keywords
91
+
92
+ self._update_source_meta_data()
93
+
94
+ def __str__(self):
95
+ """
96
+ Print properties
97
+ """
98
+ info = "Document Properties:\n"
99
+ info += f"title: {self.title}\n"
100
+ info += f"author: {self.author}\n"
101
+
102
+ # Handle date formatting safely
103
+ if self.created is not None:
104
+ info += f"created: {self.created.isoformat()}\n"
105
+ else:
106
+ info += "created: None\n"
107
+
108
+ if self.modified is not None:
109
+ info += f"modified: {self.modified.isoformat()}\n"
110
+ else:
111
+ info += "modified: None\n"
112
+
113
+ info += f"keywords: {self.keywords}\n"
114
+
115
+ return info
116
+
117
+ def _update_source_meta_data(self):
118
+ """
119
+ Update the source metadata with the document's core properties
120
+ """
121
+ # Only update metadata if dates are available
122
+ metadata_updates = {}
123
+
124
+ if self.created is not None:
125
+ metadata_updates["date_created"] = self.created.isoformat()
126
+
127
+ if self.modified is not None:
128
+ metadata_updates["last_modified"] = self.modified.isoformat()
129
+
130
+ if metadata_updates:
131
+ self.source_metadata.update(metadata_updates)
132
+
133
+
134
+ class DocxReader:
135
+ __doc__ = f"""
136
+ Read a docx file and extract its content as text, images and tables.
137
+
138
+ Parameters
139
+ ----------
140
+ docx :
141
+ Bytestream
142
+ paragraph_format : str
143
+ Format of the paragraphs. Supported formats are: {PARAGRAPH_FORMATS}
144
+ table_format : str
145
+ Format of the tables. Supported formats are: {TABLE_FORMATS}
146
+ handle_text_styles : bool
147
+ Whether to apply style on a paragraph (heading, list, title, subtitle).
148
+ Not recommended if the document has been converted from pdf.
149
+ image_tag : str
150
+ Tag to replace the images in the text. Must contain one placeholder for the image index.
151
+ table_tag : str
152
+ Tag to replace the tables in the text. Must contain one placeholder for the table index.
153
+ """
154
+
155
+ def __init__(
156
+ self,
157
+ docx,
158
+ source_metadata: Dict,
159
+ paragraph_format: str = "markdown",
160
+ table_format: str = "markdown",
161
+ handle_text_styles: bool = True,
162
+ image_tag="<image {}>",
163
+ table_tag="<table {}>",
164
+ extraction_config: Dict = None,
165
+ ):
166
+ if paragraph_format not in PARAGRAPH_FORMATS:
167
+ raise ValueError(f"Unknown paragraph format {paragraph_format}. Supported formats are: {PARAGRAPH_FORMATS}")
168
+
169
+ if table_format not in TABLE_FORMATS:
170
+ raise ValueError(f"Unknown table format {table_format}. Supported formats are: {TABLE_FORMATS}")
171
+
172
+ self.paragraph_format = paragraph_format
173
+ self.table_format = table_format
174
+ self.handle_text_styles = handle_text_styles
175
+ self.image_tag = image_tag
176
+ self.table_tag = table_tag
177
+
178
+ # Read docx
179
+ self.document = Document(docx)
180
+
181
+ # Get the core properties
182
+ self.properties = DocxProperties(self.document, source_metadata)
183
+ logger.debug("%s", str(self.properties))
184
+
185
+ self.trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
186
+ self.empty_text_pattern = re.compile(r"^\s*$")
187
+ self.images = []
188
+ self.tables = []
189
+ self.image_tag_index = 1
190
+ self.table_tag_index = 1
191
+
192
+ # placeholders for metadata extraction
193
+ self._accumulated_text = []
194
+ self._extracted_data = []
195
+ self._extraction_config = extraction_config if extraction_config else {}
196
+ self._pending_images = []
197
+ self._prev_para_image_idx = 0
198
+ self._prev_para_images = []
199
+
200
+ def is_text_empty(self, text: str) -> bool:
201
+ """
202
+ Check if the given text is empty or matches the empty text pattern.
203
+
204
+ Parameters
205
+ ----------
206
+ text : str
207
+ The text to check.
208
+
209
+ Returns
210
+ -------
211
+ bool
212
+ True if the text is empty or matches the empty text pattern, False otherwise.
213
+ """
214
+
215
+ return self.empty_text_pattern.match(text) is not None
216
+
217
+ def format_text(self, text: str, bold: bool, italic: bool, underline: bool) -> str:
218
+ """
219
+ Apply markdown styling (bold, italic, underline) to the given text.
220
+
221
+ Parameters
222
+ ----------
223
+ text : str
224
+ The text to format.
225
+ bold : bool
226
+ Whether to apply bold styling.
227
+ italic : bool
228
+ Whether to apply italic styling.
229
+ underline : bool
230
+ Whether to apply underline styling.
231
+
232
+ Returns
233
+ -------
234
+ str
235
+ The formatted text with the applied styles.
236
+ """
237
+
238
+ if self.is_text_empty(text):
239
+ return text
240
+
241
+ # Exclude leading and trailing spaces from style
242
+ match = self.trailing_space_pattern.match(text)
243
+ if match:
244
+ prefix, text, suffix = match.groups()
245
+ else:
246
+ prefix, suffix = "", ""
247
+
248
+ # Apply style
249
+ if bold:
250
+ text = f"**{text}**"
251
+ if italic:
252
+ text = f"*{text}*"
253
+ if underline:
254
+ text = f"<u>{text}</u>"
255
+
256
+ # Add back leading and trailing spaces
257
+ text = prefix + text + suffix
258
+
259
+ return text
260
+
261
+ def format_paragraph(self, paragraph: "Paragraph") -> Tuple[str, List["Image"]]:
262
+ """
263
+ Format a paragraph into styled text and extract associated images.
264
+
265
+ Parameters
266
+ ----------
267
+ paragraph : Paragraph
268
+ The paragraph to format. This includes text and potentially embedded images.
269
+
270
+ Returns
271
+ -------
272
+ tuple of (str, list of Image)
273
+ - The formatted paragraph text with markdown styling applied.
274
+ - A list of extracted images from the paragraph.
275
+ """
276
+
277
+ try:
278
+ paragraph_images = []
279
+ if self.paragraph_format == "text":
280
+ return paragraph.text.strip(), paragraph_images
281
+
282
+ font = paragraph.style.font
283
+ default_style = (font.bold, font.italic, font.underline)
284
+
285
+ paragraph_text = ""
286
+ group_text = ""
287
+ previous_style = None
288
+
289
+ for c in paragraph.iter_inner_content():
290
+ try:
291
+ if isinstance(c, Hyperlink):
292
+ text = f"[{c.text}]({c.address})"
293
+ style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
+ elif isinstance(c, Run):
295
+ text = c.text
296
+ style = (c.bold, c.italic, c.underline)
297
+
298
+ # 1. Locate the inline shape which is stored in the <w:drawing> element.
299
+ # 2. r:embed in <a.blip> has the relationship id for extracting the file where
300
+ # the image is stored as bytes.
301
+ # Reference:
302
+ # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
303
+ inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
304
+ for r_id in inline_shapes:
305
+ text += self.image_tag.format(self.image_tag_index)
306
+ self.image_tag_index += 1
307
+ try:
308
+ image = paragraph.part.related_parts[r_id].image
309
+ paragraph_images.append(image)
310
+ except Exception as img_e:
311
+ logger.warning(
312
+ "Failed to extract image with rId " "%s: %s -- object / file may be malformed",
313
+ r_id,
314
+ img_e,
315
+ )
316
+ else:
317
+ continue
318
+
319
+ style = tuple(s if s is not None else d for s, d in zip(style, default_style))
320
+
321
+ if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
322
+ paragraph_text += self.format_text(group_text, *previous_style)
323
+ group_text = ""
324
+
325
+ group_text += text
326
+ if not self.is_text_empty(text):
327
+ previous_style = style
328
+
329
+ except Exception as e:
330
+ logger.error("format_paragraph: failed to process run: %s", e)
331
+ continue
332
+
333
+ if group_text and previous_style:
334
+ paragraph_text += self.format_text(group_text, *previous_style)
335
+
336
+ return paragraph_text.strip(), paragraph_images
337
+
338
+ except Exception as e:
339
+ logger.error("format_paragraph: failed for paragraph: %s", e)
340
+ return "", []
341
+
342
+ def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
343
+ """
344
+ Format a table cell into Markdown text and extract associated images.
345
+
346
+ Parameters
347
+ ----------
348
+ cell : _Cell
349
+ The table cell to format.
350
+
351
+ Returns
352
+ -------
353
+ tuple of (str, list of Image)
354
+ - The formatted text of the cell with markdown styling applied.
355
+ - A list of images extracted from the cell.
356
+ """
357
+
358
+ try:
359
+ newline = "<br>" if self.paragraph_format == "markdown" else "\n"
360
+ texts, images = [], []
361
+
362
+ for p in cell.paragraphs:
363
+ try:
364
+ t, imgs = self.format_paragraph(p)
365
+ texts.append(t)
366
+ images.extend(imgs)
367
+ except Exception as e:
368
+ logger.error("format_cell: failed to format paragraph in cell: %s", e)
369
+
370
+ return newline.join(texts), images
371
+
372
+ except Exception as e:
373
+ logger.error("format_cell: failed entirely: %s", e)
374
+ return "", []
375
+
376
+ def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
377
+ """
378
+ Format a table into text, extract images, and represent it as a DataFrame.
379
+
380
+ Parameters
381
+ ----------
382
+ table : Table
383
+ The table to format.
384
+
385
+ Returns
386
+ -------
387
+ tuple of (str or None, list of Image, DataFrame)
388
+ - The formatted table as text, using the specified format (e.g., markdown, CSV).
389
+ - A list of images extracted from the table.
390
+ - A DataFrame representation of the table's content.
391
+ """
392
+
393
+ try:
394
+ rows_data = []
395
+ all_images = []
396
+
397
+ for row in table.rows:
398
+ row_texts = []
399
+ row_images = []
400
+ for cell in row.cells:
401
+ try:
402
+ cell_text, cell_imgs = self.format_cell(cell)
403
+ row_texts.append(cell_text)
404
+ row_images.extend(cell_imgs)
405
+ except Exception as e:
406
+ logger.error("format_table: failed to process cell: %s", e)
407
+ row_texts.append("") # pad for column alignment
408
+
409
+ rows_data.append(row_texts)
410
+ all_images.extend(row_images)
411
+
412
+ if not rows_data or not rows_data[0]:
413
+ return None, [], pd.DataFrame()
414
+
415
+ header = rows_data[0]
416
+ body = rows_data[1:]
417
+ df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
418
+
419
+ if "markdown" in self.table_format:
420
+ table_text = df.to_markdown(index=False)
421
+ if self.table_format == "markdown_light":
422
+ table_text = re.sub(r"\s{2,}", " ", table_text)
423
+ table_text = re.sub(r"-{2,}", "-", table_text)
424
+ elif self.table_format == "csv":
425
+ table_text = df.to_csv(index=False)
426
+ elif self.table_format == "tag":
427
+ table_text = self.table_tag.format(self.table_tag_index)
428
+ self.table_tag_index += 1
429
+ else:
430
+ raise ValueError(f"Unknown table format {self.table_format}")
431
+
432
+ return table_text, all_images, df
433
+
434
+ except Exception as e:
435
+ logger.error("format_table: failed to format table: %s", e)
436
+ return None, [], pd.DataFrame()
437
+
438
+ @staticmethod
439
+ def apply_text_style(style: str, text: str, level: int = 0) -> str:
440
+ """
441
+ Apply a specific text style (e.g., heading, list, title, subtitle) to the given text.
442
+
443
+ Parameters
444
+ ----------
445
+ style : str
446
+ The style to apply. Supported styles include headings ("Heading 1" to "Heading 9"),
447
+ list items ("List"), and document structures ("Title", "Subtitle").
448
+ text : str
449
+ The text to style.
450
+ level : int, optional
451
+ The indentation level for the styled text. Default is 0.
452
+
453
+ Returns
454
+ -------
455
+ str
456
+ The text with the specified style and indentation applied.
457
+ """
458
+
459
+ if re.match(r"^Heading [1-9]$", style):
460
+ n = int(style.split(" ")[-1])
461
+ text = f"{'#' * n} {text}"
462
+ elif style.startswith("List"):
463
+ text = f"- {text}"
464
+ elif style == "Title":
465
+ text = f"{text}\n{'=' * len(text)}"
466
+ elif style == "Subtitle":
467
+ text = f"{text}\n{'-' * len(text)}"
468
+
469
+ text = "\t" * level + text
470
+
471
+ return text
472
+
473
+ @staticmethod
474
+ def docx_content_type_to_image_type(content_type: "MIME_TYPE") -> str:
475
+ """
476
+ Convert a DOCX content type string to an image type.
477
+
478
+ Parameters
479
+ ----------
480
+ content_type : MIME_TYPE
481
+ The content type string from the image header, e.g., "image/jpeg".
482
+
483
+ Returns
484
+ -------
485
+ str
486
+ The image type extracted from the content type string.
487
+ """
488
+
489
+ return content_type.split("/")[1]
490
+
491
+ def _construct_image_metadata(
492
+ self, para_idx: int, caption: str, base_unified_metadata: Dict, base64_img: str
493
+ ) -> List[Union[str, dict]]:
494
+ """
495
+ Build metadata for an image in a DOCX file.
496
+
497
+ Parameters
498
+ ----------
499
+ para_idx : int
500
+ The paragraph index containing the image.
501
+ caption : str
502
+ The caption associated with the image.
503
+ base_unified_metadata : dict
504
+ The base metadata to build upon.
505
+ base64_img : str
506
+ The image content encoded as a base64 string.
507
+
508
+ Returns
509
+ -------
510
+ list
511
+ A list containing the content type, validated metadata, and a unique identifier.
512
+ """
513
+
514
+ bbox = (0, 0, 0, 0)
515
+ caption_len = len(caption.splitlines())
516
+
517
+ page_idx = 0 # docx => single page
518
+ page_count = 1
519
+
520
+ page_nearby_blocks = {
521
+ "text": {"content": [], "bbox": []},
522
+ "images": {"content": [], "bbox": []},
523
+ "structured": {"content": [], "bbox": []},
524
+ }
525
+
526
+ if caption_len:
527
+ page_nearby_blocks["text"]["content"].append(caption)
528
+ page_nearby_blocks["text"]["bbox"] = [[-1, -1, -1, -1]] * caption_len
529
+
530
+ content_metadata = {
531
+ "type": ContentTypeEnum.IMAGE,
532
+ "description": ContentDescriptionEnum.DOCX_IMAGE,
533
+ "page_number": page_idx,
534
+ "hierarchy": {
535
+ "page_count": page_count,
536
+ "page": page_idx,
537
+ "block": para_idx,
538
+ "line": -1,
539
+ "span": -1,
540
+ "nearby_objects": page_nearby_blocks,
541
+ },
542
+ }
543
+
544
+ image_metadata = {
545
+ "image_type": DocumentTypeEnum.PNG,
546
+ "structured_image_type": ContentTypeEnum.NONE,
547
+ "caption": caption,
548
+ "text": "",
549
+ "image_location": bbox,
550
+ }
551
+
552
+ unified_metadata = base_unified_metadata.copy()
553
+ unified_metadata.update(
554
+ {
555
+ "content": base64_img,
556
+ "source_metadata": self.properties.source_metadata,
557
+ "content_metadata": content_metadata,
558
+ "image_metadata": image_metadata,
559
+ }
560
+ )
561
+
562
+ validated_unified_metadata = validate_metadata(unified_metadata)
563
+
564
+ return [
565
+ ContentTypeEnum.IMAGE.value,
566
+ validated_unified_metadata.model_dump(),
567
+ str(uuid.uuid4()),
568
+ ]
569
+
570
+ def _extract_para_images(
571
+ self, images: List["Image"], para_idx: int, caption: str, base_unified_metadata: Dict
572
+ ) -> None:
573
+ """
574
+ Collect images from a paragraph and store them for metadata construction.
575
+
576
+ Parameters
577
+ ----------
578
+ images : list of Image
579
+ The images found in the paragraph.
580
+ para_idx : int
581
+ The index of the paragraph containing the images.
582
+ caption : str
583
+ The caption associated with the images.
584
+ base_unified_metadata : dict
585
+ The base metadata to associate with the images.
586
+
587
+ Returns
588
+ -------
589
+ None
590
+ """
591
+
592
+ for image in images:
593
+ logger.debug("image content_type %s para_idx %d", image.content_type, para_idx)
594
+ logger.debug("image caption %s", caption)
595
+
596
+ # Simply append a tuple so we can build the final metadata in _finalize_images
597
+ self._pending_images.append((image, para_idx, caption, base_unified_metadata))
598
+
599
+ def _construct_text_metadata(
600
+ self, accumulated_text: List[str], para_idx: int, text_depth: "TextTypeEnum", base_unified_metadata: Dict
601
+ ) -> List[Union[str, dict]]:
602
+ """
603
+ Build metadata for text content in a DOCX file.
604
+
605
+ Parameters
606
+ ----------
607
+ accumulated_text : list of str
608
+ The accumulated text to include in the metadata.
609
+ para_idx : int
610
+ The paragraph index containing the text.
611
+ text_depth : TextTypeEnum
612
+ The depth of the text content (e.g., page-level, paragraph-level).
613
+ base_unified_metadata : dict
614
+ The base metadata to build upon.
615
+
616
+ Returns
617
+ -------
618
+ list
619
+ A list containing the content type, validated metadata, and a unique identifier.
620
+ """
621
+
622
+ if len(accumulated_text) < 1:
623
+ return []
624
+
625
+ extracted_text = " ".join(accumulated_text)
626
+
627
+ # the document is treated as a single page
628
+ page_number = 0 if text_depth == TextTypeEnum.PAGE else -1
629
+ content_metadata = {
630
+ "type": ContentTypeEnum.TEXT,
631
+ "description": ContentDescriptionEnum.DOCX_TEXT,
632
+ "page_number": page_number,
633
+ "hierarchy": {
634
+ "page_count": 1,
635
+ "page": page_number,
636
+ "block": para_idx,
637
+ "line": -1,
638
+ "span": -1,
639
+ },
640
+ }
641
+
642
+ language = detect_language(extracted_text)
643
+ text_metadata = {
644
+ "text_type": text_depth,
645
+ "summary": "",
646
+ "keywords": self.properties.keywords,
647
+ "language": language,
648
+ "text_location": (-1, -1, -1, -1),
649
+ }
650
+
651
+ ext_unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
652
+ ext_unified_metadata.update(
653
+ {
654
+ "content": extracted_text,
655
+ "source_metadata": self.properties.source_metadata,
656
+ "content_metadata": content_metadata,
657
+ "text_metadata": text_metadata,
658
+ }
659
+ )
660
+
661
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
662
+
663
+ return [ContentTypeEnum.TEXT.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
664
+
665
+ def _extract_para_text(
666
+ self,
667
+ paragraph,
668
+ paragraph_text,
669
+ base_unified_metadata: Dict,
670
+ text_depth: str,
671
+ para_idx: int,
672
+ ) -> None:
673
+ """
674
+ Process the text, images, and styles in a DOCX paragraph.
675
+
676
+ Parameters
677
+ ----------
678
+ paragraph: Paragraph
679
+ The paragraph to process.
680
+ paragraph_text: str
681
+ The text content of the paragraph.
682
+ base_unified_metadata : dict
683
+ The base metadata to associate with extracted data.
684
+ text_depth : TextTypeEnum
685
+ The depth of text extraction (e.g., block-level, document-level).
686
+ para_idx : int
687
+ The index of the paragraph being processed.
688
+
689
+ Returns
690
+ -------
691
+ None
692
+ """
693
+
694
+ # Handle text styles if desired
695
+ if self.handle_text_styles:
696
+ try:
697
+ numPr = paragraph._element.xpath("./w:pPr/w:numPr")[0]
698
+ level = int(numPr.xpath("./w:ilvl/@w:val")[0])
699
+ except Exception:
700
+ level = -1
701
+ paragraph_text = self.apply_text_style(paragraph.style.name, paragraph_text, level)
702
+
703
+ self._accumulated_text.append(paragraph_text + "\n")
704
+
705
+ # If text_depth is BLOCK, we flush after each paragraph
706
+ if text_depth == TextTypeEnum.BLOCK:
707
+ text_extraction = self._construct_text_metadata(
708
+ self._accumulated_text, para_idx, text_depth, base_unified_metadata
709
+ )
710
+ self._extracted_data.append(text_extraction)
711
+ self._accumulated_text = []
712
+
713
+ def _finalize_images(
714
+ self,
715
+ extract_tables: bool,
716
+ extract_charts: bool,
717
+ extract_infographics: bool,
718
+ extract_images: bool,
719
+ **kwargs,
720
+ ) -> None:
721
+ """
722
+ Build and append final metadata for each pending image in batches.
723
+
724
+ Parameters
725
+ ----------
726
+ extract_tables : bool
727
+ Whether to attempt table detection in images.
728
+ extract_charts : bool
729
+ Whether to attempt chart detection in images.
730
+ **kwargs
731
+ Additional configuration for image processing.
732
+
733
+ Returns
734
+ -------
735
+ None
736
+ """
737
+ if not self._pending_images:
738
+ return
739
+
740
+ # 1) Convert all pending images into numpy arrays (and also store base64 + context),
741
+ # so we can run detection on them in one go.
742
+ all_image_arrays = []
743
+ image_info = [] # parallel list to hold (para_idx, caption, base_unified_metadata, base64_img)
744
+
745
+ for docx_image, para_idx, caption, base_unified_metadata in self._pending_images:
746
+ # Convert docx image blob to BytesIO, then to numpy array
747
+ image_bytes = docx_image.blob
748
+ image_stream = io.BytesIO(image_bytes)
749
+ image_array = load_and_preprocess_image(image_stream)
750
+ base64_img = str(bytetools.base64frombytes(image_bytes))
751
+
752
+ all_image_arrays.append(image_array)
753
+
754
+ # Keep track of all needed metadata so we can rebuild final entries
755
+ image_info.append((para_idx, caption, base_unified_metadata, base64_img))
756
+
757
+ # 2) If the user wants to detect tables/charts, do it in one pass for all images.
758
+ detection_map = defaultdict(list) # maps image_index -> list of CroppedImageWithContent
759
+
760
+ if extract_tables or extract_charts or extract_infographics:
761
+ try:
762
+ # Perform the batched detection on all images
763
+ detection_results = extract_page_elements_from_images(
764
+ images=all_image_arrays,
765
+ config=ImageConfigSchema(**self._extraction_config.model_dump()),
766
+ trace_info=kwargs.get("trace_info"),
767
+ )
768
+ # detection_results is typically List[Tuple[int, CroppedImageWithContent]]
769
+ # Group by image_index
770
+ for image_idx, cropped_item in detection_results:
771
+ # Skip elements that shouldn't be extracted based on flags
772
+ element_type = cropped_item.type_string
773
+ if (not extract_tables) and (element_type == "table"):
774
+ continue
775
+ if (not extract_charts) and (element_type == "chart"):
776
+ continue
777
+ if (not extract_infographics) and (element_type == "infographic"):
778
+ continue
779
+
780
+ detection_map[image_idx].append(cropped_item)
781
+
782
+ except Exception as e:
783
+ logger.error(f"Error extracting tables/charts in batch: {e}")
784
+ # If something goes wrong, we can fall back to empty detection map
785
+ # so that all images are treated normally
786
+ detection_map = {}
787
+
788
+ # 3) For each pending image, decide if we found tables/charts or not.
789
+ for i, _ in enumerate(self._pending_images):
790
+ para_idx_i, caption_i, base_unified_metadata_i, base64_img_i = image_info[i]
791
+
792
+ # If detection_map[i] is non-empty, we have found table(s)/chart(s).
793
+ if i in detection_map and detection_map[i]:
794
+ for table_chart_data in detection_map[i]:
795
+ # Build structured metadata for each table or chart
796
+ structured_entry = construct_table_and_chart_metadata(
797
+ structured_image=table_chart_data, # A CroppedImageWithContent
798
+ page_idx=0, # docx => single page
799
+ page_count=1,
800
+ source_metadata=self.properties.source_metadata,
801
+ base_unified_metadata=base_unified_metadata_i,
802
+ )
803
+ self._extracted_data.append(structured_entry)
804
+ else:
805
+ # Either detection was not requested, or no table/chart was found
806
+ if extract_images:
807
+ image_entry = self._construct_image_metadata(
808
+ para_idx_i,
809
+ caption_i,
810
+ base_unified_metadata_i,
811
+ base64_img_i,
812
+ )
813
+ self._extracted_data.append(image_entry)
814
+
815
+ # 4) Clear out the pending images after finalizing
816
+ self._pending_images = []
817
+
818
+ def _extract_table_data(
819
+ self,
820
+ child,
821
+ base_unified_metadata: Dict,
822
+ ) -> None:
823
+ """
824
+ Process the text and images in a DOCX table.
825
+
826
+ Parameters
827
+ ----------
828
+ child : element
829
+ The table element to process.
830
+ base_unified_metadata : dict
831
+ The base metadata to associate with extracted data.
832
+ text_depth : TextTypeEnum
833
+ The depth of text extraction (e.g., block-level, document-level).
834
+ para_idx : int
835
+ The index of the table being processed.
836
+
837
+ Returns
838
+ -------
839
+ None
840
+ """
841
+
842
+ # Table
843
+ table = Table(child, self.document)
844
+ table_text, table_images, table_dataframe = self.format_table(table)
845
+
846
+ self.images += table_images
847
+ self.tables.append(table_dataframe)
848
+
849
+ cropped_image_with_content = CroppedImageWithContent(
850
+ content=table_text,
851
+ image="", # no image content
852
+ bbox=(0, 0, 0, 0),
853
+ max_width=0,
854
+ max_height=0,
855
+ type_string="table",
856
+ )
857
+
858
+ self._extracted_data.append(
859
+ construct_table_and_chart_metadata(
860
+ structured_image=cropped_image_with_content,
861
+ page_idx=0, # docx => single page
862
+ page_count=1,
863
+ source_metadata=self.properties.source_metadata,
864
+ base_unified_metadata=base_unified_metadata,
865
+ )
866
+ )
867
+
868
+ def extract_data(
869
+ self,
870
+ base_unified_metadata: Dict,
871
+ text_depth: "TextTypeEnum",
872
+ extract_text: bool,
873
+ extract_tables: bool,
874
+ extract_charts: bool,
875
+ extract_infographics: bool,
876
+ extract_images: bool,
877
+ ) -> list[list[str | dict]]:
878
+ """
879
+ Iterate over paragraphs and tables in a DOCX document to extract data.
880
+
881
+ Parameters
882
+ ----------
883
+ base_unified_metadata : dict
884
+ The base metadata to associate with all extracted content.
885
+ text_depth : TextTypeEnum
886
+ The depth of text extraction (e.g., block-level, document-level).
887
+ extract_text : bool
888
+ Whether to extract text from the document.
889
+ extract_charts : bool
890
+ Whether to extract charts from the document.
891
+ extract_tables : bool
892
+ Whether to extract tables from the document.
893
+ extract_images : bool
894
+ Whether to extract images from the document.
895
+
896
+ Returns
897
+ -------
898
+ dict
899
+ A dictionary containing the extracted data from the document.
900
+ """
901
+
902
+ self._accumulated_text = []
903
+ self._extracted_data = []
904
+ self._pending_images = []
905
+ self._prev_para_images = []
906
+ self._prev_para_image_idx = 0
907
+
908
+ para_idx = 0
909
+ for child in self.document.element.body.iterchildren():
910
+ try:
911
+ if isinstance(child, CT_P):
912
+ paragraph = Paragraph(child, self.document)
913
+ paragraph_text, paragraph_images = self.format_paragraph(paragraph)
914
+
915
+ if extract_text:
916
+ try:
917
+ self._extract_para_text(
918
+ paragraph,
919
+ paragraph_text,
920
+ base_unified_metadata,
921
+ text_depth,
922
+ para_idx,
923
+ )
924
+ except Exception as e:
925
+ logger.error("extract_data: _extract_para_text failed: %s", e)
926
+
927
+ if (extract_images or extract_charts or extract_tables) and paragraph_images:
928
+ self._pending_images += [
929
+ (image, para_idx, "", base_unified_metadata) for image in paragraph_images
930
+ ]
931
+ self.images.extend(paragraph_images)
932
+
933
+ elif isinstance(child, CT_Tbl):
934
+ if extract_tables or extract_charts:
935
+ try:
936
+ self._extract_table_data(child, base_unified_metadata)
937
+ except Exception as e:
938
+ logger.error("extract_data: _extract_table_data failed: %s", e)
939
+
940
+ except Exception as e:
941
+ logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
942
+
943
+ para_idx += 1
944
+
945
+ # If there's leftover text at the doc’s end
946
+ if (
947
+ extract_text
948
+ and text_depth in (TextTypeEnum.DOCUMENT, TextTypeEnum.PAGE)
949
+ and len(self._accumulated_text) > 0
950
+ ):
951
+ text_extraction = self._construct_text_metadata(
952
+ self._accumulated_text,
953
+ -1,
954
+ text_depth,
955
+ base_unified_metadata,
956
+ )
957
+
958
+ if text_extraction:
959
+ self._extracted_data.append(text_extraction)
960
+
961
+ # Final pass: Decide if images are just images or contain tables/charts
962
+ if extract_images or extract_tables or extract_charts or extract_infographics:
963
+ self._finalize_images(
964
+ extract_tables=extract_tables,
965
+ extract_charts=extract_charts,
966
+ extract_infographics=extract_infographics,
967
+ extract_images=extract_images,
968
+ trace_info=None,
969
+ )
970
+
971
+ return self._extracted_data