nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -72
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
@@ -1,895 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- # Copyright (c) 2024, NVIDIA CORPORATION.
7
- #
8
- # Licensed under the Apache License, Version 2.0 (the "License");
9
- # you may not use this file except in compliance with the License.
10
- # You may obtain a copy of the License at
11
- #
12
- # http://www.apache.org/licenses/LICENSE-2.0
13
- #
14
- # Unless required by applicable law or agreed to in writing, software
15
- # distributed under the License is distributed on an "AS IS" BASIS,
16
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
- # See the License for the specific language governing permissions and
18
- # limitations under the License.
19
-
20
- # pylint: disable=line-too-long
21
- # pylint: disable=too-few-public-methods
22
-
23
- import io
24
- import logging
25
- import re
26
- import uuid
27
- from typing import Dict, Optional, Union
28
- from typing import List
29
- from typing import Tuple
30
-
31
- from collections import defaultdict
32
-
33
- import pandas as pd
34
- from docx import Document
35
- from docx.image.constants import MIME_TYPE
36
- from docx.image.image import Image
37
- from docx.oxml.table import CT_Tbl
38
- from docx.oxml.text.paragraph import CT_P
39
- from docx.table import Table
40
- from docx.table import _Cell
41
- from docx.text.hyperlink import Hyperlink
42
- from docx.text.paragraph import Paragraph
43
- from docx.text.run import Run
44
- from pandas import DataFrame
45
-
46
- from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
47
- from nv_ingest_api.internal.extract.image.image_helpers.common import (
48
- load_and_preprocess_image,
49
- extract_page_elements_from_images,
50
- )
51
- from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
52
- from nv_ingest_api.internal.schemas.meta.metadata_schema import (
53
- ContentTypeEnum,
54
- validate_metadata,
55
- TextTypeEnum,
56
- )
57
- from nv_ingest_api.util.converters import bytetools
58
- from nv_ingest_api.util.detectors.language import detect_language
59
- from nv_ingest_api.util.metadata.aggregators import construct_table_and_chart_metadata, CroppedImageWithContent
60
-
61
- PARAGRAPH_FORMATS = ["text", "markdown"]
62
- TABLE_FORMATS = ["markdown", "markdown_light", "csv", "tag"]
63
-
64
- logger = logging.getLogger(__name__)
65
-
66
-
67
- class DocxProperties:
68
- """
69
- Parse document core properties and update metadata
70
- """
71
-
72
- def __init__(self, document: Document, source_metadata: Dict):
73
- """
74
- Copy over some of the docx core properties
75
- """
76
- self.document = document
77
- self.source_metadata = source_metadata
78
-
79
- # Extract core properties with None checks
80
- core_properties = self.document.core_properties
81
-
82
- # Get properties with None handling
83
- self.title = core_properties.title
84
-
85
- # Author with fallback to last_modified_by if author is None
86
- self.author = core_properties.author if core_properties.author is not None else core_properties.last_modified_by
87
-
88
- self.created = core_properties.created
89
- self.modified = core_properties.modified
90
- self.keywords = core_properties.keywords
91
-
92
- self._update_source_meta_data()
93
-
94
- def __str__(self):
95
- """
96
- Print properties
97
- """
98
- info = "Document Properties:\n"
99
- info += f"title: {self.title}\n"
100
- info += f"author: {self.author}\n"
101
-
102
- # Handle date formatting safely
103
- if self.created is not None:
104
- info += f"created: {self.created.isoformat()}\n"
105
- else:
106
- info += "created: None\n"
107
-
108
- if self.modified is not None:
109
- info += f"modified: {self.modified.isoformat()}\n"
110
- else:
111
- info += "modified: None\n"
112
-
113
- info += f"keywords: {self.keywords}\n"
114
-
115
- return info
116
-
117
- def _update_source_meta_data(self):
118
- """
119
- Update the source metadata with the document's core properties
120
- """
121
- # Only update metadata if dates are available
122
- metadata_updates = {}
123
-
124
- if self.created is not None:
125
- metadata_updates["date_created"] = self.created.isoformat()
126
-
127
- if self.modified is not None:
128
- metadata_updates["last_modified"] = self.modified.isoformat()
129
-
130
- if metadata_updates:
131
- self.source_metadata.update(metadata_updates)
132
-
133
-
134
- class DocxReader:
135
- __doc__ = f"""
136
- Read a docx file and extract its content as text, images and tables.
137
-
138
- Parameters
139
- ----------
140
- docx :
141
- Bytestream
142
- paragraph_format : str
143
- Format of the paragraphs. Supported formats are: {PARAGRAPH_FORMATS}
144
- table_format : str
145
- Format of the tables. Supported formats are: {TABLE_FORMATS}
146
- handle_text_styles : bool
147
- Whether to apply style on a paragraph (heading, list, title, subtitle).
148
- Not recommended if the document has been converted from pdf.
149
- image_tag : str
150
- Tag to replace the images in the text. Must contain one placeholder for the image index.
151
- table_tag : str
152
- Tag to replace the tables in the text. Must contain one placeholder for the table index.
153
- """
154
-
155
- def __init__(
156
- self,
157
- docx,
158
- source_metadata: Dict,
159
- paragraph_format: str = "markdown",
160
- table_format: str = "markdown",
161
- handle_text_styles: bool = True,
162
- image_tag="<image {}>",
163
- table_tag="<table {}>",
164
- extraction_config: Dict = None,
165
- ):
166
- if paragraph_format not in PARAGRAPH_FORMATS:
167
- raise ValueError(f"Unknown paragraph format {paragraph_format}. Supported formats are: {PARAGRAPH_FORMATS}")
168
-
169
- if table_format not in TABLE_FORMATS:
170
- raise ValueError(f"Unknown table format {table_format}. Supported formats are: {TABLE_FORMATS}")
171
-
172
- self.paragraph_format = paragraph_format
173
- self.table_format = table_format
174
- self.handle_text_styles = handle_text_styles
175
- self.image_tag = image_tag
176
- self.table_tag = table_tag
177
-
178
- # Read docx
179
- self.document = Document(docx)
180
-
181
- # Get the core properties
182
- self.properties = DocxProperties(self.document, source_metadata)
183
- logger.debug("%s", str(self.properties))
184
-
185
- self.trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
186
- self.empty_text_pattern = re.compile(r"^\s*$")
187
- self.images = []
188
- self.tables = []
189
- self.image_tag_index = 1
190
- self.table_tag_index = 1
191
-
192
- # placeholders for metadata extraction
193
- self._accumulated_text = []
194
- self._extracted_data = []
195
- self._extraction_config = extraction_config if extraction_config else {}
196
- self._pending_images = []
197
- self._prev_para_image_idx = 0
198
- self._prev_para_images = []
199
-
200
- def is_text_empty(self, text: str) -> bool:
201
- """
202
- Check if the given text is empty or matches the empty text pattern.
203
-
204
- Parameters
205
- ----------
206
- text : str
207
- The text to check.
208
-
209
- Returns
210
- -------
211
- bool
212
- True if the text is empty or matches the empty text pattern, False otherwise.
213
- """
214
-
215
- return self.empty_text_pattern.match(text) is not None
216
-
217
- def format_text(self, text: str, bold: bool, italic: bool, underline: bool) -> str:
218
- """
219
- Apply markdown styling (bold, italic, underline) to the given text.
220
-
221
- Parameters
222
- ----------
223
- text : str
224
- The text to format.
225
- bold : bool
226
- Whether to apply bold styling.
227
- italic : bool
228
- Whether to apply italic styling.
229
- underline : bool
230
- Whether to apply underline styling.
231
-
232
- Returns
233
- -------
234
- str
235
- The formatted text with the applied styles.
236
- """
237
-
238
- if self.is_text_empty(text):
239
- return text
240
-
241
- # Exclude leading and trailing spaces from style
242
- match = self.trailing_space_pattern.match(text)
243
- if match:
244
- prefix, text, suffix = match.groups()
245
- else:
246
- prefix, suffix = "", ""
247
-
248
- # Apply style
249
- if bold:
250
- text = f"**{text}**"
251
- if italic:
252
- text = f"*{text}*"
253
- if underline:
254
- text = f"<u>{text}</u>"
255
-
256
- # Add back leading and trailing spaces
257
- text = prefix + text + suffix
258
-
259
- return text
260
-
261
- def format_paragraph(self, paragraph: "Paragraph") -> Tuple[str, List["Image"]]:
262
- """
263
- Format a paragraph into styled text and extract associated images.
264
-
265
- Parameters
266
- ----------
267
- paragraph : Paragraph
268
- The paragraph to format. This includes text and potentially embedded images.
269
-
270
- Returns
271
- -------
272
- tuple of (str, list of Image)
273
- - The formatted paragraph text with markdown styling applied.
274
- - A list of extracted images from the paragraph.
275
- """
276
-
277
- paragraph_images = []
278
- if self.paragraph_format == "text":
279
- paragraph_text = paragraph.text
280
- else:
281
- # Get the default style of the paragraph, "markdown"
282
- font = paragraph.style.font
283
- default_style = (font.bold, font.italic, font.underline)
284
-
285
- # Iterate over the runs of the paragraph and group them by style, excluding empty runs
286
- paragraph_text = ""
287
- group_text = ""
288
- previous_style = None
289
-
290
- for c in paragraph.iter_inner_content():
291
- if isinstance(c, Hyperlink):
292
- text = f"[{c.text}]({c.address})"
293
- style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
- elif isinstance(c, Run):
295
- text = c.text
296
- style = (c.bold, c.italic, c.underline)
297
- # 1. Locate the inline shape which is stored in the <w:drawing> element.
298
- # 2. r:embed in <a.blip> has the relationship id for extracting the file where
299
- # the image is stored as bytes.
300
- # Reference:
301
- # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
302
- inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
303
- for r_id in inline_shapes:
304
- text += self.image_tag.format(self.image_tag_index)
305
- self.image_tag_index += 1
306
- image = paragraph.part.related_parts[r_id].image
307
- paragraph_images.append(image)
308
- else:
309
- continue
310
-
311
- style = tuple([s if s is not None else d for s, d in zip(style, default_style)])
312
-
313
- # If the style changes for a non empty text, format the previous group and start a new one
314
- if (not self.is_text_empty(text)) and (previous_style is not None):
315
- if style != previous_style:
316
- paragraph_text += self.format_text(group_text, *previous_style)
317
- group_text = ""
318
-
319
- group_text += text
320
- if not self.is_text_empty(text):
321
- previous_style = style
322
-
323
- # Format the last group
324
- if group_text:
325
- paragraph_text += self.format_text(group_text, *style)
326
-
327
- # Remove trailing spaces
328
- paragraph_text = paragraph_text.strip()
329
- return paragraph_text, paragraph_images
330
-
331
- def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
332
- """
333
- Format a table cell into Markdown text and extract associated images.
334
-
335
- Parameters
336
- ----------
337
- cell : _Cell
338
- The table cell to format.
339
-
340
- Returns
341
- -------
342
- tuple of (str, list of Image)
343
- - The formatted text of the cell with markdown styling applied.
344
- - A list of images extracted from the cell.
345
- """
346
-
347
- if self.paragraph_format == "markdown":
348
- newline = "<br>"
349
- else:
350
- newline = "\n"
351
- paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs])
352
- return newline.join(paragraph_texts), paragraph_images
353
-
354
- def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
355
- """
356
- Format a table into text, extract images, and represent it as a DataFrame.
357
-
358
- Parameters
359
- ----------
360
- table : Table
361
- The table to format.
362
-
363
- Returns
364
- -------
365
- tuple of (str or None, list of Image, DataFrame)
366
- - The formatted table as text, using the specified format (e.g., markdown, CSV).
367
- - A list of images extracted from the table.
368
- - A DataFrame representation of the table's content.
369
- """
370
-
371
- rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows]
372
- texts = [[text for text, _ in row] for row in rows]
373
- table_images = [image for row in rows for _, images in row for image in images]
374
-
375
- table = pd.DataFrame(texts[1:], columns=texts[0])
376
- if "markdown" in self.table_format:
377
- table_text = table.to_markdown(index=False)
378
- if self.table_format == "markdown_light":
379
- table_text = re.sub(r"\s{2,}", " ", table_text)
380
- table_text = re.sub(r"-{2,}", "-", table_text)
381
- elif self.table_format == "csv":
382
- table_text = table.to_csv()
383
- elif self.table_format == "tag":
384
- table_text = self.table_tag.format(self.table_tag_index)
385
- self.table_tag_index += 1
386
- else:
387
- raise ValueError(f"Unknown table format {format}")
388
-
389
- return table_text, table_images, table
390
-
391
- @staticmethod
392
- def apply_text_style(style: str, text: str, level: int = 0) -> str:
393
- """
394
- Apply a specific text style (e.g., heading, list, title, subtitle) to the given text.
395
-
396
- Parameters
397
- ----------
398
- style : str
399
- The style to apply. Supported styles include headings ("Heading 1" to "Heading 9"),
400
- list items ("List"), and document structures ("Title", "Subtitle").
401
- text : str
402
- The text to style.
403
- level : int, optional
404
- The indentation level for the styled text. Default is 0.
405
-
406
- Returns
407
- -------
408
- str
409
- The text with the specified style and indentation applied.
410
- """
411
-
412
- if re.match(r"^Heading [1-9]$", style):
413
- n = int(style.split(" ")[-1])
414
- text = f"{'#' * n} {text}"
415
- elif style.startswith("List"):
416
- text = f"- {text}"
417
- elif style == "Title":
418
- text = f"{text}\n{'=' * len(text)}"
419
- elif style == "Subtitle":
420
- text = f"{text}\n{'-' * len(text)}"
421
-
422
- text = "\t" * level + text
423
-
424
- return text
425
-
426
- @staticmethod
427
- def docx_content_type_to_image_type(content_type: "MIME_TYPE") -> str:
428
- """
429
- Convert a DOCX content type string to an image type.
430
-
431
- Parameters
432
- ----------
433
- content_type : MIME_TYPE
434
- The content type string from the image header, e.g., "image/jpeg".
435
-
436
- Returns
437
- -------
438
- str
439
- The image type extracted from the content type string.
440
- """
441
-
442
- return content_type.split("/")[1]
443
-
444
- def _construct_image_metadata(
445
- self, para_idx: int, caption: str, base_unified_metadata: Dict, base64_img: str
446
- ) -> List[Union[str, dict]]:
447
- """
448
- Build metadata for an image in a DOCX file.
449
-
450
- Parameters
451
- ----------
452
- para_idx : int
453
- The paragraph index containing the image.
454
- caption : str
455
- The caption associated with the image.
456
- base_unified_metadata : dict
457
- The base metadata to build upon.
458
- base64_img : str
459
- The image content encoded as a base64 string.
460
-
461
- Returns
462
- -------
463
- list
464
- A list containing the content type, validated metadata, and a unique identifier.
465
- """
466
-
467
- bbox = (0, 0, 0, 0)
468
- caption_len = len(caption.splitlines())
469
-
470
- page_idx = 0 # docx => single page
471
- page_count = 1
472
-
473
- page_nearby_blocks = {
474
- "text": {"content": [], "bbox": []},
475
- "images": {"content": [], "bbox": []},
476
- "structured": {"content": [], "bbox": []},
477
- }
478
-
479
- if caption_len:
480
- page_nearby_blocks["text"]["content"].append(caption)
481
- page_nearby_blocks["text"]["bbox"] = [[-1, -1, -1, -1]] * caption_len
482
-
483
- content_metadata = {
484
- "type": ContentTypeEnum.IMAGE,
485
- "description": ContentDescriptionEnum.DOCX_IMAGE,
486
- "page_number": page_idx,
487
- "hierarchy": {
488
- "page_count": page_count,
489
- "page": page_idx,
490
- "block": para_idx,
491
- "line": -1,
492
- "span": -1,
493
- "nearby_objects": page_nearby_blocks,
494
- },
495
- }
496
-
497
- image_metadata = {
498
- "image_type": DocumentTypeEnum.PNG,
499
- "structured_image_type": ContentTypeEnum.NONE,
500
- "caption": caption,
501
- "text": "",
502
- "image_location": bbox,
503
- }
504
-
505
- unified_metadata = base_unified_metadata.copy()
506
- unified_metadata.update(
507
- {
508
- "content": base64_img,
509
- "source_metadata": self.properties.source_metadata,
510
- "content_metadata": content_metadata,
511
- "image_metadata": image_metadata,
512
- }
513
- )
514
-
515
- validated_unified_metadata = validate_metadata(unified_metadata)
516
-
517
- return [
518
- ContentTypeEnum.IMAGE.value,
519
- validated_unified_metadata.model_dump(),
520
- str(uuid.uuid4()),
521
- ]
522
-
523
- def _extract_para_images(
524
- self, images: List["Image"], para_idx: int, caption: str, base_unified_metadata: Dict
525
- ) -> None:
526
- """
527
- Collect images from a paragraph and store them for metadata construction.
528
-
529
- Parameters
530
- ----------
531
- images : list of Image
532
- The images found in the paragraph.
533
- para_idx : int
534
- The index of the paragraph containing the images.
535
- caption : str
536
- The caption associated with the images.
537
- base_unified_metadata : dict
538
- The base metadata to associate with the images.
539
-
540
- Returns
541
- -------
542
- None
543
- """
544
-
545
- for image in images:
546
- logger.debug("image content_type %s para_idx %d", image.content_type, para_idx)
547
- logger.debug("image caption %s", caption)
548
-
549
- # Simply append a tuple so we can build the final metadata in _finalize_images
550
- self._pending_images.append((image, para_idx, caption, base_unified_metadata))
551
-
552
- def _construct_text_metadata(
553
- self, accumulated_text: List[str], para_idx: int, text_depth: "TextTypeEnum", base_unified_metadata: Dict
554
- ) -> List[Union[str, dict]]:
555
- """
556
- Build metadata for text content in a DOCX file.
557
-
558
- Parameters
559
- ----------
560
- accumulated_text : list of str
561
- The accumulated text to include in the metadata.
562
- para_idx : int
563
- The paragraph index containing the text.
564
- text_depth : TextTypeEnum
565
- The depth of the text content (e.g., page-level, paragraph-level).
566
- base_unified_metadata : dict
567
- The base metadata to build upon.
568
-
569
- Returns
570
- -------
571
- list
572
- A list containing the content type, validated metadata, and a unique identifier.
573
- """
574
-
575
- if len(accumulated_text) < 1:
576
- return []
577
-
578
- extracted_text = " ".join(accumulated_text)
579
-
580
- # the document is treated as a single page
581
- page_number = 0 if text_depth == TextTypeEnum.PAGE else -1
582
- content_metadata = {
583
- "type": ContentTypeEnum.TEXT,
584
- "description": ContentDescriptionEnum.DOCX_TEXT,
585
- "page_number": page_number,
586
- "hierarchy": {
587
- "page_count": 1,
588
- "page": page_number,
589
- "block": para_idx,
590
- "line": -1,
591
- "span": -1,
592
- },
593
- }
594
-
595
- language = detect_language(extracted_text)
596
- text_metadata = {
597
- "text_type": text_depth,
598
- "summary": "",
599
- "keywords": self.properties.keywords,
600
- "language": language,
601
- "text_location": (-1, -1, -1, -1),
602
- }
603
-
604
- ext_unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
605
- ext_unified_metadata.update(
606
- {
607
- "content": extracted_text,
608
- "source_metadata": self.properties.source_metadata,
609
- "content_metadata": content_metadata,
610
- "text_metadata": text_metadata,
611
- }
612
- )
613
-
614
- validated_unified_metadata = validate_metadata(ext_unified_metadata)
615
-
616
- return [ContentTypeEnum.TEXT.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
617
-
618
- def _extract_para_text(
619
- self,
620
- paragraph,
621
- paragraph_text,
622
- base_unified_metadata: Dict,
623
- text_depth: str,
624
- para_idx: int,
625
- ) -> None:
626
- """
627
- Process the text, images, and styles in a DOCX paragraph.
628
-
629
- Parameters
630
- ----------
631
- paragraph: Paragraph
632
- The paragraph to process.
633
- paragraph_text: str
634
- The text content of the paragraph.
635
- base_unified_metadata : dict
636
- The base metadata to associate with extracted data.
637
- text_depth : TextTypeEnum
638
- The depth of text extraction (e.g., block-level, document-level).
639
- para_idx : int
640
- The index of the paragraph being processed.
641
-
642
- Returns
643
- -------
644
- None
645
- """
646
-
647
- # Handle text styles if desired
648
- if self.handle_text_styles:
649
- try:
650
- numPr = paragraph._element.xpath("./w:pPr/w:numPr")[0]
651
- level = int(numPr.xpath("./w:ilvl/@w:val")[0])
652
- except Exception:
653
- level = -1
654
- paragraph_text = self.apply_text_style(paragraph.style.name, paragraph_text, level)
655
-
656
- self._accumulated_text.append(paragraph_text + "\n")
657
-
658
- # If text_depth is BLOCK, we flush after each paragraph
659
- if text_depth == TextTypeEnum.BLOCK:
660
- text_extraction = self._construct_text_metadata(
661
- self._accumulated_text, para_idx, text_depth, base_unified_metadata
662
- )
663
- self._extracted_data.append(text_extraction)
664
- self._accumulated_text = []
665
-
666
- def _finalize_images(self, extract_tables: bool, extract_charts: bool, **kwargs) -> None:
667
- """
668
- Build and append final metadata for each pending image in batches.
669
-
670
- Parameters
671
- ----------
672
- extract_tables : bool
673
- Whether to attempt table detection in images.
674
- extract_charts : bool
675
- Whether to attempt chart detection in images.
676
- **kwargs
677
- Additional configuration for image processing.
678
-
679
- Returns
680
- -------
681
- None
682
- """
683
- if not self._pending_images:
684
- return
685
-
686
- # 1) Convert all pending images into numpy arrays (and also store base64 + context),
687
- # so we can run detection on them in one go.
688
- all_image_arrays = []
689
- image_info = [] # parallel list to hold (para_idx, caption, base_unified_metadata, base64_img)
690
-
691
- for docx_image, para_idx, caption, base_unified_metadata in self._pending_images:
692
- # Convert docx image blob to BytesIO, then to numpy array
693
- image_bytes = docx_image.blob
694
- image_stream = io.BytesIO(image_bytes)
695
- image_array = load_and_preprocess_image(image_stream)
696
- base64_img = str(bytetools.base64frombytes(image_bytes))
697
-
698
- all_image_arrays.append(image_array)
699
-
700
- # Keep track of all needed metadata so we can rebuild final entries
701
- image_info.append((para_idx, caption, base_unified_metadata, base64_img))
702
-
703
- # 2) If the user wants to detect tables/charts, do it in one pass for all images.
704
- detection_map = defaultdict(list) # maps image_index -> list of CroppedImageWithContent
705
-
706
- if extract_tables or extract_charts:
707
- try:
708
- # Perform the batched detection on all images
709
- detection_results = extract_page_elements_from_images(
710
- images=all_image_arrays,
711
- config=ImageConfigSchema(**self._extraction_config.model_dump()),
712
- trace_info=kwargs.get("trace_info"),
713
- )
714
- # detection_results is typically List[Tuple[int, CroppedImageWithContent]]
715
- # Group by image_index
716
- for image_idx, cropped_item in detection_results:
717
- detection_map[image_idx].append(cropped_item)
718
-
719
- except Exception as e:
720
- logger.error(f"Error extracting tables/charts in batch: {e}")
721
- # If something goes wrong, we can fall back to empty detection map
722
- # so that all images are treated normally
723
- detection_map = {}
724
-
725
- # 3) For each pending image, decide if we found tables/charts or not.
726
- for i, _ in enumerate(self._pending_images):
727
- para_idx_i, caption_i, base_unified_metadata_i, base64_img_i = image_info[i]
728
-
729
- # If detection_map[i] is non-empty, we have found table(s)/chart(s).
730
- if i in detection_map and detection_map[i]:
731
- for table_chart_data in detection_map[i]:
732
- # Build structured metadata for each table or chart
733
- structured_entry = construct_table_and_chart_metadata(
734
- structured_image=table_chart_data, # A CroppedImageWithContent
735
- page_idx=0, # docx => single page
736
- page_count=1,
737
- source_metadata=self.properties.source_metadata,
738
- base_unified_metadata=base_unified_metadata_i,
739
- )
740
- self._extracted_data.append(structured_entry)
741
- else:
742
- # Either detection was not requested, or no table/chart was found
743
- image_entry = self._construct_image_metadata(
744
- para_idx_i,
745
- caption_i,
746
- base_unified_metadata_i,
747
- base64_img_i,
748
- )
749
- self._extracted_data.append(image_entry)
750
-
751
- # 4) Clear out the pending images after finalizing
752
- self._pending_images = []
753
-
754
- def _extract_table_data(
755
- self,
756
- child,
757
- base_unified_metadata: Dict,
758
- ) -> None:
759
- """
760
- Process the text and images in a DOCX table.
761
-
762
- Parameters
763
- ----------
764
- child : element
765
- The table element to process.
766
- base_unified_metadata : dict
767
- The base metadata to associate with extracted data.
768
- text_depth : TextTypeEnum
769
- The depth of text extraction (e.g., block-level, document-level).
770
- para_idx : int
771
- The index of the table being processed.
772
-
773
- Returns
774
- -------
775
- None
776
- """
777
-
778
- # Table
779
- table = Table(child, self.document)
780
- table_text, table_images, table_dataframe = self.format_table(table)
781
-
782
- self.images += table_images
783
- self.tables.append(table_dataframe)
784
-
785
- cropped_image_with_content = CroppedImageWithContent(
786
- content=table_text,
787
- image="", # no image content
788
- bbox=(0, 0, 0, 0),
789
- max_width=0,
790
- max_height=0,
791
- type_string="table",
792
- )
793
-
794
- self._extracted_data.append(
795
- construct_table_and_chart_metadata(
796
- structured_image=cropped_image_with_content,
797
- page_idx=0, # docx => single page
798
- page_count=1,
799
- source_metadata=self.properties.source_metadata,
800
- base_unified_metadata=base_unified_metadata,
801
- )
802
- )
803
-
804
- def extract_data(
805
- self,
806
- base_unified_metadata: Dict,
807
- text_depth: "TextTypeEnum",
808
- extract_text: bool,
809
- extract_charts: bool,
810
- extract_tables: bool,
811
- extract_images: bool,
812
- ) -> list[list[str | dict]]:
813
- """
814
- Iterate over paragraphs and tables in a DOCX document to extract data.
815
-
816
- Parameters
817
- ----------
818
- base_unified_metadata : dict
819
- The base metadata to associate with all extracted content.
820
- text_depth : TextTypeEnum
821
- The depth of text extraction (e.g., block-level, document-level).
822
- extract_text : bool
823
- Whether to extract text from the document.
824
- extract_charts : bool
825
- Whether to extract charts from the document.
826
- extract_tables : bool
827
- Whether to extract tables from the document.
828
- extract_images : bool
829
- Whether to extract images from the document.
830
-
831
- Returns
832
- -------
833
- dict
834
- A dictionary containing the extracted data from the document.
835
- """
836
-
837
- self._accumulated_text = []
838
- self._extracted_data = []
839
- self._pending_images = []
840
- self._prev_para_images = []
841
- self._prev_para_image_idx = 0
842
-
843
- para_idx = 0
844
-
845
- for child in self.document.element.body.iterchildren():
846
- if isinstance(child, CT_P):
847
- paragraph = Paragraph(child, self.document)
848
- paragraph_text, paragraph_images = self.format_paragraph(paragraph)
849
-
850
- if extract_text:
851
- self._extract_para_text(
852
- paragraph,
853
- paragraph_text,
854
- base_unified_metadata,
855
- text_depth,
856
- para_idx,
857
- )
858
-
859
- if (extract_charts or extract_images or extract_tables) and paragraph_images:
860
- self._prev_para_images = paragraph_images
861
- self._prev_para_image_idx = para_idx
862
- self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images]
863
- self.images += paragraph_images
864
-
865
- elif isinstance(child, CT_Tbl):
866
- if extract_tables or extract_charts:
867
- self._extract_table_data(child, base_unified_metadata)
868
-
869
- para_idx += 1
870
-
871
- # If there's leftover text at the doc’s end
872
- if (
873
- extract_text
874
- and text_depth in (TextTypeEnum.DOCUMENT, TextTypeEnum.PAGE)
875
- and len(self._accumulated_text) > 0
876
- ):
877
- text_extraction = self._construct_text_metadata(
878
- self._accumulated_text,
879
- -1,
880
- text_depth,
881
- base_unified_metadata,
882
- )
883
-
884
- if text_extraction:
885
- self._extracted_data.append(text_extraction)
886
-
887
- # Final pass: Decide if images are just images or contain tables/charts
888
- if extract_images or extract_tables or extract_charts:
889
- self._finalize_images(
890
- extract_tables=extract_tables,
891
- extract_charts=extract_charts,
892
- trace_info=None,
893
- )
894
-
895
- return self._extracted_data