nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +72 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.15.dev20250415.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,799 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import logging
19
+ import io
20
+ import operator
21
+ import re
22
+ import uuid
23
+ from collections import defaultdict
24
+ from datetime import datetime
25
+ from typing import Dict, List, Tuple, IO
26
+ from typing import Optional
27
+
28
+ import pandas as pd
29
+ from pptx import Presentation
30
+ from pptx.enum.dml import MSO_COLOR_TYPE
31
+ from pptx.enum.dml import MSO_THEME_COLOR
32
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
33
+ from pptx.enum.shapes import PP_PLACEHOLDER
34
+ from pptx.shapes.autoshape import Shape
35
+ from pptx.slide import Slide
36
+
37
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
38
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
39
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
40
+ from nv_ingest_api.internal.enums.common import TableFormatEnum
41
+ from nv_ingest_api.internal.enums.common import TextTypeEnum
42
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
43
+ from nv_ingest_api.internal.extract.image.image_helpers.common import (
44
+ load_and_preprocess_image,
45
+ extract_page_elements_from_images,
46
+ )
47
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
48
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXConfigSchema
49
+ from nv_ingest_api.util.converters import bytetools
50
+ from nv_ingest_api.util.detectors.language import detect_language
51
+ from nv_ingest_api.util.metadata.aggregators import construct_page_element_metadata
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ def _finalize_images(
57
+ pending_images: List[Tuple[Shape, int, int, int, dict, dict, dict]],
58
+ extracted_data: List,
59
+ pptx_extraction_config: PPTXConfigSchema,
60
+ extract_tables: bool = False,
61
+ extract_charts: bool = False,
62
+ trace_info: Optional[Dict] = None,
63
+ ):
64
+ """
65
+ Post-process all pending images.
66
+ - Convert shape image -> NumPy or base64
67
+ - If `extract_tables` or `extract_charts`, do detection (table/chart)
68
+ - Build the appropriate metadata, either table/chart or image.
69
+
70
+ This mimics the docx approach, but adapted for python-pptx shapes.
71
+ """
72
+ if not pending_images:
73
+ return
74
+
75
+ # Convert each shape to image data (base64 or ndarray).
76
+ # We'll store them for a single call to your model if you'd like (batching).
77
+ image_arrays = []
78
+ image_contexts = []
79
+ for (
80
+ shape,
81
+ shape_idx,
82
+ slide_idx,
83
+ slide_count,
84
+ page_nearby_blocks,
85
+ source_metadata,
86
+ base_unified_metadata,
87
+ ) in pending_images:
88
+ try:
89
+ image_bytes = shape.image.blob
90
+ image_array = load_and_preprocess_image(io.BytesIO(image_bytes))
91
+ base64_img = bytetools.base64frombytes(image_bytes)
92
+
93
+ image_arrays.append(image_array)
94
+ image_contexts.append(
95
+ (
96
+ shape_idx,
97
+ slide_idx,
98
+ slide_count,
99
+ page_nearby_blocks,
100
+ source_metadata,
101
+ base_unified_metadata,
102
+ base64_img,
103
+ )
104
+ )
105
+ except Exception as e:
106
+ logger.warning(f"Unable to process shape image: {e}")
107
+
108
+ # If you want table/chart detection for these images, do it now
109
+ # (similar to docx approach). This might use your YOLO or other method:
110
+ detection_map = defaultdict(list) # image_idx -> list of CroppedImageWithContent
111
+ if extract_tables or extract_charts:
112
+ try:
113
+ # For example, a call to your function that checks for tables/charts
114
+ detection_results = extract_page_elements_from_images(
115
+ images=image_arrays,
116
+ config=ImageConfigSchema(**(pptx_extraction_config.model_dump())),
117
+ trace_info=trace_info,
118
+ )
119
+ # detection_results is something like [(image_idx, CroppedImageWithContent), ...]
120
+ for img_idx, cropped_obj in detection_results:
121
+ detection_map[img_idx].append(cropped_obj)
122
+ except Exception as e:
123
+ logger.error(f"Error while running table/chart detection on PPTX images: {e}")
124
+ detection_map = {}
125
+
126
+ # Now build the final metadata objects
127
+ for i, context in enumerate(image_contexts):
128
+ (shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata, base_unified_metadata, base64_img) = (
129
+ context
130
+ )
131
+
132
+ # If there's a detection result for this image, handle it
133
+ if i in detection_map and detection_map[i]:
134
+ # We found table(s)/chart(s) in the image
135
+ for cropped_item in detection_map[i]:
136
+ structured_entry = construct_page_element_metadata(
137
+ structured_image=cropped_item,
138
+ page_idx=slide_idx,
139
+ page_count=slide_count,
140
+ source_metadata=source_metadata,
141
+ base_unified_metadata=base_unified_metadata,
142
+ )
143
+ extracted_data.append(structured_entry)
144
+ else:
145
+ # No table detected => build normal image metadata
146
+ image_entry = _construct_image_metadata(
147
+ shape_idx=shape_idx,
148
+ slide_idx=slide_idx,
149
+ slide_count=slide_count,
150
+ page_nearby_blocks=page_nearby_blocks,
151
+ base64_img=base64_img,
152
+ source_metadata=source_metadata,
153
+ base_unified_metadata=base_unified_metadata,
154
+ )
155
+ extracted_data.append(image_entry)
156
+
157
+
158
+ # -----------------------------------------------------------------------------
159
+ # Helper Function: Recursive Image Extraction
160
+ # -----------------------------------------------------------------------------
161
+ def process_shape(
162
+ shape, shape_idx, slide_idx, slide_count, pending_images, page_nearby_blocks, source_metadata, base_unified_metadata
163
+ ):
164
+ """
165
+ Recursively process a shape:
166
+ - If the shape is a group, iterate over its child shapes.
167
+ - If it is a picture or a placeholder with an embedded image, append it to pending_images.
168
+ """
169
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
170
+ for sub_idx, sub_shape in enumerate(shape.shapes):
171
+ # Create a composite index (e.g., "2.1" for the first child of shape 2)
172
+ composite_idx = f"{shape_idx}.{sub_idx}"
173
+ process_shape(
174
+ sub_shape,
175
+ composite_idx,
176
+ slide_idx,
177
+ slide_count,
178
+ pending_images,
179
+ page_nearby_blocks,
180
+ source_metadata,
181
+ base_unified_metadata,
182
+ )
183
+ else:
184
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
185
+ shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image")
186
+ ):
187
+ try:
188
+ pending_images.append(
189
+ (
190
+ shape, # so we can later pull shape.image.blob
191
+ shape_idx,
192
+ slide_idx,
193
+ slide_count,
194
+ page_nearby_blocks,
195
+ source_metadata,
196
+ base_unified_metadata,
197
+ )
198
+ )
199
+ except Exception as e:
200
+ logger.warning(f"Error processing shape {shape_idx} on slide {slide_idx}: {e}")
201
+ raise
202
+
203
+
204
+ # -----------------------------------------------------------------------------
205
+ # Main Extraction Function
206
+ # -----------------------------------------------------------------------------
207
+ def python_pptx(
208
+ *,
209
+ pptx_stream: IO,
210
+ extract_text: bool,
211
+ extract_images: bool,
212
+ extract_infographics: bool,
213
+ extract_tables: bool,
214
+ extract_charts: bool,
215
+ extraction_config: dict,
216
+ execution_trace_log: Optional[List] = None,
217
+ ):
218
+ """
219
+ Uses python-pptx to extract text from a PPTX bytestream, while deferring image
220
+ classification into tables/charts if requested.
221
+ """
222
+
223
+ _ = extract_infographics # Placeholder for future use
224
+ _ = execution_trace_log # Placeholder for future use
225
+
226
+ row_data = extraction_config.get("row_data")
227
+ source_id = row_data["source_id"]
228
+
229
+ text_depth = extraction_config.get("text_depth", "page")
230
+ text_depth = TextTypeEnum[text_depth.upper()]
231
+
232
+ paragraph_format = extraction_config.get("paragraph_format", "markdown")
233
+ identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
234
+
235
+ metadata_col = extraction_config.get("metadata_column", "metadata")
236
+ pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
237
+ trace_info = extraction_config.get("trace_info", {})
238
+
239
+ base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
240
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
241
+ source_location = base_source_metadata.get("source_location", "")
242
+ collection_id = base_source_metadata.get("collection_id", "")
243
+ partition_id = base_source_metadata.get("partition_id", -1)
244
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
245
+
246
+ presentation = Presentation(pptx_stream)
247
+
248
+ # Collect source metadata from the core properties of the document.
249
+ last_modified = (
250
+ presentation.core_properties.modified.isoformat()
251
+ if presentation.core_properties.modified
252
+ else datetime.now().isoformat()
253
+ )
254
+ date_created = (
255
+ presentation.core_properties.created.isoformat()
256
+ if presentation.core_properties.created
257
+ else datetime.now().isoformat()
258
+ )
259
+ keywords = presentation.core_properties.keywords
260
+ source_type = DocumentTypeEnum.PPTX
261
+ source_metadata = {
262
+ "source_name": source_id, # python-pptx doesn't maintain filename; re-use source_id
263
+ "source_id": source_id,
264
+ "source_location": source_location,
265
+ "source_type": source_type,
266
+ "collection_id": collection_id,
267
+ "date_created": date_created,
268
+ "last_modified": last_modified,
269
+ "summary": "",
270
+ "partition_id": partition_id,
271
+ "access_level": access_level,
272
+ }
273
+
274
+ slide_count = len(presentation.slides)
275
+
276
+ accumulated_text = []
277
+ extracted_data = []
278
+
279
+ # Hold images here for final classification.
280
+ # Each item is (shape, shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata,
281
+ # base_unified_metadata)
282
+ pending_images = []
283
+
284
+ for slide_idx, slide in enumerate(presentation.slides):
285
+ # Obtain a flat list of shapes (ungrouped) sorted by top then left.
286
+ shapes = sorted(ungroup_shapes(slide.shapes), key=operator.attrgetter("top", "left"))
287
+
288
+ page_nearby_blocks = {
289
+ "text": {"content": [], "bbox": []},
290
+ "images": {"content": [], "bbox": []},
291
+ "structured": {"content": [], "bbox": []},
292
+ }
293
+
294
+ for shape_idx, shape in enumerate(shapes):
295
+ block_text = []
296
+ added_title = added_subtitle = False
297
+
298
+ # ---------------------------------------------
299
+ # 1) Text Extraction
300
+ # ---------------------------------------------
301
+ if extract_text and shape.has_text_frame:
302
+ for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
303
+ if not paragraph.text.strip():
304
+ continue
305
+
306
+ for run_idx, run in enumerate(paragraph.runs):
307
+ text = run.text
308
+ if not text:
309
+ continue
310
+
311
+ text = escape_text(text)
312
+
313
+ if paragraph_format == "markdown":
314
+ if is_title(shape):
315
+ if not added_title:
316
+ text = process_title(shape)
317
+ added_title = True
318
+ else:
319
+ continue
320
+ elif is_subtitle(shape):
321
+ if not added_subtitle:
322
+ text = process_subtitle(shape)
323
+ added_subtitle = True
324
+ else:
325
+ continue
326
+ else:
327
+ if run.hyperlink.address:
328
+ text = get_hyperlink(text, run.hyperlink.address)
329
+ if is_accent(paragraph.font) or is_accent(run.font):
330
+ text = format_text(text, italic=True)
331
+ elif is_strong(paragraph.font) or is_strong(run.font):
332
+ text = format_text(text, bold=True)
333
+ elif is_underlined(paragraph.font) or is_underlined(run.font):
334
+ text = format_text(text, underline=True)
335
+ if is_list_block(shape):
336
+ text = " " * paragraph.level + "* " + text
337
+
338
+ accumulated_text.append(text)
339
+
340
+ # For "nearby objects", store block text.
341
+ if extract_images and identify_nearby_objects:
342
+ block_text.append(text)
343
+
344
+ # If we only want text at SPAN level, flush after each run.
345
+ if text_depth == TextTypeEnum.SPAN:
346
+ text_extraction = _construct_text_metadata(
347
+ presentation,
348
+ shape,
349
+ accumulated_text,
350
+ keywords,
351
+ slide_idx,
352
+ shape_idx,
353
+ paragraph_idx,
354
+ run_idx,
355
+ slide_count,
356
+ text_depth,
357
+ source_metadata,
358
+ base_unified_metadata,
359
+ )
360
+ if len(text_extraction) > 0:
361
+ extracted_data.append(text_extraction)
362
+ accumulated_text = []
363
+
364
+ # Add newlines for separation at line/paragraph level.
365
+ if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
366
+ accumulated_text.append("\n\n")
367
+
368
+ if text_depth == TextTypeEnum.LINE:
369
+ text_extraction = _construct_text_metadata(
370
+ presentation,
371
+ shape,
372
+ accumulated_text,
373
+ keywords,
374
+ slide_idx,
375
+ shape_idx,
376
+ paragraph_idx,
377
+ -1,
378
+ slide_count,
379
+ text_depth,
380
+ source_metadata,
381
+ base_unified_metadata,
382
+ )
383
+ if len(text_extraction) > 0:
384
+ extracted_data.append(text_extraction)
385
+ accumulated_text = []
386
+
387
+ if text_depth == TextTypeEnum.BLOCK:
388
+ text_extraction = _construct_text_metadata(
389
+ presentation,
390
+ shape,
391
+ accumulated_text,
392
+ keywords,
393
+ slide_idx,
394
+ shape_idx,
395
+ -1,
396
+ -1,
397
+ slide_count,
398
+ text_depth,
399
+ source_metadata,
400
+ base_unified_metadata,
401
+ )
402
+ if len(text_extraction) > 0:
403
+ extracted_data.append(text_extraction)
404
+ accumulated_text = []
405
+
406
+ if extract_images and identify_nearby_objects and block_text:
407
+ page_nearby_blocks["text"]["content"].append("".join(block_text))
408
+ page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
409
+
410
+ # ---------------------------------------------
411
+ # 2) Image Handling (DEFERRED) with nested/group shapes
412
+ # ---------------------------------------------
413
+ if extract_images:
414
+ process_shape(
415
+ shape,
416
+ shape_idx,
417
+ slide_idx,
418
+ slide_count,
419
+ pending_images,
420
+ page_nearby_blocks,
421
+ source_metadata,
422
+ base_unified_metadata,
423
+ )
424
+
425
+ # ---------------------------------------------
426
+ # 3) Table Handling
427
+ # ---------------------------------------------
428
+ if extract_tables and shape.has_table:
429
+ table_extraction = _construct_table_metadata(
430
+ shape, slide_idx, slide_count, source_metadata, base_unified_metadata
431
+ )
432
+ extracted_data.append(table_extraction)
433
+
434
+ if extract_text and (text_depth == TextTypeEnum.PAGE) and (len(accumulated_text) > 0):
435
+ text_extraction = _construct_text_metadata(
436
+ presentation,
437
+ shape, # may pass None if preferred
438
+ accumulated_text,
439
+ keywords,
440
+ slide_idx,
441
+ -1,
442
+ -1,
443
+ -1,
444
+ slide_count,
445
+ text_depth,
446
+ source_metadata,
447
+ base_unified_metadata,
448
+ )
449
+ if len(text_extraction) > 0:
450
+ extracted_data.append(text_extraction)
451
+ accumulated_text = []
452
+
453
+ if extract_text and (text_depth == TextTypeEnum.DOCUMENT) and (len(accumulated_text) > 0):
454
+ text_extraction = _construct_text_metadata(
455
+ presentation,
456
+ shape, # may pass None
457
+ accumulated_text,
458
+ keywords,
459
+ -1,
460
+ -1,
461
+ -1,
462
+ -1,
463
+ slide_count,
464
+ text_depth,
465
+ source_metadata,
466
+ base_unified_metadata,
467
+ )
468
+ if len(text_extraction) > 0:
469
+ extracted_data.append(text_extraction)
470
+ accumulated_text = []
471
+
472
+ # ---------------------------------------------
473
+ # FINAL STEP: Finalize images (and tables/charts)
474
+ # ---------------------------------------------
475
+ if extract_images or extract_tables or extract_charts:
476
+ _finalize_images(
477
+ pending_images,
478
+ extracted_data,
479
+ pptx_extractor_config,
480
+ extract_tables=extract_tables,
481
+ extract_charts=extract_charts,
482
+ trace_info=trace_info,
483
+ )
484
+
485
+ return extracted_data
486
+
487
+
488
+ def _construct_text_metadata(
489
+ presentation_object,
490
+ shape_object,
491
+ accumulated_text,
492
+ keywords,
493
+ slide_idx,
494
+ shape_idx,
495
+ paragraph_idx,
496
+ run_idx,
497
+ slide_count,
498
+ text_depth,
499
+ source_metadata,
500
+ base_unified_metadata,
501
+ ):
502
+ extracted_text = "".join(accumulated_text)
503
+
504
+ content_metadata = {
505
+ "type": ContentTypeEnum.TEXT,
506
+ "description": ContentDescriptionEnum.PPTX_TEXT,
507
+ "page_number": slide_idx,
508
+ "hierarchy": {
509
+ "page_count": slide_count,
510
+ "page": slide_idx,
511
+ "block": shape_idx,
512
+ "line": paragraph_idx,
513
+ "span": run_idx,
514
+ },
515
+ }
516
+
517
+ language = detect_language(extracted_text)
518
+ bbox = get_bbox(
519
+ presentation_object=presentation_object,
520
+ shape_object=shape_object,
521
+ text_depth=text_depth,
522
+ )
523
+
524
+ text_metadata = {
525
+ "text_type": text_depth,
526
+ "summary": "",
527
+ "keywords": keywords,
528
+ "language": language,
529
+ "text_location": bbox,
530
+ }
531
+
532
+ ext_unified_metadata = base_unified_metadata.copy()
533
+
534
+ ext_unified_metadata.update(
535
+ {
536
+ "content": extracted_text,
537
+ "source_metadata": source_metadata,
538
+ "content_metadata": content_metadata,
539
+ "text_metadata": text_metadata,
540
+ }
541
+ )
542
+
543
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
544
+
545
+ return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
546
+
547
+
548
+ # need to add block text to hierarchy/nearby_objects, including bbox
549
+ def _construct_image_metadata(
550
+ shape_idx: int,
551
+ slide_idx: int,
552
+ slide_count: int,
553
+ page_nearby_blocks: Dict,
554
+ base64_img: str,
555
+ source_metadata: Dict,
556
+ base_unified_metadata: Dict,
557
+ ):
558
+ """
559
+ Build standard PPTX image metadata.
560
+ """
561
+ # Example bounding box
562
+ bbox = (0, 0, 0, 0) # or extract from shape.left, shape.top, shape.width, shape.height if desired
563
+
564
+ content_metadata = {
565
+ "type": ContentTypeEnum.IMAGE,
566
+ "description": ContentDescriptionEnum.PPTX_IMAGE,
567
+ "page_number": slide_idx,
568
+ "hierarchy": {
569
+ "page_count": slide_count,
570
+ "page": slide_idx,
571
+ "block": shape_idx,
572
+ "line": -1,
573
+ "span": -1,
574
+ "nearby_objects": page_nearby_blocks,
575
+ },
576
+ }
577
+
578
+ image_metadata = {
579
+ "image_type": DocumentTypeEnum.PNG,
580
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
581
+ "caption": "", # could attempt to guess a caption from nearby text
582
+ "text": "",
583
+ "image_location": bbox,
584
+ }
585
+
586
+ unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
587
+ unified_metadata.update(
588
+ {
589
+ "content": base64_img,
590
+ "source_metadata": source_metadata,
591
+ "content_metadata": content_metadata,
592
+ "image_metadata": image_metadata,
593
+ }
594
+ )
595
+
596
+ validated_unified_metadata = validate_metadata(unified_metadata)
597
+
598
+ return [
599
+ ContentTypeEnum.IMAGE.value,
600
+ validated_unified_metadata.model_dump(),
601
+ str(uuid.uuid4()),
602
+ ]
603
+
604
+
605
+ def _construct_table_metadata(
606
+ shape,
607
+ slide_idx: int,
608
+ slide_count: int,
609
+ source_metadata: Dict,
610
+ base_unified_metadata: Dict,
611
+ ):
612
+ table = [[cell.text for cell in row.cells] for row in shape.table.rows]
613
+ df = pd.DataFrame(table[1:], columns=table[0])
614
+ # As df is eventually converted to markdown,
615
+ # remove any newlines, tabs, or extra spaces from the column names
616
+ df.columns = df.columns.str.replace(r"\s+", " ", regex=True)
617
+
618
+ bbox = get_bbox(shape_object=shape)
619
+
620
+ content_metadata = {
621
+ "type": ContentTypeEnum.STRUCTURED,
622
+ "description": ContentDescriptionEnum.PPTX_TABLE,
623
+ "page_number": slide_idx,
624
+ "hierarchy": {
625
+ "page_count": slide_count,
626
+ "page": slide_idx,
627
+ "line": -1,
628
+ "span": -1,
629
+ },
630
+ "subtype": ContentTypeEnum.TABLE,
631
+ }
632
+ table_metadata = {
633
+ "caption": "",
634
+ "table_format": TableFormatEnum.MARKDOWN,
635
+ "table_location": bbox,
636
+ "table_content": df.to_markdown(index=False),
637
+ }
638
+ ext_unified_metadata = base_unified_metadata.copy()
639
+
640
+ ext_unified_metadata.update(
641
+ {
642
+ "content": "",
643
+ "source_metadata": source_metadata,
644
+ "content_metadata": content_metadata,
645
+ "table_metadata": table_metadata,
646
+ }
647
+ )
648
+
649
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
650
+
651
+ return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
652
+
653
+
654
+ def get_bbox(
655
+ presentation_object: Optional[Presentation] = None,
656
+ shape_object: Optional[Slide] = None,
657
+ text_depth: Optional[TextTypeEnum] = None,
658
+ ):
659
+ bbox = (-1, -1, -1, -1)
660
+ if text_depth == TextTypeEnum.DOCUMENT:
661
+ bbox = (-1, -1, -1, -1)
662
+ elif text_depth == TextTypeEnum.PAGE:
663
+ top = left = 0
664
+ width = presentation_object.slide_width
665
+ height = presentation_object.slide_height
666
+ bbox = (top, left, top + height, left + width)
667
+ elif shape_object:
668
+ top = shape_object.top
669
+ left = shape_object.left
670
+ width = shape_object.width
671
+ height = shape_object.height
672
+ bbox = (top, left, top + height, left + width)
673
+ return bbox
674
+
675
+
676
+ def ungroup_shapes(shapes):
677
+ result = []
678
+ for shape in shapes:
679
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
680
+ result.extend(ungroup_shapes(shape.shapes))
681
+ else:
682
+ result.append(shape)
683
+ return result
684
+
685
+
686
+ def is_title(shape):
687
+ if shape.is_placeholder and (
688
+ shape.placeholder_format.type == PP_PLACEHOLDER.TITLE
689
+ or shape.placeholder_format.type == PP_PLACEHOLDER.VERTICAL_TITLE
690
+ or shape.placeholder_format.type == PP_PLACEHOLDER.CENTER_TITLE
691
+ ):
692
+ return True
693
+ else:
694
+ return False
695
+
696
+
697
+ def process_title(shape):
698
+ title = shape.text_frame.text.strip()
699
+ extracted_text = f"{title}\n{'=' * len(title)}"
700
+ return extracted_text
701
+
702
+
703
+ def is_subtitle(shape):
704
+ if shape.is_placeholder and (shape.placeholder_format.type == PP_PLACEHOLDER.SUBTITLE):
705
+ return True
706
+ else:
707
+ return False
708
+
709
+
710
+ def process_subtitle(shape):
711
+ subtitle = shape.text_frame.text.strip()
712
+ extracted_text = f"{subtitle}\n{'-' * len(subtitle)}"
713
+ return extracted_text
714
+
715
+
716
+ def is_list_block(shape):
717
+ levels = set()
718
+ for paragraph in shape.text_frame.paragraphs:
719
+ if paragraph.level not in levels:
720
+ levels.add(paragraph.level)
721
+ if paragraph.level != 0 or len(levels) > 1:
722
+ return True
723
+ return False
724
+
725
+
726
+ def escape_text(text):
727
+ def escape_repl(match_obj):
728
+ return "\\" + match_obj.group(0)
729
+
730
+ escape_regex_1 = re.compile(r"([\\\*`!_\{\}\[\]\(\)#\+-\.])")
731
+ escape_regex_2 = re.compile(r"(<[^>]+>)")
732
+ text = re.sub(escape_regex_1, escape_repl, text)
733
+ text = re.sub(escape_regex_2, escape_repl, text)
734
+
735
+ return text
736
+
737
+
738
+ def get_hyperlink(text, url):
739
+ result = f"[{text}]({url})"
740
+ return result
741
+
742
+
743
+ def is_accent(font):
744
+ if font.italic or (
745
+ font.color.type == MSO_COLOR_TYPE.SCHEME
746
+ and (
747
+ font.color.theme_color == MSO_THEME_COLOR.ACCENT_1
748
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_2
749
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_3
750
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_4
751
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_5
752
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_6
753
+ )
754
+ ):
755
+ return True
756
+ else:
757
+ return False
758
+
759
+
760
+ def is_underlined(font):
761
+ if font.underline:
762
+ return True
763
+ else:
764
+ return False
765
+
766
+
767
+ def format_text(text: str, bold: bool = False, italic: bool = False, underline: bool = False) -> str:
768
+ if not text.strip():
769
+ return text
770
+
771
+ prefix, suffix = "", ""
772
+ # Exclude leading and trailing spaces from style
773
+ trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
774
+ match = trailing_space_pattern.match(text)
775
+ if match:
776
+ prefix, text, suffix = match.groups()
777
+
778
+ # Apply style
779
+ if bold:
780
+ text = f"**{text}**"
781
+ if italic:
782
+ text = f"*{text}*"
783
+ if underline:
784
+ text = f"<u>{text}</u>"
785
+
786
+ # Add back leading and trailing spaces
787
+ text = prefix + text + suffix
788
+
789
+ return text
790
+
791
+
792
+ def is_strong(font):
793
+ if font.bold or (
794
+ font.color.type == MSO_COLOR_TYPE.SCHEME
795
+ and (font.color.theme_color == MSO_THEME_COLOR.DARK_1 or font.color.theme_color == MSO_THEME_COLOR.DARK_2)
796
+ ):
797
+ return True
798
+ else:
799
+ return False