nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,968 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import io
19
+ import logging
20
+ import os
21
+ import re
22
+ import subprocess
23
+ import tempfile
24
+ import uuid
25
+ from collections import defaultdict
26
+ from datetime import datetime
27
+ from typing import Dict, List, Tuple, IO
28
+ from typing import Optional
29
+ from typing import Union
30
+
31
+ import pandas as pd
32
+ from pptx import Presentation
33
+ from pptx.enum.dml import MSO_COLOR_TYPE
34
+ from pptx.enum.dml import MSO_THEME_COLOR # noqa
35
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
36
+ from pptx.enum.shapes import PP_PLACEHOLDER # noqa
37
+ from pptx.slide import Slide
38
+ import pypdfium2 as pdfium
39
+
40
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
41
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
42
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
43
+ from nv_ingest_api.internal.enums.common import TableFormatEnum
44
+ from nv_ingest_api.internal.enums.common import TextTypeEnum
45
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
46
+ from nv_ingest_api.internal.extract.image.image_helpers.common import (
47
+ load_and_preprocess_image,
48
+ extract_page_elements_from_images,
49
+ )
50
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
51
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXConfigSchema
52
+ from nv_ingest_api.util.converters import bytetools
53
+ from nv_ingest_api.util.detectors.language import detect_language
54
+ from nv_ingest_api.util.metadata.aggregators import construct_page_element_metadata
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+
59
+ def _finalize_images(
60
+ pending_images: List[Tuple[bytes, int, int, int, dict, dict, dict]],
61
+ extracted_data: List,
62
+ pptx_extraction_config: PPTXConfigSchema,
63
+ extract_tables: bool = False,
64
+ extract_charts: bool = False,
65
+ extract_infographics: bool = False,
66
+ extract_images: bool = False,
67
+ trace_info: Optional[Dict] = None,
68
+ ):
69
+ """
70
+ Post-process all pending images.
71
+ - Convert shape image -> NumPy or base64
72
+ - If `extract_tables` or `extract_charts`, do detection (table/chart)
73
+ - Build the appropriate metadata, either table/chart or image.
74
+
75
+ This mimics the docx approach, but adapted for python-pptx shapes.
76
+ """
77
+ if not pending_images:
78
+ return
79
+
80
+ # Convert each shape to image data (base64 or ndarray).
81
+ # We'll store them for a single call to your model if you'd like (batching).
82
+ image_arrays = []
83
+ image_contexts = []
84
+ for (
85
+ image_bytes,
86
+ shape_idx,
87
+ slide_idx,
88
+ slide_count,
89
+ page_nearby_blocks,
90
+ source_metadata,
91
+ base_unified_metadata,
92
+ ) in pending_images:
93
+ try:
94
+ image_array = load_and_preprocess_image(io.BytesIO(image_bytes))
95
+ base64_img = bytetools.base64frombytes(image_bytes)
96
+
97
+ image_arrays.append(image_array)
98
+ image_contexts.append(
99
+ (
100
+ shape_idx,
101
+ slide_idx,
102
+ slide_count,
103
+ page_nearby_blocks,
104
+ source_metadata,
105
+ base_unified_metadata,
106
+ base64_img,
107
+ )
108
+ )
109
+ except Exception as e:
110
+ logger.warning(f"Unable to process shape image: {e}")
111
+
112
+ # If you want table/chart detection for these images, do it now
113
+ # (similar to docx approach). This might use your YOLO or another method:
114
+ detection_map = defaultdict(list) # image_idx -> list of CroppedImageWithContent
115
+ if extract_tables or extract_charts or extract_infographics:
116
+ try:
117
+ # For example, a call to your function that checks for tables/charts
118
+ detection_results = extract_page_elements_from_images(
119
+ images=image_arrays,
120
+ config=ImageConfigSchema(**(pptx_extraction_config.model_dump())),
121
+ trace_info=trace_info,
122
+ )
123
+ # detection_results is something like [(image_idx, CroppedImageWithContent), ...]
124
+ for img_idx, cropped_obj in detection_results:
125
+
126
+ # Skip elements that shouldn't be extracted based on flags
127
+ element_type = cropped_obj.type_string
128
+ if (not extract_tables) and (element_type == "table"):
129
+ continue
130
+ if (not extract_charts) and (element_type == "chart"):
131
+ continue
132
+ if (not extract_infographics) and (element_type == "infographic"):
133
+ continue
134
+
135
+ detection_map[img_idx].append(cropped_obj)
136
+ except Exception as e:
137
+ logger.error(f"Error while running table/chart detection on PPTX images: {e}")
138
+ detection_map = {}
139
+
140
+ # Now build the final metadata objects
141
+ for i, context in enumerate(image_contexts):
142
+ (shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata, base_unified_metadata, base64_img) = (
143
+ context
144
+ )
145
+
146
+ # If there's a detection result for this image, handle it
147
+ if i in detection_map and detection_map[i]:
148
+ # We found table(s)/chart(s) in the image
149
+ for cropped_item in detection_map[i]:
150
+ structured_entry = construct_page_element_metadata(
151
+ structured_image=cropped_item,
152
+ page_idx=slide_idx,
153
+ page_count=slide_count,
154
+ source_metadata=source_metadata,
155
+ base_unified_metadata=base_unified_metadata,
156
+ )
157
+ extracted_data.append(structured_entry)
158
+ else:
159
+ # No table detected => build normal image metadata
160
+ if extract_images:
161
+ image_entry = _construct_image_metadata(
162
+ shape_idx=shape_idx,
163
+ slide_idx=slide_idx,
164
+ slide_count=slide_count,
165
+ page_nearby_blocks=page_nearby_blocks,
166
+ base64_img=base64_img,
167
+ source_metadata=source_metadata,
168
+ base_unified_metadata=base_unified_metadata,
169
+ )
170
+ extracted_data.append(image_entry)
171
+
172
+
173
+ def _safe_position(shape):
174
+ top = shape.top if shape.top is not None else float("inf")
175
+ left = shape.left if shape.left is not None else float("inf")
176
+ return (top, left)
177
+
178
+
179
+ # -----------------------------------------------------------------------------
180
+ # Helper Function: Recursive Image Extraction
181
+ # -----------------------------------------------------------------------------
182
+ def process_shape(
183
+ shape, shape_idx, slide_idx, slide_count, pending_images, page_nearby_blocks, source_metadata, base_unified_metadata
184
+ ):
185
+ """
186
+ Recursively process a shape:
187
+ - If the shape is a group, iterate over its child shapes.
188
+ - If it is a picture or a placeholder with an embedded image, append it to pending_images.
189
+ - OLE Objects: Convert with LibreOffice, then extract blobs.
190
+ """
191
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
192
+ for sub_idx, sub_shape in enumerate(shape.shapes):
193
+ # Create a composite index (e.g., "2.1" for the first child of shape 2)
194
+ composite_idx = f"{shape_idx}.{sub_idx}"
195
+ process_shape(
196
+ sub_shape,
197
+ composite_idx,
198
+ slide_idx,
199
+ slide_count,
200
+ pending_images,
201
+ page_nearby_blocks,
202
+ source_metadata,
203
+ base_unified_metadata,
204
+ )
205
+ elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
206
+ shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image")
207
+ ):
208
+ try:
209
+ pending_images.append(
210
+ (
211
+ shape.image.blob,
212
+ shape_idx,
213
+ slide_idx,
214
+ slide_count,
215
+ page_nearby_blocks,
216
+ source_metadata,
217
+ base_unified_metadata,
218
+ )
219
+ )
220
+ except Exception as e:
221
+ logger.warning(f"Error processing shape {shape_idx} on slide {slide_idx}: {e}")
222
+ raise
223
+
224
+ elif shape.shape_type == MSO_SHAPE_TYPE.EMBEDDED_OLE_OBJECT:
225
+ try:
226
+ ole_blob = shape.ole_format.blob
227
+ if not ole_blob:
228
+ return
229
+
230
+ prog_id = getattr(shape.ole_format, "prog_id", "")
231
+ ext = _get_ole_extension(prog_id)
232
+
233
+ if ext in {"docx", "pptx", "xlsx", "pdf"}:
234
+ png_streams = convert_stream_with_libreoffice(io.BytesIO(ole_blob), ext, "png")
235
+
236
+ for png_stream in png_streams:
237
+ pending_images.append(
238
+ (
239
+ png_stream.getvalue(),
240
+ shape_idx,
241
+ slide_idx,
242
+ slide_count,
243
+ page_nearby_blocks,
244
+ source_metadata,
245
+ base_unified_metadata,
246
+ )
247
+ )
248
+ except Exception as e:
249
+ logger.warning(f"Failed to convert OLE object (shape {shape_idx}, slide {slide_idx}) via LibreOffice: {e}")
250
+ # Fallback: Try to use the standard image representation if it exists (the preview image)
251
+ if hasattr(shape, "image"):
252
+ try:
253
+ pending_images.append(
254
+ (
255
+ shape.image.blob,
256
+ shape_idx,
257
+ slide_idx,
258
+ slide_count,
259
+ page_nearby_blocks,
260
+ source_metadata,
261
+ base_unified_metadata,
262
+ )
263
+ )
264
+ except Exception as fallback_err:
265
+ logger.warning(f"Fallback to OLE preview image failed: {fallback_err}")
266
+
267
+
268
+ # -----------------------------------------------------------------------------
269
+ # Main Extraction Function
270
+ # -----------------------------------------------------------------------------
271
+ def python_pptx(
272
+ *,
273
+ pptx_stream: IO,
274
+ extract_text: bool,
275
+ extract_images: bool,
276
+ extract_infographics: bool,
277
+ extract_tables: bool,
278
+ extract_charts: bool,
279
+ extraction_config: dict,
280
+ execution_trace_log: Optional[List] = None,
281
+ ):
282
+ _ = extract_infographics
283
+ _ = execution_trace_log
284
+
285
+ row_data = extraction_config.get("row_data")
286
+ source_id = row_data["source_id"]
287
+
288
+ text_depth = TextTypeEnum[extraction_config.get("text_depth", "page").upper()]
289
+ paragraph_format = extraction_config.get("paragraph_format", "markdown")
290
+ identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
291
+
292
+ metadata_col = extraction_config.get("metadata_column", "metadata")
293
+ pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
294
+ trace_info = extraction_config.get("trace_info", {})
295
+
296
+ base_unified_metadata = row_data.get(metadata_col, {})
297
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
298
+ source_location = base_source_metadata.get("source_location", "")
299
+ collection_id = base_source_metadata.get("collection_id", "")
300
+ partition_id = base_source_metadata.get("partition_id", -1)
301
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
302
+
303
+ try:
304
+ presentation = Presentation(pptx_stream)
305
+ except Exception as e:
306
+ logger.error("Failed to open PPTX presentation: %s", e)
307
+ return []
308
+
309
+ last_modified = (
310
+ presentation.core_properties.modified.isoformat()
311
+ if presentation.core_properties.modified
312
+ else datetime.now().isoformat()
313
+ )
314
+ date_created = (
315
+ presentation.core_properties.created.isoformat()
316
+ if presentation.core_properties.created
317
+ else datetime.now().isoformat()
318
+ )
319
+ keywords = presentation.core_properties.keywords
320
+ source_metadata = {
321
+ "source_name": source_id,
322
+ "source_id": source_id,
323
+ "source_location": source_location,
324
+ "source_type": DocumentTypeEnum.PPTX,
325
+ "collection_id": collection_id,
326
+ "date_created": date_created,
327
+ "last_modified": last_modified,
328
+ "summary": "",
329
+ "partition_id": partition_id,
330
+ "access_level": access_level,
331
+ }
332
+
333
+ slide_count = len(presentation.slides)
334
+ accumulated_text = []
335
+ extracted_data = []
336
+ pending_images = []
337
+
338
+ for slide_idx, slide in enumerate(presentation.slides):
339
+ try:
340
+ shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
341
+ except Exception as e:
342
+ logger.error("Slide %d: Failed to ungroup or sort shapes: %s", slide_idx, e)
343
+ continue
344
+
345
+ page_nearby_blocks = {
346
+ "text": {"content": [], "bbox": []},
347
+ "images": {"content": [], "bbox": []},
348
+ "structured": {"content": [], "bbox": []},
349
+ }
350
+
351
+ for shape_idx, shape in enumerate(shapes):
352
+ try:
353
+ block_text = []
354
+ added_title = added_subtitle = False
355
+
356
+ # Text extraction
357
+ if extract_text and shape.has_text_frame:
358
+ for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
359
+ if not paragraph.text.strip():
360
+ continue
361
+
362
+ for run_idx, run in enumerate(paragraph.runs):
363
+ try:
364
+ text = run.text
365
+ if not text:
366
+ continue
367
+
368
+ text = escape_text(text)
369
+
370
+ if paragraph_format == "markdown":
371
+ if is_title(shape) and not added_title:
372
+ text = process_title(shape)
373
+ added_title = True
374
+ elif is_subtitle(shape) and not added_subtitle:
375
+ text = process_subtitle(shape)
376
+ added_subtitle = True
377
+ elif is_title(shape) or is_subtitle(shape):
378
+ continue # already added
379
+
380
+ if run.hyperlink and run.hyperlink.address:
381
+ text = get_hyperlink(text, run.hyperlink.address)
382
+ if is_accent(paragraph.font) or is_accent(run.font):
383
+ text = format_text(text, italic=True)
384
+ elif is_strong(paragraph.font) or is_strong(run.font):
385
+ text = format_text(text, bold=True)
386
+ elif is_underlined(paragraph.font) or is_underlined(run.font):
387
+ text = format_text(text, underline=True)
388
+ if is_list_block(shape):
389
+ text = " " * paragraph.level + "* " + text
390
+
391
+ accumulated_text.append(text)
392
+ if extract_images and identify_nearby_objects:
393
+ block_text.append(text)
394
+
395
+ if text_depth == TextTypeEnum.SPAN:
396
+ extracted_data.append(
397
+ _construct_text_metadata(
398
+ presentation,
399
+ shape,
400
+ accumulated_text,
401
+ keywords,
402
+ slide_idx,
403
+ shape_idx,
404
+ paragraph_idx,
405
+ run_idx,
406
+ slide_count,
407
+ text_depth,
408
+ source_metadata,
409
+ base_unified_metadata,
410
+ )
411
+ )
412
+ accumulated_text = []
413
+
414
+ except Exception as e:
415
+ logger.warning(
416
+ "Slide %d Shape %d Run %d: Failed to process run: %s",
417
+ slide_idx,
418
+ shape_idx,
419
+ run_idx,
420
+ e,
421
+ )
422
+
423
+ if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
424
+ accumulated_text.append("\n\n")
425
+
426
+ if text_depth == TextTypeEnum.LINE:
427
+ extracted_data.append(
428
+ _construct_text_metadata(
429
+ presentation,
430
+ shape,
431
+ accumulated_text,
432
+ keywords,
433
+ slide_idx,
434
+ shape_idx,
435
+ paragraph_idx,
436
+ -1,
437
+ slide_count,
438
+ text_depth,
439
+ source_metadata,
440
+ base_unified_metadata,
441
+ )
442
+ )
443
+ accumulated_text = []
444
+
445
+ if text_depth == TextTypeEnum.BLOCK:
446
+ extracted_data.append(
447
+ _construct_text_metadata(
448
+ presentation,
449
+ shape,
450
+ accumulated_text,
451
+ keywords,
452
+ slide_idx,
453
+ shape_idx,
454
+ -1,
455
+ -1,
456
+ slide_count,
457
+ text_depth,
458
+ source_metadata,
459
+ base_unified_metadata,
460
+ )
461
+ )
462
+ accumulated_text = []
463
+
464
+ if extract_images and identify_nearby_objects and block_text:
465
+ page_nearby_blocks["text"]["content"].append("".join(block_text))
466
+ page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
467
+
468
+ # Image processing (deferred)
469
+ if extract_images or extract_tables or extract_charts or extract_infographics:
470
+ try:
471
+ process_shape(
472
+ shape,
473
+ shape_idx,
474
+ slide_idx,
475
+ slide_count,
476
+ pending_images,
477
+ page_nearby_blocks,
478
+ source_metadata,
479
+ base_unified_metadata,
480
+ )
481
+ except Exception as e:
482
+ logger.warning("Slide %d Shape %d: Failed to process image shape: %s", slide_idx, shape_idx, e)
483
+
484
+ # Table extraction
485
+ if extract_tables and shape.has_table:
486
+ try:
487
+ extracted_data.append(
488
+ _construct_table_metadata(
489
+ shape, slide_idx, slide_count, source_metadata, base_unified_metadata
490
+ )
491
+ )
492
+ except Exception as e:
493
+ logger.warning("Slide %d Shape %d: Failed to extract table: %s", slide_idx, shape_idx, e)
494
+
495
+ except Exception as e:
496
+ logger.warning("Slide %d Shape %d: Top-level failure: %s", slide_idx, shape_idx, e)
497
+
498
+ if extract_text and text_depth == TextTypeEnum.PAGE and accumulated_text:
499
+ extracted_data.append(
500
+ _construct_text_metadata(
501
+ presentation,
502
+ None,
503
+ accumulated_text,
504
+ keywords,
505
+ slide_idx,
506
+ -1,
507
+ -1,
508
+ -1,
509
+ slide_count,
510
+ text_depth,
511
+ source_metadata,
512
+ base_unified_metadata,
513
+ )
514
+ )
515
+ accumulated_text = []
516
+
517
+ if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
518
+ extracted_data.append(
519
+ _construct_text_metadata(
520
+ presentation,
521
+ None,
522
+ accumulated_text,
523
+ keywords,
524
+ -1,
525
+ -1,
526
+ -1,
527
+ -1,
528
+ slide_count,
529
+ text_depth,
530
+ source_metadata,
531
+ base_unified_metadata,
532
+ )
533
+ )
534
+
535
+ if extract_images or extract_tables or extract_charts or extract_infographics:
536
+ try:
537
+ _finalize_images(
538
+ pending_images,
539
+ extracted_data,
540
+ pptx_extractor_config,
541
+ extract_tables=extract_tables,
542
+ extract_charts=extract_charts,
543
+ extract_infographics=extract_infographics,
544
+ extract_images=extract_images,
545
+ trace_info=trace_info,
546
+ )
547
+ except Exception as e:
548
+ logger.error("Finalization of images failed: %s", e)
549
+
550
+ return extracted_data
551
+
552
+
553
+ def _construct_text_metadata(
554
+ presentation_object,
555
+ shape_object,
556
+ accumulated_text,
557
+ keywords,
558
+ slide_idx,
559
+ shape_idx,
560
+ paragraph_idx,
561
+ run_idx,
562
+ slide_count,
563
+ text_depth,
564
+ source_metadata,
565
+ base_unified_metadata,
566
+ ):
567
+ extracted_text = "".join(accumulated_text)
568
+
569
+ content_metadata = {
570
+ "type": ContentTypeEnum.TEXT,
571
+ "description": ContentDescriptionEnum.PPTX_TEXT,
572
+ "page_number": slide_idx,
573
+ "hierarchy": {
574
+ "page_count": slide_count,
575
+ "page": slide_idx,
576
+ "block": shape_idx,
577
+ "line": paragraph_idx,
578
+ "span": run_idx,
579
+ },
580
+ }
581
+
582
+ language = detect_language(extracted_text)
583
+ bbox = get_bbox(
584
+ presentation_object=presentation_object,
585
+ shape_object=shape_object,
586
+ text_depth=text_depth,
587
+ )
588
+
589
+ text_metadata = {
590
+ "text_type": text_depth,
591
+ "summary": "",
592
+ "keywords": keywords,
593
+ "language": language,
594
+ "text_location": bbox,
595
+ }
596
+
597
+ ext_unified_metadata = base_unified_metadata.copy()
598
+
599
+ ext_unified_metadata.update(
600
+ {
601
+ "content": extracted_text,
602
+ "source_metadata": source_metadata,
603
+ "content_metadata": content_metadata,
604
+ "text_metadata": text_metadata,
605
+ }
606
+ )
607
+
608
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
609
+
610
+ return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
611
+
612
+
613
+ # need to add block text to hierarchy/nearby_objects, including bbox
614
+ def _construct_image_metadata(
615
+ shape_idx: int,
616
+ slide_idx: int,
617
+ slide_count: int,
618
+ page_nearby_blocks: Dict,
619
+ base64_img: str,
620
+ source_metadata: Dict,
621
+ base_unified_metadata: Dict,
622
+ ):
623
+ """
624
+ Build standard PPTX image metadata.
625
+ """
626
+ # Example bounding box
627
+ bbox = (0, 0, 0, 0) # or extract from shape.left, shape.top, shape.width, shape.height if desired
628
+
629
+ content_metadata = {
630
+ "type": ContentTypeEnum.IMAGE,
631
+ "description": ContentDescriptionEnum.PPTX_IMAGE,
632
+ "page_number": slide_idx,
633
+ "hierarchy": {
634
+ "page_count": slide_count,
635
+ "page": slide_idx,
636
+ "block": shape_idx,
637
+ "line": -1,
638
+ "span": -1,
639
+ "nearby_objects": page_nearby_blocks,
640
+ },
641
+ }
642
+
643
+ image_metadata = {
644
+ "image_type": DocumentTypeEnum.PNG,
645
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
646
+ "caption": "", # could attempt to guess a caption from nearby text
647
+ "text": "",
648
+ "image_location": bbox,
649
+ }
650
+
651
+ unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
652
+ unified_metadata.update(
653
+ {
654
+ "content": base64_img,
655
+ "source_metadata": source_metadata,
656
+ "content_metadata": content_metadata,
657
+ "image_metadata": image_metadata,
658
+ }
659
+ )
660
+
661
+ validated_unified_metadata = validate_metadata(unified_metadata)
662
+
663
+ return [
664
+ ContentTypeEnum.IMAGE.value,
665
+ validated_unified_metadata.model_dump(),
666
+ str(uuid.uuid4()),
667
+ ]
668
+
669
+
670
+ def _construct_table_metadata(
671
+ shape,
672
+ slide_idx: int,
673
+ slide_count: int,
674
+ source_metadata: Dict,
675
+ base_unified_metadata: Dict,
676
+ ):
677
+ table = [[cell.text for cell in row.cells] for row in shape.table.rows]
678
+ df = pd.DataFrame(table[1:], columns=table[0])
679
+ # As df is eventually converted to markdown,
680
+ # remove any newlines, tabs, or extra spaces from the column names
681
+ df.columns = df.columns.str.replace(r"\s+", " ", regex=True)
682
+
683
+ bbox = get_bbox(shape_object=shape)
684
+
685
+ content_metadata = {
686
+ "type": ContentTypeEnum.STRUCTURED,
687
+ "description": ContentDescriptionEnum.PPTX_TABLE,
688
+ "page_number": slide_idx,
689
+ "hierarchy": {
690
+ "page_count": slide_count,
691
+ "page": slide_idx,
692
+ "line": -1,
693
+ "span": -1,
694
+ },
695
+ "subtype": ContentTypeEnum.TABLE,
696
+ }
697
+ table_metadata = {
698
+ "caption": "",
699
+ "table_format": TableFormatEnum.MARKDOWN,
700
+ "table_location": bbox,
701
+ "table_content": df.to_markdown(index=False),
702
+ }
703
+ ext_unified_metadata = base_unified_metadata.copy()
704
+
705
+ ext_unified_metadata.update(
706
+ {
707
+ "content": "",
708
+ "source_metadata": source_metadata,
709
+ "content_metadata": content_metadata,
710
+ "table_metadata": table_metadata,
711
+ }
712
+ )
713
+
714
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
715
+
716
+ return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
717
+
718
+
719
+ def get_bbox(
720
+ presentation_object: Optional[Presentation] = None,
721
+ shape_object: Optional[Slide] = None,
722
+ text_depth: Optional[TextTypeEnum] = None,
723
+ ):
724
+ """
725
+ Safely computes bounding box for a slide, shape, or document.
726
+ Ensures that missing or None values are gracefully handled.
727
+
728
+ Returns
729
+ -------
730
+ Tuple[int, int, int, int]
731
+ Bounding box as (top, left, bottom, right).
732
+ Defaults to (-1, -1, -1, -1) if invalid or unsupported.
733
+ """
734
+ try:
735
+ if text_depth == TextTypeEnum.DOCUMENT:
736
+ return (-1, -1, -1, -1)
737
+
738
+ elif text_depth == TextTypeEnum.PAGE and presentation_object:
739
+ top = left = 0
740
+ width = presentation_object.slide_width
741
+ height = presentation_object.slide_height
742
+ return (top, left, top + height, left + width)
743
+
744
+ elif shape_object:
745
+ top = shape_object.top if shape_object.top is not None else -1
746
+ left = shape_object.left if shape_object.left is not None else -1
747
+ width = shape_object.width if shape_object.width is not None else -1
748
+ height = shape_object.height if shape_object.height is not None else -1
749
+
750
+ # If all are valid, return normally, else return placeholder
751
+ if -1 in [top, left, width, height]:
752
+ return (-1, -1, -1, -1)
753
+
754
+ return (top, left, top + height, left + width)
755
+
756
+ except Exception as e:
757
+ logger.warning(f"get_bbox: Failed to compute bbox due to {e}")
758
+ return (-1, -1, -1, -1)
759
+
760
+ return (-1, -1, -1, -1)
761
+
762
+
763
+ def ungroup_shapes(shapes):
764
+ result = []
765
+ for shape in shapes:
766
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
767
+ result.extend(ungroup_shapes(shape.shapes))
768
+ else:
769
+ result.append(shape)
770
+ return result
771
+
772
+
773
+ def is_title(shape):
774
+ if shape.is_placeholder and (
775
+ shape.placeholder_format.type == PP_PLACEHOLDER.TITLE
776
+ or shape.placeholder_format.type == PP_PLACEHOLDER.VERTICAL_TITLE
777
+ or shape.placeholder_format.type == PP_PLACEHOLDER.CENTER_TITLE
778
+ ):
779
+ return True
780
+ else:
781
+ return False
782
+
783
+
784
+ def process_title(shape):
785
+ title = shape.text_frame.text.strip()
786
+ extracted_text = f"{title}\n{'=' * len(title)}"
787
+ return extracted_text
788
+
789
+
790
+ def is_subtitle(shape):
791
+ if shape.is_placeholder and (shape.placeholder_format.type == PP_PLACEHOLDER.SUBTITLE):
792
+ return True
793
+ else:
794
+ return False
795
+
796
+
797
+ def process_subtitle(shape):
798
+ subtitle = shape.text_frame.text.strip()
799
+ extracted_text = f"{subtitle}\n{'-' * len(subtitle)}"
800
+ return extracted_text
801
+
802
+
803
+ def is_list_block(shape):
804
+ levels = set()
805
+ for paragraph in shape.text_frame.paragraphs:
806
+ if paragraph.level not in levels:
807
+ levels.add(paragraph.level)
808
+ if paragraph.level != 0 or len(levels) > 1:
809
+ return True
810
+ return False
811
+
812
+
813
+ def escape_text(text):
814
+ def escape_repl(match_obj):
815
+ return "\\" + match_obj.group(0)
816
+
817
+ escape_regex_1 = re.compile(r"([\\\*`!_\{\}\[\]\(\)#\+-\.])")
818
+ escape_regex_2 = re.compile(r"(<[^>]+>)")
819
+ text = re.sub(escape_regex_1, escape_repl, text)
820
+ text = re.sub(escape_regex_2, escape_repl, text)
821
+
822
+ return text
823
+
824
+
825
+ def get_hyperlink(text, url):
826
+ result = f"[{text}]({url})"
827
+ return result
828
+
829
+
830
+ def is_accent(font):
831
+ if font.italic or (
832
+ font.color.type == MSO_COLOR_TYPE.SCHEME
833
+ and (
834
+ font.color.theme_color == MSO_THEME_COLOR.ACCENT_1
835
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_2
836
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_3
837
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_4
838
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_5
839
+ or font.color.theme_color == MSO_THEME_COLOR.ACCENT_6
840
+ )
841
+ ):
842
+ return True
843
+ else:
844
+ return False
845
+
846
+
847
+ def is_underlined(font):
848
+ if font.underline:
849
+ return True
850
+ else:
851
+ return False
852
+
853
+
854
+ def format_text(text: str, bold: bool = False, italic: bool = False, underline: bool = False) -> str:
855
+ if not text.strip():
856
+ return text
857
+
858
+ prefix, suffix = "", ""
859
+ # Exclude leading and trailing spaces from style
860
+ trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
861
+ match = trailing_space_pattern.match(text)
862
+ if match:
863
+ prefix, text, suffix = match.groups()
864
+
865
+ # Apply style
866
+ if bold:
867
+ text = f"**{text}**"
868
+ if italic:
869
+ text = f"*{text}*"
870
+ if underline:
871
+ text = f"<u>{text}</u>"
872
+
873
+ # Add back leading and trailing spaces
874
+ text = prefix + text + suffix
875
+
876
+ return text
877
+
878
+
879
+ def is_strong(font):
880
+ if font.bold or (
881
+ font.color.type == MSO_COLOR_TYPE.SCHEME
882
+ and (font.color.theme_color == MSO_THEME_COLOR.DARK_1 or font.color.theme_color == MSO_THEME_COLOR.DARK_2)
883
+ ):
884
+ return True
885
+ else:
886
+ return False
887
+
888
+
889
+ def convert_stream_with_libreoffice(
890
+ file_stream: io.BytesIO,
891
+ input_extension: str,
892
+ output_format: str,
893
+ ) -> Union[io.BytesIO, List[io.BytesIO]]:
894
+ """
895
+ Converts a file stream (DOCX or PPTX) to PDF or a series of PNGs using a temporary directory.
896
+ """
897
+ if output_format not in {"pdf", "png"}:
898
+ raise ValueError(f"Unsupported output format for LibreOffice conversion: {output_format}")
899
+
900
+ with tempfile.TemporaryDirectory() as temp_dir:
901
+ input_path = os.path.join(temp_dir, f"input.{input_extension}")
902
+ with open(input_path, "wb") as f:
903
+ f.write(file_stream.read())
904
+
905
+ # We always convert to PDF first using LibreOffice.
906
+ # Direct conversion to image formats (e.g. --convert-to png) in LibreOffice
907
+ # often only exports the first page/slide or lacks control over resolution.
908
+ # Converting to PDF preserves multi-page structure and layout fidelity.
909
+ command = [
910
+ "libreoffice",
911
+ "--headless",
912
+ "--convert-to",
913
+ "pdf",
914
+ input_path,
915
+ "--outdir",
916
+ temp_dir,
917
+ ]
918
+
919
+ subprocess.run(
920
+ command,
921
+ check=True,
922
+ capture_output=True,
923
+ text=True,
924
+ )
925
+
926
+ pdf_path = os.path.join(temp_dir, "input.pdf")
927
+ if not os.path.exists(pdf_path):
928
+ raise RuntimeError("LibreOffice conversion failed.")
929
+
930
+ if output_format == "pdf":
931
+ with open(pdf_path, "rb") as f:
932
+ return io.BytesIO(f.read())
933
+
934
+ elif output_format in {"png"}:
935
+ # We use pdfium to rasterize the PDF into images.
936
+ # This provides:
937
+ # 1. Support for multi-page documents (LibreOffice image export is often single-page).
938
+ # 2. Consistent rendering appearance matching the PDF output.
939
+ image_streams = []
940
+ pdf_document = pdfium.PdfDocument(pdf_path)
941
+ for i in range(len(pdf_document)):
942
+ page = pdf_document[i]
943
+ bitmap = page.render(scale=1)
944
+ pil_image = bitmap.to_pil()
945
+ buffered = io.BytesIO()
946
+ pil_image.save(buffered, format=output_format)
947
+ image_streams.append(buffered)
948
+ return image_streams
949
+
950
+
951
+ def _get_ole_extension(prog_id: str) -> str:
952
+ """
953
+ Map OLE prog_id to a likely file extension for LibreOffice conversion.
954
+ """
955
+ if not prog_id:
956
+ return "bin"
957
+
958
+ pid = prog_id.lower()
959
+ if "excel" in pid or "sheet" in pid:
960
+ return "xlsx"
961
+ if "word" in pid:
962
+ return "docx"
963
+ if "powerpoint" in pid or "show" in pid or "presentation" in pid:
964
+ return "pptx"
965
+ if "acrobat" in pid or "pdf" in pid:
966
+ return "pdf"
967
+
968
+ return "bin"