nv-ingest-api 25.4.2__py3-none-any.whl → 25.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
- nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
- nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
- nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
- nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +214 -188
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +6 -9
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +35 -38
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
- nv_ingest_api/internal/primitives/nim/nim_client.py +17 -9
- nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +1 -1
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +1 -1
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +1 -1
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +26 -12
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +34 -23
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +11 -10
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +9 -7
- nv_ingest_api/internal/store/image_upload.py +1 -0
- nv_ingest_api/internal/transform/embed_text.py +75 -52
- nv_ingest_api/internal/transform/split_text.py +9 -3
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/exception_handlers/converters.py +1 -1
- nv_ingest_api/util/exception_handlers/decorators.py +309 -51
- nv_ingest_api/util/image_processing/processing.py +1 -1
- nv_ingest_api/util/logging/configuration.py +15 -8
- nv_ingest_api/util/pdf/pdfium.py +2 -2
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
- nv_ingest_api/util/service_clients/rest/rest_client.py +2 -2
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +430 -0
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/METADATA +2 -1
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/RECORD +46 -41
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/WHEEL +1 -1
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
|
|
18
18
|
import logging
|
|
19
19
|
import io
|
|
20
|
-
import operator
|
|
21
20
|
import re
|
|
22
21
|
import uuid
|
|
23
22
|
from collections import defaultdict
|
|
@@ -28,9 +27,9 @@ from typing import Optional
|
|
|
28
27
|
import pandas as pd
|
|
29
28
|
from pptx import Presentation
|
|
30
29
|
from pptx.enum.dml import MSO_COLOR_TYPE
|
|
31
|
-
from pptx.enum.dml import MSO_THEME_COLOR
|
|
30
|
+
from pptx.enum.dml import MSO_THEME_COLOR # noqa
|
|
32
31
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
33
|
-
from pptx.enum.shapes import PP_PLACEHOLDER
|
|
32
|
+
from pptx.enum.shapes import PP_PLACEHOLDER # noqa
|
|
34
33
|
from pptx.shapes.autoshape import Shape
|
|
35
34
|
from pptx.slide import Slide
|
|
36
35
|
|
|
@@ -106,7 +105,7 @@ def _finalize_images(
|
|
|
106
105
|
logger.warning(f"Unable to process shape image: {e}")
|
|
107
106
|
|
|
108
107
|
# If you want table/chart detection for these images, do it now
|
|
109
|
-
# (similar to docx approach). This might use your YOLO or
|
|
108
|
+
# (similar to docx approach). This might use your YOLO or another method:
|
|
110
109
|
detection_map = defaultdict(list) # image_idx -> list of CroppedImageWithContent
|
|
111
110
|
if extract_tables or extract_charts:
|
|
112
111
|
try:
|
|
@@ -155,6 +154,12 @@ def _finalize_images(
|
|
|
155
154
|
extracted_data.append(image_entry)
|
|
156
155
|
|
|
157
156
|
|
|
157
|
+
def _safe_position(shape):
|
|
158
|
+
top = shape.top if shape.top is not None else float("inf")
|
|
159
|
+
left = shape.left if shape.left is not None else float("inf")
|
|
160
|
+
return (top, left)
|
|
161
|
+
|
|
162
|
+
|
|
158
163
|
# -----------------------------------------------------------------------------
|
|
159
164
|
# Helper Function: Recursive Image Extraction
|
|
160
165
|
# -----------------------------------------------------------------------------
|
|
@@ -215,20 +220,13 @@ def python_pptx(
|
|
|
215
220
|
extraction_config: dict,
|
|
216
221
|
execution_trace_log: Optional[List] = None,
|
|
217
222
|
):
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
classification into tables/charts if requested.
|
|
221
|
-
"""
|
|
222
|
-
|
|
223
|
-
_ = extract_infographics # Placeholder for future use
|
|
224
|
-
_ = execution_trace_log # Placeholder for future use
|
|
223
|
+
_ = extract_infographics
|
|
224
|
+
_ = execution_trace_log
|
|
225
225
|
|
|
226
226
|
row_data = extraction_config.get("row_data")
|
|
227
227
|
source_id = row_data["source_id"]
|
|
228
228
|
|
|
229
|
-
text_depth = extraction_config.get("text_depth", "page")
|
|
230
|
-
text_depth = TextTypeEnum[text_depth.upper()]
|
|
231
|
-
|
|
229
|
+
text_depth = TextTypeEnum[extraction_config.get("text_depth", "page").upper()]
|
|
232
230
|
paragraph_format = extraction_config.get("paragraph_format", "markdown")
|
|
233
231
|
identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
|
|
234
232
|
|
|
@@ -236,16 +234,19 @@ def python_pptx(
|
|
|
236
234
|
pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
|
|
237
235
|
trace_info = extraction_config.get("trace_info", {})
|
|
238
236
|
|
|
239
|
-
base_unified_metadata = row_data
|
|
237
|
+
base_unified_metadata = row_data.get(metadata_col, {})
|
|
240
238
|
base_source_metadata = base_unified_metadata.get("source_metadata", {})
|
|
241
239
|
source_location = base_source_metadata.get("source_location", "")
|
|
242
240
|
collection_id = base_source_metadata.get("collection_id", "")
|
|
243
241
|
partition_id = base_source_metadata.get("partition_id", -1)
|
|
244
242
|
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
|
|
245
243
|
|
|
246
|
-
|
|
244
|
+
try:
|
|
245
|
+
presentation = Presentation(pptx_stream)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.error("Failed to open PPTX presentation: %s", e)
|
|
248
|
+
return []
|
|
247
249
|
|
|
248
|
-
# Collect source metadata from the core properties of the document.
|
|
249
250
|
last_modified = (
|
|
250
251
|
presentation.core_properties.modified.isoformat()
|
|
251
252
|
if presentation.core_properties.modified
|
|
@@ -257,12 +258,11 @@ def python_pptx(
|
|
|
257
258
|
else datetime.now().isoformat()
|
|
258
259
|
)
|
|
259
260
|
keywords = presentation.core_properties.keywords
|
|
260
|
-
source_type = DocumentTypeEnum.PPTX
|
|
261
261
|
source_metadata = {
|
|
262
|
-
"source_name": source_id,
|
|
262
|
+
"source_name": source_id,
|
|
263
263
|
"source_id": source_id,
|
|
264
264
|
"source_location": source_location,
|
|
265
|
-
"source_type":
|
|
265
|
+
"source_type": DocumentTypeEnum.PPTX,
|
|
266
266
|
"collection_id": collection_id,
|
|
267
267
|
"date_created": date_created,
|
|
268
268
|
"last_modified": last_modified,
|
|
@@ -272,18 +272,16 @@ def python_pptx(
|
|
|
272
272
|
}
|
|
273
273
|
|
|
274
274
|
slide_count = len(presentation.slides)
|
|
275
|
-
|
|
276
275
|
accumulated_text = []
|
|
277
276
|
extracted_data = []
|
|
278
|
-
|
|
279
|
-
# Hold images here for final classification.
|
|
280
|
-
# Each item is (shape, shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata,
|
|
281
|
-
# base_unified_metadata)
|
|
282
277
|
pending_images = []
|
|
283
278
|
|
|
284
279
|
for slide_idx, slide in enumerate(presentation.slides):
|
|
285
|
-
|
|
286
|
-
|
|
280
|
+
try:
|
|
281
|
+
shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.error("Slide %d: Failed to ungroup or sort shapes: %s", slide_idx, e)
|
|
284
|
+
continue
|
|
287
285
|
|
|
288
286
|
page_nearby_blocks = {
|
|
289
287
|
"text": {"content": [], "bbox": []},
|
|
@@ -292,152 +290,179 @@ def python_pptx(
|
|
|
292
290
|
}
|
|
293
291
|
|
|
294
292
|
for shape_idx, shape in enumerate(shapes):
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
# ---------------------------------------------
|
|
299
|
-
# 1) Text Extraction
|
|
300
|
-
# ---------------------------------------------
|
|
301
|
-
if extract_text and shape.has_text_frame:
|
|
302
|
-
for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
|
|
303
|
-
if not paragraph.text.strip():
|
|
304
|
-
continue
|
|
305
|
-
|
|
306
|
-
for run_idx, run in enumerate(paragraph.runs):
|
|
307
|
-
text = run.text
|
|
308
|
-
if not text:
|
|
309
|
-
continue
|
|
293
|
+
try:
|
|
294
|
+
block_text = []
|
|
295
|
+
added_title = added_subtitle = False
|
|
310
296
|
|
|
311
|
-
|
|
297
|
+
# Text extraction
|
|
298
|
+
if extract_text and shape.has_text_frame:
|
|
299
|
+
for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
|
|
300
|
+
if not paragraph.text.strip():
|
|
301
|
+
continue
|
|
312
302
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
added_title = True
|
|
318
|
-
else:
|
|
319
|
-
continue
|
|
320
|
-
elif is_subtitle(shape):
|
|
321
|
-
if not added_subtitle:
|
|
322
|
-
text = process_subtitle(shape)
|
|
323
|
-
added_subtitle = True
|
|
324
|
-
else:
|
|
303
|
+
for run_idx, run in enumerate(paragraph.runs):
|
|
304
|
+
try:
|
|
305
|
+
text = run.text
|
|
306
|
+
if not text:
|
|
325
307
|
continue
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
if
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
308
|
+
|
|
309
|
+
text = escape_text(text)
|
|
310
|
+
|
|
311
|
+
if paragraph_format == "markdown":
|
|
312
|
+
if is_title(shape) and not added_title:
|
|
313
|
+
text = process_title(shape)
|
|
314
|
+
added_title = True
|
|
315
|
+
elif is_subtitle(shape) and not added_subtitle:
|
|
316
|
+
text = process_subtitle(shape)
|
|
317
|
+
added_subtitle = True
|
|
318
|
+
elif is_title(shape) or is_subtitle(shape):
|
|
319
|
+
continue # already added
|
|
320
|
+
|
|
321
|
+
if run.hyperlink and run.hyperlink.address:
|
|
322
|
+
text = get_hyperlink(text, run.hyperlink.address)
|
|
323
|
+
if is_accent(paragraph.font) or is_accent(run.font):
|
|
324
|
+
text = format_text(text, italic=True)
|
|
325
|
+
elif is_strong(paragraph.font) or is_strong(run.font):
|
|
326
|
+
text = format_text(text, bold=True)
|
|
327
|
+
elif is_underlined(paragraph.font) or is_underlined(run.font):
|
|
328
|
+
text = format_text(text, underline=True)
|
|
329
|
+
if is_list_block(shape):
|
|
330
|
+
text = " " * paragraph.level + "* " + text
|
|
331
|
+
|
|
332
|
+
accumulated_text.append(text)
|
|
333
|
+
if extract_images and identify_nearby_objects:
|
|
334
|
+
block_text.append(text)
|
|
335
|
+
|
|
336
|
+
if text_depth == TextTypeEnum.SPAN:
|
|
337
|
+
extracted_data.append(
|
|
338
|
+
_construct_text_metadata(
|
|
339
|
+
presentation,
|
|
340
|
+
shape,
|
|
341
|
+
accumulated_text,
|
|
342
|
+
keywords,
|
|
343
|
+
slide_idx,
|
|
344
|
+
shape_idx,
|
|
345
|
+
paragraph_idx,
|
|
346
|
+
run_idx,
|
|
347
|
+
slide_count,
|
|
348
|
+
text_depth,
|
|
349
|
+
source_metadata,
|
|
350
|
+
base_unified_metadata,
|
|
351
|
+
)
|
|
352
|
+
)
|
|
353
|
+
accumulated_text = []
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.warning(
|
|
357
|
+
"Slide %d Shape %d Run %d: Failed to process run: %s",
|
|
358
|
+
slide_idx,
|
|
359
|
+
shape_idx,
|
|
360
|
+
run_idx,
|
|
361
|
+
e,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
|
|
365
|
+
accumulated_text.append("\n\n")
|
|
366
|
+
|
|
367
|
+
if text_depth == TextTypeEnum.LINE:
|
|
368
|
+
extracted_data.append(
|
|
369
|
+
_construct_text_metadata(
|
|
370
|
+
presentation,
|
|
371
|
+
shape,
|
|
372
|
+
accumulated_text,
|
|
373
|
+
keywords,
|
|
374
|
+
slide_idx,
|
|
375
|
+
shape_idx,
|
|
376
|
+
paragraph_idx,
|
|
377
|
+
-1,
|
|
378
|
+
slide_count,
|
|
379
|
+
text_depth,
|
|
380
|
+
source_metadata,
|
|
381
|
+
base_unified_metadata,
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
accumulated_text = []
|
|
385
|
+
|
|
386
|
+
if text_depth == TextTypeEnum.BLOCK:
|
|
387
|
+
extracted_data.append(
|
|
388
|
+
_construct_text_metadata(
|
|
347
389
|
presentation,
|
|
348
390
|
shape,
|
|
349
391
|
accumulated_text,
|
|
350
392
|
keywords,
|
|
351
393
|
slide_idx,
|
|
352
394
|
shape_idx,
|
|
353
|
-
|
|
354
|
-
|
|
395
|
+
-1,
|
|
396
|
+
-1,
|
|
355
397
|
slide_count,
|
|
356
398
|
text_depth,
|
|
357
399
|
source_metadata,
|
|
358
400
|
base_unified_metadata,
|
|
359
401
|
)
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
accumulated_text = []
|
|
402
|
+
)
|
|
403
|
+
accumulated_text = []
|
|
363
404
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
405
|
+
if extract_images and identify_nearby_objects and block_text:
|
|
406
|
+
page_nearby_blocks["text"]["content"].append("".join(block_text))
|
|
407
|
+
page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
|
|
367
408
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
409
|
+
# Image processing (deferred)
|
|
410
|
+
if extract_images or extract_tables or extract_charts:
|
|
411
|
+
try:
|
|
412
|
+
process_shape(
|
|
371
413
|
shape,
|
|
372
|
-
accumulated_text,
|
|
373
|
-
keywords,
|
|
374
|
-
slide_idx,
|
|
375
414
|
shape_idx,
|
|
376
|
-
|
|
377
|
-
-1,
|
|
415
|
+
slide_idx,
|
|
378
416
|
slide_count,
|
|
379
|
-
|
|
417
|
+
pending_images,
|
|
418
|
+
page_nearby_blocks,
|
|
380
419
|
source_metadata,
|
|
381
420
|
base_unified_metadata,
|
|
382
421
|
)
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
422
|
+
except Exception as e:
|
|
423
|
+
logger.warning("Slide %d Shape %d: Failed to process image shape: %s", slide_idx, shape_idx, e)
|
|
424
|
+
|
|
425
|
+
# Table extraction
|
|
426
|
+
if extract_tables and shape.has_table:
|
|
427
|
+
try:
|
|
428
|
+
extracted_data.append(
|
|
429
|
+
_construct_table_metadata(
|
|
430
|
+
shape, slide_idx, slide_count, source_metadata, base_unified_metadata
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.warning("Slide %d Shape %d: Failed to extract table: %s", slide_idx, shape_idx, e)
|
|
386
435
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
slide_count,
|
|
398
|
-
text_depth,
|
|
399
|
-
source_metadata,
|
|
400
|
-
base_unified_metadata,
|
|
401
|
-
)
|
|
402
|
-
if len(text_extraction) > 0:
|
|
403
|
-
extracted_data.append(text_extraction)
|
|
404
|
-
accumulated_text = []
|
|
405
|
-
|
|
406
|
-
if extract_images and identify_nearby_objects and block_text:
|
|
407
|
-
page_nearby_blocks["text"]["content"].append("".join(block_text))
|
|
408
|
-
page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
|
|
409
|
-
|
|
410
|
-
# ---------------------------------------------
|
|
411
|
-
# 2) Image Handling (DEFERRED) with nested/group shapes
|
|
412
|
-
# ---------------------------------------------
|
|
413
|
-
if extract_images:
|
|
414
|
-
process_shape(
|
|
415
|
-
shape,
|
|
416
|
-
shape_idx,
|
|
436
|
+
except Exception as e:
|
|
437
|
+
logger.warning("Slide %d Shape %d: Top-level failure: %s", slide_idx, shape_idx, e)
|
|
438
|
+
|
|
439
|
+
if extract_text and text_depth == TextTypeEnum.PAGE and accumulated_text:
|
|
440
|
+
extracted_data.append(
|
|
441
|
+
_construct_text_metadata(
|
|
442
|
+
presentation,
|
|
443
|
+
None,
|
|
444
|
+
accumulated_text,
|
|
445
|
+
keywords,
|
|
417
446
|
slide_idx,
|
|
447
|
+
-1,
|
|
448
|
+
-1,
|
|
449
|
+
-1,
|
|
418
450
|
slide_count,
|
|
419
|
-
|
|
420
|
-
page_nearby_blocks,
|
|
451
|
+
text_depth,
|
|
421
452
|
source_metadata,
|
|
422
453
|
base_unified_metadata,
|
|
423
454
|
)
|
|
455
|
+
)
|
|
456
|
+
accumulated_text = []
|
|
424
457
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
if extract_tables and shape.has_table:
|
|
429
|
-
table_extraction = _construct_table_metadata(
|
|
430
|
-
shape, slide_idx, slide_count, source_metadata, base_unified_metadata
|
|
431
|
-
)
|
|
432
|
-
extracted_data.append(table_extraction)
|
|
433
|
-
|
|
434
|
-
if extract_text and (text_depth == TextTypeEnum.PAGE) and (len(accumulated_text) > 0):
|
|
435
|
-
text_extraction = _construct_text_metadata(
|
|
458
|
+
if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
|
|
459
|
+
extracted_data.append(
|
|
460
|
+
_construct_text_metadata(
|
|
436
461
|
presentation,
|
|
437
|
-
|
|
462
|
+
None,
|
|
438
463
|
accumulated_text,
|
|
439
464
|
keywords,
|
|
440
|
-
|
|
465
|
+
-1,
|
|
441
466
|
-1,
|
|
442
467
|
-1,
|
|
443
468
|
-1,
|
|
@@ -446,41 +471,20 @@ def python_pptx(
|
|
|
446
471
|
source_metadata,
|
|
447
472
|
base_unified_metadata,
|
|
448
473
|
)
|
|
449
|
-
if len(text_extraction) > 0:
|
|
450
|
-
extracted_data.append(text_extraction)
|
|
451
|
-
accumulated_text = []
|
|
452
|
-
|
|
453
|
-
if extract_text and (text_depth == TextTypeEnum.DOCUMENT) and (len(accumulated_text) > 0):
|
|
454
|
-
text_extraction = _construct_text_metadata(
|
|
455
|
-
presentation,
|
|
456
|
-
shape, # may pass None
|
|
457
|
-
accumulated_text,
|
|
458
|
-
keywords,
|
|
459
|
-
-1,
|
|
460
|
-
-1,
|
|
461
|
-
-1,
|
|
462
|
-
-1,
|
|
463
|
-
slide_count,
|
|
464
|
-
text_depth,
|
|
465
|
-
source_metadata,
|
|
466
|
-
base_unified_metadata,
|
|
467
474
|
)
|
|
468
|
-
if len(text_extraction) > 0:
|
|
469
|
-
extracted_data.append(text_extraction)
|
|
470
|
-
accumulated_text = []
|
|
471
475
|
|
|
472
|
-
# ---------------------------------------------
|
|
473
|
-
# FINAL STEP: Finalize images (and tables/charts)
|
|
474
|
-
# ---------------------------------------------
|
|
475
476
|
if extract_images or extract_tables or extract_charts:
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
477
|
+
try:
|
|
478
|
+
_finalize_images(
|
|
479
|
+
pending_images,
|
|
480
|
+
extracted_data,
|
|
481
|
+
pptx_extractor_config,
|
|
482
|
+
extract_tables=extract_tables,
|
|
483
|
+
extract_charts=extract_charts,
|
|
484
|
+
trace_info=trace_info,
|
|
485
|
+
)
|
|
486
|
+
except Exception as e:
|
|
487
|
+
logger.error("Finalization of images failed: %s", e)
|
|
484
488
|
|
|
485
489
|
return extracted_data
|
|
486
490
|
|
|
@@ -656,21 +660,43 @@ def get_bbox(
|
|
|
656
660
|
shape_object: Optional[Slide] = None,
|
|
657
661
|
text_depth: Optional[TextTypeEnum] = None,
|
|
658
662
|
):
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
663
|
+
"""
|
|
664
|
+
Safely computes bounding box for a slide, shape, or document.
|
|
665
|
+
Ensures that missing or None values are gracefully handled.
|
|
666
|
+
|
|
667
|
+
Returns
|
|
668
|
+
-------
|
|
669
|
+
Tuple[int, int, int, int]
|
|
670
|
+
Bounding box as (top, left, bottom, right).
|
|
671
|
+
Defaults to (-1, -1, -1, -1) if invalid or unsupported.
|
|
672
|
+
"""
|
|
673
|
+
try:
|
|
674
|
+
if text_depth == TextTypeEnum.DOCUMENT:
|
|
675
|
+
return (-1, -1, -1, -1)
|
|
676
|
+
|
|
677
|
+
elif text_depth == TextTypeEnum.PAGE and presentation_object:
|
|
678
|
+
top = left = 0
|
|
679
|
+
width = presentation_object.slide_width
|
|
680
|
+
height = presentation_object.slide_height
|
|
681
|
+
return (top, left, top + height, left + width)
|
|
682
|
+
|
|
683
|
+
elif shape_object:
|
|
684
|
+
top = shape_object.top if shape_object.top is not None else -1
|
|
685
|
+
left = shape_object.left if shape_object.left is not None else -1
|
|
686
|
+
width = shape_object.width if shape_object.width is not None else -1
|
|
687
|
+
height = shape_object.height if shape_object.height is not None else -1
|
|
688
|
+
|
|
689
|
+
# If all are valid, return normally, else return placeholder
|
|
690
|
+
if -1 in [top, left, width, height]:
|
|
691
|
+
return (-1, -1, -1, -1)
|
|
692
|
+
|
|
693
|
+
return (top, left, top + height, left + width)
|
|
694
|
+
|
|
695
|
+
except Exception as e:
|
|
696
|
+
logger.warning(f"get_bbox: Failed to compute bbox due to {e}")
|
|
697
|
+
return (-1, -1, -1, -1)
|
|
698
|
+
|
|
699
|
+
return (-1, -1, -1, -1)
|
|
674
700
|
|
|
675
701
|
|
|
676
702
|
def ungroup_shapes(shapes):
|
|
@@ -99,14 +99,11 @@ def _decode_and_extract_from_pptx(
|
|
|
99
99
|
|
|
100
100
|
# Retrieve extraction parameters (and remove boolean flags as they are consumed).
|
|
101
101
|
extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
extract_infographics: bool = extract_params.pop("extract_infographics", False)
|
|
108
|
-
except KeyError as e:
|
|
109
|
-
raise ValueError(f"Missing required extraction flag: {e}")
|
|
102
|
+
extract_text: bool = extract_params.pop("extract_text", False)
|
|
103
|
+
extract_images: bool = extract_params.pop("extract_images", False)
|
|
104
|
+
extract_tables: bool = extract_params.pop("extract_tables", False)
|
|
105
|
+
extract_charts: bool = extract_params.pop("extract_charts", False)
|
|
106
|
+
extract_infographics: bool = extract_params.pop("extract_infographics", False)
|
|
110
107
|
|
|
111
108
|
# Inject additional configuration and trace information.
|
|
112
109
|
if getattr(extraction_config, "pptx_extraction_config", None) is not None:
|
|
@@ -184,4 +181,4 @@ def extract_primitives_from_pptx_internal(
|
|
|
184
181
|
else:
|
|
185
182
|
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
186
183
|
|
|
187
|
-
return extracted_df
|
|
184
|
+
return extracted_df, {}
|