nv-ingest-api 25.4.2__py3-none-any.whl → 25.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (46) hide show
  1. nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
  2. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
  3. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  4. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  5. nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
  6. nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
  7. nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
  8. nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
  9. nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
  10. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
  11. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
  12. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +214 -188
  13. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +6 -9
  14. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +35 -38
  15. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
  16. nv_ingest_api/internal/primitives/nim/nim_client.py +17 -9
  17. nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
  18. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +1 -1
  19. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  20. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +1 -1
  21. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
  22. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +1 -1
  23. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +26 -12
  24. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +34 -23
  25. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +11 -10
  26. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +9 -7
  27. nv_ingest_api/internal/store/image_upload.py +1 -0
  28. nv_ingest_api/internal/transform/embed_text.py +75 -52
  29. nv_ingest_api/internal/transform/split_text.py +9 -3
  30. nv_ingest_api/util/__init__.py +3 -0
  31. nv_ingest_api/util/exception_handlers/converters.py +1 -1
  32. nv_ingest_api/util/exception_handlers/decorators.py +309 -51
  33. nv_ingest_api/util/image_processing/processing.py +1 -1
  34. nv_ingest_api/util/logging/configuration.py +15 -8
  35. nv_ingest_api/util/pdf/pdfium.py +2 -2
  36. nv_ingest_api/util/schema/__init__.py +3 -0
  37. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  38. nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
  39. nv_ingest_api/util/service_clients/rest/rest_client.py +2 -2
  40. nv_ingest_api/util/system/__init__.py +0 -0
  41. nv_ingest_api/util/system/hardware_info.py +430 -0
  42. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/METADATA +2 -1
  43. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/RECORD +46 -41
  44. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/WHEEL +1 -1
  45. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/licenses/LICENSE +0 -0
  46. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,6 @@
17
17
 
18
18
  import logging
19
19
  import io
20
- import operator
21
20
  import re
22
21
  import uuid
23
22
  from collections import defaultdict
@@ -28,9 +27,9 @@ from typing import Optional
28
27
  import pandas as pd
29
28
  from pptx import Presentation
30
29
  from pptx.enum.dml import MSO_COLOR_TYPE
31
- from pptx.enum.dml import MSO_THEME_COLOR
30
+ from pptx.enum.dml import MSO_THEME_COLOR # noqa
32
31
  from pptx.enum.shapes import MSO_SHAPE_TYPE
33
- from pptx.enum.shapes import PP_PLACEHOLDER
32
+ from pptx.enum.shapes import PP_PLACEHOLDER # noqa
34
33
  from pptx.shapes.autoshape import Shape
35
34
  from pptx.slide import Slide
36
35
 
@@ -106,7 +105,7 @@ def _finalize_images(
106
105
  logger.warning(f"Unable to process shape image: {e}")
107
106
 
108
107
  # If you want table/chart detection for these images, do it now
109
- # (similar to docx approach). This might use your YOLO or other method:
108
+ # (similar to docx approach). This might use your YOLO or another method:
110
109
  detection_map = defaultdict(list) # image_idx -> list of CroppedImageWithContent
111
110
  if extract_tables or extract_charts:
112
111
  try:
@@ -155,6 +154,12 @@ def _finalize_images(
155
154
  extracted_data.append(image_entry)
156
155
 
157
156
 
157
+ def _safe_position(shape):
158
+ top = shape.top if shape.top is not None else float("inf")
159
+ left = shape.left if shape.left is not None else float("inf")
160
+ return (top, left)
161
+
162
+
158
163
  # -----------------------------------------------------------------------------
159
164
  # Helper Function: Recursive Image Extraction
160
165
  # -----------------------------------------------------------------------------
@@ -215,20 +220,13 @@ def python_pptx(
215
220
  extraction_config: dict,
216
221
  execution_trace_log: Optional[List] = None,
217
222
  ):
218
- """
219
- Uses python-pptx to extract text from a PPTX bytestream, while deferring image
220
- classification into tables/charts if requested.
221
- """
222
-
223
- _ = extract_infographics # Placeholder for future use
224
- _ = execution_trace_log # Placeholder for future use
223
+ _ = extract_infographics
224
+ _ = execution_trace_log
225
225
 
226
226
  row_data = extraction_config.get("row_data")
227
227
  source_id = row_data["source_id"]
228
228
 
229
- text_depth = extraction_config.get("text_depth", "page")
230
- text_depth = TextTypeEnum[text_depth.upper()]
231
-
229
+ text_depth = TextTypeEnum[extraction_config.get("text_depth", "page").upper()]
232
230
  paragraph_format = extraction_config.get("paragraph_format", "markdown")
233
231
  identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
234
232
 
@@ -236,16 +234,19 @@ def python_pptx(
236
234
  pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
237
235
  trace_info = extraction_config.get("trace_info", {})
238
236
 
239
- base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
237
+ base_unified_metadata = row_data.get(metadata_col, {})
240
238
  base_source_metadata = base_unified_metadata.get("source_metadata", {})
241
239
  source_location = base_source_metadata.get("source_location", "")
242
240
  collection_id = base_source_metadata.get("collection_id", "")
243
241
  partition_id = base_source_metadata.get("partition_id", -1)
244
242
  access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
245
243
 
246
- presentation = Presentation(pptx_stream)
244
+ try:
245
+ presentation = Presentation(pptx_stream)
246
+ except Exception as e:
247
+ logger.error("Failed to open PPTX presentation: %s", e)
248
+ return []
247
249
 
248
- # Collect source metadata from the core properties of the document.
249
250
  last_modified = (
250
251
  presentation.core_properties.modified.isoformat()
251
252
  if presentation.core_properties.modified
@@ -257,12 +258,11 @@ def python_pptx(
257
258
  else datetime.now().isoformat()
258
259
  )
259
260
  keywords = presentation.core_properties.keywords
260
- source_type = DocumentTypeEnum.PPTX
261
261
  source_metadata = {
262
- "source_name": source_id, # python-pptx doesn't maintain filename; re-use source_id
262
+ "source_name": source_id,
263
263
  "source_id": source_id,
264
264
  "source_location": source_location,
265
- "source_type": source_type,
265
+ "source_type": DocumentTypeEnum.PPTX,
266
266
  "collection_id": collection_id,
267
267
  "date_created": date_created,
268
268
  "last_modified": last_modified,
@@ -272,18 +272,16 @@ def python_pptx(
272
272
  }
273
273
 
274
274
  slide_count = len(presentation.slides)
275
-
276
275
  accumulated_text = []
277
276
  extracted_data = []
278
-
279
- # Hold images here for final classification.
280
- # Each item is (shape, shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata,
281
- # base_unified_metadata)
282
277
  pending_images = []
283
278
 
284
279
  for slide_idx, slide in enumerate(presentation.slides):
285
- # Obtain a flat list of shapes (ungrouped) sorted by top then left.
286
- shapes = sorted(ungroup_shapes(slide.shapes), key=operator.attrgetter("top", "left"))
280
+ try:
281
+ shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
282
+ except Exception as e:
283
+ logger.error("Slide %d: Failed to ungroup or sort shapes: %s", slide_idx, e)
284
+ continue
287
285
 
288
286
  page_nearby_blocks = {
289
287
  "text": {"content": [], "bbox": []},
@@ -292,152 +290,179 @@ def python_pptx(
292
290
  }
293
291
 
294
292
  for shape_idx, shape in enumerate(shapes):
295
- block_text = []
296
- added_title = added_subtitle = False
297
-
298
- # ---------------------------------------------
299
- # 1) Text Extraction
300
- # ---------------------------------------------
301
- if extract_text and shape.has_text_frame:
302
- for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
303
- if not paragraph.text.strip():
304
- continue
305
-
306
- for run_idx, run in enumerate(paragraph.runs):
307
- text = run.text
308
- if not text:
309
- continue
293
+ try:
294
+ block_text = []
295
+ added_title = added_subtitle = False
310
296
 
311
- text = escape_text(text)
297
+ # Text extraction
298
+ if extract_text and shape.has_text_frame:
299
+ for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
300
+ if not paragraph.text.strip():
301
+ continue
312
302
 
313
- if paragraph_format == "markdown":
314
- if is_title(shape):
315
- if not added_title:
316
- text = process_title(shape)
317
- added_title = True
318
- else:
319
- continue
320
- elif is_subtitle(shape):
321
- if not added_subtitle:
322
- text = process_subtitle(shape)
323
- added_subtitle = True
324
- else:
303
+ for run_idx, run in enumerate(paragraph.runs):
304
+ try:
305
+ text = run.text
306
+ if not text:
325
307
  continue
326
- else:
327
- if run.hyperlink.address:
328
- text = get_hyperlink(text, run.hyperlink.address)
329
- if is_accent(paragraph.font) or is_accent(run.font):
330
- text = format_text(text, italic=True)
331
- elif is_strong(paragraph.font) or is_strong(run.font):
332
- text = format_text(text, bold=True)
333
- elif is_underlined(paragraph.font) or is_underlined(run.font):
334
- text = format_text(text, underline=True)
335
- if is_list_block(shape):
336
- text = " " * paragraph.level + "* " + text
337
-
338
- accumulated_text.append(text)
339
-
340
- # For "nearby objects", store block text.
341
- if extract_images and identify_nearby_objects:
342
- block_text.append(text)
343
-
344
- # If we only want text at SPAN level, flush after each run.
345
- if text_depth == TextTypeEnum.SPAN:
346
- text_extraction = _construct_text_metadata(
308
+
309
+ text = escape_text(text)
310
+
311
+ if paragraph_format == "markdown":
312
+ if is_title(shape) and not added_title:
313
+ text = process_title(shape)
314
+ added_title = True
315
+ elif is_subtitle(shape) and not added_subtitle:
316
+ text = process_subtitle(shape)
317
+ added_subtitle = True
318
+ elif is_title(shape) or is_subtitle(shape):
319
+ continue # already added
320
+
321
+ if run.hyperlink and run.hyperlink.address:
322
+ text = get_hyperlink(text, run.hyperlink.address)
323
+ if is_accent(paragraph.font) or is_accent(run.font):
324
+ text = format_text(text, italic=True)
325
+ elif is_strong(paragraph.font) or is_strong(run.font):
326
+ text = format_text(text, bold=True)
327
+ elif is_underlined(paragraph.font) or is_underlined(run.font):
328
+ text = format_text(text, underline=True)
329
+ if is_list_block(shape):
330
+ text = " " * paragraph.level + "* " + text
331
+
332
+ accumulated_text.append(text)
333
+ if extract_images and identify_nearby_objects:
334
+ block_text.append(text)
335
+
336
+ if text_depth == TextTypeEnum.SPAN:
337
+ extracted_data.append(
338
+ _construct_text_metadata(
339
+ presentation,
340
+ shape,
341
+ accumulated_text,
342
+ keywords,
343
+ slide_idx,
344
+ shape_idx,
345
+ paragraph_idx,
346
+ run_idx,
347
+ slide_count,
348
+ text_depth,
349
+ source_metadata,
350
+ base_unified_metadata,
351
+ )
352
+ )
353
+ accumulated_text = []
354
+
355
+ except Exception as e:
356
+ logger.warning(
357
+ "Slide %d Shape %d Run %d: Failed to process run: %s",
358
+ slide_idx,
359
+ shape_idx,
360
+ run_idx,
361
+ e,
362
+ )
363
+
364
+ if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
365
+ accumulated_text.append("\n\n")
366
+
367
+ if text_depth == TextTypeEnum.LINE:
368
+ extracted_data.append(
369
+ _construct_text_metadata(
370
+ presentation,
371
+ shape,
372
+ accumulated_text,
373
+ keywords,
374
+ slide_idx,
375
+ shape_idx,
376
+ paragraph_idx,
377
+ -1,
378
+ slide_count,
379
+ text_depth,
380
+ source_metadata,
381
+ base_unified_metadata,
382
+ )
383
+ )
384
+ accumulated_text = []
385
+
386
+ if text_depth == TextTypeEnum.BLOCK:
387
+ extracted_data.append(
388
+ _construct_text_metadata(
347
389
  presentation,
348
390
  shape,
349
391
  accumulated_text,
350
392
  keywords,
351
393
  slide_idx,
352
394
  shape_idx,
353
- paragraph_idx,
354
- run_idx,
395
+ -1,
396
+ -1,
355
397
  slide_count,
356
398
  text_depth,
357
399
  source_metadata,
358
400
  base_unified_metadata,
359
401
  )
360
- if len(text_extraction) > 0:
361
- extracted_data.append(text_extraction)
362
- accumulated_text = []
402
+ )
403
+ accumulated_text = []
363
404
 
364
- # Add newlines for separation at line/paragraph level.
365
- if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
366
- accumulated_text.append("\n\n")
405
+ if extract_images and identify_nearby_objects and block_text:
406
+ page_nearby_blocks["text"]["content"].append("".join(block_text))
407
+ page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
367
408
 
368
- if text_depth == TextTypeEnum.LINE:
369
- text_extraction = _construct_text_metadata(
370
- presentation,
409
+ # Image processing (deferred)
410
+ if extract_images or extract_tables or extract_charts:
411
+ try:
412
+ process_shape(
371
413
  shape,
372
- accumulated_text,
373
- keywords,
374
- slide_idx,
375
414
  shape_idx,
376
- paragraph_idx,
377
- -1,
415
+ slide_idx,
378
416
  slide_count,
379
- text_depth,
417
+ pending_images,
418
+ page_nearby_blocks,
380
419
  source_metadata,
381
420
  base_unified_metadata,
382
421
  )
383
- if len(text_extraction) > 0:
384
- extracted_data.append(text_extraction)
385
- accumulated_text = []
422
+ except Exception as e:
423
+ logger.warning("Slide %d Shape %d: Failed to process image shape: %s", slide_idx, shape_idx, e)
424
+
425
+ # Table extraction
426
+ if extract_tables and shape.has_table:
427
+ try:
428
+ extracted_data.append(
429
+ _construct_table_metadata(
430
+ shape, slide_idx, slide_count, source_metadata, base_unified_metadata
431
+ )
432
+ )
433
+ except Exception as e:
434
+ logger.warning("Slide %d Shape %d: Failed to extract table: %s", slide_idx, shape_idx, e)
386
435
 
387
- if text_depth == TextTypeEnum.BLOCK:
388
- text_extraction = _construct_text_metadata(
389
- presentation,
390
- shape,
391
- accumulated_text,
392
- keywords,
393
- slide_idx,
394
- shape_idx,
395
- -1,
396
- -1,
397
- slide_count,
398
- text_depth,
399
- source_metadata,
400
- base_unified_metadata,
401
- )
402
- if len(text_extraction) > 0:
403
- extracted_data.append(text_extraction)
404
- accumulated_text = []
405
-
406
- if extract_images and identify_nearby_objects and block_text:
407
- page_nearby_blocks["text"]["content"].append("".join(block_text))
408
- page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
409
-
410
- # ---------------------------------------------
411
- # 2) Image Handling (DEFERRED) with nested/group shapes
412
- # ---------------------------------------------
413
- if extract_images:
414
- process_shape(
415
- shape,
416
- shape_idx,
436
+ except Exception as e:
437
+ logger.warning("Slide %d Shape %d: Top-level failure: %s", slide_idx, shape_idx, e)
438
+
439
+ if extract_text and text_depth == TextTypeEnum.PAGE and accumulated_text:
440
+ extracted_data.append(
441
+ _construct_text_metadata(
442
+ presentation,
443
+ None,
444
+ accumulated_text,
445
+ keywords,
417
446
  slide_idx,
447
+ -1,
448
+ -1,
449
+ -1,
418
450
  slide_count,
419
- pending_images,
420
- page_nearby_blocks,
451
+ text_depth,
421
452
  source_metadata,
422
453
  base_unified_metadata,
423
454
  )
455
+ )
456
+ accumulated_text = []
424
457
 
425
- # ---------------------------------------------
426
- # 3) Table Handling
427
- # ---------------------------------------------
428
- if extract_tables and shape.has_table:
429
- table_extraction = _construct_table_metadata(
430
- shape, slide_idx, slide_count, source_metadata, base_unified_metadata
431
- )
432
- extracted_data.append(table_extraction)
433
-
434
- if extract_text and (text_depth == TextTypeEnum.PAGE) and (len(accumulated_text) > 0):
435
- text_extraction = _construct_text_metadata(
458
+ if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
459
+ extracted_data.append(
460
+ _construct_text_metadata(
436
461
  presentation,
437
- shape, # may pass None if preferred
462
+ None,
438
463
  accumulated_text,
439
464
  keywords,
440
- slide_idx,
465
+ -1,
441
466
  -1,
442
467
  -1,
443
468
  -1,
@@ -446,41 +471,20 @@ def python_pptx(
446
471
  source_metadata,
447
472
  base_unified_metadata,
448
473
  )
449
- if len(text_extraction) > 0:
450
- extracted_data.append(text_extraction)
451
- accumulated_text = []
452
-
453
- if extract_text and (text_depth == TextTypeEnum.DOCUMENT) and (len(accumulated_text) > 0):
454
- text_extraction = _construct_text_metadata(
455
- presentation,
456
- shape, # may pass None
457
- accumulated_text,
458
- keywords,
459
- -1,
460
- -1,
461
- -1,
462
- -1,
463
- slide_count,
464
- text_depth,
465
- source_metadata,
466
- base_unified_metadata,
467
474
  )
468
- if len(text_extraction) > 0:
469
- extracted_data.append(text_extraction)
470
- accumulated_text = []
471
475
 
472
- # ---------------------------------------------
473
- # FINAL STEP: Finalize images (and tables/charts)
474
- # ---------------------------------------------
475
476
  if extract_images or extract_tables or extract_charts:
476
- _finalize_images(
477
- pending_images,
478
- extracted_data,
479
- pptx_extractor_config,
480
- extract_tables=extract_tables,
481
- extract_charts=extract_charts,
482
- trace_info=trace_info,
483
- )
477
+ try:
478
+ _finalize_images(
479
+ pending_images,
480
+ extracted_data,
481
+ pptx_extractor_config,
482
+ extract_tables=extract_tables,
483
+ extract_charts=extract_charts,
484
+ trace_info=trace_info,
485
+ )
486
+ except Exception as e:
487
+ logger.error("Finalization of images failed: %s", e)
484
488
 
485
489
  return extracted_data
486
490
 
@@ -656,21 +660,43 @@ def get_bbox(
656
660
  shape_object: Optional[Slide] = None,
657
661
  text_depth: Optional[TextTypeEnum] = None,
658
662
  ):
659
- bbox = (-1, -1, -1, -1)
660
- if text_depth == TextTypeEnum.DOCUMENT:
661
- bbox = (-1, -1, -1, -1)
662
- elif text_depth == TextTypeEnum.PAGE:
663
- top = left = 0
664
- width = presentation_object.slide_width
665
- height = presentation_object.slide_height
666
- bbox = (top, left, top + height, left + width)
667
- elif shape_object:
668
- top = shape_object.top
669
- left = shape_object.left
670
- width = shape_object.width
671
- height = shape_object.height
672
- bbox = (top, left, top + height, left + width)
673
- return bbox
663
+ """
664
+ Safely computes bounding box for a slide, shape, or document.
665
+ Ensures that missing or None values are gracefully handled.
666
+
667
+ Returns
668
+ -------
669
+ Tuple[int, int, int, int]
670
+ Bounding box as (top, left, bottom, right).
671
+ Defaults to (-1, -1, -1, -1) if invalid or unsupported.
672
+ """
673
+ try:
674
+ if text_depth == TextTypeEnum.DOCUMENT:
675
+ return (-1, -1, -1, -1)
676
+
677
+ elif text_depth == TextTypeEnum.PAGE and presentation_object:
678
+ top = left = 0
679
+ width = presentation_object.slide_width
680
+ height = presentation_object.slide_height
681
+ return (top, left, top + height, left + width)
682
+
683
+ elif shape_object:
684
+ top = shape_object.top if shape_object.top is not None else -1
685
+ left = shape_object.left if shape_object.left is not None else -1
686
+ width = shape_object.width if shape_object.width is not None else -1
687
+ height = shape_object.height if shape_object.height is not None else -1
688
+
689
+ # If all are valid, return normally, else return placeholder
690
+ if -1 in [top, left, width, height]:
691
+ return (-1, -1, -1, -1)
692
+
693
+ return (top, left, top + height, left + width)
694
+
695
+ except Exception as e:
696
+ logger.warning(f"get_bbox: Failed to compute bbox due to {e}")
697
+ return (-1, -1, -1, -1)
698
+
699
+ return (-1, -1, -1, -1)
674
700
 
675
701
 
676
702
  def ungroup_shapes(shapes):
@@ -99,14 +99,11 @@ def _decode_and_extract_from_pptx(
99
99
 
100
100
  # Retrieve extraction parameters (and remove boolean flags as they are consumed).
101
101
  extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
102
- try:
103
- extract_text: bool = extract_params.pop("extract_text", False)
104
- extract_images: bool = extract_params.pop("extract_images", False)
105
- extract_tables: bool = extract_params.pop("extract_tables", False)
106
- extract_charts: bool = extract_params.pop("extract_charts", False)
107
- extract_infographics: bool = extract_params.pop("extract_infographics", False)
108
- except KeyError as e:
109
- raise ValueError(f"Missing required extraction flag: {e}")
102
+ extract_text: bool = extract_params.pop("extract_text", False)
103
+ extract_images: bool = extract_params.pop("extract_images", False)
104
+ extract_tables: bool = extract_params.pop("extract_tables", False)
105
+ extract_charts: bool = extract_params.pop("extract_charts", False)
106
+ extract_infographics: bool = extract_params.pop("extract_infographics", False)
110
107
 
111
108
  # Inject additional configuration and trace information.
112
109
  if getattr(extraction_config, "pptx_extraction_config", None) is not None:
@@ -184,4 +181,4 @@ def extract_primitives_from_pptx_internal(
184
181
  else:
185
182
  extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
186
183
 
187
- return extracted_df
184
+ return extracted_df, {}