nv-ingest-api 2025.5.18.dev20250518__py3-none-any.whl → 2025.5.19.dev20250519__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -274,59 +274,70 @@ class DocxReader:
274
274
  - A list of extracted images from the paragraph.
275
275
  """
276
276
 
277
- paragraph_images = []
278
- if self.paragraph_format == "text":
279
- paragraph_text = paragraph.text
280
- else:
281
- # Get the default style of the paragraph, "markdown"
277
+ try:
278
+ paragraph_images = []
279
+ if self.paragraph_format == "text":
280
+ return paragraph.text.strip(), paragraph_images
281
+
282
282
  font = paragraph.style.font
283
283
  default_style = (font.bold, font.italic, font.underline)
284
284
 
285
- # Iterate over the runs of the paragraph and group them by style, excluding empty runs
286
285
  paragraph_text = ""
287
286
  group_text = ""
288
287
  previous_style = None
289
288
 
290
289
  for c in paragraph.iter_inner_content():
291
- if isinstance(c, Hyperlink):
292
- text = f"[{c.text}]({c.address})"
293
- style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
- elif isinstance(c, Run):
295
- text = c.text
296
- style = (c.bold, c.italic, c.underline)
297
- # 1. Locate the inline shape which is stored in the <w:drawing> element.
298
- # 2. r:embed in <a.blip> has the relationship id for extracting the file where
299
- # the image is stored as bytes.
300
- # Reference:
301
- # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
302
- inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
303
- for r_id in inline_shapes:
304
- text += self.image_tag.format(self.image_tag_index)
305
- self.image_tag_index += 1
306
- image = paragraph.part.related_parts[r_id].image
307
- paragraph_images.append(image)
308
- else:
309
- continue
310
-
311
- style = tuple([s if s is not None else d for s, d in zip(style, default_style)])
312
-
313
- # If the style changes for a non empty text, format the previous group and start a new one
314
- if (not self.is_text_empty(text)) and (previous_style is not None):
315
- if style != previous_style:
290
+ try:
291
+ if isinstance(c, Hyperlink):
292
+ text = f"[{c.text}]({c.address})"
293
+ style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
+ elif isinstance(c, Run):
295
+ text = c.text
296
+ style = (c.bold, c.italic, c.underline)
297
+
298
+ # 1. Locate the inline shape which is stored in the <w:drawing> element.
299
+ # 2. r:embed in <a.blip> has the relationship id for extracting the file where
300
+ # the image is stored as bytes.
301
+ # Reference:
302
+ # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
303
+ inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
304
+ for r_id in inline_shapes:
305
+ text += self.image_tag.format(self.image_tag_index)
306
+ self.image_tag_index += 1
307
+ try:
308
+ image = paragraph.part.related_parts[r_id].image
309
+ paragraph_images.append(image)
310
+ except Exception as img_e:
311
+ logger.warning(
312
+ "Failed to extract image with rId " "%s: %s -- object / file may be malformed",
313
+ r_id,
314
+ img_e,
315
+ )
316
+ else:
317
+ continue
318
+
319
+ style = tuple(s if s is not None else d for s, d in zip(style, default_style))
320
+
321
+ if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
316
322
  paragraph_text += self.format_text(group_text, *previous_style)
317
323
  group_text = ""
318
324
 
319
- group_text += text
320
- if not self.is_text_empty(text):
321
- previous_style = style
325
+ group_text += text
326
+ if not self.is_text_empty(text):
327
+ previous_style = style
322
328
 
323
- # Format the last group
324
- if group_text:
325
- paragraph_text += self.format_text(group_text, *style)
329
+ except Exception as e:
330
+ logger.error("format_paragraph: failed to process run: %s", e)
331
+ continue
332
+
333
+ if group_text and previous_style:
334
+ paragraph_text += self.format_text(group_text, *previous_style)
335
+
336
+ return paragraph_text.strip(), paragraph_images
326
337
 
327
- # Remove trailing spaces
328
- paragraph_text = paragraph_text.strip()
329
- return paragraph_text, paragraph_images
338
+ except Exception as e:
339
+ logger.error("format_paragraph: failed for paragraph: %s", e)
340
+ return "", []
330
341
 
331
342
  def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
332
343
  """
@@ -344,12 +355,23 @@ class DocxReader:
344
355
  - A list of images extracted from the cell.
345
356
  """
346
357
 
347
- if self.paragraph_format == "markdown":
348
- newline = "<br>"
349
- else:
350
- newline = "\n"
351
- paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs])
352
- return newline.join(paragraph_texts), paragraph_images
358
+ try:
359
+ newline = "<br>" if self.paragraph_format == "markdown" else "\n"
360
+ texts, images = [], []
361
+
362
+ for p in cell.paragraphs:
363
+ try:
364
+ t, imgs = self.format_paragraph(p)
365
+ texts.append(t)
366
+ images.extend(imgs)
367
+ except Exception as e:
368
+ logger.error("format_cell: failed to format paragraph in cell: %s", e)
369
+
370
+ return newline.join(texts), images
371
+
372
+ except Exception as e:
373
+ logger.error("format_cell: failed entirely: %s", e)
374
+ return "", []
353
375
 
354
376
  def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
355
377
  """
@@ -368,25 +390,50 @@ class DocxReader:
368
390
  - A DataFrame representation of the table's content.
369
391
  """
370
392
 
371
- rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows]
372
- texts = [[text for text, _ in row] for row in rows]
373
- table_images = [image for row in rows for _, images in row for image in images]
374
-
375
- table = pd.DataFrame(texts[1:], columns=texts[0])
376
- if "markdown" in self.table_format:
377
- table_text = table.to_markdown(index=False)
378
- if self.table_format == "markdown_light":
379
- table_text = re.sub(r"\s{2,}", " ", table_text)
380
- table_text = re.sub(r"-{2,}", "-", table_text)
381
- elif self.table_format == "csv":
382
- table_text = table.to_csv()
383
- elif self.table_format == "tag":
384
- table_text = self.table_tag.format(self.table_tag_index)
385
- self.table_tag_index += 1
386
- else:
387
- raise ValueError(f"Unknown table format {format}")
393
+ try:
394
+ rows_data = []
395
+ all_images = []
396
+
397
+ for row in table.rows:
398
+ row_texts = []
399
+ row_images = []
400
+ for cell in row.cells:
401
+ try:
402
+ cell_text, cell_imgs = self.format_cell(cell)
403
+ row_texts.append(cell_text)
404
+ row_images.extend(cell_imgs)
405
+ except Exception as e:
406
+ logger.error("format_table: failed to process cell: %s", e)
407
+ row_texts.append("") # pad for column alignment
408
+
409
+ rows_data.append(row_texts)
410
+ all_images.extend(row_images)
411
+
412
+ if not rows_data or not rows_data[0]:
413
+ return None, [], pd.DataFrame()
414
+
415
+ header = rows_data[0]
416
+ body = rows_data[1:]
417
+ df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
418
+
419
+ if "markdown" in self.table_format:
420
+ table_text = df.to_markdown(index=False)
421
+ if self.table_format == "markdown_light":
422
+ table_text = re.sub(r"\s{2,}", " ", table_text)
423
+ table_text = re.sub(r"-{2,}", "-", table_text)
424
+ elif self.table_format == "csv":
425
+ table_text = df.to_csv(index=False)
426
+ elif self.table_format == "tag":
427
+ table_text = self.table_tag.format(self.table_tag_index)
428
+ self.table_tag_index += 1
429
+ else:
430
+ raise ValueError(f"Unknown table format {self.table_format}")
431
+
432
+ return table_text, all_images, df
388
433
 
389
- return table_text, table_images, table
434
+ except Exception as e:
435
+ logger.error("format_table: failed to format table: %s", e)
436
+ return None, [], pd.DataFrame()
390
437
 
391
438
  @staticmethod
392
439
  def apply_text_style(style: str, text: str, level: int = 0) -> str:
@@ -841,30 +888,39 @@ class DocxReader:
841
888
  self._prev_para_image_idx = 0
842
889
 
843
890
  para_idx = 0
844
-
845
891
  for child in self.document.element.body.iterchildren():
846
- if isinstance(child, CT_P):
847
- paragraph = Paragraph(child, self.document)
848
- paragraph_text, paragraph_images = self.format_paragraph(paragraph)
849
-
850
- if extract_text:
851
- self._extract_para_text(
852
- paragraph,
853
- paragraph_text,
854
- base_unified_metadata,
855
- text_depth,
856
- para_idx,
857
- )
858
-
859
- if (extract_charts or extract_images or extract_tables) and paragraph_images:
860
- self._prev_para_images = paragraph_images
861
- self._prev_para_image_idx = para_idx
862
- self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images]
863
- self.images += paragraph_images
892
+ try:
893
+ if isinstance(child, CT_P):
894
+ paragraph = Paragraph(child, self.document)
895
+ paragraph_text, paragraph_images = self.format_paragraph(paragraph)
896
+
897
+ if extract_text:
898
+ try:
899
+ self._extract_para_text(
900
+ paragraph,
901
+ paragraph_text,
902
+ base_unified_metadata,
903
+ text_depth,
904
+ para_idx,
905
+ )
906
+ except Exception as e:
907
+ logger.error("extract_data: _extract_para_text failed: %s", e)
908
+
909
+ if (extract_images or extract_charts or extract_tables) and paragraph_images:
910
+ self._pending_images += [
911
+ (image, para_idx, "", base_unified_metadata) for image in paragraph_images
912
+ ]
913
+ self.images.extend(paragraph_images)
914
+
915
+ elif isinstance(child, CT_Tbl):
916
+ if extract_tables or extract_charts:
917
+ try:
918
+ self._extract_table_data(child, base_unified_metadata)
919
+ except Exception as e:
920
+ logger.error("extract_data: _extract_table_data failed: %s", e)
864
921
 
865
- elif isinstance(child, CT_Tbl):
866
- if extract_tables or extract_charts:
867
- self._extract_table_data(child, base_unified_metadata)
922
+ except Exception as e:
923
+ logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
868
924
 
869
925
  para_idx += 1
870
926
 
@@ -27,9 +27,9 @@ from typing import Optional
27
27
  import pandas as pd
28
28
  from pptx import Presentation
29
29
  from pptx.enum.dml import MSO_COLOR_TYPE
30
- from pptx.enum.dml import MSO_THEME_COLOR
30
+ from pptx.enum.dml import MSO_THEME_COLOR # noqa
31
31
  from pptx.enum.shapes import MSO_SHAPE_TYPE
32
- from pptx.enum.shapes import PP_PLACEHOLDER
32
+ from pptx.enum.shapes import PP_PLACEHOLDER # noqa
33
33
  from pptx.shapes.autoshape import Shape
34
34
  from pptx.slide import Slide
35
35
 
@@ -220,20 +220,13 @@ def python_pptx(
220
220
  extraction_config: dict,
221
221
  execution_trace_log: Optional[List] = None,
222
222
  ):
223
- """
224
- Uses python-pptx to extract text from a PPTX bytestream, while deferring image
225
- classification into tables/charts if requested.
226
- """
227
-
228
- _ = extract_infographics # Placeholder for future use
229
- _ = execution_trace_log # Placeholder for future use
223
+ _ = extract_infographics
224
+ _ = execution_trace_log
230
225
 
231
226
  row_data = extraction_config.get("row_data")
232
227
  source_id = row_data["source_id"]
233
228
 
234
- text_depth = extraction_config.get("text_depth", "page")
235
- text_depth = TextTypeEnum[text_depth.upper()]
236
-
229
+ text_depth = TextTypeEnum[extraction_config.get("text_depth", "page").upper()]
237
230
  paragraph_format = extraction_config.get("paragraph_format", "markdown")
238
231
  identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
239
232
 
@@ -241,16 +234,19 @@ def python_pptx(
241
234
  pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
242
235
  trace_info = extraction_config.get("trace_info", {})
243
236
 
244
- base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
237
+ base_unified_metadata = row_data.get(metadata_col, {})
245
238
  base_source_metadata = base_unified_metadata.get("source_metadata", {})
246
239
  source_location = base_source_metadata.get("source_location", "")
247
240
  collection_id = base_source_metadata.get("collection_id", "")
248
241
  partition_id = base_source_metadata.get("partition_id", -1)
249
242
  access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
250
243
 
251
- presentation = Presentation(pptx_stream)
244
+ try:
245
+ presentation = Presentation(pptx_stream)
246
+ except Exception as e:
247
+ logger.error("Failed to open PPTX presentation: %s", e)
248
+ return []
252
249
 
253
- # Collect source metadata from the core properties of the document.
254
250
  last_modified = (
255
251
  presentation.core_properties.modified.isoformat()
256
252
  if presentation.core_properties.modified
@@ -262,12 +258,11 @@ def python_pptx(
262
258
  else datetime.now().isoformat()
263
259
  )
264
260
  keywords = presentation.core_properties.keywords
265
- source_type = DocumentTypeEnum.PPTX
266
261
  source_metadata = {
267
- "source_name": source_id, # python-pptx doesn't maintain filename; re-use source_id
262
+ "source_name": source_id,
268
263
  "source_id": source_id,
269
264
  "source_location": source_location,
270
- "source_type": source_type,
265
+ "source_type": DocumentTypeEnum.PPTX,
271
266
  "collection_id": collection_id,
272
267
  "date_created": date_created,
273
268
  "last_modified": last_modified,
@@ -277,18 +272,16 @@ def python_pptx(
277
272
  }
278
273
 
279
274
  slide_count = len(presentation.slides)
280
-
281
275
  accumulated_text = []
282
276
  extracted_data = []
283
-
284
- # Hold images here for final classification.
285
- # Each item is (shape, shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata,
286
- # base_unified_metadata)
287
277
  pending_images = []
288
278
 
289
279
  for slide_idx, slide in enumerate(presentation.slides):
290
- # Obtain a flat list of shapes (ungrouped) sorted by top then left.
291
- shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
280
+ try:
281
+ shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
282
+ except Exception as e:
283
+ logger.error("Slide %d: Failed to ungroup or sort shapes: %s", slide_idx, e)
284
+ continue
292
285
 
293
286
  page_nearby_blocks = {
294
287
  "text": {"content": [], "bbox": []},
@@ -297,152 +290,179 @@ def python_pptx(
297
290
  }
298
291
 
299
292
  for shape_idx, shape in enumerate(shapes):
300
- block_text = []
301
- added_title = added_subtitle = False
302
-
303
- # ---------------------------------------------
304
- # 1) Text Extraction
305
- # ---------------------------------------------
306
- if extract_text and shape.has_text_frame:
307
- for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
308
- if not paragraph.text.strip():
309
- continue
310
-
311
- for run_idx, run in enumerate(paragraph.runs):
312
- text = run.text
313
- if not text:
314
- continue
293
+ try:
294
+ block_text = []
295
+ added_title = added_subtitle = False
315
296
 
316
- text = escape_text(text)
297
+ # Text extraction
298
+ if extract_text and shape.has_text_frame:
299
+ for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
300
+ if not paragraph.text.strip():
301
+ continue
317
302
 
318
- if paragraph_format == "markdown":
319
- if is_title(shape):
320
- if not added_title:
321
- text = process_title(shape)
322
- added_title = True
323
- else:
303
+ for run_idx, run in enumerate(paragraph.runs):
304
+ try:
305
+ text = run.text
306
+ if not text:
324
307
  continue
325
- elif is_subtitle(shape):
326
- if not added_subtitle:
327
- text = process_subtitle(shape)
328
- added_subtitle = True
329
- else:
330
- continue
331
- else:
332
- if run.hyperlink.address:
333
- text = get_hyperlink(text, run.hyperlink.address)
334
- if is_accent(paragraph.font) or is_accent(run.font):
335
- text = format_text(text, italic=True)
336
- elif is_strong(paragraph.font) or is_strong(run.font):
337
- text = format_text(text, bold=True)
338
- elif is_underlined(paragraph.font) or is_underlined(run.font):
339
- text = format_text(text, underline=True)
340
- if is_list_block(shape):
341
- text = " " * paragraph.level + "* " + text
342
-
343
- accumulated_text.append(text)
344
-
345
- # For "nearby objects", store block text.
346
- if extract_images and identify_nearby_objects:
347
- block_text.append(text)
348
-
349
- # If we only want text at SPAN level, flush after each run.
350
- if text_depth == TextTypeEnum.SPAN:
351
- text_extraction = _construct_text_metadata(
308
+
309
+ text = escape_text(text)
310
+
311
+ if paragraph_format == "markdown":
312
+ if is_title(shape) and not added_title:
313
+ text = process_title(shape)
314
+ added_title = True
315
+ elif is_subtitle(shape) and not added_subtitle:
316
+ text = process_subtitle(shape)
317
+ added_subtitle = True
318
+ elif is_title(shape) or is_subtitle(shape):
319
+ continue # already added
320
+
321
+ if run.hyperlink and run.hyperlink.address:
322
+ text = get_hyperlink(text, run.hyperlink.address)
323
+ if is_accent(paragraph.font) or is_accent(run.font):
324
+ text = format_text(text, italic=True)
325
+ elif is_strong(paragraph.font) or is_strong(run.font):
326
+ text = format_text(text, bold=True)
327
+ elif is_underlined(paragraph.font) or is_underlined(run.font):
328
+ text = format_text(text, underline=True)
329
+ if is_list_block(shape):
330
+ text = " " * paragraph.level + "* " + text
331
+
332
+ accumulated_text.append(text)
333
+ if extract_images and identify_nearby_objects:
334
+ block_text.append(text)
335
+
336
+ if text_depth == TextTypeEnum.SPAN:
337
+ extracted_data.append(
338
+ _construct_text_metadata(
339
+ presentation,
340
+ shape,
341
+ accumulated_text,
342
+ keywords,
343
+ slide_idx,
344
+ shape_idx,
345
+ paragraph_idx,
346
+ run_idx,
347
+ slide_count,
348
+ text_depth,
349
+ source_metadata,
350
+ base_unified_metadata,
351
+ )
352
+ )
353
+ accumulated_text = []
354
+
355
+ except Exception as e:
356
+ logger.warning(
357
+ "Slide %d Shape %d Run %d: Failed to process run: %s",
358
+ slide_idx,
359
+ shape_idx,
360
+ run_idx,
361
+ e,
362
+ )
363
+
364
+ if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
365
+ accumulated_text.append("\n\n")
366
+
367
+ if text_depth == TextTypeEnum.LINE:
368
+ extracted_data.append(
369
+ _construct_text_metadata(
370
+ presentation,
371
+ shape,
372
+ accumulated_text,
373
+ keywords,
374
+ slide_idx,
375
+ shape_idx,
376
+ paragraph_idx,
377
+ -1,
378
+ slide_count,
379
+ text_depth,
380
+ source_metadata,
381
+ base_unified_metadata,
382
+ )
383
+ )
384
+ accumulated_text = []
385
+
386
+ if text_depth == TextTypeEnum.BLOCK:
387
+ extracted_data.append(
388
+ _construct_text_metadata(
352
389
  presentation,
353
390
  shape,
354
391
  accumulated_text,
355
392
  keywords,
356
393
  slide_idx,
357
394
  shape_idx,
358
- paragraph_idx,
359
- run_idx,
395
+ -1,
396
+ -1,
360
397
  slide_count,
361
398
  text_depth,
362
399
  source_metadata,
363
400
  base_unified_metadata,
364
401
  )
365
- if len(text_extraction) > 0:
366
- extracted_data.append(text_extraction)
367
- accumulated_text = []
402
+ )
403
+ accumulated_text = []
368
404
 
369
- # Add newlines for separation at line/paragraph level.
370
- if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
371
- accumulated_text.append("\n\n")
405
+ if extract_images and identify_nearby_objects and block_text:
406
+ page_nearby_blocks["text"]["content"].append("".join(block_text))
407
+ page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
372
408
 
373
- if text_depth == TextTypeEnum.LINE:
374
- text_extraction = _construct_text_metadata(
375
- presentation,
409
+ # Image processing (deferred)
410
+ if extract_images:
411
+ try:
412
+ process_shape(
376
413
  shape,
377
- accumulated_text,
378
- keywords,
379
- slide_idx,
380
414
  shape_idx,
381
- paragraph_idx,
382
- -1,
415
+ slide_idx,
383
416
  slide_count,
384
- text_depth,
417
+ pending_images,
418
+ page_nearby_blocks,
385
419
  source_metadata,
386
420
  base_unified_metadata,
387
421
  )
388
- if len(text_extraction) > 0:
389
- extracted_data.append(text_extraction)
390
- accumulated_text = []
422
+ except Exception as e:
423
+ logger.warning("Slide %d Shape %d: Failed to process image shape: %s", slide_idx, shape_idx, e)
424
+
425
+ # Table extraction
426
+ if extract_tables and shape.has_table:
427
+ try:
428
+ extracted_data.append(
429
+ _construct_table_metadata(
430
+ shape, slide_idx, slide_count, source_metadata, base_unified_metadata
431
+ )
432
+ )
433
+ except Exception as e:
434
+ logger.warning("Slide %d Shape %d: Failed to extract table: %s", slide_idx, shape_idx, e)
391
435
 
392
- if text_depth == TextTypeEnum.BLOCK:
393
- text_extraction = _construct_text_metadata(
394
- presentation,
395
- shape,
396
- accumulated_text,
397
- keywords,
398
- slide_idx,
399
- shape_idx,
400
- -1,
401
- -1,
402
- slide_count,
403
- text_depth,
404
- source_metadata,
405
- base_unified_metadata,
406
- )
407
- if len(text_extraction) > 0:
408
- extracted_data.append(text_extraction)
409
- accumulated_text = []
410
-
411
- if extract_images and identify_nearby_objects and block_text:
412
- page_nearby_blocks["text"]["content"].append("".join(block_text))
413
- page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
414
-
415
- # ---------------------------------------------
416
- # 2) Image Handling (DEFERRED) with nested/group shapes
417
- # ---------------------------------------------
418
- if extract_images:
419
- process_shape(
420
- shape,
421
- shape_idx,
436
+ except Exception as e:
437
+ logger.warning("Slide %d Shape %d: Top-level failure: %s", slide_idx, shape_idx, e)
438
+
439
+ if extract_text and text_depth == TextTypeEnum.PAGE and accumulated_text:
440
+ extracted_data.append(
441
+ _construct_text_metadata(
442
+ presentation,
443
+ None,
444
+ accumulated_text,
445
+ keywords,
422
446
  slide_idx,
447
+ -1,
448
+ -1,
449
+ -1,
423
450
  slide_count,
424
- pending_images,
425
- page_nearby_blocks,
451
+ text_depth,
426
452
  source_metadata,
427
453
  base_unified_metadata,
428
454
  )
455
+ )
456
+ accumulated_text = []
429
457
 
430
- # ---------------------------------------------
431
- # 3) Table Handling
432
- # ---------------------------------------------
433
- if extract_tables and shape.has_table:
434
- table_extraction = _construct_table_metadata(
435
- shape, slide_idx, slide_count, source_metadata, base_unified_metadata
436
- )
437
- extracted_data.append(table_extraction)
438
-
439
- if extract_text and (text_depth == TextTypeEnum.PAGE) and (len(accumulated_text) > 0):
440
- text_extraction = _construct_text_metadata(
458
+ if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
459
+ extracted_data.append(
460
+ _construct_text_metadata(
441
461
  presentation,
442
- shape, # may pass None if preferred
462
+ None,
443
463
  accumulated_text,
444
464
  keywords,
445
- slide_idx,
465
+ -1,
446
466
  -1,
447
467
  -1,
448
468
  -1,
@@ -451,41 +471,20 @@ def python_pptx(
451
471
  source_metadata,
452
472
  base_unified_metadata,
453
473
  )
454
- if len(text_extraction) > 0:
455
- extracted_data.append(text_extraction)
456
- accumulated_text = []
457
-
458
- if extract_text and (text_depth == TextTypeEnum.DOCUMENT) and (len(accumulated_text) > 0):
459
- text_extraction = _construct_text_metadata(
460
- presentation,
461
- shape, # may pass None
462
- accumulated_text,
463
- keywords,
464
- -1,
465
- -1,
466
- -1,
467
- -1,
468
- slide_count,
469
- text_depth,
470
- source_metadata,
471
- base_unified_metadata,
472
474
  )
473
- if len(text_extraction) > 0:
474
- extracted_data.append(text_extraction)
475
- accumulated_text = []
476
475
 
477
- # ---------------------------------------------
478
- # FINAL STEP: Finalize images (and tables/charts)
479
- # ---------------------------------------------
480
476
  if extract_images or extract_tables or extract_charts:
481
- _finalize_images(
482
- pending_images,
483
- extracted_data,
484
- pptx_extractor_config,
485
- extract_tables=extract_tables,
486
- extract_charts=extract_charts,
487
- trace_info=trace_info,
488
- )
477
+ try:
478
+ _finalize_images(
479
+ pending_images,
480
+ extracted_data,
481
+ pptx_extractor_config,
482
+ extract_tables=extract_tables,
483
+ extract_charts=extract_charts,
484
+ trace_info=trace_info,
485
+ )
486
+ except Exception as e:
487
+ logger.error("Finalization of images failed: %s", e)
489
488
 
490
489
  return extracted_data
491
490
 
@@ -118,9 +118,15 @@ def transform_text_split_and_tokenize_internal(
118
118
  )
119
119
 
120
120
  # Filter to documents with text content.
121
- bool_index = (df_transform_ledger["document_type"] == ContentTypeEnum.TEXT) & (
122
- pd.json_normalize(df_transform_ledger["metadata"])["source_metadata.source_type"].isin(split_source_types)
123
- )
121
+ text_type_condition = df_transform_ledger["document_type"] == ContentTypeEnum.TEXT
122
+
123
+ normalized_meta_df = pd.json_normalize(df_transform_ledger["metadata"], errors="ignore")
124
+ if "source_metadata.source_type" in normalized_meta_df.columns:
125
+ source_type_condition = normalized_meta_df["source_metadata.source_type"].isin(split_source_types)
126
+ else:
127
+ source_type_condition = False
128
+
129
+ bool_index = text_type_condition & source_type_condition
124
130
  df_filtered: pd.DataFrame = df_transform_ledger.loc[bool_index]
125
131
 
126
132
  if df_filtered.empty:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.18.dev20250518
3
+ Version: 2025.5.19.dev20250519
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -16,7 +16,7 @@ nv_ingest_api/internal/extract/docx/docx_extractor.py,sha256=jjbL12F5dtpbqHRbhL0
16
16
  nv_ingest_api/internal/extract/docx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
18
18
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py,sha256=1wkciAxu8lz9WuPuoleJFy2s09ieSzXl1S71F9r0BWA,4385
19
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=CM2yV8lfEw1F1ORAjupD4gyIKX0PDDJrL3nsZ5Mnrgg,31539
19
+ nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=FOZZBD9gRRAr93qgK_L6o9xVBYD-6EE5-xI2-cWKvzo,33713
20
20
  nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
21
  nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=CkaW8ihPmGMQGrZh0ih14gtEpWuGOJ8InPQfZwpsP2g,13300
22
22
  nv_ingest_api/internal/extract/image/image_extractor.py,sha256=4tUWinuFMN3ukWa2tZa2_LtzRiTyUAUCBF6BDkUEvm0,8705
@@ -37,7 +37,7 @@ nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ
37
37
  nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
38
38
  nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
39
39
  nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py,sha256=Lg2I1Zq-WJagsZibgyn__8T-M86BjkqAiXWNta9X_EU,29430
40
+ nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py,sha256=IZu0c_RHDSJwwclOZD3_tDu5jg4AEEfumbwKB78dUE0,29716
41
41
  nv_ingest_api/internal/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
42
42
  nv_ingest_api/internal/mutate/deduplicate.py,sha256=hmvTTGevpCtlkM_wVZSoc8-Exr6rUJwqLjoEnbPcPzY,3849
43
43
  nv_ingest_api/internal/mutate/filter.py,sha256=H-hOTBVP-zLpvQr-FoGIJKxkhtj4l_sZ9V2Fgu3rTEM,5183
@@ -97,7 +97,7 @@ nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9
97
97
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
98
98
  nv_ingest_api/internal/transform/caption_image.py,sha256=RYL_b26zfaRlbHz0XvLw9HwaMlXpNhr7gayjxGzdALQ,8545
99
99
  nv_ingest_api/internal/transform/embed_text.py,sha256=F8kg-WXihtuUMwDQUUYjnfGDCdQp1Mkd-jeThOiJT0s,16507
100
- nv_ingest_api/internal/transform/split_text.py,sha256=y6NYRkCEVpVsDu-AqrKx2D6JPp1vwxclw9obNZNJIIs,6561
100
+ nv_ingest_api/internal/transform/split_text.py,sha256=DlVoyHLqZ-6_FiWwZmofPcq7TX8Ta23hIE0St9tw1IY,6822
101
101
  nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
102
102
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  nv_ingest_api/util/control_message/validators.py,sha256=KvvbyheJ5rbzvJbH9JKpMR9VfoI0b0uM6eTAZte1p44,1315
@@ -147,8 +147,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
147
147
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
148
148
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
149
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
150
- nv_ingest_api-2025.5.18.dev20250518.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
151
- nv_ingest_api-2025.5.18.dev20250518.dist-info/METADATA,sha256=JlVjLzmSn4zx25vzOggr993vge5gS2VflDsBw84dG6M,13889
152
- nv_ingest_api-2025.5.18.dev20250518.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
153
- nv_ingest_api-2025.5.18.dev20250518.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
154
- nv_ingest_api-2025.5.18.dev20250518.dist-info/RECORD,,
150
+ nv_ingest_api-2025.5.19.dev20250519.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
151
+ nv_ingest_api-2025.5.19.dev20250519.dist-info/METADATA,sha256=LF2uw9E7zhD2ylp4pRazX1C53VqDPN3FOO4NVrLXGe8,13889
152
+ nv_ingest_api-2025.5.19.dev20250519.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
153
+ nv_ingest_api-2025.5.19.dev20250519.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
154
+ nv_ingest_api-2025.5.19.dev20250519.dist-info/RECORD,,