nv-ingest-api 2025.5.18.dev20250518__py3-none-any.whl → 2025.5.19.dev20250519__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +170 -171
- nv_ingest_api/internal/transform/split_text.py +9 -3
- {nv_ingest_api-2025.5.18.dev20250518.dist-info → nv_ingest_api-2025.5.19.dev20250519.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.5.18.dev20250518.dist-info → nv_ingest_api-2025.5.19.dev20250519.dist-info}/RECORD +8 -8
- {nv_ingest_api-2025.5.18.dev20250518.dist-info → nv_ingest_api-2025.5.19.dev20250519.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.5.18.dev20250518.dist-info → nv_ingest_api-2025.5.19.dev20250519.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.5.18.dev20250518.dist-info → nv_ingest_api-2025.5.19.dev20250519.dist-info}/top_level.txt +0 -0
|
@@ -274,59 +274,70 @@ class DocxReader:
|
|
|
274
274
|
- A list of extracted images from the paragraph.
|
|
275
275
|
"""
|
|
276
276
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
277
|
+
try:
|
|
278
|
+
paragraph_images = []
|
|
279
|
+
if self.paragraph_format == "text":
|
|
280
|
+
return paragraph.text.strip(), paragraph_images
|
|
281
|
+
|
|
282
282
|
font = paragraph.style.font
|
|
283
283
|
default_style = (font.bold, font.italic, font.underline)
|
|
284
284
|
|
|
285
|
-
# Iterate over the runs of the paragraph and group them by style, excluding empty runs
|
|
286
285
|
paragraph_text = ""
|
|
287
286
|
group_text = ""
|
|
288
287
|
previous_style = None
|
|
289
288
|
|
|
290
289
|
for c in paragraph.iter_inner_content():
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
290
|
+
try:
|
|
291
|
+
if isinstance(c, Hyperlink):
|
|
292
|
+
text = f"[{c.text}]({c.address})"
|
|
293
|
+
style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
|
|
294
|
+
elif isinstance(c, Run):
|
|
295
|
+
text = c.text
|
|
296
|
+
style = (c.bold, c.italic, c.underline)
|
|
297
|
+
|
|
298
|
+
# 1. Locate the inline shape which is stored in the <w:drawing> element.
|
|
299
|
+
# 2. r:embed in <a.blip> has the relationship id for extracting the file where
|
|
300
|
+
# the image is stored as bytes.
|
|
301
|
+
# Reference:
|
|
302
|
+
# https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
|
|
303
|
+
inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
|
|
304
|
+
for r_id in inline_shapes:
|
|
305
|
+
text += self.image_tag.format(self.image_tag_index)
|
|
306
|
+
self.image_tag_index += 1
|
|
307
|
+
try:
|
|
308
|
+
image = paragraph.part.related_parts[r_id].image
|
|
309
|
+
paragraph_images.append(image)
|
|
310
|
+
except Exception as img_e:
|
|
311
|
+
logger.warning(
|
|
312
|
+
"Failed to extract image with rId " "%s: %s -- object / file may be malformed",
|
|
313
|
+
r_id,
|
|
314
|
+
img_e,
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
style = tuple(s if s is not None else d for s, d in zip(style, default_style))
|
|
320
|
+
|
|
321
|
+
if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
|
|
316
322
|
paragraph_text += self.format_text(group_text, *previous_style)
|
|
317
323
|
group_text = ""
|
|
318
324
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
325
|
+
group_text += text
|
|
326
|
+
if not self.is_text_empty(text):
|
|
327
|
+
previous_style = style
|
|
322
328
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
329
|
+
except Exception as e:
|
|
330
|
+
logger.error("format_paragraph: failed to process run: %s", e)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
if group_text and previous_style:
|
|
334
|
+
paragraph_text += self.format_text(group_text, *previous_style)
|
|
335
|
+
|
|
336
|
+
return paragraph_text.strip(), paragraph_images
|
|
326
337
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.error("format_paragraph: failed for paragraph: %s", e)
|
|
340
|
+
return "", []
|
|
330
341
|
|
|
331
342
|
def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
|
|
332
343
|
"""
|
|
@@ -344,12 +355,23 @@ class DocxReader:
|
|
|
344
355
|
- A list of images extracted from the cell.
|
|
345
356
|
"""
|
|
346
357
|
|
|
347
|
-
|
|
348
|
-
newline = "<br>"
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
358
|
+
try:
|
|
359
|
+
newline = "<br>" if self.paragraph_format == "markdown" else "\n"
|
|
360
|
+
texts, images = [], []
|
|
361
|
+
|
|
362
|
+
for p in cell.paragraphs:
|
|
363
|
+
try:
|
|
364
|
+
t, imgs = self.format_paragraph(p)
|
|
365
|
+
texts.append(t)
|
|
366
|
+
images.extend(imgs)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.error("format_cell: failed to format paragraph in cell: %s", e)
|
|
369
|
+
|
|
370
|
+
return newline.join(texts), images
|
|
371
|
+
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.error("format_cell: failed entirely: %s", e)
|
|
374
|
+
return "", []
|
|
353
375
|
|
|
354
376
|
def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
|
|
355
377
|
"""
|
|
@@ -368,25 +390,50 @@ class DocxReader:
|
|
|
368
390
|
- A DataFrame representation of the table's content.
|
|
369
391
|
"""
|
|
370
392
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
393
|
+
try:
|
|
394
|
+
rows_data = []
|
|
395
|
+
all_images = []
|
|
396
|
+
|
|
397
|
+
for row in table.rows:
|
|
398
|
+
row_texts = []
|
|
399
|
+
row_images = []
|
|
400
|
+
for cell in row.cells:
|
|
401
|
+
try:
|
|
402
|
+
cell_text, cell_imgs = self.format_cell(cell)
|
|
403
|
+
row_texts.append(cell_text)
|
|
404
|
+
row_images.extend(cell_imgs)
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.error("format_table: failed to process cell: %s", e)
|
|
407
|
+
row_texts.append("") # pad for column alignment
|
|
408
|
+
|
|
409
|
+
rows_data.append(row_texts)
|
|
410
|
+
all_images.extend(row_images)
|
|
411
|
+
|
|
412
|
+
if not rows_data or not rows_data[0]:
|
|
413
|
+
return None, [], pd.DataFrame()
|
|
414
|
+
|
|
415
|
+
header = rows_data[0]
|
|
416
|
+
body = rows_data[1:]
|
|
417
|
+
df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
|
|
418
|
+
|
|
419
|
+
if "markdown" in self.table_format:
|
|
420
|
+
table_text = df.to_markdown(index=False)
|
|
421
|
+
if self.table_format == "markdown_light":
|
|
422
|
+
table_text = re.sub(r"\s{2,}", " ", table_text)
|
|
423
|
+
table_text = re.sub(r"-{2,}", "-", table_text)
|
|
424
|
+
elif self.table_format == "csv":
|
|
425
|
+
table_text = df.to_csv(index=False)
|
|
426
|
+
elif self.table_format == "tag":
|
|
427
|
+
table_text = self.table_tag.format(self.table_tag_index)
|
|
428
|
+
self.table_tag_index += 1
|
|
429
|
+
else:
|
|
430
|
+
raise ValueError(f"Unknown table format {self.table_format}")
|
|
431
|
+
|
|
432
|
+
return table_text, all_images, df
|
|
388
433
|
|
|
389
|
-
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.error("format_table: failed to format table: %s", e)
|
|
436
|
+
return None, [], pd.DataFrame()
|
|
390
437
|
|
|
391
438
|
@staticmethod
|
|
392
439
|
def apply_text_style(style: str, text: str, level: int = 0) -> str:
|
|
@@ -841,30 +888,39 @@ class DocxReader:
|
|
|
841
888
|
self._prev_para_image_idx = 0
|
|
842
889
|
|
|
843
890
|
para_idx = 0
|
|
844
|
-
|
|
845
891
|
for child in self.document.element.body.iterchildren():
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
892
|
+
try:
|
|
893
|
+
if isinstance(child, CT_P):
|
|
894
|
+
paragraph = Paragraph(child, self.document)
|
|
895
|
+
paragraph_text, paragraph_images = self.format_paragraph(paragraph)
|
|
896
|
+
|
|
897
|
+
if extract_text:
|
|
898
|
+
try:
|
|
899
|
+
self._extract_para_text(
|
|
900
|
+
paragraph,
|
|
901
|
+
paragraph_text,
|
|
902
|
+
base_unified_metadata,
|
|
903
|
+
text_depth,
|
|
904
|
+
para_idx,
|
|
905
|
+
)
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.error("extract_data: _extract_para_text failed: %s", e)
|
|
908
|
+
|
|
909
|
+
if (extract_images or extract_charts or extract_tables) and paragraph_images:
|
|
910
|
+
self._pending_images += [
|
|
911
|
+
(image, para_idx, "", base_unified_metadata) for image in paragraph_images
|
|
912
|
+
]
|
|
913
|
+
self.images.extend(paragraph_images)
|
|
914
|
+
|
|
915
|
+
elif isinstance(child, CT_Tbl):
|
|
916
|
+
if extract_tables or extract_charts:
|
|
917
|
+
try:
|
|
918
|
+
self._extract_table_data(child, base_unified_metadata)
|
|
919
|
+
except Exception as e:
|
|
920
|
+
logger.error("extract_data: _extract_table_data failed: %s", e)
|
|
864
921
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
self._extract_table_data(child, base_unified_metadata)
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
|
|
868
924
|
|
|
869
925
|
para_idx += 1
|
|
870
926
|
|
|
@@ -27,9 +27,9 @@ from typing import Optional
|
|
|
27
27
|
import pandas as pd
|
|
28
28
|
from pptx import Presentation
|
|
29
29
|
from pptx.enum.dml import MSO_COLOR_TYPE
|
|
30
|
-
from pptx.enum.dml import MSO_THEME_COLOR
|
|
30
|
+
from pptx.enum.dml import MSO_THEME_COLOR # noqa
|
|
31
31
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
32
|
-
from pptx.enum.shapes import PP_PLACEHOLDER
|
|
32
|
+
from pptx.enum.shapes import PP_PLACEHOLDER # noqa
|
|
33
33
|
from pptx.shapes.autoshape import Shape
|
|
34
34
|
from pptx.slide import Slide
|
|
35
35
|
|
|
@@ -220,20 +220,13 @@ def python_pptx(
|
|
|
220
220
|
extraction_config: dict,
|
|
221
221
|
execution_trace_log: Optional[List] = None,
|
|
222
222
|
):
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
classification into tables/charts if requested.
|
|
226
|
-
"""
|
|
227
|
-
|
|
228
|
-
_ = extract_infographics # Placeholder for future use
|
|
229
|
-
_ = execution_trace_log # Placeholder for future use
|
|
223
|
+
_ = extract_infographics
|
|
224
|
+
_ = execution_trace_log
|
|
230
225
|
|
|
231
226
|
row_data = extraction_config.get("row_data")
|
|
232
227
|
source_id = row_data["source_id"]
|
|
233
228
|
|
|
234
|
-
text_depth = extraction_config.get("text_depth", "page")
|
|
235
|
-
text_depth = TextTypeEnum[text_depth.upper()]
|
|
236
|
-
|
|
229
|
+
text_depth = TextTypeEnum[extraction_config.get("text_depth", "page").upper()]
|
|
237
230
|
paragraph_format = extraction_config.get("paragraph_format", "markdown")
|
|
238
231
|
identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
|
|
239
232
|
|
|
@@ -241,16 +234,19 @@ def python_pptx(
|
|
|
241
234
|
pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
|
|
242
235
|
trace_info = extraction_config.get("trace_info", {})
|
|
243
236
|
|
|
244
|
-
base_unified_metadata = row_data
|
|
237
|
+
base_unified_metadata = row_data.get(metadata_col, {})
|
|
245
238
|
base_source_metadata = base_unified_metadata.get("source_metadata", {})
|
|
246
239
|
source_location = base_source_metadata.get("source_location", "")
|
|
247
240
|
collection_id = base_source_metadata.get("collection_id", "")
|
|
248
241
|
partition_id = base_source_metadata.get("partition_id", -1)
|
|
249
242
|
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
|
|
250
243
|
|
|
251
|
-
|
|
244
|
+
try:
|
|
245
|
+
presentation = Presentation(pptx_stream)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.error("Failed to open PPTX presentation: %s", e)
|
|
248
|
+
return []
|
|
252
249
|
|
|
253
|
-
# Collect source metadata from the core properties of the document.
|
|
254
250
|
last_modified = (
|
|
255
251
|
presentation.core_properties.modified.isoformat()
|
|
256
252
|
if presentation.core_properties.modified
|
|
@@ -262,12 +258,11 @@ def python_pptx(
|
|
|
262
258
|
else datetime.now().isoformat()
|
|
263
259
|
)
|
|
264
260
|
keywords = presentation.core_properties.keywords
|
|
265
|
-
source_type = DocumentTypeEnum.PPTX
|
|
266
261
|
source_metadata = {
|
|
267
|
-
"source_name": source_id,
|
|
262
|
+
"source_name": source_id,
|
|
268
263
|
"source_id": source_id,
|
|
269
264
|
"source_location": source_location,
|
|
270
|
-
"source_type":
|
|
265
|
+
"source_type": DocumentTypeEnum.PPTX,
|
|
271
266
|
"collection_id": collection_id,
|
|
272
267
|
"date_created": date_created,
|
|
273
268
|
"last_modified": last_modified,
|
|
@@ -277,18 +272,16 @@ def python_pptx(
|
|
|
277
272
|
}
|
|
278
273
|
|
|
279
274
|
slide_count = len(presentation.slides)
|
|
280
|
-
|
|
281
275
|
accumulated_text = []
|
|
282
276
|
extracted_data = []
|
|
283
|
-
|
|
284
|
-
# Hold images here for final classification.
|
|
285
|
-
# Each item is (shape, shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata,
|
|
286
|
-
# base_unified_metadata)
|
|
287
277
|
pending_images = []
|
|
288
278
|
|
|
289
279
|
for slide_idx, slide in enumerate(presentation.slides):
|
|
290
|
-
|
|
291
|
-
|
|
280
|
+
try:
|
|
281
|
+
shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.error("Slide %d: Failed to ungroup or sort shapes: %s", slide_idx, e)
|
|
284
|
+
continue
|
|
292
285
|
|
|
293
286
|
page_nearby_blocks = {
|
|
294
287
|
"text": {"content": [], "bbox": []},
|
|
@@ -297,152 +290,179 @@ def python_pptx(
|
|
|
297
290
|
}
|
|
298
291
|
|
|
299
292
|
for shape_idx, shape in enumerate(shapes):
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
# ---------------------------------------------
|
|
304
|
-
# 1) Text Extraction
|
|
305
|
-
# ---------------------------------------------
|
|
306
|
-
if extract_text and shape.has_text_frame:
|
|
307
|
-
for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
|
|
308
|
-
if not paragraph.text.strip():
|
|
309
|
-
continue
|
|
310
|
-
|
|
311
|
-
for run_idx, run in enumerate(paragraph.runs):
|
|
312
|
-
text = run.text
|
|
313
|
-
if not text:
|
|
314
|
-
continue
|
|
293
|
+
try:
|
|
294
|
+
block_text = []
|
|
295
|
+
added_title = added_subtitle = False
|
|
315
296
|
|
|
316
|
-
|
|
297
|
+
# Text extraction
|
|
298
|
+
if extract_text and shape.has_text_frame:
|
|
299
|
+
for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
|
|
300
|
+
if not paragraph.text.strip():
|
|
301
|
+
continue
|
|
317
302
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
added_title = True
|
|
323
|
-
else:
|
|
303
|
+
for run_idx, run in enumerate(paragraph.runs):
|
|
304
|
+
try:
|
|
305
|
+
text = run.text
|
|
306
|
+
if not text:
|
|
324
307
|
continue
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
308
|
+
|
|
309
|
+
text = escape_text(text)
|
|
310
|
+
|
|
311
|
+
if paragraph_format == "markdown":
|
|
312
|
+
if is_title(shape) and not added_title:
|
|
313
|
+
text = process_title(shape)
|
|
314
|
+
added_title = True
|
|
315
|
+
elif is_subtitle(shape) and not added_subtitle:
|
|
316
|
+
text = process_subtitle(shape)
|
|
317
|
+
added_subtitle = True
|
|
318
|
+
elif is_title(shape) or is_subtitle(shape):
|
|
319
|
+
continue # already added
|
|
320
|
+
|
|
321
|
+
if run.hyperlink and run.hyperlink.address:
|
|
322
|
+
text = get_hyperlink(text, run.hyperlink.address)
|
|
323
|
+
if is_accent(paragraph.font) or is_accent(run.font):
|
|
324
|
+
text = format_text(text, italic=True)
|
|
325
|
+
elif is_strong(paragraph.font) or is_strong(run.font):
|
|
326
|
+
text = format_text(text, bold=True)
|
|
327
|
+
elif is_underlined(paragraph.font) or is_underlined(run.font):
|
|
328
|
+
text = format_text(text, underline=True)
|
|
329
|
+
if is_list_block(shape):
|
|
330
|
+
text = " " * paragraph.level + "* " + text
|
|
331
|
+
|
|
332
|
+
accumulated_text.append(text)
|
|
333
|
+
if extract_images and identify_nearby_objects:
|
|
334
|
+
block_text.append(text)
|
|
335
|
+
|
|
336
|
+
if text_depth == TextTypeEnum.SPAN:
|
|
337
|
+
extracted_data.append(
|
|
338
|
+
_construct_text_metadata(
|
|
339
|
+
presentation,
|
|
340
|
+
shape,
|
|
341
|
+
accumulated_text,
|
|
342
|
+
keywords,
|
|
343
|
+
slide_idx,
|
|
344
|
+
shape_idx,
|
|
345
|
+
paragraph_idx,
|
|
346
|
+
run_idx,
|
|
347
|
+
slide_count,
|
|
348
|
+
text_depth,
|
|
349
|
+
source_metadata,
|
|
350
|
+
base_unified_metadata,
|
|
351
|
+
)
|
|
352
|
+
)
|
|
353
|
+
accumulated_text = []
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.warning(
|
|
357
|
+
"Slide %d Shape %d Run %d: Failed to process run: %s",
|
|
358
|
+
slide_idx,
|
|
359
|
+
shape_idx,
|
|
360
|
+
run_idx,
|
|
361
|
+
e,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
|
|
365
|
+
accumulated_text.append("\n\n")
|
|
366
|
+
|
|
367
|
+
if text_depth == TextTypeEnum.LINE:
|
|
368
|
+
extracted_data.append(
|
|
369
|
+
_construct_text_metadata(
|
|
370
|
+
presentation,
|
|
371
|
+
shape,
|
|
372
|
+
accumulated_text,
|
|
373
|
+
keywords,
|
|
374
|
+
slide_idx,
|
|
375
|
+
shape_idx,
|
|
376
|
+
paragraph_idx,
|
|
377
|
+
-1,
|
|
378
|
+
slide_count,
|
|
379
|
+
text_depth,
|
|
380
|
+
source_metadata,
|
|
381
|
+
base_unified_metadata,
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
accumulated_text = []
|
|
385
|
+
|
|
386
|
+
if text_depth == TextTypeEnum.BLOCK:
|
|
387
|
+
extracted_data.append(
|
|
388
|
+
_construct_text_metadata(
|
|
352
389
|
presentation,
|
|
353
390
|
shape,
|
|
354
391
|
accumulated_text,
|
|
355
392
|
keywords,
|
|
356
393
|
slide_idx,
|
|
357
394
|
shape_idx,
|
|
358
|
-
|
|
359
|
-
|
|
395
|
+
-1,
|
|
396
|
+
-1,
|
|
360
397
|
slide_count,
|
|
361
398
|
text_depth,
|
|
362
399
|
source_metadata,
|
|
363
400
|
base_unified_metadata,
|
|
364
401
|
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
accumulated_text = []
|
|
402
|
+
)
|
|
403
|
+
accumulated_text = []
|
|
368
404
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
405
|
+
if extract_images and identify_nearby_objects and block_text:
|
|
406
|
+
page_nearby_blocks["text"]["content"].append("".join(block_text))
|
|
407
|
+
page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
|
|
372
408
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
409
|
+
# Image processing (deferred)
|
|
410
|
+
if extract_images:
|
|
411
|
+
try:
|
|
412
|
+
process_shape(
|
|
376
413
|
shape,
|
|
377
|
-
accumulated_text,
|
|
378
|
-
keywords,
|
|
379
|
-
slide_idx,
|
|
380
414
|
shape_idx,
|
|
381
|
-
|
|
382
|
-
-1,
|
|
415
|
+
slide_idx,
|
|
383
416
|
slide_count,
|
|
384
|
-
|
|
417
|
+
pending_images,
|
|
418
|
+
page_nearby_blocks,
|
|
385
419
|
source_metadata,
|
|
386
420
|
base_unified_metadata,
|
|
387
421
|
)
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
422
|
+
except Exception as e:
|
|
423
|
+
logger.warning("Slide %d Shape %d: Failed to process image shape: %s", slide_idx, shape_idx, e)
|
|
424
|
+
|
|
425
|
+
# Table extraction
|
|
426
|
+
if extract_tables and shape.has_table:
|
|
427
|
+
try:
|
|
428
|
+
extracted_data.append(
|
|
429
|
+
_construct_table_metadata(
|
|
430
|
+
shape, slide_idx, slide_count, source_metadata, base_unified_metadata
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.warning("Slide %d Shape %d: Failed to extract table: %s", slide_idx, shape_idx, e)
|
|
391
435
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
slide_count,
|
|
403
|
-
text_depth,
|
|
404
|
-
source_metadata,
|
|
405
|
-
base_unified_metadata,
|
|
406
|
-
)
|
|
407
|
-
if len(text_extraction) > 0:
|
|
408
|
-
extracted_data.append(text_extraction)
|
|
409
|
-
accumulated_text = []
|
|
410
|
-
|
|
411
|
-
if extract_images and identify_nearby_objects and block_text:
|
|
412
|
-
page_nearby_blocks["text"]["content"].append("".join(block_text))
|
|
413
|
-
page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
|
|
414
|
-
|
|
415
|
-
# ---------------------------------------------
|
|
416
|
-
# 2) Image Handling (DEFERRED) with nested/group shapes
|
|
417
|
-
# ---------------------------------------------
|
|
418
|
-
if extract_images:
|
|
419
|
-
process_shape(
|
|
420
|
-
shape,
|
|
421
|
-
shape_idx,
|
|
436
|
+
except Exception as e:
|
|
437
|
+
logger.warning("Slide %d Shape %d: Top-level failure: %s", slide_idx, shape_idx, e)
|
|
438
|
+
|
|
439
|
+
if extract_text and text_depth == TextTypeEnum.PAGE and accumulated_text:
|
|
440
|
+
extracted_data.append(
|
|
441
|
+
_construct_text_metadata(
|
|
442
|
+
presentation,
|
|
443
|
+
None,
|
|
444
|
+
accumulated_text,
|
|
445
|
+
keywords,
|
|
422
446
|
slide_idx,
|
|
447
|
+
-1,
|
|
448
|
+
-1,
|
|
449
|
+
-1,
|
|
423
450
|
slide_count,
|
|
424
|
-
|
|
425
|
-
page_nearby_blocks,
|
|
451
|
+
text_depth,
|
|
426
452
|
source_metadata,
|
|
427
453
|
base_unified_metadata,
|
|
428
454
|
)
|
|
455
|
+
)
|
|
456
|
+
accumulated_text = []
|
|
429
457
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
if extract_tables and shape.has_table:
|
|
434
|
-
table_extraction = _construct_table_metadata(
|
|
435
|
-
shape, slide_idx, slide_count, source_metadata, base_unified_metadata
|
|
436
|
-
)
|
|
437
|
-
extracted_data.append(table_extraction)
|
|
438
|
-
|
|
439
|
-
if extract_text and (text_depth == TextTypeEnum.PAGE) and (len(accumulated_text) > 0):
|
|
440
|
-
text_extraction = _construct_text_metadata(
|
|
458
|
+
if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
|
|
459
|
+
extracted_data.append(
|
|
460
|
+
_construct_text_metadata(
|
|
441
461
|
presentation,
|
|
442
|
-
|
|
462
|
+
None,
|
|
443
463
|
accumulated_text,
|
|
444
464
|
keywords,
|
|
445
|
-
|
|
465
|
+
-1,
|
|
446
466
|
-1,
|
|
447
467
|
-1,
|
|
448
468
|
-1,
|
|
@@ -451,41 +471,20 @@ def python_pptx(
|
|
|
451
471
|
source_metadata,
|
|
452
472
|
base_unified_metadata,
|
|
453
473
|
)
|
|
454
|
-
if len(text_extraction) > 0:
|
|
455
|
-
extracted_data.append(text_extraction)
|
|
456
|
-
accumulated_text = []
|
|
457
|
-
|
|
458
|
-
if extract_text and (text_depth == TextTypeEnum.DOCUMENT) and (len(accumulated_text) > 0):
|
|
459
|
-
text_extraction = _construct_text_metadata(
|
|
460
|
-
presentation,
|
|
461
|
-
shape, # may pass None
|
|
462
|
-
accumulated_text,
|
|
463
|
-
keywords,
|
|
464
|
-
-1,
|
|
465
|
-
-1,
|
|
466
|
-
-1,
|
|
467
|
-
-1,
|
|
468
|
-
slide_count,
|
|
469
|
-
text_depth,
|
|
470
|
-
source_metadata,
|
|
471
|
-
base_unified_metadata,
|
|
472
474
|
)
|
|
473
|
-
if len(text_extraction) > 0:
|
|
474
|
-
extracted_data.append(text_extraction)
|
|
475
|
-
accumulated_text = []
|
|
476
475
|
|
|
477
|
-
# ---------------------------------------------
|
|
478
|
-
# FINAL STEP: Finalize images (and tables/charts)
|
|
479
|
-
# ---------------------------------------------
|
|
480
476
|
if extract_images or extract_tables or extract_charts:
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
477
|
+
try:
|
|
478
|
+
_finalize_images(
|
|
479
|
+
pending_images,
|
|
480
|
+
extracted_data,
|
|
481
|
+
pptx_extractor_config,
|
|
482
|
+
extract_tables=extract_tables,
|
|
483
|
+
extract_charts=extract_charts,
|
|
484
|
+
trace_info=trace_info,
|
|
485
|
+
)
|
|
486
|
+
except Exception as e:
|
|
487
|
+
logger.error("Finalization of images failed: %s", e)
|
|
489
488
|
|
|
490
489
|
return extracted_data
|
|
491
490
|
|
|
@@ -118,9 +118,15 @@ def transform_text_split_and_tokenize_internal(
|
|
|
118
118
|
)
|
|
119
119
|
|
|
120
120
|
# Filter to documents with text content.
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
)
|
|
121
|
+
text_type_condition = df_transform_ledger["document_type"] == ContentTypeEnum.TEXT
|
|
122
|
+
|
|
123
|
+
normalized_meta_df = pd.json_normalize(df_transform_ledger["metadata"], errors="ignore")
|
|
124
|
+
if "source_metadata.source_type" in normalized_meta_df.columns:
|
|
125
|
+
source_type_condition = normalized_meta_df["source_metadata.source_type"].isin(split_source_types)
|
|
126
|
+
else:
|
|
127
|
+
source_type_condition = False
|
|
128
|
+
|
|
129
|
+
bool_index = text_type_condition & source_type_condition
|
|
124
130
|
df_filtered: pd.DataFrame = df_transform_ledger.loc[bool_index]
|
|
125
131
|
|
|
126
132
|
if df_filtered.empty:
|
|
@@ -16,7 +16,7 @@ nv_ingest_api/internal/extract/docx/docx_extractor.py,sha256=jjbL12F5dtpbqHRbhL0
|
|
|
16
16
|
nv_ingest_api/internal/extract/docx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
|
|
18
18
|
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py,sha256=1wkciAxu8lz9WuPuoleJFy2s09ieSzXl1S71F9r0BWA,4385
|
|
19
|
-
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=
|
|
19
|
+
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=FOZZBD9gRRAr93qgK_L6o9xVBYD-6EE5-xI2-cWKvzo,33713
|
|
20
20
|
nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
21
21
|
nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=CkaW8ihPmGMQGrZh0ih14gtEpWuGOJ8InPQfZwpsP2g,13300
|
|
22
22
|
nv_ingest_api/internal/extract/image/image_extractor.py,sha256=4tUWinuFMN3ukWa2tZa2_LtzRiTyUAUCBF6BDkUEvm0,8705
|
|
@@ -37,7 +37,7 @@ nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ
|
|
|
37
37
|
nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
|
|
38
38
|
nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
|
|
39
39
|
nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py,sha256=
|
|
40
|
+
nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py,sha256=IZu0c_RHDSJwwclOZD3_tDu5jg4AEEfumbwKB78dUE0,29716
|
|
41
41
|
nv_ingest_api/internal/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
42
42
|
nv_ingest_api/internal/mutate/deduplicate.py,sha256=hmvTTGevpCtlkM_wVZSoc8-Exr6rUJwqLjoEnbPcPzY,3849
|
|
43
43
|
nv_ingest_api/internal/mutate/filter.py,sha256=H-hOTBVP-zLpvQr-FoGIJKxkhtj4l_sZ9V2Fgu3rTEM,5183
|
|
@@ -97,7 +97,7 @@ nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9
|
|
|
97
97
|
nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
98
98
|
nv_ingest_api/internal/transform/caption_image.py,sha256=RYL_b26zfaRlbHz0XvLw9HwaMlXpNhr7gayjxGzdALQ,8545
|
|
99
99
|
nv_ingest_api/internal/transform/embed_text.py,sha256=F8kg-WXihtuUMwDQUUYjnfGDCdQp1Mkd-jeThOiJT0s,16507
|
|
100
|
-
nv_ingest_api/internal/transform/split_text.py,sha256=
|
|
100
|
+
nv_ingest_api/internal/transform/split_text.py,sha256=DlVoyHLqZ-6_FiWwZmofPcq7TX8Ta23hIE0St9tw1IY,6822
|
|
101
101
|
nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
102
102
|
nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
103
|
nv_ingest_api/util/control_message/validators.py,sha256=KvvbyheJ5rbzvJbH9JKpMR9VfoI0b0uM6eTAZte1p44,1315
|
|
@@ -147,8 +147,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
|
|
|
147
147
|
nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
|
|
148
148
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
149
|
nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
|
|
150
|
-
nv_ingest_api-2025.5.
|
|
151
|
-
nv_ingest_api-2025.5.
|
|
152
|
-
nv_ingest_api-2025.5.
|
|
153
|
-
nv_ingest_api-2025.5.
|
|
154
|
-
nv_ingest_api-2025.5.
|
|
150
|
+
nv_ingest_api-2025.5.19.dev20250519.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
151
|
+
nv_ingest_api-2025.5.19.dev20250519.dist-info/METADATA,sha256=LF2uw9E7zhD2ylp4pRazX1C53VqDPN3FOO4NVrLXGe8,13889
|
|
152
|
+
nv_ingest_api-2025.5.19.dev20250519.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
153
|
+
nv_ingest_api-2025.5.19.dev20250519.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
|
|
154
|
+
nv_ingest_api-2025.5.19.dev20250519.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|