nv-ingest-api 25.4.2__py3-none-any.whl → 25.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
- nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
- nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
- nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
- nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +214 -188
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +6 -9
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +35 -38
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
- nv_ingest_api/internal/primitives/nim/nim_client.py +17 -9
- nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +1 -1
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +1 -1
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +1 -1
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +26 -12
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +34 -23
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +11 -10
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +9 -7
- nv_ingest_api/internal/store/image_upload.py +1 -0
- nv_ingest_api/internal/transform/embed_text.py +75 -52
- nv_ingest_api/internal/transform/split_text.py +9 -3
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/exception_handlers/converters.py +1 -1
- nv_ingest_api/util/exception_handlers/decorators.py +309 -51
- nv_ingest_api/util/image_processing/processing.py +1 -1
- nv_ingest_api/util/logging/configuration.py +15 -8
- nv_ingest_api/util/pdf/pdfium.py +2 -2
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
- nv_ingest_api/util/service_clients/rest/rest_client.py +2 -2
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +430 -0
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/METADATA +2 -1
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/RECORD +46 -41
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/WHEEL +1 -1
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ import base64
|
|
|
7
7
|
import functools
|
|
8
8
|
import io
|
|
9
9
|
import logging
|
|
10
|
-
from typing import Optional, Dict, Any, Union
|
|
10
|
+
from typing import Optional, Dict, Any, Union, Tuple
|
|
11
11
|
|
|
12
12
|
import pandas as pd
|
|
13
13
|
from pydantic import BaseModel
|
|
@@ -146,7 +146,7 @@ def extract_primitives_from_docx_internal(
|
|
|
146
146
|
task_config: Union[Dict[str, Any], BaseModel],
|
|
147
147
|
extraction_config: DocxExtractorSchema,
|
|
148
148
|
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
149
|
-
) -> pd.DataFrame:
|
|
149
|
+
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
|
|
150
150
|
"""
|
|
151
151
|
Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
|
|
152
152
|
each document and replacing the original content with the extracted text.
|
|
@@ -202,4 +202,4 @@ def extract_primitives_from_docx_internal(
|
|
|
202
202
|
else:
|
|
203
203
|
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
204
204
|
|
|
205
|
-
return extracted_df
|
|
205
|
+
return extracted_df, {}
|
|
@@ -274,59 +274,70 @@ class DocxReader:
|
|
|
274
274
|
- A list of extracted images from the paragraph.
|
|
275
275
|
"""
|
|
276
276
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
277
|
+
try:
|
|
278
|
+
paragraph_images = []
|
|
279
|
+
if self.paragraph_format == "text":
|
|
280
|
+
return paragraph.text.strip(), paragraph_images
|
|
281
|
+
|
|
282
282
|
font = paragraph.style.font
|
|
283
283
|
default_style = (font.bold, font.italic, font.underline)
|
|
284
284
|
|
|
285
|
-
# Iterate over the runs of the paragraph and group them by style, excluding empty runs
|
|
286
285
|
paragraph_text = ""
|
|
287
286
|
group_text = ""
|
|
288
287
|
previous_style = None
|
|
289
288
|
|
|
290
289
|
for c in paragraph.iter_inner_content():
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
290
|
+
try:
|
|
291
|
+
if isinstance(c, Hyperlink):
|
|
292
|
+
text = f"[{c.text}]({c.address})"
|
|
293
|
+
style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
|
|
294
|
+
elif isinstance(c, Run):
|
|
295
|
+
text = c.text
|
|
296
|
+
style = (c.bold, c.italic, c.underline)
|
|
297
|
+
|
|
298
|
+
# 1. Locate the inline shape which is stored in the <w:drawing> element.
|
|
299
|
+
# 2. r:embed in <a.blip> has the relationship id for extracting the file where
|
|
300
|
+
# the image is stored as bytes.
|
|
301
|
+
# Reference:
|
|
302
|
+
# https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
|
|
303
|
+
inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
|
|
304
|
+
for r_id in inline_shapes:
|
|
305
|
+
text += self.image_tag.format(self.image_tag_index)
|
|
306
|
+
self.image_tag_index += 1
|
|
307
|
+
try:
|
|
308
|
+
image = paragraph.part.related_parts[r_id].image
|
|
309
|
+
paragraph_images.append(image)
|
|
310
|
+
except Exception as img_e:
|
|
311
|
+
logger.warning(
|
|
312
|
+
"Failed to extract image with rId " "%s: %s -- object / file may be malformed",
|
|
313
|
+
r_id,
|
|
314
|
+
img_e,
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
style = tuple(s if s is not None else d for s, d in zip(style, default_style))
|
|
320
|
+
|
|
321
|
+
if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
|
|
316
322
|
paragraph_text += self.format_text(group_text, *previous_style)
|
|
317
323
|
group_text = ""
|
|
318
324
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
325
|
+
group_text += text
|
|
326
|
+
if not self.is_text_empty(text):
|
|
327
|
+
previous_style = style
|
|
322
328
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
329
|
+
except Exception as e:
|
|
330
|
+
logger.error("format_paragraph: failed to process run: %s", e)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
if group_text and previous_style:
|
|
334
|
+
paragraph_text += self.format_text(group_text, *previous_style)
|
|
335
|
+
|
|
336
|
+
return paragraph_text.strip(), paragraph_images
|
|
326
337
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.error("format_paragraph: failed for paragraph: %s", e)
|
|
340
|
+
return "", []
|
|
330
341
|
|
|
331
342
|
def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
|
|
332
343
|
"""
|
|
@@ -344,12 +355,23 @@ class DocxReader:
|
|
|
344
355
|
- A list of images extracted from the cell.
|
|
345
356
|
"""
|
|
346
357
|
|
|
347
|
-
|
|
348
|
-
newline = "<br>"
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
358
|
+
try:
|
|
359
|
+
newline = "<br>" if self.paragraph_format == "markdown" else "\n"
|
|
360
|
+
texts, images = [], []
|
|
361
|
+
|
|
362
|
+
for p in cell.paragraphs:
|
|
363
|
+
try:
|
|
364
|
+
t, imgs = self.format_paragraph(p)
|
|
365
|
+
texts.append(t)
|
|
366
|
+
images.extend(imgs)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.error("format_cell: failed to format paragraph in cell: %s", e)
|
|
369
|
+
|
|
370
|
+
return newline.join(texts), images
|
|
371
|
+
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.error("format_cell: failed entirely: %s", e)
|
|
374
|
+
return "", []
|
|
353
375
|
|
|
354
376
|
def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
|
|
355
377
|
"""
|
|
@@ -368,25 +390,50 @@ class DocxReader:
|
|
|
368
390
|
- A DataFrame representation of the table's content.
|
|
369
391
|
"""
|
|
370
392
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
393
|
+
try:
|
|
394
|
+
rows_data = []
|
|
395
|
+
all_images = []
|
|
396
|
+
|
|
397
|
+
for row in table.rows:
|
|
398
|
+
row_texts = []
|
|
399
|
+
row_images = []
|
|
400
|
+
for cell in row.cells:
|
|
401
|
+
try:
|
|
402
|
+
cell_text, cell_imgs = self.format_cell(cell)
|
|
403
|
+
row_texts.append(cell_text)
|
|
404
|
+
row_images.extend(cell_imgs)
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.error("format_table: failed to process cell: %s", e)
|
|
407
|
+
row_texts.append("") # pad for column alignment
|
|
408
|
+
|
|
409
|
+
rows_data.append(row_texts)
|
|
410
|
+
all_images.extend(row_images)
|
|
411
|
+
|
|
412
|
+
if not rows_data or not rows_data[0]:
|
|
413
|
+
return None, [], pd.DataFrame()
|
|
414
|
+
|
|
415
|
+
header = rows_data[0]
|
|
416
|
+
body = rows_data[1:]
|
|
417
|
+
df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
|
|
418
|
+
|
|
419
|
+
if "markdown" in self.table_format:
|
|
420
|
+
table_text = df.to_markdown(index=False)
|
|
421
|
+
if self.table_format == "markdown_light":
|
|
422
|
+
table_text = re.sub(r"\s{2,}", " ", table_text)
|
|
423
|
+
table_text = re.sub(r"-{2,}", "-", table_text)
|
|
424
|
+
elif self.table_format == "csv":
|
|
425
|
+
table_text = df.to_csv(index=False)
|
|
426
|
+
elif self.table_format == "tag":
|
|
427
|
+
table_text = self.table_tag.format(self.table_tag_index)
|
|
428
|
+
self.table_tag_index += 1
|
|
429
|
+
else:
|
|
430
|
+
raise ValueError(f"Unknown table format {self.table_format}")
|
|
431
|
+
|
|
432
|
+
return table_text, all_images, df
|
|
388
433
|
|
|
389
|
-
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.error("format_table: failed to format table: %s", e)
|
|
436
|
+
return None, [], pd.DataFrame()
|
|
390
437
|
|
|
391
438
|
@staticmethod
|
|
392
439
|
def apply_text_style(style: str, text: str, level: int = 0) -> str:
|
|
@@ -841,30 +888,39 @@ class DocxReader:
|
|
|
841
888
|
self._prev_para_image_idx = 0
|
|
842
889
|
|
|
843
890
|
para_idx = 0
|
|
844
|
-
|
|
845
891
|
for child in self.document.element.body.iterchildren():
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
892
|
+
try:
|
|
893
|
+
if isinstance(child, CT_P):
|
|
894
|
+
paragraph = Paragraph(child, self.document)
|
|
895
|
+
paragraph_text, paragraph_images = self.format_paragraph(paragraph)
|
|
896
|
+
|
|
897
|
+
if extract_text:
|
|
898
|
+
try:
|
|
899
|
+
self._extract_para_text(
|
|
900
|
+
paragraph,
|
|
901
|
+
paragraph_text,
|
|
902
|
+
base_unified_metadata,
|
|
903
|
+
text_depth,
|
|
904
|
+
para_idx,
|
|
905
|
+
)
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.error("extract_data: _extract_para_text failed: %s", e)
|
|
908
|
+
|
|
909
|
+
if (extract_images or extract_charts or extract_tables) and paragraph_images:
|
|
910
|
+
self._pending_images += [
|
|
911
|
+
(image, para_idx, "", base_unified_metadata) for image in paragraph_images
|
|
912
|
+
]
|
|
913
|
+
self.images.extend(paragraph_images)
|
|
914
|
+
|
|
915
|
+
elif isinstance(child, CT_Tbl):
|
|
916
|
+
if extract_tables or extract_charts:
|
|
917
|
+
try:
|
|
918
|
+
self._extract_table_data(child, base_unified_metadata)
|
|
919
|
+
except Exception as e:
|
|
920
|
+
logger.error("extract_data: _extract_table_data failed: %s", e)
|
|
864
921
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
self._extract_table_data(child, base_unified_metadata)
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
|
|
868
924
|
|
|
869
925
|
para_idx += 1
|
|
870
926
|
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import Optional, Dict, Any, Union, Tuple, List
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
13
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
|
|
14
|
+
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
|
|
15
|
+
from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
17
|
+
|
|
18
|
+
from markitdown.converters import HtmlConverter
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@unified_exception_handler
|
|
24
|
+
def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
|
|
25
|
+
metadata = row.get("metadata")
|
|
26
|
+
html_content = row.get("content")
|
|
27
|
+
|
|
28
|
+
if html_content:
|
|
29
|
+
html_converter = HtmlConverter()
|
|
30
|
+
md_content = html_converter.convert_string(html_content=html_content).text_content
|
|
31
|
+
metadata["content"] = md_content
|
|
32
|
+
|
|
33
|
+
return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_markdown_from_html_internal(
|
|
37
|
+
df_extraction_ledger: pd.DataFrame,
|
|
38
|
+
task_config: Dict[str, Any],
|
|
39
|
+
extraction_config: HtmlExtractorSchema,
|
|
40
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
41
|
+
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
|
|
42
|
+
"""
|
|
43
|
+
Processes a pandas DataFrame containing HTML file content, extracting html as text from
|
|
44
|
+
each document and converting it to markdown.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
df_extraction_ledger : pd.DataFrame
|
|
49
|
+
The input DataFrame containing html files as raw text. Expected columns include
|
|
50
|
+
'source_id' and 'content'.
|
|
51
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
52
|
+
Configuration instructions for the document processing task. This can be provided as a
|
|
53
|
+
dictionary or a Pydantic model.
|
|
54
|
+
extraction_config : Any
|
|
55
|
+
A configuration object for document extraction that guides the extraction process.
|
|
56
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
57
|
+
An optional dictionary containing trace information for debugging or logging.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
pd.DataFrame
|
|
62
|
+
A DataFrame with the original html content converted to markdown. The resulting
|
|
63
|
+
DataFrame contains the columns "document_type", "metadata", and "uuid".
|
|
64
|
+
|
|
65
|
+
Raises
|
|
66
|
+
------
|
|
67
|
+
Exception
|
|
68
|
+
If an error occurs during the document extraction process, the exception is logged and
|
|
69
|
+
re-raised.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# Apply the decode_and_extract function to each row in the DataFrame.
|
|
73
|
+
sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
|
|
74
|
+
|
|
75
|
+
# Explode any list results and drop missing values.
|
|
76
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
77
|
+
|
|
78
|
+
# Convert the extraction results to a DataFrame if available.
|
|
79
|
+
if not sr_extraction.empty:
|
|
80
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
81
|
+
else:
|
|
82
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
83
|
+
|
|
84
|
+
return extracted_df, {}
|
|
@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
|
|
|
27
27
|
PADDLE_MIN_WIDTH = 32
|
|
28
28
|
PADDLE_MIN_HEIGHT = 32
|
|
29
29
|
|
|
30
|
-
logger = logging.getLogger(f"
|
|
30
|
+
logger = logging.getLogger(f"ray.{__name__}")
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def _filter_valid_chart_images(
|
|
@@ -80,7 +80,7 @@ def _run_chart_inference(
|
|
|
80
80
|
yolox_client.infer,
|
|
81
81
|
data=data_yolox,
|
|
82
82
|
model_name="yolox",
|
|
83
|
-
stage_name="
|
|
83
|
+
stage_name="chart_extraction",
|
|
84
84
|
max_batch_size=8,
|
|
85
85
|
trace_info=trace_info,
|
|
86
86
|
)
|
|
@@ -88,7 +88,7 @@ def _run_chart_inference(
|
|
|
88
88
|
paddle_client.infer,
|
|
89
89
|
data=data_paddle,
|
|
90
90
|
model_name="paddle",
|
|
91
|
-
stage_name="
|
|
91
|
+
stage_name="chart_extraction",
|
|
92
92
|
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
93
93
|
trace_info=trace_info,
|
|
94
94
|
)
|
|
@@ -16,7 +16,7 @@ import pandas as pd
|
|
|
16
16
|
from pydantic import BaseModel
|
|
17
17
|
|
|
18
18
|
from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
|
|
19
|
-
from nv_ingest_api.internal.schemas.extract.extract_image_schema import
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
20
20
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
def _decode_and_extract_from_image(
|
|
27
27
|
base64_row: pd.Series,
|
|
28
28
|
task_config: Dict[str, Any],
|
|
29
|
-
validated_extraction_config:
|
|
29
|
+
validated_extraction_config: ImageConfigSchema,
|
|
30
30
|
execution_trace_log: Optional[List[Any]] = None,
|
|
31
31
|
) -> Any:
|
|
32
32
|
"""
|
|
@@ -106,10 +106,10 @@ def _decode_and_extract_from_image(
|
|
|
106
106
|
|
|
107
107
|
logger.debug(
|
|
108
108
|
f"decode_and_extract: Extracting image content using image_extraction_config: "
|
|
109
|
-
f"{validated_extraction_config
|
|
109
|
+
f"{validated_extraction_config}"
|
|
110
110
|
)
|
|
111
|
-
if validated_extraction_config
|
|
112
|
-
extract_params["image_extraction_config"] = validated_extraction_config
|
|
111
|
+
if validated_extraction_config is not None:
|
|
112
|
+
extract_params["image_extraction_config"] = validated_extraction_config
|
|
113
113
|
|
|
114
114
|
if execution_trace_log is not None:
|
|
115
115
|
extract_params["trace_info"] = execution_trace_log
|
|
@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
|
|
|
223
223
|
model_name="yolox",
|
|
224
224
|
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
225
225
|
trace_info=trace_info,
|
|
226
|
-
stage_name="
|
|
226
|
+
stage_name="pdf_extraction",
|
|
227
227
|
)
|
|
228
228
|
|
|
229
229
|
# Process each result along with its corresponding image.
|
|
@@ -100,7 +100,7 @@ def _update_infographic_metadata(
|
|
|
100
100
|
paddle_results = paddle_client.infer(
|
|
101
101
|
data=data_paddle,
|
|
102
102
|
model_name="paddle",
|
|
103
|
-
stage_name="
|
|
103
|
+
stage_name="infographic_extraction",
|
|
104
104
|
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
105
105
|
trace_info=trace_info,
|
|
106
106
|
)
|
|
@@ -81,7 +81,7 @@ def _run_inference(
|
|
|
81
81
|
yolox_client.infer,
|
|
82
82
|
data=data_yolox,
|
|
83
83
|
model_name="yolox",
|
|
84
|
-
stage_name="
|
|
84
|
+
stage_name="table_extraction",
|
|
85
85
|
max_batch_size=8,
|
|
86
86
|
trace_info=trace_info,
|
|
87
87
|
)
|
|
@@ -89,7 +89,7 @@ def _run_inference(
|
|
|
89
89
|
paddle_client.infer,
|
|
90
90
|
data=data_paddle,
|
|
91
91
|
model_name="paddle",
|
|
92
|
-
stage_name="
|
|
92
|
+
stage_name="table_extraction",
|
|
93
93
|
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
94
94
|
trace_info=trace_info,
|
|
95
95
|
)
|
|
@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
|
|
|
466
466
|
inference_results = nemoretriever_parse_client.infer(
|
|
467
467
|
data=data,
|
|
468
468
|
model_name="nemoretriever_parse",
|
|
469
|
-
stage_name="
|
|
469
|
+
stage_name="pdf_extraction",
|
|
470
470
|
max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
|
|
471
471
|
execution_trace_log=execution_trace_log,
|
|
472
472
|
)
|
|
@@ -476,7 +476,7 @@ def _extract_text_and_bounding_boxes(
|
|
|
476
476
|
|
|
477
477
|
def _create_clients(nemoretriever_parse_config):
|
|
478
478
|
model_interface = nemoretriever_parse_utils.NemoRetrieverParseModelInterface(
|
|
479
|
-
model_name=nemoretriever_parse_config.
|
|
479
|
+
model_name=nemoretriever_parse_config.nemoretriever_parse_model_name,
|
|
480
480
|
)
|
|
481
481
|
nemoretriever_parse_client = create_inference_client(
|
|
482
482
|
nemoretriever_parse_config.nemoretriever_parse_endpoints,
|
|
@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
|
|
|
105
105
|
model_name="yolox",
|
|
106
106
|
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
107
107
|
trace_info=execution_trace_log,
|
|
108
|
-
stage_name="
|
|
108
|
+
stage_name="pdf_extraction",
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
# Process results: iterate over each image's inference output.
|