nv-ingest-api 25.4.2__py3-none-any.whl → 25.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (46) hide show
  1. nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
  2. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
  3. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  4. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  5. nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
  6. nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
  7. nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
  8. nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
  9. nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
  10. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
  11. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
  12. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +214 -188
  13. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +6 -9
  14. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +35 -38
  15. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
  16. nv_ingest_api/internal/primitives/nim/nim_client.py +17 -9
  17. nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
  18. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +1 -1
  19. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  20. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +1 -1
  21. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
  22. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +1 -1
  23. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +26 -12
  24. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +34 -23
  25. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +11 -10
  26. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +9 -7
  27. nv_ingest_api/internal/store/image_upload.py +1 -0
  28. nv_ingest_api/internal/transform/embed_text.py +75 -52
  29. nv_ingest_api/internal/transform/split_text.py +9 -3
  30. nv_ingest_api/util/__init__.py +3 -0
  31. nv_ingest_api/util/exception_handlers/converters.py +1 -1
  32. nv_ingest_api/util/exception_handlers/decorators.py +309 -51
  33. nv_ingest_api/util/image_processing/processing.py +1 -1
  34. nv_ingest_api/util/logging/configuration.py +15 -8
  35. nv_ingest_api/util/pdf/pdfium.py +2 -2
  36. nv_ingest_api/util/schema/__init__.py +3 -0
  37. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  38. nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
  39. nv_ingest_api/util/service_clients/rest/rest_client.py +2 -2
  40. nv_ingest_api/util/system/__init__.py +0 -0
  41. nv_ingest_api/util/system/hardware_info.py +430 -0
  42. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/METADATA +2 -1
  43. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/RECORD +46 -41
  44. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/WHEEL +1 -1
  45. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/licenses/LICENSE +0 -0
  46. {nv_ingest_api-25.4.2.dist-info → nv_ingest_api-25.6.1.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ import base64
7
7
  import functools
8
8
  import io
9
9
  import logging
10
- from typing import Optional, Dict, Any, Union
10
+ from typing import Optional, Dict, Any, Union, Tuple
11
11
 
12
12
  import pandas as pd
13
13
  from pydantic import BaseModel
@@ -146,7 +146,7 @@ def extract_primitives_from_docx_internal(
146
146
  task_config: Union[Dict[str, Any], BaseModel],
147
147
  extraction_config: DocxExtractorSchema,
148
148
  execution_trace_log: Optional[Dict[str, Any]] = None,
149
- ) -> pd.DataFrame:
149
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
150
150
  """
151
151
  Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
152
152
  each document and replacing the original content with the extracted text.
@@ -202,4 +202,4 @@ def extract_primitives_from_docx_internal(
202
202
  else:
203
203
  extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
204
204
 
205
- return extracted_df
205
+ return extracted_df, {}
@@ -274,59 +274,70 @@ class DocxReader:
274
274
  - A list of extracted images from the paragraph.
275
275
  """
276
276
 
277
- paragraph_images = []
278
- if self.paragraph_format == "text":
279
- paragraph_text = paragraph.text
280
- else:
281
- # Get the default style of the paragraph, "markdown"
277
+ try:
278
+ paragraph_images = []
279
+ if self.paragraph_format == "text":
280
+ return paragraph.text.strip(), paragraph_images
281
+
282
282
  font = paragraph.style.font
283
283
  default_style = (font.bold, font.italic, font.underline)
284
284
 
285
- # Iterate over the runs of the paragraph and group them by style, excluding empty runs
286
285
  paragraph_text = ""
287
286
  group_text = ""
288
287
  previous_style = None
289
288
 
290
289
  for c in paragraph.iter_inner_content():
291
- if isinstance(c, Hyperlink):
292
- text = f"[{c.text}]({c.address})"
293
- style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
- elif isinstance(c, Run):
295
- text = c.text
296
- style = (c.bold, c.italic, c.underline)
297
- # 1. Locate the inline shape which is stored in the <w:drawing> element.
298
- # 2. r:embed in <a.blip> has the relationship id for extracting the file where
299
- # the image is stored as bytes.
300
- # Reference:
301
- # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
302
- inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
303
- for r_id in inline_shapes:
304
- text += self.image_tag.format(self.image_tag_index)
305
- self.image_tag_index += 1
306
- image = paragraph.part.related_parts[r_id].image
307
- paragraph_images.append(image)
308
- else:
309
- continue
310
-
311
- style = tuple([s if s is not None else d for s, d in zip(style, default_style)])
312
-
313
- # If the style changes for a non empty text, format the previous group and start a new one
314
- if (not self.is_text_empty(text)) and (previous_style is not None):
315
- if style != previous_style:
290
+ try:
291
+ if isinstance(c, Hyperlink):
292
+ text = f"[{c.text}]({c.address})"
293
+ style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
+ elif isinstance(c, Run):
295
+ text = c.text
296
+ style = (c.bold, c.italic, c.underline)
297
+
298
+ # 1. Locate the inline shape which is stored in the <w:drawing> element.
299
+ # 2. r:embed in <a.blip> has the relationship id for extracting the file where
300
+ # the image is stored as bytes.
301
+ # Reference:
302
+ # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
303
+ inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
304
+ for r_id in inline_shapes:
305
+ text += self.image_tag.format(self.image_tag_index)
306
+ self.image_tag_index += 1
307
+ try:
308
+ image = paragraph.part.related_parts[r_id].image
309
+ paragraph_images.append(image)
310
+ except Exception as img_e:
311
+ logger.warning(
312
+ "Failed to extract image with rId " "%s: %s -- object / file may be malformed",
313
+ r_id,
314
+ img_e,
315
+ )
316
+ else:
317
+ continue
318
+
319
+ style = tuple(s if s is not None else d for s, d in zip(style, default_style))
320
+
321
+ if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
316
322
  paragraph_text += self.format_text(group_text, *previous_style)
317
323
  group_text = ""
318
324
 
319
- group_text += text
320
- if not self.is_text_empty(text):
321
- previous_style = style
325
+ group_text += text
326
+ if not self.is_text_empty(text):
327
+ previous_style = style
322
328
 
323
- # Format the last group
324
- if group_text:
325
- paragraph_text += self.format_text(group_text, *style)
329
+ except Exception as e:
330
+ logger.error("format_paragraph: failed to process run: %s", e)
331
+ continue
332
+
333
+ if group_text and previous_style:
334
+ paragraph_text += self.format_text(group_text, *previous_style)
335
+
336
+ return paragraph_text.strip(), paragraph_images
326
337
 
327
- # Remove trailing spaces
328
- paragraph_text = paragraph_text.strip()
329
- return paragraph_text, paragraph_images
338
+ except Exception as e:
339
+ logger.error("format_paragraph: failed for paragraph: %s", e)
340
+ return "", []
330
341
 
331
342
  def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
332
343
  """
@@ -344,12 +355,23 @@ class DocxReader:
344
355
  - A list of images extracted from the cell.
345
356
  """
346
357
 
347
- if self.paragraph_format == "markdown":
348
- newline = "<br>"
349
- else:
350
- newline = "\n"
351
- paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs])
352
- return newline.join(paragraph_texts), paragraph_images
358
+ try:
359
+ newline = "<br>" if self.paragraph_format == "markdown" else "\n"
360
+ texts, images = [], []
361
+
362
+ for p in cell.paragraphs:
363
+ try:
364
+ t, imgs = self.format_paragraph(p)
365
+ texts.append(t)
366
+ images.extend(imgs)
367
+ except Exception as e:
368
+ logger.error("format_cell: failed to format paragraph in cell: %s", e)
369
+
370
+ return newline.join(texts), images
371
+
372
+ except Exception as e:
373
+ logger.error("format_cell: failed entirely: %s", e)
374
+ return "", []
353
375
 
354
376
  def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
355
377
  """
@@ -368,25 +390,50 @@ class DocxReader:
368
390
  - A DataFrame representation of the table's content.
369
391
  """
370
392
 
371
- rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows]
372
- texts = [[text for text, _ in row] for row in rows]
373
- table_images = [image for row in rows for _, images in row for image in images]
374
-
375
- table = pd.DataFrame(texts[1:], columns=texts[0])
376
- if "markdown" in self.table_format:
377
- table_text = table.to_markdown(index=False)
378
- if self.table_format == "markdown_light":
379
- table_text = re.sub(r"\s{2,}", " ", table_text)
380
- table_text = re.sub(r"-{2,}", "-", table_text)
381
- elif self.table_format == "csv":
382
- table_text = table.to_csv()
383
- elif self.table_format == "tag":
384
- table_text = self.table_tag.format(self.table_tag_index)
385
- self.table_tag_index += 1
386
- else:
387
- raise ValueError(f"Unknown table format {format}")
393
+ try:
394
+ rows_data = []
395
+ all_images = []
396
+
397
+ for row in table.rows:
398
+ row_texts = []
399
+ row_images = []
400
+ for cell in row.cells:
401
+ try:
402
+ cell_text, cell_imgs = self.format_cell(cell)
403
+ row_texts.append(cell_text)
404
+ row_images.extend(cell_imgs)
405
+ except Exception as e:
406
+ logger.error("format_table: failed to process cell: %s", e)
407
+ row_texts.append("") # pad for column alignment
408
+
409
+ rows_data.append(row_texts)
410
+ all_images.extend(row_images)
411
+
412
+ if not rows_data or not rows_data[0]:
413
+ return None, [], pd.DataFrame()
414
+
415
+ header = rows_data[0]
416
+ body = rows_data[1:]
417
+ df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
418
+
419
+ if "markdown" in self.table_format:
420
+ table_text = df.to_markdown(index=False)
421
+ if self.table_format == "markdown_light":
422
+ table_text = re.sub(r"\s{2,}", " ", table_text)
423
+ table_text = re.sub(r"-{2,}", "-", table_text)
424
+ elif self.table_format == "csv":
425
+ table_text = df.to_csv(index=False)
426
+ elif self.table_format == "tag":
427
+ table_text = self.table_tag.format(self.table_tag_index)
428
+ self.table_tag_index += 1
429
+ else:
430
+ raise ValueError(f"Unknown table format {self.table_format}")
431
+
432
+ return table_text, all_images, df
388
433
 
389
- return table_text, table_images, table
434
+ except Exception as e:
435
+ logger.error("format_table: failed to format table: %s", e)
436
+ return None, [], pd.DataFrame()
390
437
 
391
438
  @staticmethod
392
439
  def apply_text_style(style: str, text: str, level: int = 0) -> str:
@@ -841,30 +888,39 @@ class DocxReader:
841
888
  self._prev_para_image_idx = 0
842
889
 
843
890
  para_idx = 0
844
-
845
891
  for child in self.document.element.body.iterchildren():
846
- if isinstance(child, CT_P):
847
- paragraph = Paragraph(child, self.document)
848
- paragraph_text, paragraph_images = self.format_paragraph(paragraph)
849
-
850
- if extract_text:
851
- self._extract_para_text(
852
- paragraph,
853
- paragraph_text,
854
- base_unified_metadata,
855
- text_depth,
856
- para_idx,
857
- )
858
-
859
- if (extract_charts or extract_images or extract_tables) and paragraph_images:
860
- self._prev_para_images = paragraph_images
861
- self._prev_para_image_idx = para_idx
862
- self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images]
863
- self.images += paragraph_images
892
+ try:
893
+ if isinstance(child, CT_P):
894
+ paragraph = Paragraph(child, self.document)
895
+ paragraph_text, paragraph_images = self.format_paragraph(paragraph)
896
+
897
+ if extract_text:
898
+ try:
899
+ self._extract_para_text(
900
+ paragraph,
901
+ paragraph_text,
902
+ base_unified_metadata,
903
+ text_depth,
904
+ para_idx,
905
+ )
906
+ except Exception as e:
907
+ logger.error("extract_data: _extract_para_text failed: %s", e)
908
+
909
+ if (extract_images or extract_charts or extract_tables) and paragraph_images:
910
+ self._pending_images += [
911
+ (image, para_idx, "", base_unified_metadata) for image in paragraph_images
912
+ ]
913
+ self.images.extend(paragraph_images)
914
+
915
+ elif isinstance(child, CT_Tbl):
916
+ if extract_tables or extract_charts:
917
+ try:
918
+ self._extract_table_data(child, base_unified_metadata)
919
+ except Exception as e:
920
+ logger.error("extract_data: _extract_table_data failed: %s", e)
864
921
 
865
- elif isinstance(child, CT_Tbl):
866
- if extract_tables or extract_charts:
867
- self._extract_table_data(child, base_unified_metadata)
922
+ except Exception as e:
923
+ logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
868
924
 
869
925
  para_idx += 1
870
926
 
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import uuid
8
+ from typing import Optional, Dict, Any, Union, Tuple, List
9
+
10
+ import pandas as pd
11
+
12
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
13
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
14
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
15
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
16
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
17
+
18
+ from markitdown.converters import HtmlConverter
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @unified_exception_handler
24
+ def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
25
+ metadata = row.get("metadata")
26
+ html_content = row.get("content")
27
+
28
+ if html_content:
29
+ html_converter = HtmlConverter()
30
+ md_content = html_converter.convert_string(html_content=html_content).text_content
31
+ metadata["content"] = md_content
32
+
33
+ return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
34
+
35
+
36
+ def extract_markdown_from_html_internal(
37
+ df_extraction_ledger: pd.DataFrame,
38
+ task_config: Dict[str, Any],
39
+ extraction_config: HtmlExtractorSchema,
40
+ execution_trace_log: Optional[Dict[str, Any]] = None,
41
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
42
+ """
43
+ Processes a pandas DataFrame containing HTML file content, extracting html as text from
44
+ each document and converting it to markdown.
45
+
46
+ Parameters
47
+ ----------
48
+ df_extraction_ledger : pd.DataFrame
49
+ The input DataFrame containing html files as raw text. Expected columns include
50
+ 'source_id' and 'content'.
51
+ task_config : Union[Dict[str, Any], BaseModel]
52
+ Configuration instructions for the document processing task. This can be provided as a
53
+ dictionary or a Pydantic model.
54
+ extraction_config : Any
55
+ A configuration object for document extraction that guides the extraction process.
56
+ execution_trace_log : Optional[Dict[str, Any]], default=None
57
+ An optional dictionary containing trace information for debugging or logging.
58
+
59
+ Returns
60
+ -------
61
+ pd.DataFrame
62
+ A DataFrame with the original html content converted to markdown. The resulting
63
+ DataFrame contains the columns "document_type", "metadata", and "uuid".
64
+
65
+ Raises
66
+ ------
67
+ Exception
68
+ If an error occurs during the document extraction process, the exception is logged and
69
+ re-raised.
70
+ """
71
+
72
+ # Apply the decode_and_extract function to each row in the DataFrame.
73
+ sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
74
+
75
+ # Explode any list results and drop missing values.
76
+ sr_extraction = sr_extraction.explode().dropna()
77
+
78
+ # Convert the extraction results to a DataFrame if available.
79
+ if not sr_extraction.empty:
80
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
81
+ else:
82
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
83
+
84
+ return extracted_df, {}
@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
27
27
  PADDLE_MIN_WIDTH = 32
28
28
  PADDLE_MIN_HEIGHT = 32
29
29
 
30
- logger = logging.getLogger(f"morpheus.{__name__}")
30
+ logger = logging.getLogger(f"ray.{__name__}")
31
31
 
32
32
 
33
33
  def _filter_valid_chart_images(
@@ -80,7 +80,7 @@ def _run_chart_inference(
80
80
  yolox_client.infer,
81
81
  data=data_yolox,
82
82
  model_name="yolox",
83
- stage_name="chart_data_extraction",
83
+ stage_name="chart_extraction",
84
84
  max_batch_size=8,
85
85
  trace_info=trace_info,
86
86
  )
@@ -88,7 +88,7 @@ def _run_chart_inference(
88
88
  paddle_client.infer,
89
89
  data=data_paddle,
90
90
  model_name="paddle",
91
- stage_name="chart_data_extraction",
91
+ stage_name="chart_extraction",
92
92
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
93
93
  trace_info=trace_info,
94
94
  )
@@ -16,7 +16,7 @@ import pandas as pd
16
16
  from pydantic import BaseModel
17
17
 
18
18
  from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
19
- from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
19
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
20
20
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
21
21
 
22
22
  logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
26
26
  def _decode_and_extract_from_image(
27
27
  base64_row: pd.Series,
28
28
  task_config: Dict[str, Any],
29
- validated_extraction_config: ImageExtractorSchema,
29
+ validated_extraction_config: ImageConfigSchema,
30
30
  execution_trace_log: Optional[List[Any]] = None,
31
31
  ) -> Any:
32
32
  """
@@ -106,10 +106,10 @@ def _decode_and_extract_from_image(
106
106
 
107
107
  logger.debug(
108
108
  f"decode_and_extract: Extracting image content using image_extraction_config: "
109
- f"{validated_extraction_config.image_extraction_config}"
109
+ f"{validated_extraction_config}"
110
110
  )
111
- if validated_extraction_config.image_extraction_config is not None:
112
- extract_params["image_extraction_config"] = validated_extraction_config.image_extraction_config
111
+ if validated_extraction_config is not None:
112
+ extract_params["image_extraction_config"] = validated_extraction_config
113
113
 
114
114
  if execution_trace_log is not None:
115
115
  extract_params["trace_info"] = execution_trace_log
@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
223
223
  model_name="yolox",
224
224
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
225
225
  trace_info=trace_info,
226
- stage_name="pdf_content_extractor",
226
+ stage_name="pdf_extraction",
227
227
  )
228
228
 
229
229
  # Process each result along with its corresponding image.
@@ -100,7 +100,7 @@ def _update_infographic_metadata(
100
100
  paddle_results = paddle_client.infer(
101
101
  data=data_paddle,
102
102
  model_name="paddle",
103
- stage_name="infographic_data_extraction",
103
+ stage_name="infographic_extraction",
104
104
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
105
105
  trace_info=trace_info,
106
106
  )
@@ -81,7 +81,7 @@ def _run_inference(
81
81
  yolox_client.infer,
82
82
  data=data_yolox,
83
83
  model_name="yolox",
84
- stage_name="table_data_extraction",
84
+ stage_name="table_extraction",
85
85
  max_batch_size=8,
86
86
  trace_info=trace_info,
87
87
  )
@@ -89,7 +89,7 @@ def _run_inference(
89
89
  paddle_client.infer,
90
90
  data=data_paddle,
91
91
  model_name="paddle",
92
- stage_name="table_data_extraction",
92
+ stage_name="table_extraction",
93
93
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
94
94
  trace_info=trace_info,
95
95
  )
@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
466
466
  inference_results = nemoretriever_parse_client.infer(
467
467
  data=data,
468
468
  model_name="nemoretriever_parse",
469
- stage_name="pdf_content_extractor",
469
+ stage_name="pdf_extraction",
470
470
  max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
471
471
  execution_trace_log=execution_trace_log,
472
472
  )
@@ -476,7 +476,7 @@ def _extract_text_and_bounding_boxes(
476
476
 
477
477
  def _create_clients(nemoretriever_parse_config):
478
478
  model_interface = nemoretriever_parse_utils.NemoRetrieverParseModelInterface(
479
- model_name=nemoretriever_parse_config.model_name,
479
+ model_name=nemoretriever_parse_config.nemoretriever_parse_model_name,
480
480
  )
481
481
  nemoretriever_parse_client = create_inference_client(
482
482
  nemoretriever_parse_config.nemoretriever_parse_endpoints,
@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
105
105
  model_name="yolox",
106
106
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
107
107
  trace_info=execution_trace_log,
108
- stage_name="pdf_content_extractor",
108
+ stage_name="pdf_extraction",
109
109
  )
110
110
 
111
111
  # Process results: iterate over each image's inference output.