nv-ingest-api 25.7.6.dev20250706__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (33) hide show
  1. nv_ingest_api/interface/extract.py +18 -18
  2. nv_ingest_api/internal/enums/common.py +6 -0
  3. nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
  4. nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
  5. nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
  6. nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
  7. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +9 -8
  8. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +32 -20
  9. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +40 -29
  10. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
  11. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +1 -0
  12. nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
  13. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +44 -236
  14. nv_ingest_api/internal/primitives/nim/nim_client.py +61 -18
  15. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
  16. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
  17. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
  18. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -0
  19. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +2 -2
  20. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
  21. nv_ingest_api/internal/transform/embed_text.py +103 -12
  22. nv_ingest_api/internal/transform/split_text.py +13 -8
  23. nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
  24. nv_ingest_api/util/image_processing/transforms.py +351 -87
  25. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
  26. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
  27. nv_ingest_api/util/metadata/aggregators.py +4 -1
  28. nv_ingest_api/util/pdf/pdfium.py +6 -14
  29. {nv_ingest_api-25.7.6.dev20250706.dist-info → nv_ingest_api-25.8.0rc1.dist-info}/METADATA +2 -1
  30. {nv_ingest_api-25.7.6.dev20250706.dist-info → nv_ingest_api-25.8.0rc1.dist-info}/RECORD +33 -33
  31. {nv_ingest_api-25.7.6.dev20250706.dist-info → nv_ingest_api-25.8.0rc1.dist-info}/WHEEL +0 -0
  32. {nv_ingest_api-25.7.6.dev20250706.dist-info → nv_ingest_api-25.8.0rc1.dist-info}/licenses/LICENSE +0 -0
  33. {nv_ingest_api-25.7.6.dev20250706.dist-info → nv_ingest_api-25.8.0rc1.dist-info}/top_level.txt +0 -0
@@ -15,10 +15,11 @@ import pandas as pd
15
15
 
16
16
  from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
17
17
  from nv_ingest_api.internal.enums.common import TableFormatEnum
18
- from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
18
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
19
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
19
20
  from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
20
- from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_paddle_output
21
- from nv_ingest_api.util.image_processing.table_and_chart import convert_paddle_response_to_psuedo_markdown
21
+ from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
22
+ from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
22
23
  from nv_ingest_api.internal.primitives.nim import NimClient
23
24
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
24
25
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
60
61
  def _run_inference(
61
62
  enable_yolox: bool,
62
63
  yolox_client: Any,
63
- paddle_client: Any,
64
+ ocr_client: Any,
65
+ ocr_model_name: str,
64
66
  valid_arrays: List[np.ndarray],
65
67
  valid_images: List[str],
66
68
  trace_info: Optional[Dict] = None,
@@ -68,32 +70,45 @@ def _run_inference(
68
70
  """
69
71
  Run inference concurrently for YOLOX (if enabled) and Paddle.
70
72
 
71
- Returns a tuple of (yolox_results, paddle_results).
73
+ Returns a tuple of (yolox_results, ocr_results).
72
74
  """
73
- data_paddle = {"base64_images": valid_images}
75
+ data_ocr = {"base64_images": valid_images}
74
76
  if enable_yolox:
75
77
  data_yolox = {"images": valid_arrays}
76
-
77
- with ThreadPoolExecutor(max_workers=2) as executor:
78
- future_yolox = None
79
- if enable_yolox:
80
- future_yolox = executor.submit(
81
- yolox_client.infer,
82
- data=data_yolox,
83
- model_name="yolox",
84
- stage_name="table_extraction",
85
- max_batch_size=8,
86
- trace_info=trace_info,
87
- )
88
- future_paddle = executor.submit(
89
- paddle_client.infer,
90
- data=data_paddle,
91
- model_name="paddle",
78
+ future_yolox_kwargs = dict(
79
+ data=data_yolox,
80
+ model_name="yolox_ensemble",
92
81
  stage_name="table_extraction",
93
- max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
82
+ max_batch_size=8,
83
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
84
+ dtypes=["BYTES", "FP32"],
85
+ output_names=["OUTPUT"],
94
86
  trace_info=trace_info,
95
87
  )
96
88
 
89
+ future_ocr_kwargs = dict(
90
+ data=data_ocr,
91
+ stage_name="table_extraction",
92
+ max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
93
+ trace_info=trace_info,
94
+ )
95
+ if ocr_model_name == "paddle":
96
+ future_ocr_kwargs.update(
97
+ model_name="paddle",
98
+ )
99
+ else:
100
+ future_ocr_kwargs.update(
101
+ model_name="scene_text",
102
+ input_names=["input", "merge_levels"],
103
+ dtypes=["FP32", "BYTES"],
104
+ merge_level="word",
105
+ )
106
+
107
+ with ThreadPoolExecutor(max_workers=2) as executor:
108
+ future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
109
+ future_yolox = None
110
+ if enable_yolox:
111
+ future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
97
112
  if enable_yolox:
98
113
  try:
99
114
  yolox_results = future_yolox.result()
@@ -104,17 +119,17 @@ def _run_inference(
104
119
  yolox_results = [None] * len(valid_images)
105
120
 
106
121
  try:
107
- paddle_results = future_paddle.result()
122
+ ocr_results = future_ocr.result()
108
123
  except Exception as e:
109
- logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
124
+ logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
110
125
  raise
111
126
 
112
- return yolox_results, paddle_results
127
+ return yolox_results, ocr_results
113
128
 
114
129
 
115
130
  def _validate_inference_results(
116
131
  yolox_results: Any,
117
- paddle_results: Any,
132
+ ocr_results: Any,
118
133
  valid_arrays: List[Any],
119
134
  valid_images: List[str],
120
135
  ) -> Tuple[List[Any], List[Any]]:
@@ -123,46 +138,47 @@ def _validate_inference_results(
123
138
 
124
139
  If not, default values are assigned. Raises a ValueError if the lengths do not match.
125
140
  """
126
- if not isinstance(yolox_results, list) or not isinstance(paddle_results, list):
141
+ if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
127
142
  logger.warning(
128
- "Unexpected result types from inference clients: yolox_results=%s, paddle_results=%s. "
143
+ "Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
129
144
  "Proceeding with available results.",
130
145
  type(yolox_results).__name__,
131
- type(paddle_results).__name__,
146
+ type(ocr_results).__name__,
132
147
  )
133
148
  if not isinstance(yolox_results, list):
134
149
  yolox_results = [None] * len(valid_arrays)
135
- if not isinstance(paddle_results, list):
136
- paddle_results = [(None, None)] * len(valid_images)
150
+ if not isinstance(ocr_results, list):
151
+ ocr_results = [(None, None)] * len(valid_images)
137
152
 
138
153
  if len(yolox_results) != len(valid_arrays):
139
154
  raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
140
- if len(paddle_results) != len(valid_images):
141
- raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
155
+ if len(ocr_results) != len(valid_images):
156
+ raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
142
157
 
143
- return yolox_results, paddle_results
158
+ return yolox_results, ocr_results
144
159
 
145
160
 
146
161
  def _update_table_metadata(
147
162
  base64_images: List[str],
148
163
  yolox_client: Any,
149
- paddle_client: Any,
164
+ ocr_client: Any,
165
+ ocr_model_name: str,
150
166
  worker_pool_size: int = 8, # Not currently used
151
167
  enable_yolox: bool = False,
152
168
  trace_info: Optional[Dict] = None,
153
169
  ) -> List[Tuple[str, Any, Any, Any]]:
154
170
  """
155
171
  Given a list of base64-encoded images, this function filters out images that do not meet
156
- the minimum size requirements and then calls the PaddleOCR model via paddle_client.infer
172
+ the minimum size requirements and then calls the OCR model via ocr_client.infer
157
173
  to extract table data.
158
174
 
159
175
  For each base64-encoded image, the result is a tuple:
160
- (base64_image, yolox_result, paddle_text_predictions, paddle_bounding_boxes)
176
+ (base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
161
177
 
162
178
  Images that do not meet the minimum size are skipped (resulting in placeholders).
163
- The paddle_client is expected to handle any necessary batching and concurrency.
179
+ The ocr_client is expected to handle any necessary batching and concurrency.
164
180
  """
165
- logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")
181
+ logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
166
182
 
167
183
  # Initialize the results list with default placeholders.
168
184
  results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
@@ -174,24 +190,23 @@ def _update_table_metadata(
174
190
  return results
175
191
 
176
192
  # Run inference concurrently.
177
- yolox_results, paddle_results = _run_inference(
193
+ yolox_results, ocr_results = _run_inference(
178
194
  enable_yolox=enable_yolox,
179
195
  yolox_client=yolox_client,
180
- paddle_client=paddle_client,
196
+ ocr_client=ocr_client,
197
+ ocr_model_name=ocr_model_name,
181
198
  valid_arrays=valid_arrays,
182
199
  valid_images=valid_images,
183
200
  trace_info=trace_info,
184
201
  )
185
202
 
186
203
  # Validate that the inference results have the expected structure.
187
- yolox_results, paddle_results = _validate_inference_results(
188
- yolox_results, paddle_results, valid_arrays, valid_images
189
- )
204
+ yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
190
205
 
191
206
  # Combine results with the original order.
192
- for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
207
+ for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
193
208
  original_index = valid_indices[idx]
194
- results[original_index] = (base64_images[original_index], yolox_res, paddle_res[0], paddle_res[1])
209
+ results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
195
210
 
196
211
  return results
197
212
 
@@ -199,14 +214,14 @@ def _update_table_metadata(
199
214
  def _create_clients(
200
215
  yolox_endpoints: Tuple[str, str],
201
216
  yolox_protocol: str,
202
- paddle_endpoints: Tuple[str, str],
203
- paddle_protocol: str,
217
+ ocr_endpoints: Tuple[str, str],
218
+ ocr_protocol: str,
204
219
  auth_token: str,
205
220
  ) -> Tuple[NimClient, NimClient]:
206
221
  yolox_model_interface = YoloxTableStructureModelInterface()
207
- paddle_model_interface = PaddleOCRModelInterface()
222
+ ocr_model_interface = OCRModelInterface()
208
223
 
209
- logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
224
+ logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
210
225
 
211
226
  yolox_client = create_inference_client(
212
227
  endpoints=yolox_endpoints,
@@ -215,14 +230,14 @@ def _create_clients(
215
230
  infer_protocol=yolox_protocol,
216
231
  )
217
232
 
218
- paddle_client = create_inference_client(
219
- endpoints=paddle_endpoints,
220
- model_interface=paddle_model_interface,
233
+ ocr_client = create_inference_client(
234
+ endpoints=ocr_endpoints,
235
+ model_interface=ocr_model_interface,
221
236
  auth_token=auth_token,
222
- infer_protocol=paddle_protocol,
237
+ infer_protocol=ocr_protocol,
223
238
  )
224
239
 
225
- return yolox_client, paddle_client
240
+ return yolox_client, ocr_client
226
241
 
227
242
 
228
243
  def extract_table_data_from_image_internal(
@@ -262,14 +277,18 @@ def extract_table_data_from_image_internal(
262
277
  return df_extraction_ledger, execution_trace_log
263
278
 
264
279
  endpoint_config = extraction_config.endpoint_config
265
- yolox_client, paddle_client = _create_clients(
280
+ yolox_client, ocr_client = _create_clients(
266
281
  endpoint_config.yolox_endpoints,
267
282
  endpoint_config.yolox_infer_protocol,
268
- endpoint_config.paddle_endpoints,
269
- endpoint_config.paddle_infer_protocol,
283
+ endpoint_config.ocr_endpoints,
284
+ endpoint_config.ocr_infer_protocol,
270
285
  endpoint_config.auth_token,
271
286
  )
272
287
 
288
+ # Get the grpc endpoint to determine the model if needed
289
+ ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
290
+ ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
291
+
273
292
  try:
274
293
  # 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
275
294
  def meets_criteria(row):
@@ -309,7 +328,8 @@ def extract_table_data_from_image_internal(
309
328
  bulk_results = _update_table_metadata(
310
329
  base64_images=base64_images,
311
330
  yolox_client=yolox_client,
312
- paddle_client=paddle_client,
331
+ ocr_client=ocr_client,
332
+ ocr_model_name=ocr_model_name,
313
333
  worker_pool_size=endpoint_config.workers_per_progress_engine,
314
334
  enable_yolox=enable_yolox,
315
335
  trace_info=execution_trace_log,
@@ -317,15 +337,15 @@ def extract_table_data_from_image_internal(
317
337
 
318
338
  # 4) Write the results (bounding_boxes, text_predictions) back
319
339
  for row_id, idx in enumerate(valid_indices):
320
- # unpack (base64_image, (yolox_predictions, paddle_bounding boxes, paddle_text_predictions))
340
+ # unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
321
341
  _, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
322
342
 
323
343
  if table_content_format == TableFormatEnum.SIMPLE:
324
344
  table_content = " ".join(text_predictions)
325
345
  elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
326
- table_content = convert_paddle_response_to_psuedo_markdown(bounding_boxes, text_predictions)
346
+ table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
327
347
  elif table_content_format == TableFormatEnum.MARKDOWN:
328
- table_content = join_yolox_table_structure_and_paddle_output(
348
+ table_content = join_yolox_table_structure_and_ocr_output(
329
349
  cell_predictions, bounding_boxes, text_predictions
330
350
  )
331
351
  else:
@@ -341,4 +361,4 @@ def extract_table_data_from_image_internal(
341
361
  raise
342
362
  finally:
343
363
  yolox_client.close()
344
- paddle_client.close()
364
+ ocr_client.close()
@@ -40,6 +40,7 @@ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadat
40
40
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
41
41
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
42
42
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
43
+ YOLOX_PAGE_IMAGE_FORMAT,
43
44
  )
44
45
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import NemoRetrieverParseConfigSchema
45
46
  from nv_ingest_api.util.metadata.aggregators import (
@@ -100,7 +101,7 @@ def nemoretriever_parse_extractor(
100
101
  - text_depth : str, optional (default is "page")
101
102
  - extract_tables_method : str, optional (default is "yolox")
102
103
  - identify_nearby_objects : bool, optional (default is True)
103
- - paddle_output_format : str, optional (default is "pseudo_markdown")
104
+ - table_output_format : str, optional (default is "pseudo_markdown")
104
105
  - pdfium_config : dict, optional (configuration for PDFium)
105
106
  - nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
106
107
  - metadata_column : str, optional (default is "metadata")
@@ -145,14 +146,14 @@ def nemoretriever_parse_extractor(
145
146
  # Flag for identifying nearby objects.
146
147
  identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
147
148
 
148
- # Get and validate paddle_output_format.
149
- paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
149
+ # Get and validate table_output_format.
150
+ table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
150
151
  try:
151
- paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
152
+ table_output_format = TableFormatEnum[table_output_format_str.upper()]
152
153
  except KeyError:
153
154
  valid_options = [e.name.lower() for e in TableFormatEnum]
154
155
  raise ValueError(
155
- f"Invalid paddle_output_format value: {paddle_output_format_str}. Expected one of: {valid_options}"
156
+ f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
156
157
  )
157
158
 
158
159
  # Process nemoretriever_parse configuration.
@@ -253,7 +254,7 @@ def nemoretriever_parse_extractor(
253
254
  extract_tables,
254
255
  extract_charts,
255
256
  extract_infographics,
256
- paddle_output_format,
257
+ table_output_format,
257
258
  nemoretriever_parse_config.yolox_endpoints,
258
259
  nemoretriever_parse_config.yolox_infer_protocol,
259
260
  nemoretriever_parse_config.auth_token,
@@ -287,7 +288,7 @@ def nemoretriever_parse_extractor(
287
288
  extract_tables,
288
289
  extract_charts,
289
290
  extract_infographics,
290
- paddle_output_format,
291
+ table_output_format,
291
292
  nemoretriever_parse_config.yolox_endpoints,
292
293
  nemoretriever_parse_config.yolox_infer_protocol,
293
294
  nemoretriever_parse_config.auth_token,
@@ -355,7 +356,7 @@ def nemoretriever_parse_extractor(
355
356
  img_numpy = crop_image(page_image, transformed_bbox)
356
357
 
357
358
  if img_numpy is not None:
358
- base64_img = numpy_to_base64(img_numpy)
359
+ base64_img = numpy_to_base64(img_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
359
360
  image = Base64Image(
360
361
  image=base64_img,
361
362
  bbox=transformed_bbox,
@@ -4,20 +4,21 @@
4
4
  # Copyright (c) 2024, NVIDIA CORPORATION.
5
5
 
6
6
  import base64
7
+ import inspect
7
8
  import io
8
-
9
- import pandas as pd
10
- from typing import Any, Dict, List, Optional
11
9
  import logging
10
+ from typing import Any
11
+ from typing import Dict
12
+ from typing import List
13
+ from typing import Optional
12
14
 
13
- from nv_ingest_api.internal.extract.pdf.engines import (
14
- adobe_extractor,
15
- llama_parse_extractor,
16
- nemoretriever_parse_extractor,
17
- pdfium_extractor,
18
- tika_extractor,
19
- unstructured_io_extractor,
20
- )
15
+ import pandas as pd
16
+ from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
17
+ from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
18
+ from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
19
+ from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
20
+ from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
21
+ from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
21
22
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
22
23
 
23
24
  # Import extraction functions for different engines.
@@ -43,6 +44,7 @@ def _work_extract_pdf(
43
44
  extract_infographics: bool,
44
45
  extract_tables: bool,
45
46
  extract_charts: bool,
47
+ extract_page_as_image: bool,
46
48
  extractor_config: dict,
47
49
  execution_trace_log=None,
48
50
  ) -> Any:
@@ -52,17 +54,25 @@ def _work_extract_pdf(
52
54
 
53
55
  extract_method = extractor_config["extract_method"]
54
56
  extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
55
- return extractor_fn(
56
- pdf_stream,
57
- extract_text,
58
- extract_images,
59
- extract_infographics,
60
- extract_tables,
61
- extract_charts,
62
- extractor_config,
63
- execution_trace_log,
57
+
58
+ extractor_fn_args = dict(
59
+ pdf_stream=pdf_stream,
60
+ extract_text=extract_text,
61
+ extract_images=extract_images,
62
+ extract_infographics=extract_infographics,
63
+ extract_tables=extract_tables,
64
+ extract_charts=extract_charts,
65
+ extractor_config=extractor_config,
66
+ execution_trace_log=execution_trace_log,
64
67
  )
65
68
 
69
+ if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
70
+ extractor_fn_args["extract_page_as_image"] = extract_page_as_image
71
+ elif extract_page_as_image:
72
+ logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
73
+
74
+ return extractor_fn(**extractor_fn_args)
75
+
66
76
 
67
77
  @unified_exception_handler
68
78
  def _orchestrate_row_extraction(
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
97
107
  extract_tables = params.pop("extract_tables", False)
98
108
  extract_charts = params.pop("extract_charts", False)
99
109
  extract_infographics = params.pop("extract_infographics", False)
110
+ extract_page_as_image = params.pop("extract_page_as_image", False)
100
111
  extract_method = params.get("extract_method", "pdfium")
101
112
  except KeyError as e:
102
113
  raise ValueError(f"Missing required extraction flag: {e}")
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
137
148
  extract_text=extract_text,
138
149
  extract_images=extract_images,
139
150
  extract_infographics=extract_infographics,
151
+ extract_page_as_image=extract_page_as_image,
140
152
  extract_tables=extract_tables,
141
153
  extract_charts=extract_charts,
142
154
  extractor_config=extractor_config,
@@ -24,16 +24,18 @@ import numpy as np
24
24
  import pandas as pd
25
25
  import pypdfium2 as libpdfium
26
26
 
27
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
27
28
  from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
28
29
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
29
30
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
30
31
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
31
- get_yolox_model_name,
32
32
  YoloxPageElementsModelInterface,
33
+ YOLOX_PAGE_IMAGE_FORMAT,
33
34
  )
34
35
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
35
36
  from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
36
37
  from nv_ingest_api.util.metadata.aggregators import (
38
+ construct_image_metadata_from_base64,
37
39
  construct_image_metadata_from_pdf_image,
38
40
  extract_pdf_metadata,
39
41
  construct_text_metadata,
@@ -46,6 +48,7 @@ from nv_ingest_api.util.pdf.pdfium import (
46
48
  extract_image_like_objects_from_pdfium_page,
47
49
  )
48
50
  from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
51
+ from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
49
52
  from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
50
53
 
51
54
  logger = logging.getLogger(__name__)
@@ -54,7 +57,6 @@ logger = logging.getLogger(__name__)
54
57
  def _extract_page_elements_using_image_ensemble(
55
58
  pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
56
59
  yolox_client,
57
- yolox_model_name: str = "yolox",
58
60
  execution_trace_log: Optional[List] = None,
59
61
  ) -> List[Tuple[int, object]]:
60
62
  """
@@ -68,8 +70,6 @@ def _extract_page_elements_using_image_ensemble(
68
70
  and optional padding offset information.
69
71
  yolox_client : object
70
72
  A pre-configured client instance for the YOLOX inference service.
71
- yolox_model_name : str, default="yolox"
72
- The name of the YOLOX model to use for inference.
73
73
  execution_trace_log : Optional[List], default=None
74
74
  List for accumulating execution trace information.
75
75
 
@@ -102,8 +102,11 @@ def _extract_page_elements_using_image_ensemble(
102
102
  # Perform inference using the NimClient.
103
103
  inference_results = yolox_client.infer(
104
104
  data,
105
- model_name="yolox",
105
+ model_name="yolox_ensemble",
106
106
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
107
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
108
+ dtypes=["BYTES", "FP32"],
109
+ output_names=["OUTPUT"],
107
110
  trace_info=execution_trace_log,
108
111
  stage_name="pdf_extraction",
109
112
  )
@@ -186,7 +189,7 @@ def _extract_page_element_images(
186
189
  if cropped is None:
187
190
  continue
188
191
 
189
- base64_img = numpy_to_base64(cropped)
192
+ base64_img = numpy_to_base64(cropped, format=YOLOX_PAGE_IMAGE_FORMAT)
190
193
 
191
194
  bbox_in_orig_coord = (
192
195
  int(w1) - pad_width,
@@ -263,7 +266,7 @@ def _extract_page_elements(
263
266
  extract_tables: bool,
264
267
  extract_charts: bool,
265
268
  extract_infographics: bool,
266
- paddle_output_format: str,
269
+ table_output_format: str,
267
270
  yolox_endpoints: Tuple[Optional[str], Optional[str]],
268
271
  yolox_infer_protocol: str = "http",
269
272
  auth_token: Optional[str] = None,
@@ -292,7 +295,7 @@ def _extract_page_elements(
292
295
  Flag indicating whether to extract charts.
293
296
  extract_infographics : bool
294
297
  Flag indicating whether to extract infographics.
295
- paddle_output_format : str
298
+ table_output_format : str
296
299
  Format to use for table content.
297
300
  yolox_endpoints : Tuple[Optional[str], Optional[str]]
298
301
  A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
@@ -313,19 +316,7 @@ def _extract_page_elements(
313
316
 
314
317
  try:
315
318
  # Default model name
316
- yolox_model_name = "yolox"
317
-
318
- # Get the HTTP endpoint to determine the model name if needed
319
- yolox_http_endpoint = yolox_endpoints[1]
320
- if yolox_http_endpoint:
321
- try:
322
- yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
323
- except Exception as e:
324
- logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
325
-
326
- # Create the model interface
327
- model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
328
-
319
+ model_interface = YoloxPageElementsModelInterface()
329
320
  # Create the inference client
330
321
  yolox_client = create_inference_client(
331
322
  yolox_endpoints,
@@ -336,7 +327,7 @@ def _extract_page_elements(
336
327
 
337
328
  # Extract page elements using the client
338
329
  page_element_results = _extract_page_elements_using_image_ensemble(
339
- pages, yolox_client, yolox_model_name, execution_trace_log=execution_trace_log
330
+ pages, yolox_client, execution_trace_log=execution_trace_log
340
331
  )
341
332
 
342
333
  # Process each extracted element based on extraction flags
@@ -351,7 +342,7 @@ def _extract_page_elements(
351
342
 
352
343
  # Set content format for tables
353
344
  if page_element.type_string == "table":
354
- page_element.content_format = paddle_output_format
345
+ page_element.content_format = table_output_format
355
346
 
356
347
  # Construct metadata for the page element
357
348
  page_element_meta = construct_page_element_metadata(
@@ -384,6 +375,7 @@ def pdfium_extractor(
384
375
  extract_infographics: bool,
385
376
  extract_tables: bool,
386
377
  extract_charts: bool,
378
+ extract_page_as_image: bool,
387
379
  extractor_config: dict,
388
380
  execution_trace_log: Optional[List[Any]] = None,
389
381
  ) -> pd.DataFrame:
@@ -407,13 +399,13 @@ def pdfium_extractor(
407
399
  f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
408
400
  )
409
401
 
410
- # Validate and extract paddle_output_format
411
- paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
402
+ # Validate and extract table_output_format
403
+ table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
412
404
  try:
413
- paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
405
+ table_output_format = TableFormatEnum[table_output_format_str.upper()]
414
406
  except KeyError:
415
407
  raise ValueError(
416
- f"Invalid paddle_output_format: {paddle_output_format_str}. "
408
+ f"Invalid table_output_format: {table_output_format_str}. "
417
409
  f"Valid options: {list(TableFormatEnum.__members__.keys())}"
418
410
  )
419
411
 
@@ -524,6 +516,24 @@ def pdfium_extractor(
524
516
  )
525
517
  extracted_data.extend(image_data)
526
518
 
519
+ # Full page image extraction
520
+ if extract_page_as_image:
521
+ page_text = _extract_page_text(page)
522
+ image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
523
+ base64_image = numpy_to_base64(image[0])
524
+ if len(base64_image) > 2**24 - 1:
525
+ base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
526
+ image_meta = construct_image_metadata_from_base64(
527
+ base64_image,
528
+ page_idx,
529
+ page_count,
530
+ source_metadata,
531
+ base_unified_metadata,
532
+ subtype=ContentTypeEnum.PAGE_IMAGE,
533
+ text=page_text,
534
+ )
535
+ extracted_data.append(image_meta)
536
+
527
537
  # If we want tables or charts, rasterize the page and store it
528
538
  if extract_tables or extract_charts or extract_infographics:
529
539
  image, padding_offsets = pdfium_pages_to_numpy(
@@ -545,7 +555,7 @@ def pdfium_extractor(
545
555
  extract_tables,
546
556
  extract_charts,
547
557
  extract_infographics,
548
- paddle_output_format,
558
+ table_output_format,
549
559
  pdfium_config.yolox_endpoints,
550
560
  pdfium_config.yolox_infer_protocol,
551
561
  pdfium_config.auth_token,
@@ -567,13 +577,14 @@ def pdfium_extractor(
567
577
  extract_tables,
568
578
  extract_charts,
569
579
  extract_infographics,
570
- paddle_output_format,
580
+ table_output_format,
571
581
  pdfium_config.yolox_endpoints,
572
582
  pdfium_config.yolox_infer_protocol,
573
583
  pdfium_config.auth_token,
574
584
  execution_trace_log=execution_trace_log,
575
585
  )
576
586
  futures.append(future)
587
+
577
588
  pages_for_tables.clear()
578
589
 
579
590
  # Wait for all asynchronous jobs to complete.