nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (24) hide show
  1. nv_ingest_api/interface/extract.py +18 -18
  2. nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
  3. nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
  4. nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
  5. nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
  6. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +10 -7
  7. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +16 -29
  8. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
  9. nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
  10. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +37 -224
  11. nv_ingest_api/internal/primitives/nim/nim_client.py +55 -14
  12. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
  13. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
  14. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
  15. nv_ingest_api/internal/transform/split_text.py +13 -8
  16. nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
  17. nv_ingest_api/util/image_processing/transforms.py +16 -5
  18. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
  19. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
  20. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/METADATA +1 -1
  21. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/RECORD +24 -24
  22. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/WHEEL +0 -0
  23. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/licenses/LICENSE +0 -0
  24. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/top_level.txt +0 -0
@@ -15,10 +15,11 @@ import pandas as pd
15
15
 
16
16
  from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
17
17
  from nv_ingest_api.internal.enums.common import TableFormatEnum
18
- from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
18
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
19
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
19
20
  from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
20
- from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_paddle_output
21
- from nv_ingest_api.util.image_processing.table_and_chart import convert_paddle_response_to_psuedo_markdown
21
+ from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
22
+ from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
22
23
  from nv_ingest_api.internal.primitives.nim import NimClient
23
24
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
24
25
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
60
61
  def _run_inference(
61
62
  enable_yolox: bool,
62
63
  yolox_client: Any,
63
- paddle_client: Any,
64
+ ocr_client: Any,
65
+ ocr_model_name: str,
64
66
  valid_arrays: List[np.ndarray],
65
67
  valid_images: List[str],
66
68
  trace_info: Optional[Dict] = None,
@@ -68,32 +70,45 @@ def _run_inference(
68
70
  """
69
71
  Run inference concurrently for YOLOX (if enabled) and Paddle.
70
72
 
71
- Returns a tuple of (yolox_results, paddle_results).
73
+ Returns a tuple of (yolox_results, ocr_results).
72
74
  """
73
- data_paddle = {"base64_images": valid_images}
75
+ data_ocr = {"base64_images": valid_images}
74
76
  if enable_yolox:
75
77
  data_yolox = {"images": valid_arrays}
76
-
77
- with ThreadPoolExecutor(max_workers=2) as executor:
78
- future_yolox = None
79
- if enable_yolox:
80
- future_yolox = executor.submit(
81
- yolox_client.infer,
82
- data=data_yolox,
83
- model_name="yolox",
84
- stage_name="table_extraction",
85
- max_batch_size=8,
86
- trace_info=trace_info,
87
- )
88
- future_paddle = executor.submit(
89
- paddle_client.infer,
90
- data=data_paddle,
91
- model_name="paddle",
78
+ future_yolox_kwargs = dict(
79
+ data=data_yolox,
80
+ model_name="yolox_ensemble",
92
81
  stage_name="table_extraction",
93
- max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
82
+ max_batch_size=8,
83
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
84
+ dtypes=["BYTES", "FP32"],
85
+ output_names=["OUTPUT"],
94
86
  trace_info=trace_info,
95
87
  )
96
88
 
89
+ future_ocr_kwargs = dict(
90
+ data=data_ocr,
91
+ stage_name="table_extraction",
92
+ max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
93
+ trace_info=trace_info,
94
+ )
95
+ if ocr_model_name == "paddle":
96
+ future_ocr_kwargs.update(
97
+ model_name="paddle",
98
+ )
99
+ else:
100
+ future_ocr_kwargs.update(
101
+ model_name="scene_text",
102
+ input_names=["input", "merge_levels"],
103
+ dtypes=["FP32", "BYTES"],
104
+ merge_level="word",
105
+ )
106
+
107
+ with ThreadPoolExecutor(max_workers=2) as executor:
108
+ future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
109
+ future_yolox = None
110
+ if enable_yolox:
111
+ future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
97
112
  if enable_yolox:
98
113
  try:
99
114
  yolox_results = future_yolox.result()
@@ -104,17 +119,17 @@ def _run_inference(
104
119
  yolox_results = [None] * len(valid_images)
105
120
 
106
121
  try:
107
- paddle_results = future_paddle.result()
122
+ ocr_results = future_ocr.result()
108
123
  except Exception as e:
109
- logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
124
+ logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
110
125
  raise
111
126
 
112
- return yolox_results, paddle_results
127
+ return yolox_results, ocr_results
113
128
 
114
129
 
115
130
  def _validate_inference_results(
116
131
  yolox_results: Any,
117
- paddle_results: Any,
132
+ ocr_results: Any,
118
133
  valid_arrays: List[Any],
119
134
  valid_images: List[str],
120
135
  ) -> Tuple[List[Any], List[Any]]:
@@ -123,46 +138,47 @@ def _validate_inference_results(
123
138
 
124
139
  If not, default values are assigned. Raises a ValueError if the lengths do not match.
125
140
  """
126
- if not isinstance(yolox_results, list) or not isinstance(paddle_results, list):
141
+ if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
127
142
  logger.warning(
128
- "Unexpected result types from inference clients: yolox_results=%s, paddle_results=%s. "
143
+ "Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
129
144
  "Proceeding with available results.",
130
145
  type(yolox_results).__name__,
131
- type(paddle_results).__name__,
146
+ type(ocr_results).__name__,
132
147
  )
133
148
  if not isinstance(yolox_results, list):
134
149
  yolox_results = [None] * len(valid_arrays)
135
- if not isinstance(paddle_results, list):
136
- paddle_results = [(None, None)] * len(valid_images)
150
+ if not isinstance(ocr_results, list):
151
+ ocr_results = [(None, None)] * len(valid_images)
137
152
 
138
153
  if len(yolox_results) != len(valid_arrays):
139
154
  raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
140
- if len(paddle_results) != len(valid_images):
141
- raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
155
+ if len(ocr_results) != len(valid_images):
156
+ raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
142
157
 
143
- return yolox_results, paddle_results
158
+ return yolox_results, ocr_results
144
159
 
145
160
 
146
161
  def _update_table_metadata(
147
162
  base64_images: List[str],
148
163
  yolox_client: Any,
149
- paddle_client: Any,
164
+ ocr_client: Any,
165
+ ocr_model_name: str,
150
166
  worker_pool_size: int = 8, # Not currently used
151
167
  enable_yolox: bool = False,
152
168
  trace_info: Optional[Dict] = None,
153
169
  ) -> List[Tuple[str, Any, Any, Any]]:
154
170
  """
155
171
  Given a list of base64-encoded images, this function filters out images that do not meet
156
- the minimum size requirements and then calls the PaddleOCR model via paddle_client.infer
172
+ the minimum size requirements and then calls the OCR model via ocr_client.infer
157
173
  to extract table data.
158
174
 
159
175
  For each base64-encoded image, the result is a tuple:
160
- (base64_image, yolox_result, paddle_text_predictions, paddle_bounding_boxes)
176
+ (base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
161
177
 
162
178
  Images that do not meet the minimum size are skipped (resulting in placeholders).
163
- The paddle_client is expected to handle any necessary batching and concurrency.
179
+ The ocr_client is expected to handle any necessary batching and concurrency.
164
180
  """
165
- logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")
181
+ logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
166
182
 
167
183
  # Initialize the results list with default placeholders.
168
184
  results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
@@ -174,24 +190,23 @@ def _update_table_metadata(
174
190
  return results
175
191
 
176
192
  # Run inference concurrently.
177
- yolox_results, paddle_results = _run_inference(
193
+ yolox_results, ocr_results = _run_inference(
178
194
  enable_yolox=enable_yolox,
179
195
  yolox_client=yolox_client,
180
- paddle_client=paddle_client,
196
+ ocr_client=ocr_client,
197
+ ocr_model_name=ocr_model_name,
181
198
  valid_arrays=valid_arrays,
182
199
  valid_images=valid_images,
183
200
  trace_info=trace_info,
184
201
  )
185
202
 
186
203
  # Validate that the inference results have the expected structure.
187
- yolox_results, paddle_results = _validate_inference_results(
188
- yolox_results, paddle_results, valid_arrays, valid_images
189
- )
204
+ yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
190
205
 
191
206
  # Combine results with the original order.
192
- for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
207
+ for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
193
208
  original_index = valid_indices[idx]
194
- results[original_index] = (base64_images[original_index], yolox_res, paddle_res[0], paddle_res[1])
209
+ results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
195
210
 
196
211
  return results
197
212
 
@@ -199,14 +214,14 @@ def _update_table_metadata(
199
214
  def _create_clients(
200
215
  yolox_endpoints: Tuple[str, str],
201
216
  yolox_protocol: str,
202
- paddle_endpoints: Tuple[str, str],
203
- paddle_protocol: str,
217
+ ocr_endpoints: Tuple[str, str],
218
+ ocr_protocol: str,
204
219
  auth_token: str,
205
220
  ) -> Tuple[NimClient, NimClient]:
206
221
  yolox_model_interface = YoloxTableStructureModelInterface()
207
- paddle_model_interface = PaddleOCRModelInterface()
222
+ ocr_model_interface = OCRModelInterface()
208
223
 
209
- logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
224
+ logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
210
225
 
211
226
  yolox_client = create_inference_client(
212
227
  endpoints=yolox_endpoints,
@@ -215,14 +230,14 @@ def _create_clients(
215
230
  infer_protocol=yolox_protocol,
216
231
  )
217
232
 
218
- paddle_client = create_inference_client(
219
- endpoints=paddle_endpoints,
220
- model_interface=paddle_model_interface,
233
+ ocr_client = create_inference_client(
234
+ endpoints=ocr_endpoints,
235
+ model_interface=ocr_model_interface,
221
236
  auth_token=auth_token,
222
- infer_protocol=paddle_protocol,
237
+ infer_protocol=ocr_protocol,
223
238
  )
224
239
 
225
- return yolox_client, paddle_client
240
+ return yolox_client, ocr_client
226
241
 
227
242
 
228
243
  def extract_table_data_from_image_internal(
@@ -262,14 +277,18 @@ def extract_table_data_from_image_internal(
262
277
  return df_extraction_ledger, execution_trace_log
263
278
 
264
279
  endpoint_config = extraction_config.endpoint_config
265
- yolox_client, paddle_client = _create_clients(
280
+ yolox_client, ocr_client = _create_clients(
266
281
  endpoint_config.yolox_endpoints,
267
282
  endpoint_config.yolox_infer_protocol,
268
- endpoint_config.paddle_endpoints,
269
- endpoint_config.paddle_infer_protocol,
283
+ endpoint_config.ocr_endpoints,
284
+ endpoint_config.ocr_infer_protocol,
270
285
  endpoint_config.auth_token,
271
286
  )
272
287
 
288
+ # Get the grpc endpoint to determine the model if needed
289
+ ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
290
+ ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
291
+
273
292
  try:
274
293
  # 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
275
294
  def meets_criteria(row):
@@ -309,7 +328,8 @@ def extract_table_data_from_image_internal(
309
328
  bulk_results = _update_table_metadata(
310
329
  base64_images=base64_images,
311
330
  yolox_client=yolox_client,
312
- paddle_client=paddle_client,
331
+ ocr_client=ocr_client,
332
+ ocr_model_name=ocr_model_name,
313
333
  worker_pool_size=endpoint_config.workers_per_progress_engine,
314
334
  enable_yolox=enable_yolox,
315
335
  trace_info=execution_trace_log,
@@ -317,15 +337,15 @@ def extract_table_data_from_image_internal(
317
337
 
318
338
  # 4) Write the results (bounding_boxes, text_predictions) back
319
339
  for row_id, idx in enumerate(valid_indices):
320
- # unpack (base64_image, (yolox_predictions, paddle_bounding boxes, paddle_text_predictions))
340
+ # unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
321
341
  _, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
322
342
 
323
343
  if table_content_format == TableFormatEnum.SIMPLE:
324
344
  table_content = " ".join(text_predictions)
325
345
  elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
326
- table_content = convert_paddle_response_to_psuedo_markdown(bounding_boxes, text_predictions)
346
+ table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
327
347
  elif table_content_format == TableFormatEnum.MARKDOWN:
328
- table_content = join_yolox_table_structure_and_paddle_output(
348
+ table_content = join_yolox_table_structure_and_ocr_output(
329
349
  cell_predictions, bounding_boxes, text_predictions
330
350
  )
331
351
  else:
@@ -341,4 +361,4 @@ def extract_table_data_from_image_internal(
341
361
  raise
342
362
  finally:
343
363
  yolox_client.close()
344
- paddle_client.close()
364
+ ocr_client.close()
@@ -101,7 +101,7 @@ def nemoretriever_parse_extractor(
101
101
  - text_depth : str, optional (default is "page")
102
102
  - extract_tables_method : str, optional (default is "yolox")
103
103
  - identify_nearby_objects : bool, optional (default is True)
104
- - paddle_output_format : str, optional (default is "pseudo_markdown")
104
+ - table_output_format : str, optional (default is "pseudo_markdown")
105
105
  - pdfium_config : dict, optional (configuration for PDFium)
106
106
  - nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
107
107
  - metadata_column : str, optional (default is "metadata")
@@ -146,14 +146,14 @@ def nemoretriever_parse_extractor(
146
146
  # Flag for identifying nearby objects.
147
147
  identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
148
148
 
149
- # Get and validate paddle_output_format.
150
- paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
149
+ # Get and validate table_output_format.
150
+ table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
151
151
  try:
152
- paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
152
+ table_output_format = TableFormatEnum[table_output_format_str.upper()]
153
153
  except KeyError:
154
154
  valid_options = [e.name.lower() for e in TableFormatEnum]
155
155
  raise ValueError(
156
- f"Invalid paddle_output_format value: {paddle_output_format_str}. Expected one of: {valid_options}"
156
+ f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
157
157
  )
158
158
 
159
159
  # Process nemoretriever_parse configuration.
@@ -254,10 +254,13 @@ def nemoretriever_parse_extractor(
254
254
  extract_tables,
255
255
  extract_charts,
256
256
  extract_infographics,
257
- paddle_output_format,
257
+ table_output_format,
258
258
  nemoretriever_parse_config.yolox_endpoints,
259
259
  nemoretriever_parse_config.yolox_infer_protocol,
260
260
  nemoretriever_parse_config.auth_token,
261
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
262
+ dtypes=["BYTES", "FP32"],
263
+ output_names=["OUTPUT"],
261
264
  execution_trace_log=execution_trace_log,
262
265
  )
263
266
  futures.append(future_yolox)
@@ -288,7 +291,7 @@ def nemoretriever_parse_extractor(
288
291
  extract_tables,
289
292
  extract_charts,
290
293
  extract_infographics,
291
- paddle_output_format,
294
+ table_output_format,
292
295
  nemoretriever_parse_config.yolox_endpoints,
293
296
  nemoretriever_parse_config.yolox_infer_protocol,
294
297
  nemoretriever_parse_config.auth_token,
@@ -29,9 +29,8 @@ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH
29
29
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
30
30
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
31
31
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
32
- YOLOX_PAGE_IMAGE_FORMAT,
33
- get_yolox_model_name,
34
32
  YoloxPageElementsModelInterface,
33
+ YOLOX_PAGE_IMAGE_FORMAT,
35
34
  )
36
35
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
37
36
  from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
@@ -58,7 +57,6 @@ logger = logging.getLogger(__name__)
58
57
  def _extract_page_elements_using_image_ensemble(
59
58
  pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
60
59
  yolox_client,
61
- yolox_model_name: str = "yolox",
62
60
  execution_trace_log: Optional[List] = None,
63
61
  ) -> List[Tuple[int, object]]:
64
62
  """
@@ -72,8 +70,6 @@ def _extract_page_elements_using_image_ensemble(
72
70
  and optional padding offset information.
73
71
  yolox_client : object
74
72
  A pre-configured client instance for the YOLOX inference service.
75
- yolox_model_name : str, default="yolox"
76
- The name of the YOLOX model to use for inference.
77
73
  execution_trace_log : Optional[List], default=None
78
74
  List for accumulating execution trace information.
79
75
 
@@ -106,8 +102,11 @@ def _extract_page_elements_using_image_ensemble(
106
102
  # Perform inference using the NimClient.
107
103
  inference_results = yolox_client.infer(
108
104
  data,
109
- model_name="yolox",
105
+ model_name="yolox_ensemble",
110
106
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
107
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
108
+ dtypes=["BYTES", "FP32"],
109
+ output_names=["OUTPUT"],
111
110
  trace_info=execution_trace_log,
112
111
  stage_name="pdf_extraction",
113
112
  )
@@ -267,7 +266,7 @@ def _extract_page_elements(
267
266
  extract_tables: bool,
268
267
  extract_charts: bool,
269
268
  extract_infographics: bool,
270
- paddle_output_format: str,
269
+ table_output_format: str,
271
270
  yolox_endpoints: Tuple[Optional[str], Optional[str]],
272
271
  yolox_infer_protocol: str = "http",
273
272
  auth_token: Optional[str] = None,
@@ -296,7 +295,7 @@ def _extract_page_elements(
296
295
  Flag indicating whether to extract charts.
297
296
  extract_infographics : bool
298
297
  Flag indicating whether to extract infographics.
299
- paddle_output_format : str
298
+ table_output_format : str
300
299
  Format to use for table content.
301
300
  yolox_endpoints : Tuple[Optional[str], Optional[str]]
302
301
  A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
@@ -317,19 +316,7 @@ def _extract_page_elements(
317
316
 
318
317
  try:
319
318
  # Default model name
320
- yolox_model_name = "yolox"
321
-
322
- # Get the HTTP endpoint to determine the model name if needed
323
- yolox_http_endpoint = yolox_endpoints[1]
324
- if yolox_http_endpoint:
325
- try:
326
- yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
327
- except Exception as e:
328
- logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
329
-
330
- # Create the model interface
331
- model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
332
-
319
+ model_interface = YoloxPageElementsModelInterface()
333
320
  # Create the inference client
334
321
  yolox_client = create_inference_client(
335
322
  yolox_endpoints,
@@ -340,7 +327,7 @@ def _extract_page_elements(
340
327
 
341
328
  # Extract page elements using the client
342
329
  page_element_results = _extract_page_elements_using_image_ensemble(
343
- pages, yolox_client, yolox_model_name, execution_trace_log=execution_trace_log
330
+ pages, yolox_client, execution_trace_log=execution_trace_log
344
331
  )
345
332
 
346
333
  # Process each extracted element based on extraction flags
@@ -355,7 +342,7 @@ def _extract_page_elements(
355
342
 
356
343
  # Set content format for tables
357
344
  if page_element.type_string == "table":
358
- page_element.content_format = paddle_output_format
345
+ page_element.content_format = table_output_format
359
346
 
360
347
  # Construct metadata for the page element
361
348
  page_element_meta = construct_page_element_metadata(
@@ -412,13 +399,13 @@ def pdfium_extractor(
412
399
  f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
413
400
  )
414
401
 
415
- # Validate and extract paddle_output_format
416
- paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
402
+ # Validate and extract table_output_format
403
+ table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
417
404
  try:
418
- paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
405
+ table_output_format = TableFormatEnum[table_output_format_str.upper()]
419
406
  except KeyError:
420
407
  raise ValueError(
421
- f"Invalid paddle_output_format: {paddle_output_format_str}. "
408
+ f"Invalid table_output_format: {table_output_format_str}. "
422
409
  f"Valid options: {list(TableFormatEnum.__members__.keys())}"
423
410
  )
424
411
 
@@ -568,7 +555,7 @@ def pdfium_extractor(
568
555
  extract_tables,
569
556
  extract_charts,
570
557
  extract_infographics,
571
- paddle_output_format,
558
+ table_output_format,
572
559
  pdfium_config.yolox_endpoints,
573
560
  pdfium_config.yolox_infer_protocol,
574
561
  pdfium_config.auth_token,
@@ -590,7 +577,7 @@ def pdfium_extractor(
590
577
  extract_tables,
591
578
  extract_charts,
592
579
  extract_infographics,
593
- paddle_output_format,
580
+ table_output_format,
594
581
  pdfium_config.yolox_endpoints,
595
582
  pdfium_config.yolox_infer_protocol,
596
583
  pdfium_config.auth_token,
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import backoff
8
9
  import cv2
@@ -13,6 +14,7 @@ from nv_ingest_api.internal.primitives.nim.model_interface.decorators import mul
13
14
  from nv_ingest_api.util.image_processing.transforms import pad_image, normalize_image
14
15
  from nv_ingest_api.util.string_processing import generate_url, remove_url_endpoints
15
16
 
17
+ cv2.setNumThreads(1)
16
18
  logger = logging.getLogger(__name__)
17
19
 
18
20
 
@@ -81,6 +83,63 @@ def preprocess_image_for_paddle(array: np.ndarray, image_max_dimension: int = 96
81
83
  return transposed, metadata
82
84
 
83
85
 
86
+ def preprocess_image_for_ocr(
87
+ array: np.ndarray,
88
+ target_height: Optional[int] = None,
89
+ target_width: Optional[int] = None,
90
+ pad_how: str = "bottom_right",
91
+ ) -> np.ndarray:
92
+ """
93
+ Preprocesses an input image to be suitable for use with NemoRetriever-OCR.
94
+
95
+ This function is intended for preprocessing images to be passed as input to NemoRetriever-OCR using GRPC.
96
+ It is not necessary when using the HTTP endpoint.
97
+
98
+ Parameters:
99
+ ----------
100
+ array : np.ndarray
101
+ The input image array of shape (height, width, channels). It should have pixel values in the range [0, 255].
102
+
103
+ Returns:
104
+ -------
105
+ np.ndarray
106
+ A preprocessed image with the shape (channels, height, width).
107
+ """
108
+ height, width = array.shape[:2]
109
+
110
+ if target_height is None:
111
+ target_height = height
112
+
113
+ if target_width is None:
114
+ target_width = width
115
+
116
+ padded, (pad_width, pad_height) = pad_image(
117
+ array,
118
+ target_height=target_height,
119
+ target_width=target_width,
120
+ background_color=255,
121
+ dtype=np.float32,
122
+ how=pad_how,
123
+ )
124
+
125
+ padded = padded / 255.0
126
+
127
+ # NemoRetriever-OCR NIM (GRPC) requires input to be (channel, height, width).
128
+ transposed = padded.transpose((2, 0, 1))
129
+
130
+ # Metadata can used for inverting transformations on the resulting bounding boxes.
131
+ metadata = {
132
+ "original_height": height,
133
+ "original_width": width,
134
+ "new_height": target_height,
135
+ "new_width": target_width,
136
+ "pad_height": pad_height,
137
+ "pad_width": pad_width,
138
+ }
139
+
140
+ return transposed, metadata
141
+
142
+
84
143
  def is_ready(http_endpoint: str, ready_endpoint: str) -> bool:
85
144
  """
86
145
  Check if the server at the given endpoint is ready.