nv-ingest-api 2025.9.22.dev20250922__py3-none-any.whl → 2025.9.25.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -17,7 +17,8 @@ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExt
17
17
  from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
18
18
  from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_graphic_elements_and_ocr_output
19
19
  from nv_ingest_api.util.image_processing.table_and_chart import process_yolox_graphic_elements
20
- from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
20
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import PaddleOCRModelInterface
21
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import NemoRetrieverOCRModelInterface
21
22
  from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
22
23
  from nv_ingest_api.internal.primitives.nim import NimClient
23
24
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxGraphicElementsModelInterface
@@ -89,19 +90,12 @@ def _run_chart_inference(
89
90
  future_ocr_kwargs = dict(
90
91
  data=data_ocr,
91
92
  stage_name="chart_extraction",
92
- max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
93
93
  trace_info=trace_info,
94
94
  )
95
95
  if ocr_model_name == "paddle":
96
96
  future_ocr_kwargs.update(
97
97
  model_name="paddle",
98
- )
99
- elif ocr_model_name == "scene_text":
100
- future_ocr_kwargs.update(
101
- model_name=ocr_model_name,
102
- input_names=["input", "merge_levels"],
103
- dtypes=["FP32", "BYTES"],
104
- merge_level="paragraph",
98
+ max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
105
99
  )
106
100
  elif ocr_model_name == "scene_text_ensemble":
107
101
  future_ocr_kwargs.update(
@@ -134,7 +128,10 @@ def _run_chart_inference(
134
128
 
135
129
 
136
130
  def _validate_chart_inference_results(
137
- yolox_results: Any, ocr_results: Any, valid_arrays: List[Any], valid_images: List[str]
131
+ yolox_results: Any,
132
+ ocr_results: Any,
133
+ valid_arrays: List[Any],
134
+ valid_images: List[str],
138
135
  ) -> Tuple[List[Any], List[Any]]:
139
136
  """
140
137
  Ensure inference results are lists and have expected lengths.
@@ -216,17 +213,12 @@ def _update_chart_metadata(
216
213
  return _merge_chart_results(base64_images, valid_indices, yolox_results, ocr_results, results)
217
214
 
218
215
 
219
- def _create_clients(
216
+ def _create_yolox_client(
220
217
  yolox_endpoints: Tuple[str, str],
221
218
  yolox_protocol: str,
222
- ocr_endpoints: Tuple[str, str],
223
- ocr_protocol: str,
224
219
  auth_token: str,
225
- ) -> Tuple[NimClient, NimClient]:
220
+ ) -> NimClient:
226
221
  yolox_model_interface = YoloxGraphicElementsModelInterface()
227
- ocr_model_interface = OCRModelInterface()
228
-
229
- logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
230
222
 
231
223
  yolox_client = create_inference_client(
232
224
  endpoints=yolox_endpoints,
@@ -235,14 +227,29 @@ def _create_clients(
235
227
  infer_protocol=yolox_protocol,
236
228
  )
237
229
 
230
+ return yolox_client
231
+
232
+
233
+ def _create_ocr_client(
234
+ ocr_endpoints: Tuple[str, str],
235
+ ocr_protocol: str,
236
+ ocr_model_name: str,
237
+ auth_token: str,
238
+ ) -> NimClient:
239
+ ocr_model_interface = (
240
+ NemoRetrieverOCRModelInterface() if ocr_model_name == "scene_text_ensemble" else PaddleOCRModelInterface()
241
+ )
242
+
238
243
  ocr_client = create_inference_client(
239
244
  endpoints=ocr_endpoints,
240
245
  model_interface=ocr_model_interface,
241
246
  auth_token=auth_token,
242
247
  infer_protocol=ocr_protocol,
248
+ enable_dynamic_batching=(True if ocr_model_name == "scene_text_ensemble" else False),
249
+ dynamic_batch_memory_budget_mb=32,
243
250
  )
244
251
 
245
- return yolox_client, ocr_client
252
+ return ocr_client
246
253
 
247
254
 
248
255
  def extract_chart_data_from_image_internal(
@@ -285,13 +292,6 @@ def extract_chart_data_from_image_internal(
285
292
  return df_extraction_ledger, execution_trace_log
286
293
 
287
294
  endpoint_config = extraction_config.endpoint_config
288
- yolox_client, ocr_client = _create_clients(
289
- endpoint_config.yolox_endpoints,
290
- endpoint_config.yolox_infer_protocol,
291
- endpoint_config.ocr_endpoints,
292
- endpoint_config.ocr_infer_protocol,
293
- endpoint_config.auth_token,
294
- )
295
295
 
296
296
  # Get the grpc endpoint to determine the model if needed
297
297
  ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
@@ -334,6 +334,19 @@ def extract_chart_data_from_image_internal(
334
334
  base64_images.append(meta["content"]) # guaranteed by meets_criteria
335
335
 
336
336
  # 3) Call our bulk _update_metadata to get all results.
337
+ yolox_client = _create_yolox_client(
338
+ endpoint_config.yolox_endpoints,
339
+ endpoint_config.yolox_infer_protocol,
340
+ endpoint_config.auth_token,
341
+ )
342
+
343
+ ocr_client = _create_ocr_client(
344
+ endpoint_config.ocr_endpoints,
345
+ endpoint_config.ocr_infer_protocol,
346
+ ocr_model_name,
347
+ endpoint_config.auth_token,
348
+ )
349
+
337
350
  bulk_results = _update_chart_metadata(
338
351
  base64_images=base64_images,
339
352
  yolox_client=yolox_client,
@@ -356,13 +369,3 @@ def extract_chart_data_from_image_internal(
356
369
  logger.error("Error occurred while extracting chart data.", exc_info=True)
357
370
 
358
371
  raise
359
-
360
- finally:
361
- try:
362
- if ocr_client is not None:
363
- ocr_client.close()
364
- if yolox_client is not None:
365
- yolox_client.close()
366
-
367
- except Exception as close_err:
368
- logger.error(f"Error closing clients: {close_err}", exc_info=True)
@@ -242,10 +242,6 @@ def extract_page_elements_from_images(
242
242
  logger.exception(f"Unhandled error during table/chart extraction: {str(e)}")
243
243
  raise
244
244
 
245
- finally:
246
- if yolox_client:
247
- yolox_client.close()
248
-
249
245
  logger.debug(f"Extracted {len(page_elements)} tables and charts from image.")
250
246
  return page_elements
251
247
 
@@ -12,11 +12,10 @@ from typing import Tuple
12
12
  import pandas as pd
13
13
 
14
14
  from nv_ingest_api.internal.primitives.nim import NimClient
15
- from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
15
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import PaddleOCRModelInterface
16
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import NemoRetrieverOCRModelInterface
16
17
  from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
17
- from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import (
18
- InfographicExtractorSchema,
19
- )
18
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
20
19
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
21
20
  from nv_ingest_api.util.nim import create_inference_client
22
21
  from nv_ingest_api.util.image_processing.table_and_chart import reorder_boxes
@@ -101,19 +100,12 @@ def _update_infographic_metadata(
101
100
 
102
101
  infer_kwargs = dict(
103
102
  stage_name="infographic_extraction",
104
- max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
105
103
  trace_info=trace_info,
106
104
  )
107
105
  if ocr_model_name == "paddle":
108
106
  infer_kwargs.update(
109
107
  model_name="paddle",
110
- )
111
- elif ocr_model_name == "scene_text":
112
- infer_kwargs.update(
113
- model_name=ocr_model_name,
114
- input_names=["input", "merge_levels"],
115
- dtypes=["FP32", "BYTES"],
116
- merge_level="paragraph",
108
+ max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
117
109
  )
118
110
  elif ocr_model_name == "scene_text_ensemble":
119
111
  infer_kwargs.update(
@@ -144,25 +136,32 @@ def _update_infographic_metadata(
144
136
  # Each ocr_res is expected to be a tuple (text_predictions, bounding_boxes, conf_scores).
145
137
  ocr_res = reorder_boxes(*ocr_res)
146
138
 
147
- results[original_index] = (base64_images[original_index], ocr_res[0], ocr_res[1])
139
+ results[original_index] = (
140
+ base64_images[original_index],
141
+ ocr_res[0],
142
+ ocr_res[1],
143
+ )
148
144
 
149
145
  return results
150
146
 
151
147
 
152
- def _create_clients(
148
+ def _create_ocr_client(
153
149
  ocr_endpoints: Tuple[str, str],
154
150
  ocr_protocol: str,
151
+ ocr_model_name: str,
155
152
  auth_token: str,
156
153
  ) -> NimClient:
157
- ocr_model_interface = OCRModelInterface()
158
-
159
- logger.debug(f"Inference protocols: ocr={ocr_protocol}")
154
+ ocr_model_interface = (
155
+ NemoRetrieverOCRModelInterface() if ocr_model_name == "scene_text_ensemble" else PaddleOCRModelInterface()
156
+ )
160
157
 
161
158
  ocr_client = create_inference_client(
162
159
  endpoints=ocr_endpoints,
163
160
  model_interface=ocr_model_interface,
164
161
  auth_token=auth_token,
165
162
  infer_protocol=ocr_protocol,
163
+ enable_dynamic_batching=(True if ocr_model_name == "scene_text_ensemble" else False),
164
+ dynamic_batch_memory_budget_mb=32,
166
165
  )
167
166
 
168
167
  return ocr_client
@@ -239,11 +238,6 @@ def extract_infographic_data_from_image_internal(
239
238
  return df_extraction_ledger, execution_trace_log
240
239
 
241
240
  endpoint_config = extraction_config.endpoint_config
242
- ocr_client = _create_clients(
243
- endpoint_config.ocr_endpoints,
244
- endpoint_config.ocr_infer_protocol,
245
- endpoint_config.auth_token,
246
- )
247
241
 
248
242
  # Get the grpc endpoint to determine the model if needed
249
243
  ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
@@ -262,6 +256,13 @@ def extract_infographic_data_from_image_internal(
262
256
  base64_images = [df_extraction_ledger.at[idx, "metadata"]["content"] for idx in valid_indices]
263
257
 
264
258
  # Call bulk update to extract infographic data.
259
+ ocr_client = _create_ocr_client(
260
+ endpoint_config.ocr_endpoints,
261
+ endpoint_config.ocr_infer_protocol,
262
+ ocr_model_name,
263
+ endpoint_config.auth_token,
264
+ )
265
+
265
266
  bulk_results = _update_infographic_metadata(
266
267
  base64_images=base64_images,
267
268
  ocr_client=ocr_client,
@@ -283,6 +284,3 @@ def extract_infographic_data_from_image_internal(
283
284
  err_msg = "Error occurred while extracting infographic data."
284
285
  logger.exception(err_msg)
285
286
  raise
286
-
287
- finally:
288
- ocr_client.close()
@@ -15,12 +15,13 @@ import pandas as pd
15
15
 
16
16
  from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
17
17
  from nv_ingest_api.internal.enums.common import TableFormatEnum
18
- from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
18
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import PaddleOCRModelInterface
19
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import NemoRetrieverOCRModelInterface
19
20
  from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
21
+ from nv_ingest_api.internal.primitives.nim import NimClient
20
22
  from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
21
23
  from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
22
24
  from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
23
- from nv_ingest_api.internal.primitives.nim import NimClient
24
25
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
25
26
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
26
27
  from nv_ingest_api.util.nim import create_inference_client
@@ -31,7 +32,9 @@ PADDLE_MIN_WIDTH = 32
31
32
  PADDLE_MIN_HEIGHT = 32
32
33
 
33
34
 
34
- def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.ndarray], List[int]]:
35
+ def _filter_valid_images(
36
+ base64_images: List[str],
37
+ ) -> Tuple[List[str], List[np.ndarray], List[int]]:
35
38
  """
36
39
  Filter base64-encoded images by their dimensions.
37
40
 
@@ -89,19 +92,12 @@ def _run_inference(
89
92
  future_ocr_kwargs = dict(
90
93
  data=data_ocr,
91
94
  stage_name="table_extraction",
92
- max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
93
95
  trace_info=trace_info,
94
96
  )
95
97
  if ocr_model_name == "paddle":
96
98
  future_ocr_kwargs.update(
97
99
  model_name="paddle",
98
- )
99
- elif ocr_model_name == "scene_text":
100
- future_ocr_kwargs.update(
101
- model_name=ocr_model_name,
102
- input_names=["input", "merge_levels"],
103
- dtypes=["FP32", "BYTES"],
104
- merge_level="word",
100
+ max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
105
101
  )
106
102
  elif ocr_model_name == "scene_text_ensemble":
107
103
  future_ocr_kwargs.update(
@@ -216,22 +212,22 @@ def _update_table_metadata(
216
212
  # Combine results with the original order.
217
213
  for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
218
214
  original_index = valid_indices[idx]
219
- results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
215
+ results[original_index] = (
216
+ base64_images[original_index],
217
+ yolox_res,
218
+ ocr_res[0],
219
+ ocr_res[1],
220
+ )
220
221
 
221
222
  return results
222
223
 
223
224
 
224
- def _create_clients(
225
+ def _create_yolox_client(
225
226
  yolox_endpoints: Tuple[str, str],
226
227
  yolox_protocol: str,
227
- ocr_endpoints: Tuple[str, str],
228
- ocr_protocol: str,
229
228
  auth_token: str,
230
- ) -> Tuple[NimClient, NimClient]:
229
+ ) -> NimClient:
231
230
  yolox_model_interface = YoloxTableStructureModelInterface()
232
- ocr_model_interface = OCRModelInterface()
233
-
234
- logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
235
231
 
236
232
  yolox_client = create_inference_client(
237
233
  endpoints=yolox_endpoints,
@@ -240,14 +236,29 @@ def _create_clients(
240
236
  infer_protocol=yolox_protocol,
241
237
  )
242
238
 
239
+ return yolox_client
240
+
241
+
242
+ def _create_ocr_client(
243
+ ocr_endpoints: Tuple[str, str],
244
+ ocr_protocol: str,
245
+ ocr_model_name: str,
246
+ auth_token: str,
247
+ ) -> NimClient:
248
+ ocr_model_interface = (
249
+ NemoRetrieverOCRModelInterface() if ocr_model_name == "scene_text_ensemble" else PaddleOCRModelInterface()
250
+ )
251
+
243
252
  ocr_client = create_inference_client(
244
253
  endpoints=ocr_endpoints,
245
254
  model_interface=ocr_model_interface,
246
255
  auth_token=auth_token,
247
256
  infer_protocol=ocr_protocol,
257
+ enable_dynamic_batching=(True if ocr_model_name == "scene_text_ensemble" else False),
258
+ dynamic_batch_memory_budget_mb=32,
248
259
  )
249
260
 
250
- return yolox_client, ocr_client
261
+ return ocr_client
251
262
 
252
263
 
253
264
  def extract_table_data_from_image_internal(
@@ -287,13 +298,6 @@ def extract_table_data_from_image_internal(
287
298
  return df_extraction_ledger, execution_trace_log
288
299
 
289
300
  endpoint_config = extraction_config.endpoint_config
290
- yolox_client, ocr_client = _create_clients(
291
- endpoint_config.yolox_endpoints,
292
- endpoint_config.yolox_infer_protocol,
293
- endpoint_config.ocr_endpoints,
294
- endpoint_config.ocr_infer_protocol,
295
- endpoint_config.auth_token,
296
- )
297
301
 
298
302
  # Get the grpc endpoint to determine the model if needed
299
303
  ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
@@ -335,6 +339,18 @@ def extract_table_data_from_image_internal(
335
339
  )
336
340
  enable_yolox = True if table_content_format in (TableFormatEnum.MARKDOWN,) else False
337
341
 
342
+ yolox_client = _create_yolox_client(
343
+ endpoint_config.yolox_endpoints,
344
+ endpoint_config.yolox_infer_protocol,
345
+ endpoint_config.auth_token,
346
+ )
347
+ ocr_client = _create_ocr_client(
348
+ endpoint_config.ocr_endpoints,
349
+ endpoint_config.ocr_infer_protocol,
350
+ ocr_model_name,
351
+ endpoint_config.auth_token,
352
+ )
353
+
338
354
  bulk_results = _update_table_metadata(
339
355
  base64_images=base64_images,
340
356
  yolox_client=yolox_client,
@@ -369,6 +385,3 @@ def extract_table_data_from_image_internal(
369
385
  except Exception:
370
386
  logger.exception("Error occurred while extracting table data.", exc_info=True)
371
387
  raise
372
- finally:
373
- yolox_client.close()
374
- ocr_client.close()
@@ -357,13 +357,6 @@ def _extract_page_elements(
357
357
  except Exception as e:
358
358
  logger.exception(f"Error in page element extraction: {str(e)}")
359
359
  raise
360
- finally:
361
- # Ensure client is closed properly
362
- if yolox_client:
363
- try:
364
- yolox_client.close()
365
- except Exception as e:
366
- logger.warning(f"Error closing YOLOX client: {str(e)}")
367
360
 
368
361
  return extracted_page_elements
369
362
 
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  from .nim_client import NimClient
6
+ from .nim_client import get_nim_client_manager
6
7
  from .nim_model_interface import ModelInterface
7
8
 
8
- __all__ = ["NimClient", "ModelInterface"]
9
+ __all__ = ["NimClient", "ModelInterface", "get_nim_client_manager"]
@@ -88,6 +88,8 @@ def preprocess_image_for_ocr(
88
88
  target_height: Optional[int] = None,
89
89
  target_width: Optional[int] = None,
90
90
  pad_how: str = "bottom_right",
91
+ normalize: bool = False,
92
+ channel_first: bool = False,
91
93
  ) -> np.ndarray:
92
94
  """
93
95
  Preprocesses an input image to be suitable for use with NemoRetriever-OCR.
@@ -122,10 +124,12 @@ def preprocess_image_for_ocr(
122
124
  how=pad_how,
123
125
  )
124
126
 
125
- padded = padded / 255.0
127
+ if normalize:
128
+ padded = padded / 255.0
126
129
 
127
- # NemoRetriever-OCR NIM (GRPC) requires input to be (channel, height, width).
128
- transposed = padded.transpose((2, 0, 1))
130
+ if channel_first:
131
+ # NemoRetriever-OCR NIM (GRPC) requires input to be (channel, height, width).
132
+ padded = padded.transpose((2, 0, 1))
129
133
 
130
134
  # Metadata can used for inverting transformations on the resulting bounding boxes.
131
135
  metadata = {
@@ -137,7 +141,7 @@ def preprocess_image_for_ocr(
137
141
  "pad_width": pad_width,
138
142
  }
139
143
 
140
- return transposed, metadata
144
+ return padded, metadata
141
145
 
142
146
 
143
147
  def is_ready(http_endpoint: str, ready_endpoint: str) -> bool: