nv-ingest-api 2025.7.14.dev20250714__py3-none-any.whl → 2025.7.15.dev20250715__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -40,6 +40,7 @@ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadat
40
40
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
41
41
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
42
42
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
43
+ YOLOX_PAGE_IMAGE_FORMAT,
43
44
  )
44
45
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import NemoRetrieverParseConfigSchema
45
46
  from nv_ingest_api.util.metadata.aggregators import (
@@ -355,7 +356,7 @@ def nemoretriever_parse_extractor(
355
356
  img_numpy = crop_image(page_image, transformed_bbox)
356
357
 
357
358
  if img_numpy is not None:
358
- base64_img = numpy_to_base64(img_numpy)
359
+ base64_img = numpy_to_base64(img_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
359
360
  image = Base64Image(
360
361
  image=base64_img,
361
362
  bbox=transformed_bbox,
@@ -28,6 +28,7 @@ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH
28
28
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
29
29
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
30
30
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
31
+ YOLOX_PAGE_IMAGE_FORMAT,
31
32
  get_yolox_model_name,
32
33
  YoloxPageElementsModelInterface,
33
34
  )
@@ -186,7 +187,7 @@ def _extract_page_element_images(
186
187
  if cropped is None:
187
188
  continue
188
189
 
189
- base64_img = numpy_to_base64(cropped)
190
+ base64_img = numpy_to_base64(cropped, format=YOLOX_PAGE_IMAGE_FORMAT)
190
191
 
191
192
  bbox_in_orig_coord = (
192
193
  int(w1) - pad_width,
@@ -120,6 +120,7 @@ class NemoRetrieverParseModelInterface(ModelInterface):
120
120
  logger.debug("Formatting input for HTTP NemoRetrieverParse model")
121
121
  # Prepare payload for HTTP request
122
122
 
123
+ ## TODO: Ask @Edward Kim if we want to switch to JPEG/PNG here
123
124
  if "images" in data:
124
125
  base64_list = [numpy_to_base64(img) for img in data["images"]]
125
126
  else:
@@ -2,9 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
-
6
- import base64
7
- import io
5
+ import os
8
6
  import logging
9
7
  import warnings
10
8
  from math import log
@@ -20,11 +18,11 @@ import packaging
20
18
  import pandas as pd
21
19
  import torch
22
20
  import torchvision
23
- from PIL import Image
24
21
 
25
22
  from nv_ingest_api.internal.primitives.nim import ModelInterface
26
23
  from nv_ingest_api.internal.primitives.nim.model_interface.helpers import get_model_name
27
24
  from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
25
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64
28
26
 
29
27
  logger = logging.getLogger(__name__)
30
28
 
@@ -35,6 +33,7 @@ YOLOX_PAGE_MIN_SCORE = 0.1
35
33
  YOLOX_PAGE_NIM_MAX_IMAGE_SIZE = 512_000
36
34
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT = 1024
37
35
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH = 1024
36
+ YOLOX_PAGE_IMAGE_FORMAT = os.getenv("YOLOX_PAGE_IMAGE_FORMAT", "PNG")
38
37
 
39
38
  # yolox-page-elements-v1 contants
40
39
  YOLOX_PAGE_V1_NUM_CLASSES = 4
@@ -239,15 +238,11 @@ class YoloxModelInterfaceBase(ModelInterface):
239
238
  # Convert to uint8 if needed.
240
239
  if image.dtype != np.uint8:
241
240
  image = (image * 255).astype(np.uint8)
242
- # Convert the numpy array to a PIL Image.
243
- image_pil = Image.fromarray(image)
244
- original_size = image_pil.size
245
-
246
- # Save the image to a buffer and encode to base64.
247
- buffered = io.BytesIO()
248
- image_pil.save(buffered, format="PNG")
249
- image_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
250
241
 
242
+ # Get original size directly from numpy array (width, height)
243
+ original_size = (image.shape[1], image.shape[0])
244
+ # Convert numpy array directly to base64 using OpenCV
245
+ image_b64 = numpy_to_base64(image, format=YOLOX_PAGE_IMAGE_FORMAT)
251
246
  # Scale the image if necessary.
252
247
  scaled_image_b64, new_size = scale_image_to_encoding_size(
253
248
  image_b64, max_base64_size=self.nim_max_image_size
@@ -2,29 +2,52 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- import base64
6
- import io
7
5
  import logging
8
- from io import BytesIO
9
6
  from math import ceil
10
7
  from math import floor
11
8
  from typing import Optional
12
9
  from typing import Tuple
13
10
 
11
+ import cv2
14
12
  import numpy as np
13
+ from io import BytesIO
15
14
  from PIL import Image
16
- from PIL import UnidentifiedImageError
17
15
 
18
16
  from nv_ingest_api.util.converters import bytetools
19
17
 
18
+ # Configure OpenCV to use a single thread for image processing
19
+ cv2.setNumThreads(1)
20
20
  DEFAULT_MAX_WIDTH = 1024
21
21
  DEFAULT_MAX_HEIGHT = 1280
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
25
 
26
+ def _resize_image_opencv(
27
+ array: np.ndarray, target_size: Tuple[int, int], interpolation=cv2.INTER_LANCZOS4
28
+ ) -> np.ndarray:
29
+ """
30
+ Resizes a NumPy array representing an image using OpenCV.
31
+
32
+ Parameters
33
+ ----------
34
+ array : np.ndarray
35
+ The input image as a NumPy array.
36
+ target_size : Tuple[int, int]
37
+ The target size as (width, height).
38
+ interpolation : int, optional
39
+ OpenCV interpolation method. Defaults to cv2.INTER_LANCZOS4.
40
+
41
+ Returns
42
+ -------
43
+ np.ndarray
44
+ The resized image as a NumPy array.
45
+ """
46
+ return cv2.resize(array, target_size, interpolation=interpolation)
47
+
48
+
26
49
  def scale_image_to_encoding_size(
27
- base64_image: str, max_base64_size: int = 180_000, initial_reduction: float = 0.9
50
+ base64_image: str, max_base64_size: int = 180_000, initial_reduction: float = 0.9, format: str = "PNG", **kwargs
28
51
  ) -> Tuple[str, Tuple[int, int]]:
29
52
  """
30
53
  Decodes a base64-encoded image, resizes it if needed, and re-encodes it as base64.
@@ -38,12 +61,19 @@ def scale_image_to_encoding_size(
38
61
  Maximum allowable size for the base64-encoded image, by default 180,000 characters.
39
62
  initial_reduction : float, optional
40
63
  Initial reduction step for resizing, by default 0.9.
64
+ format : str, optional
65
+ The image format to use for encoding. Supported formats are "PNG" and "JPEG".
66
+ Defaults to "PNG".
67
+ **kwargs
68
+ Additional keyword arguments passed to the format-specific encoding function.
69
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
70
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
41
71
 
42
72
  Returns
43
73
  -------
44
74
  Tuple[str, Tuple[int, int]]
45
75
  A tuple containing:
46
- - Base64-encoded PNG image string, resized if necessary.
76
+ - Base64-encoded image string in the specified format, resized if necessary.
47
77
  - The new size as a tuple (width, height).
48
78
 
49
79
  Raises
@@ -52,12 +82,11 @@ def scale_image_to_encoding_size(
52
82
  If the image cannot be resized below the specified max_base64_size.
53
83
  """
54
84
  try:
55
- # Decode the base64 image and open it as a PIL image
56
- image_data = base64.b64decode(base64_image)
57
- img = Image.open(io.BytesIO(image_data)).convert("RGB")
85
+ # Decode the base64 image using OpenCV (returns RGB format)
86
+ img_array = base64_to_numpy(base64_image)
58
87
 
59
- # Initial image size
60
- original_size = img.size
88
+ # Initial image size (height, width, channels) -> (width, height)
89
+ original_size = (img_array.shape[1], img_array.shape[0])
61
90
 
62
91
  # Check initial size
63
92
  if len(base64_image) <= max_base64_size:
@@ -66,23 +95,24 @@ def scale_image_to_encoding_size(
66
95
  # Initial reduction step
67
96
  reduction_step = initial_reduction
68
97
  new_size = original_size
98
+ current_img = img_array.copy()
99
+ original_width, original_height = original_size
100
+
69
101
  while len(base64_image) > max_base64_size:
70
- width, height = img.size
71
- new_size = (int(width * reduction_step), int(height * reduction_step))
102
+ new_size = (int(original_width * reduction_step), int(original_height * reduction_step))
103
+ if new_size[0] < 1 or new_size[1] < 1:
104
+ raise ValueError("Image cannot be resized further without becoming too small.")
105
+
106
+ # Resize the image using OpenCV
107
+ current_img = _resize_image_opencv(img_array, new_size)
72
108
 
73
- img_resized = img.resize(new_size, Image.LANCZOS)
74
- buffered = io.BytesIO()
75
- img_resized.save(buffered, format="PNG")
76
- base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
109
+ # Re-encode as base64 using the specified format
110
+ base64_image = numpy_to_base64(current_img, format=format, **kwargs)
77
111
 
78
112
  # Adjust the reduction step if necessary
79
113
  if len(base64_image) > max_base64_size:
80
114
  reduction_step *= 0.95 # Reduce size further if needed
81
115
 
82
- # Safety check
83
- if new_size[0] < 1 or new_size[1] < 1:
84
- raise Exception("Image cannot be resized further without becoming too small.")
85
-
86
116
  return base64_image, new_size
87
117
 
88
118
  except Exception as e:
@@ -90,36 +120,84 @@ def scale_image_to_encoding_size(
90
120
  raise
91
121
 
92
122
 
93
- def ensure_base64_is_png(base64_image: str) -> str:
123
+ def _detect_base64_image_format(base64_string: str) -> Optional[str]:
94
124
  """
95
- Ensures the given base64-encoded image is in PNG format. Converts to PNG if necessary.
125
+ Detects the format of a base64-encoded image using Pillow.
96
126
 
97
127
  Parameters
98
128
  ----------
99
- base64_image : str
129
+ base64_string : str
100
130
  Base64-encoded image string.
101
131
 
102
132
  Returns
103
133
  -------
104
- str
105
- Base64-encoded PNG image string.
134
+ The detected format ("PNG", "JPEG", "UNKNOWN")
106
135
  """
107
136
  try:
108
- # Decode the base64 string and load the image
109
- image_data = base64.b64decode(base64_image)
110
- image = Image.open(io.BytesIO(image_data))
137
+ image_bytes = bytetools.bytesfrombase64(base64_string)
138
+ except Exception as e:
139
+ logger.error(f"Invalid base64 string: {e}")
140
+ raise ValueError(f"Invalid base64 string: {e}") from e
141
+
142
+ try:
143
+ with Image.open(BytesIO(image_bytes)) as img:
144
+ return img.format.upper()
145
+ except ImportError:
146
+ raise ImportError("Pillow library not available")
147
+ except Exception as e:
148
+ logger.error(f"Error detecting image format: {e}")
149
+ return "UNKNOWN"
111
150
 
112
- # Check if the image is already in PNG format
113
- if image.format != "PNG":
114
- # Convert the image to PNG
115
- buffered = io.BytesIO()
116
- image.convert("RGB").save(buffered, format="PNG")
117
- base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
118
151
 
152
+ def ensure_base64_format(base64_image: str, target_format: str = "PNG", **kwargs) -> str:
153
+ """
154
+ Ensures the given base64-encoded image is in the specified format. Converts if necessary.
155
+ Skips conversion if the image is already in the target format.
156
+
157
+ Parameters
158
+ ----------
159
+ base64_image : str
160
+ Base64-encoded image string.
161
+ target_format : str, optional
162
+ The target image format. Supported formats are "PNG" and "JPEG". Defaults to "PNG".
163
+ **kwargs
164
+ Additional keyword arguments passed to the format-specific encoding function.
165
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
166
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
167
+
168
+ Returns
169
+ -------
170
+ str
171
+ Base64-encoded image string in the specified format.
172
+
173
+ Raises
174
+ ------
175
+ ValueError
176
+ If there is an error during format conversion.
177
+ """
178
+ target_format = target_format.upper()
179
+ if target_format == "JPG":
180
+ target_format = "JPEG"
181
+
182
+ current_format = _detect_base64_image_format(base64_image)
183
+ if current_format == "UNKNOWN":
184
+ raise ValueError(
185
+ f"Unable to decode image from base64 string: {base64_image}, because current format could not be detected."
186
+ )
187
+ if current_format == target_format:
188
+ logger.debug(f"Image already in {target_format} format, skipping conversion")
119
189
  return base64_image
190
+
191
+ try:
192
+ # Decode the base64 image using OpenCV (returns RGB format)
193
+ img_array = base64_to_numpy(base64_image)
194
+ # Re-encode in the target format
195
+ return numpy_to_base64(img_array, format=target_format, **kwargs)
196
+ except ImportError as e:
197
+ raise e
120
198
  except Exception as e:
121
- logger.error(f"Error ensuring PNG format: {e}")
122
- return None
199
+ logger.error(f"Error converting image to {target_format} format: {e}")
200
+ raise ValueError(f"Failed to convert image to {target_format} format: {e}") from e
123
201
 
124
202
 
125
203
  def pad_image(
@@ -302,66 +380,193 @@ def normalize_image(
302
380
  return output_array
303
381
 
304
382
 
305
- def numpy_to_base64(array: np.ndarray) -> str:
383
+ def _preprocess_numpy_array(array: np.ndarray) -> np.ndarray:
384
+ """
385
+ Preprocesses a NumPy array for image encoding by ensuring proper format and data type.
386
+ Also handles color space conversion for OpenCV encoding.
387
+
388
+ Parameters
389
+ ----------
390
+ array : np.ndarray
391
+ The input image as a NumPy array.
392
+
393
+ Returns
394
+ -------
395
+ np.ndarray
396
+ The preprocessed array in uint8 format, ready for OpenCV encoding (BGR color order for color images).
397
+
398
+ Raises
399
+ ------
400
+ ValueError
401
+ If the input array cannot be converted into a valid image format.
402
+ """
403
+ # Check if the array is valid and can be converted to an image
404
+ try:
405
+ # If the array represents a grayscale image, drop the redundant axis in
406
+ # (h, w, 1). cv2 expects (h, w) for grayscale.
407
+ if array.ndim == 3 and array.shape[2] == 1:
408
+ array = np.squeeze(array, axis=2)
409
+
410
+ # Ensure uint8 data type
411
+ processed_array = array.astype(np.uint8)
412
+
413
+ # OpenCV uses BGR color order, so convert RGB to BGR if needed
414
+ if processed_array.ndim == 3 and processed_array.shape[2] == 3:
415
+ # Assume input is RGB and convert to BGR for OpenCV
416
+ processed_array = cv2.cvtColor(processed_array, cv2.COLOR_RGB2BGR)
417
+
418
+ return processed_array
419
+ except Exception as e:
420
+ raise ValueError(f"Failed to preprocess NumPy array for image encoding: {e}")
421
+
422
+
423
+ def _encode_opencv_jpeg(array: np.ndarray, *, quality: int = 100) -> bytes:
424
+ """NumPy array -> JPEG bytes using OpenCV."""
425
+ ok, buf = cv2.imencode(".jpg", array, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
426
+ if not ok:
427
+ raise RuntimeError("cv2.imencode failed")
428
+ return buf.tobytes()
429
+
430
+
431
+ def _encode_opencv_png(array: np.ndarray, *, compression: int = 6) -> bytes:
432
+ """NumPy array -> PNG bytes using OpenCV"""
433
+ encode_params = [
434
+ cv2.IMWRITE_PNG_COMPRESSION,
435
+ compression,
436
+ cv2.IMWRITE_PNG_STRATEGY,
437
+ cv2.IMWRITE_PNG_STRATEGY_DEFAULT,
438
+ ]
439
+ ok, buf = cv2.imencode(".png", array, encode_params)
440
+ if not ok:
441
+ raise RuntimeError("cv2.imencode(.png) failed")
442
+ return buf.tobytes()
443
+
444
+
445
+ def numpy_to_base64_png(array: np.ndarray) -> str:
446
+ """
447
+ Converts a preprocessed NumPy array representing an image to a base64-encoded PNG string using OpenCV.
448
+
449
+ Parameters
450
+ ----------
451
+ array : np.ndarray
452
+ The preprocessed input image as a NumPy array. Must have a shape compatible with image data.
453
+
454
+ Returns
455
+ -------
456
+ str
457
+ The base64-encoded PNG string representation of the input NumPy array.
458
+
459
+ Raises
460
+ ------
461
+ RuntimeError
462
+ If there is an issue during the image conversion or base64 encoding process.
463
+ """
464
+ try:
465
+ # Encode to PNG bytes using OpenCV
466
+ png_bytes = _encode_opencv_png(array)
467
+
468
+ # Convert to base64
469
+ base64_img = bytetools.base64frombytes(png_bytes)
470
+ except Exception as e:
471
+ raise RuntimeError(f"Failed to encode image to base64 PNG: {e}")
472
+
473
+ return base64_img
474
+
475
+
476
+ def numpy_to_base64_jpeg(array: np.ndarray, quality: int = 100) -> str:
477
+ """
478
+ Converts a preprocessed NumPy array representing an image to a base64-encoded JPEG string using OpenCV.
479
+
480
+ Parameters
481
+ ----------
482
+ array : np.ndarray
483
+ The preprocessed input image as a NumPy array. Must have a shape compatible with image data.
484
+ quality : int, optional
485
+ JPEG quality (1-100), by default 100. Higher values mean better quality but larger file size.
486
+
487
+ Returns
488
+ -------
489
+ str
490
+ The base64-encoded JPEG string representation of the input NumPy array.
491
+
492
+ Raises
493
+ ------
494
+ RuntimeError
495
+ If there is an issue during the image conversion or base64 encoding process.
496
+ """
497
+ try:
498
+ # Encode to JPEG bytes using OpenCV
499
+ jpeg_bytes = _encode_opencv_jpeg(array, quality=quality)
500
+
501
+ # Convert to base64
502
+ base64_img = bytetools.base64frombytes(jpeg_bytes)
503
+ except Exception as e:
504
+ raise RuntimeError(f"Failed to encode image to base64 JPEG: {e}")
505
+
506
+ return base64_img
507
+
508
+
509
+ def numpy_to_base64(array: np.ndarray, format: str = "PNG", **kwargs) -> str:
306
510
  """
307
511
  Converts a NumPy array representing an image to a base64-encoded string.
308
512
 
309
- The function takes a NumPy array, converts it to a PIL image, and then encodes
310
- the image as a PNG in a base64 string format. The input array is expected to be in
311
- a format that can be converted to a valid image, such as having a shape of (H, W, C)
312
- where C is the number of channels (e.g., 3 for RGB).
513
+ The function takes a NumPy array, preprocesses it, and then encodes
514
+ the image in the specified format as a base64 string. The input array is expected
515
+ to be in a format that can be converted to a valid image, such as having a shape
516
+ of (H, W, C) where C is the number of channels (e.g., 3 for RGB).
313
517
 
314
518
  Parameters
315
519
  ----------
316
520
  array : np.ndarray
317
521
  The input image as a NumPy array. Must have a shape compatible with image data.
522
+ format : str, optional
523
+ The image format to use for encoding. Supported formats are "PNG" and "JPEG".
524
+ Defaults to "PNG".
525
+ **kwargs
526
+ Additional keyword arguments passed to the format-specific encoding function.
527
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
318
528
 
319
529
  Returns
320
530
  -------
321
531
  str
322
- The base64-encoded string representation of the input NumPy array as a PNG image.
532
+ The base64-encoded string representation of the input NumPy array in the specified format.
323
533
 
324
534
  Raises
325
535
  ------
326
536
  ValueError
327
- If the input array cannot be converted into a valid image format.
537
+ If the input array cannot be converted into a valid image format, or if an
538
+ unsupported format is specified.
328
539
  RuntimeError
329
540
  If there is an issue during the image conversion or base64 encoding process.
330
541
 
331
542
  Examples
332
543
  --------
333
544
  >>> array = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
334
- >>> encoded_str = numpy_to_base64(array)
545
+ >>> encoded_str = numpy_to_base64(array, format="PNG")
335
546
  >>> isinstance(encoded_str, str)
336
547
  True
548
+ >>> encoded_str_jpeg = numpy_to_base64(array, format="JPEG", quality=90)
549
+ >>> isinstance(encoded_str_jpeg, str)
550
+ True
337
551
  """
338
- # If the array represents a grayscale image, drop the redundant axis in
339
- # (h, w, 1). PIL.Image.fromarray() expects an array of form (h, w) if it's
340
- # a grayscale image.
341
- if array.ndim == 3 and array.shape[2] == 1:
342
- array = np.squeeze(array, axis=2)
552
+ # Centralized preprocessing of the numpy array
553
+ processed_array = _preprocess_numpy_array(array)
343
554
 
344
- # Check if the array is valid and can be converted to an image
345
- try:
346
- # Convert the NumPy array to a PIL image
347
- pil_image = Image.fromarray(array.astype(np.uint8))
348
- except Exception as e:
349
- raise ValueError(f"Failed to convert NumPy array to image: {e}")
555
+ format = format.upper()
350
556
 
351
- try:
352
- # Convert the PIL image to a base64-encoded string
353
- with BytesIO() as buffer:
354
- pil_image.save(buffer, format="PNG")
355
- base64_img = bytetools.base64frombytes(buffer.getvalue())
356
- except Exception as e:
357
- raise RuntimeError(f"Failed to encode image to base64: {e}")
358
-
359
- return base64_img
557
+ if format == "PNG":
558
+ return numpy_to_base64_png(processed_array)
559
+ elif format == "JPEG" or format == "JPG":
560
+ quality = kwargs.get("quality", 100)
561
+ return numpy_to_base64_jpeg(processed_array, quality=quality)
562
+ else:
563
+ raise ValueError(f"Unsupported format: {format}. Supported formats are 'PNG' and 'JPEG'.")
360
564
 
361
565
 
362
566
  def base64_to_numpy(base64_string: str) -> np.ndarray:
363
567
  """
364
- Convert a base64-encoded image string to a NumPy array.
568
+ Convert a base64-encoded image string to a NumPy array using OpenCV.
569
+ Returns images in RGB format for consistency.
365
570
 
366
571
  Parameters
367
572
  ----------
@@ -371,37 +576,82 @@ def base64_to_numpy(base64_string: str) -> np.ndarray:
371
576
  Returns
372
577
  -------
373
578
  numpy.ndarray
374
- NumPy array representation of the decoded image.
579
+ NumPy array representation of the decoded image in RGB format (for color images).
580
+ Grayscale images are returned as-is.
375
581
 
376
582
  Raises
377
583
  ------
378
584
  ValueError
379
585
  If the base64 string is invalid or cannot be decoded into an image.
380
- ImportError
381
- If required libraries are not installed.
382
586
 
383
587
  Examples
384
588
  --------
385
589
  >>> base64_str = '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBD...'
386
590
  >>> img_array = base64_to_numpy(base64_str)
591
+ >>> # img_array is now in RGB format (for color images)
387
592
  """
388
593
  try:
389
- # Decode the base64 string
390
- image_data = base64.b64decode(base64_string)
391
- except (base64.binascii.Error, ValueError) as e:
594
+ # Decode the base64 string to bytes using bytetools
595
+ image_bytes = bytetools.bytesfrombase64(base64_string)
596
+ except Exception as e:
392
597
  raise ValueError("Invalid base64 string") from e
393
598
 
599
+ # Create numpy buffer from bytes and decode using OpenCV
600
+ buf = np.frombuffer(image_bytes, dtype=np.uint8)
394
601
  try:
395
- # Convert the bytes into a BytesIO object
396
- image_bytes = BytesIO(image_data)
397
-
398
- # Open the image using PIL
399
- image = Image.open(image_bytes)
400
- image.load()
401
- except UnidentifiedImageError as e:
602
+ img = cv2.imdecode(buf, cv2.IMREAD_UNCHANGED)
603
+ if img is None:
604
+ raise ValueError("OpenCV failed to decode image")
605
+
606
+ # Convert BGR to RGB for consistent processing (OpenCV loads as BGR)
607
+ # Only convert if it's a 3-channel color image
608
+ if img.ndim == 3 and img.shape[2] == 3:
609
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
610
+ except ImportError:
611
+ raise
612
+ except Exception as e:
402
613
  raise ValueError("Unable to decode image from base64 string") from e
403
614
 
404
- # Convert the image to a NumPy array
405
- image_array = np.array(image)
615
+ # Convert to numpy array
616
+ img = np.array(img)
617
+ # Assert that 3-channel images are in RGB format after conversion
618
+ assert img.ndim <= 3, f"Image has unexpected number of dimensions: {img.ndim}"
619
+ assert img.ndim != 3 or img.shape[2] == 3, f"3-channel image should have 3 channels, got: {img.shape[2]}"
620
+
621
+ return img
622
+
623
+
624
+ def scale_numpy_image(
625
+ img_arr: np.ndarray, scale_tuple: Optional[Tuple[int, int]] = None, interpolation=Image.LANCZOS
626
+ ) -> np.ndarray:
627
+ """
628
+ Scales a NumPy image array using OpenCV with aspect ratio preservation.
406
629
 
407
- return image_array
630
+ This function provides OpenCV-based image scaling that mimics PIL's thumbnail behavior
631
+ by maintaining aspect ratio and scaling to fit within the specified dimensions.
632
+
633
+ Parameters
634
+ ----------
635
+ img_arr : np.ndarray
636
+ The input image as a NumPy array.
637
+ scale_tuple : Optional[Tuple[int, int]], optional
638
+ A tuple (width, height) to resize the image to. If provided, the image
639
+ will be resized to fit within these dimensions while maintaining aspect ratio
640
+ (similar to PIL's thumbnail method). Defaults to None.
641
+ interpolation : int, optional
642
+ OpenCV interpolation method. Defaults to cv2.INTER_LANCZOS4.
643
+
644
+ Returns
645
+ -------
646
+ np.ndarray
647
+ A NumPy array representing the scaled image data.
648
+ """
649
+ # Apply scaling using OpenCV if specified
650
+ # Using PIL for scaling as CV2 seems to lead to different results
651
+ # TODO: Remove when we move to YOLOX Ensemble Models
652
+ if scale_tuple:
653
+ image = Image.fromarray(img_arr)
654
+ image.thumbnail(scale_tuple, interpolation)
655
+ img_arr = np.array(image)
656
+ # Ensure we return a copy
657
+ return img_arr.copy()
@@ -7,7 +7,6 @@ from typing import List, Any
7
7
  from typing import Optional
8
8
  from typing import Tuple
9
9
 
10
- import PIL
11
10
  import numpy as np
12
11
  import pypdfium2 as pdfium
13
12
  import pypdfium2.raw as pdfium_c
@@ -20,8 +19,9 @@ from nv_ingest_api.util.image_processing.clustering import (
20
19
  combine_groups_into_bboxes,
21
20
  remove_superset_bboxes,
22
21
  )
23
- from nv_ingest_api.util.image_processing.transforms import pad_image, numpy_to_base64, crop_image
22
+ from nv_ingest_api.util.image_processing.transforms import pad_image, numpy_to_base64, crop_image, scale_numpy_image
24
23
  from nv_ingest_api.util.metadata.aggregators import Base64Image
24
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YOLOX_PAGE_IMAGE_FORMAT
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
@@ -176,18 +176,10 @@ def pdfium_pages_to_numpy(
176
176
  for idx, page in enumerate(pages):
177
177
  # Render the page as a bitmap with the specified scale and rotation
178
178
  page_bitmap = page.render(scale=scale, rotation=rotation)
179
-
180
- # Convert the bitmap to a PIL image
181
- pil_image = page_bitmap.to_pil()
182
-
179
+ img_arr = convert_bitmap_to_corrected_numpy(page_bitmap)
183
180
  # Apply scaling using the thumbnail approach if specified
184
181
  if scale_tuple:
185
- pil_image.thumbnail(scale_tuple, PIL.Image.LANCZOS)
186
-
187
- # Convert the PIL image to a NumPy array and force a full copy,
188
- # ensuring the returned array is entirely independent of the original buffer.
189
- img_arr = np.array(pil_image).copy()
190
-
182
+ img_arr = scale_numpy_image(img_arr, scale_tuple)
191
183
  # Apply padding if specified
192
184
  if padding_tuple:
193
185
  img_arr, (pad_width, pad_height) = pad_image(
@@ -250,7 +242,7 @@ def extract_simple_images_from_pdfium_page(page, max_depth):
250
242
  try:
251
243
  # Attempt to retrieve the image bitmap
252
244
  image_numpy: np.ndarray = pdfium_try_get_bitmap_as_numpy(obj) # noqa
253
- image_base64: str = numpy_to_base64(image_numpy)
245
+ image_base64: str = numpy_to_base64(image_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
254
246
  image_bbox = obj.get_pos()
255
247
  image_size = obj.get_size()
256
248
  if image_size[0] < 10 and image_size[1] < 10:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.7.14.dev20250714
3
+ Version: 2025.7.15.dev20250715
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -217,6 +217,7 @@ Requires-Dist: backoff==2.2.1
217
217
  Requires-Dist: pandas>=2.0
218
218
  Requires-Dist: pydantic>2.0.0
219
219
  Requires-Dist: pydantic-settings>2.0.0
220
+ Requires-Dist: tritonclient
220
221
  Dynamic: license-file
221
222
 
222
223
  # nv-ingest-api
@@ -31,8 +31,8 @@ nv_ingest_api/internal/extract/pdf/pdf_extractor.py,sha256=CxtWaD6mql9MEqSdk2CfS
31
31
  nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIRiozIRw70Kybw3A72-lcKFeoTI,582
32
32
  nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
33
33
  nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=PpKTqS8jGHBV6mKLGZWwjpfT8ga6Fy8ffrvL-gPAf2c,8182
34
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=Uqj1NH7yWga9P6_vCzgny1WKALfF--UdAaGHUF8K_aQ,22926
35
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=fDbrZwJ-lgeHYOq107WXehzdSvyF8zEDza_9UkDm5aE,22360
34
+ nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=XNYz4S2tMFBv0KFzXNERrVs-1raxJ_iIIXpBGlJFcD0,22987
35
+ nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=vtdBue1EEQJsHcBuX3NdPutbLfyKPIzily6JOK6yV0w,22421
36
36
  nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
37
37
  nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
38
38
  nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ2CZs167juvEZ-uV6qXWQjR08hhIu8otk2MWj4,4931
@@ -55,12 +55,12 @@ nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1E
55
55
  nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubkHs4WjnexM6rI0wkjWCsrVNEbA4Wjk2oKL9OYCU,1499
56
56
  nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
57
57
  nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=x35a9AyTYxpESQflLo_YnhVOKblQKVen6vGGFaXmNiE,9927
58
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=MFWPqMTXs_MZG3ripRR21o7f_mVeoE46Q10yvJ8KNr0,7023
58
+ nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
59
59
  nv_ingest_api/internal/primitives/nim/model_interface/paddle.py,sha256=rSUPwl5XOrqneoS6aKhatVjrNBg_LhP3nwUWS_aTwz0,17950
60
60
  nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=5PqD2JuHY2rwd-6SSB4axr2Dd79vm95sAEkcmI3U7ME,12977
61
61
  nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
62
62
  nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=qJ382PU1ZrIM-SR3cqIhtY_W2rmHec2HIa2aUB2SvaU,6031
63
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=uYXqdvqgkyS4Yfr9ZoikRDX4e94OV3ch3Xhv3JVg-3s,49581
63
+ nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=nsfDQgeupBe9Tdf3S5sfNpYcObEwVlzCZdfg1ObAW88,49584
64
64
  nv_ingest_api/internal/primitives/tracing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  nv_ingest_api/internal/primitives/tracing/latency.py,sha256=5kVTeYRbRdTlT_aI4MeS20N_S7mqCcLqZR6YHtxhXkY,2215
66
66
  nv_ingest_api/internal/primitives/tracing/logging.py,sha256=SSzIgS7afLH-e1C7VagYDmkkA6rTXmQ-bmtLjoEguhg,3851
@@ -123,7 +123,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
123
123
  nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
124
124
  nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
125
125
  nv_ingest_api/util/image_processing/table_and_chart.py,sha256=bxOu9PZYkG_WFCDGw_JLaO60S2pDSN8EOWK3xkIwr2A,14376
126
- nv_ingest_api/util/image_processing/transforms.py,sha256=Kz9hrizV314Hy7cRCYK9ZmhmBbVUOZ_z0HEpzZYcslQ,14081
126
+ nv_ingest_api/util/image_processing/transforms.py,sha256=CJVGQgUvHk_mzihR8ZZrvwJUBgUYcgFAKzXyRTmKdCE,23371
127
127
  nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
128
128
  nv_ingest_api/util/imports/callable_signatures.py,sha256=e2bJB1pmkN4Ee-Bf-VggOSBaQ4RXofWF5eKkWXgIj2U,1855
129
129
  nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=7GByV_-8z2X0tnVoabCxVioxOP3sYMros3ZllVAW-wY,4343
@@ -140,7 +140,7 @@ nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ
140
140
  nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
141
141
  nv_ingest_api/util/nim/__init__.py,sha256=UqbiXFCqjWcjNvoduXd_0gOUOGBT8JvppiYHOmMyneA,1775
142
142
  nv_ingest_api/util/pdf/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
143
- nv_ingest_api/util/pdf/pdfium.py,sha256=Ch9Gh5jRLcBr3stjCckqWwTUL-T0sI50PlQnZHo_9NA,15761
143
+ nv_ingest_api/util/pdf/pdfium.py,sha256=qTiTlSaiCk_rxm_eoQBoAFKq_5OQrioHVSbPbGDxVkE,15668
144
144
  nv_ingest_api/util/schema/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
145
145
  nv_ingest_api/util/schema/schema_validator.py,sha256=H0yZ_i_HZaiBRUCGmTBfRB9-hURhVqyd10aS_ynM1_0,321
146
146
  nv_ingest_api/util/service_clients/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -153,8 +153,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
153
153
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
154
154
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
156
- nv_ingest_api-2025.7.14.dev20250714.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
157
- nv_ingest_api-2025.7.14.dev20250714.dist-info/METADATA,sha256=ZSDiSF9iqAtQvebMJ1Xp4Y_Uee8FqaZwEshVsywq_5I,13919
158
- nv_ingest_api-2025.7.14.dev20250714.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
- nv_ingest_api-2025.7.14.dev20250714.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
160
- nv_ingest_api-2025.7.14.dev20250714.dist-info/RECORD,,
156
+ nv_ingest_api-2025.7.15.dev20250715.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
157
+ nv_ingest_api-2025.7.15.dev20250715.dist-info/METADATA,sha256=OWZyeCR9DZ23SdT0RcMdodCkxR508CZZaVczdM3qXPE,13947
158
+ nv_ingest_api-2025.7.15.dev20250715.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ nv_ingest_api-2025.7.15.dev20250715.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
160
+ nv_ingest_api-2025.7.15.dev20250715.dist-info/RECORD,,