nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,850 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from math import ceil
7
+ from math import floor
8
+ from typing import Optional
9
+ from typing import Tuple
10
+
11
+ import cv2
12
+ import numpy as np
13
+ from io import BytesIO
14
+ from PIL import Image
15
+
16
+ from nv_ingest_api.util.converters import bytetools
17
+
18
+ # Configure OpenCV to use a single thread for image processing
19
+ cv2.setNumThreads(1)
20
+ DEFAULT_MAX_WIDTH = 1024
21
+ DEFAULT_MAX_HEIGHT = 1280
22
+
23
+ # Workaround for PIL.Image.DecompressionBombError
24
+ Image.MAX_IMAGE_PIXELS = None
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def _resize_image_opencv(
30
+ array: np.ndarray, target_size: Tuple[int, int], interpolation=cv2.INTER_LANCZOS4
31
+ ) -> np.ndarray:
32
+ """
33
+ Resizes a NumPy array representing an image using OpenCV.
34
+
35
+ Parameters
36
+ ----------
37
+ array : np.ndarray
38
+ The input image as a NumPy array.
39
+ target_size : Tuple[int, int]
40
+ The target size as (width, height).
41
+ interpolation : int, optional
42
+ OpenCV interpolation method. Defaults to cv2.INTER_LANCZOS4.
43
+
44
+ Returns
45
+ -------
46
+ np.ndarray
47
+ The resized image as a NumPy array.
48
+ """
49
+ return cv2.resize(array, target_size, interpolation=interpolation)
50
+
51
+
52
+ def rgba_to_rgb_white_bg(rgba_image):
53
+ """
54
+ Convert RGBA image to RGB by blending with a white background.
55
+
56
+ This function properly handles transparency by alpha-blending transparent
57
+ and semi-transparent pixels with a white background, producing visually
58
+ accurate results that match how the image would appear when displayed.
59
+
60
+ Parameters
61
+ ----------
62
+ rgba_image : numpy.ndarray
63
+ Input image array with shape (height, width, 4) where the channels
64
+ are Red, Green, Blue, Alpha. Alpha values can be in range [0, 1]
65
+ (float) or [0, 255] (uint8).
66
+
67
+ Returns
68
+ -------
69
+ numpy.ndarray
70
+ RGB image array with shape (height, width, 3) and dtype uint8.
71
+ Values are in range [0, 255] representing Red, Green, Blue channels.
72
+
73
+ Notes
74
+ -----
75
+ The alpha blending formula used is:
76
+ RGB_out = RGB_in * alpha + background * (1 - alpha)
77
+
78
+ Where background is white (255, 255, 255).
79
+
80
+ For pixels with alpha = 1.0 (fully opaque), the original RGB values
81
+ are preserved. For pixels with alpha = 0.0 (fully transparent), the
82
+ result is white. Semi-transparent pixels are blended proportionally.
83
+
84
+ Examples
85
+ --------
86
+ >>> import numpy as np
87
+ >>> # Create a sample RGBA image with some transparency
88
+ >>> rgba = np.random.randint(0, 256, (100, 100, 4), dtype=np.uint8)
89
+ >>> rgb = rgba_to_rgb_white_bg(rgba)
90
+ >>> print(rgb.shape) # (100, 100, 3)
91
+ >>> print(rgb.dtype) # uint8
92
+
93
+ >>> # Example with float alpha values [0, 1]
94
+ >>> rgba_float = np.random.rand(50, 50, 4).astype(np.float32)
95
+ >>> rgb_float = rgba_to_rgb_white_bg(rgba_float)
96
+ >>> print(rgb_float.dtype) # uint8
97
+ """
98
+ # Extract RGB and alpha channels
99
+ rgb = rgba_image[:, :, :3] # RGB channels (H, W, 3)
100
+ alpha = rgba_image[:, :, 3:4] # Alpha channel (H, W, 1)
101
+
102
+ # Normalize alpha to [0, 1] range if it's in [0, 255] range
103
+ if alpha.max() > 1.0:
104
+ alpha = alpha / 255.0
105
+
106
+ # Alpha blend with white background using the formula:
107
+ # result = foreground * alpha + background * (1 - alpha)
108
+ rgb_image = rgb * alpha + 255 * (1 - alpha)
109
+
110
+ # Convert to uint8 format for standard image representation
111
+ return rgb_image.astype(np.uint8)
112
+
113
+
114
+ def scale_image_to_encoding_size(
115
+ base64_image: str, max_base64_size: int = 180_000, initial_reduction: float = 0.9, format: str = "PNG", **kwargs
116
+ ) -> Tuple[str, Tuple[int, int]]:
117
+ """
118
+ Decodes a base64-encoded image, resizes it if needed, and re-encodes it as base64.
119
+ Ensures the final image size is within the specified limit.
120
+
121
+ Parameters
122
+ ----------
123
+ base64_image : str
124
+ Base64-encoded image string.
125
+ max_base64_size : int, optional
126
+ Maximum allowable size for the base64-encoded image, by default 180,000 characters.
127
+ initial_reduction : float, optional
128
+ Initial reduction step for resizing, by default 0.9.
129
+ format : str, optional
130
+ The image format to use for encoding. Supported formats are "PNG" and "JPEG".
131
+ Defaults to "PNG".
132
+ **kwargs
133
+ Additional keyword arguments passed to the format-specific encoding function.
134
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
135
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
136
+
137
+ Returns
138
+ -------
139
+ Tuple[str, Tuple[int, int]]
140
+ A tuple containing:
141
+ - Base64-encoded image string in the specified format, resized if necessary.
142
+ - The new size as a tuple (width, height).
143
+
144
+ Raises
145
+ ------
146
+ Exception
147
+ If the image cannot be resized below the specified max_base64_size.
148
+ """
149
+ try:
150
+ # Decode the base64 image using OpenCV (returns RGB format)
151
+ img_array = base64_to_numpy(base64_image)
152
+
153
+ # Initial image size (height, width, channels) -> (width, height)
154
+ original_size = (img_array.shape[1], img_array.shape[0])
155
+
156
+ # Check initial size
157
+ if len(base64_image) <= max_base64_size:
158
+ return numpy_to_base64(img_array, format=format, **kwargs), original_size
159
+
160
+ # Initial reduction step
161
+ reduction_step = initial_reduction
162
+ new_size = original_size
163
+ current_img = img_array.copy()
164
+ original_width, original_height = original_size
165
+
166
+ while len(base64_image) > max_base64_size:
167
+ new_size = (int(original_width * reduction_step), int(original_height * reduction_step))
168
+ if new_size[0] < 1 or new_size[1] < 1:
169
+ raise ValueError("Image cannot be resized further without becoming too small.")
170
+
171
+ # Resize the image using OpenCV
172
+ current_img = _resize_image_opencv(img_array, new_size)
173
+
174
+ # Re-encode as base64 using the specified format
175
+ base64_image = numpy_to_base64(current_img, format=format, **kwargs)
176
+
177
+ # Adjust the reduction step if necessary
178
+ if len(base64_image) > max_base64_size:
179
+ reduction_step *= 0.95 # Reduce size further if needed
180
+
181
+ return base64_image, new_size
182
+
183
+ except Exception as e:
184
+ logger.error(f"Error resizing the image: {e}")
185
+ raise
186
+
187
+
188
+ def _detect_base64_image_format(base64_string: str) -> Optional[str]:
189
+ """
190
+ Detects the format of a base64-encoded image using Pillow.
191
+
192
+ Parameters
193
+ ----------
194
+ base64_string : str
195
+ Base64-encoded image string.
196
+
197
+ Returns
198
+ -------
199
+ The detected format ("PNG", "JPEG", "UNKNOWN")
200
+ """
201
+ try:
202
+ image_bytes = bytetools.bytesfrombase64(base64_string)
203
+ except Exception as e:
204
+ logger.error(f"Invalid base64 string: {e}")
205
+ raise ValueError(f"Invalid base64 string: {e}") from e
206
+
207
+ try:
208
+ with Image.open(BytesIO(image_bytes)) as img:
209
+ return img.format.upper()
210
+ except ImportError:
211
+ raise ImportError("Pillow library not available")
212
+ except Exception as e:
213
+ logger.error(f"Error detecting image format: {e}")
214
+ return "UNKNOWN"
215
+
216
+
217
+ def ensure_base64_format(base64_image: str, target_format: str = "PNG", **kwargs) -> str:
218
+ """
219
+ Ensures the given base64-encoded image is in the specified format. Converts if necessary.
220
+ Skips conversion if the image is already in the target format.
221
+
222
+ Parameters
223
+ ----------
224
+ base64_image : str
225
+ Base64-encoded image string.
226
+ target_format : str, optional
227
+ The target image format. Supported formats are "PNG", "JPEG"/"JPG". Defaults to "PNG".
228
+ **kwargs
229
+ Additional keyword arguments passed to the format-specific encoding function.
230
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
231
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
232
+
233
+ Returns
234
+ -------
235
+ str
236
+ Base64-encoded image string in the specified format.
237
+
238
+ Raises
239
+ ------
240
+ ValueError
241
+ If there is an error during format conversion or if an unsupported format is provided.
242
+ """
243
+ # Quick format normalization
244
+ target_format = target_format.upper().strip()
245
+ if target_format == "JPG":
246
+ target_format = "JPEG"
247
+
248
+ current_format = _detect_base64_image_format(base64_image)
249
+ if current_format == "UNKNOWN":
250
+ raise ValueError(
251
+ f"Unable to decode image from base64 string: {base64_image}, because current format could not be detected."
252
+ )
253
+ if current_format == target_format:
254
+ logger.debug(f"Image already in {target_format} format, skipping conversion")
255
+ return base64_image
256
+
257
+ try:
258
+ # Decode the base64 image using OpenCV (returns RGB format)
259
+ img_array = base64_to_numpy(base64_image)
260
+ # Re-encode in the target format
261
+ return numpy_to_base64(img_array, format=target_format, **kwargs)
262
+ except ImportError as e:
263
+ raise e
264
+ except Exception as e:
265
+ logger.error(f"Error converting image to {target_format} format: {e}")
266
+ raise ValueError(f"Failed to convert image to {target_format} format: {e}") from e
267
+
268
+
269
+ def pad_image(
270
+ array: np.ndarray,
271
+ target_width: int = DEFAULT_MAX_WIDTH,
272
+ target_height: int = DEFAULT_MAX_HEIGHT,
273
+ background_color: int = 255,
274
+ dtype=np.uint8,
275
+ how: str = "center",
276
+ ) -> Tuple[np.ndarray, Tuple[int, int]]:
277
+ """
278
+ Pads a NumPy array representing an image to the specified target dimensions.
279
+
280
+ If the target dimensions are smaller than the image dimensions, no padding will be applied
281
+ in that dimension. If the target dimensions are larger, the image will be centered within the
282
+ canvas of the specified target size, with the remaining space filled with white padding.
283
+
284
+ The padding can be done around the center (how="center"), or to the bottom right (how="bottom_right").
285
+
286
+ Parameters
287
+ ----------
288
+ array : np.ndarray
289
+ The input image as a NumPy array of shape (H, W, C).
290
+ target_width : int, optional
291
+ The desired target width of the padded image. Defaults to DEFAULT_MAX_WIDTH.
292
+ target_height : int, optional
293
+ The desired target height of the padded image. Defaults to DEFAULT_MAX_HEIGHT.
294
+ how : str, optional
295
+ The method to pad the image. Defaults to "center".
296
+
297
+ Returns
298
+ -------
299
+ padded_array : np.ndarray
300
+ The padded image as a NumPy array of shape (target_height, target_width, C).
301
+ padding_offsets : Tuple[int, int]
302
+ A tuple containing the horizontal and vertical offsets (pad_width, pad_height) applied to center the image.
303
+
304
+ Notes
305
+ -----
306
+ If the target dimensions are smaller than the current image dimensions, no padding will be applied
307
+ in that dimension, and the image will retain its original size in that dimension.
308
+
309
+ Examples
310
+ --------
311
+ >>> image = np.random.randint(0, 255, (600, 800, 3), dtype=np.uint8)
312
+ >>> padded_image, offsets = pad_image(image, target_width=1000, target_height=1000)
313
+ >>> padded_image.shape
314
+ (1000, 1000, 3)
315
+ >>> offsets
316
+ (100, 200)
317
+ """
318
+ height, width = array.shape[:2]
319
+
320
+ # Determine final canvas size (may be equal to original if target is smaller)
321
+ final_height = max(height, target_height)
322
+ final_width = max(width, target_width)
323
+
324
+ # Create the canvas and place the original image on it
325
+ canvas = background_color * np.ones((final_height, final_width, array.shape[2]), dtype=dtype)
326
+
327
+ # Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
328
+ if how == "center":
329
+ pad_height = max((target_height - height) // 2, 0)
330
+ pad_width = max((target_width - width) // 2, 0)
331
+
332
+ canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array # noqa: E203
333
+ elif how == "bottom_right":
334
+ pad_height, pad_width = 0, 0
335
+
336
+ canvas[:height, :width] = array # noqa: E203
337
+
338
+ return canvas, (pad_width, pad_height)
339
+
340
+
341
+ def check_numpy_image_size(image: np.ndarray, min_height: int, min_width: int) -> bool:
342
+ """
343
+ Checks if the height and width of the image are larger than the specified minimum values.
344
+
345
+ Parameters:
346
+ image (np.ndarray): The image array (assumed to be in shape (H, W, C) or (H, W)).
347
+ min_height (int): The minimum height required.
348
+ min_width (int): The minimum width required.
349
+
350
+ Returns:
351
+ bool: True if the image dimensions are larger than or equal to the minimum size, False otherwise.
352
+ """
353
+ # Check if the image has at least 2 dimensions
354
+ if image.ndim < 2:
355
+ raise ValueError("The input array does not have sufficient dimensions for an image.")
356
+
357
+ height, width = image.shape[:2]
358
+ return height >= min_height and width >= min_width
359
+
360
+
361
+ def crop_image(
362
+ array: np.array, bbox: Tuple[int, int, int, int], min_width: int = 1, min_height: int = 1
363
+ ) -> Optional[np.ndarray]:
364
+ """
365
+ Crops a NumPy array representing an image according to the specified bounding box.
366
+
367
+ Parameters
368
+ ----------
369
+ array : np.array
370
+ The image as a NumPy array.
371
+ bbox : Tuple[int, int, int, int]
372
+ The bounding box to crop the image to, given as (w1, h1, w2, h2).
373
+ min_width : int, optional
374
+ The minimum allowable width for the cropped image. If the cropped width is smaller than this value,
375
+ the function returns None. Default is 1.
376
+ min_height : int, optional
377
+ The minimum allowable height for the cropped image. If the cropped height is smaller than this value,
378
+ the function returns None. Default is 1.
379
+
380
+ Returns
381
+ -------
382
+ Optional[np.ndarray]
383
+ The cropped image as a NumPy array, or None if the bounding box is invalid.
384
+ """
385
+ w1, h1, w2, h2 = bbox
386
+ h1 = max(floor(h1), 0)
387
+ h2 = min(ceil(h2), array.shape[0])
388
+ w1 = max(floor(w1), 0)
389
+ w2 = min(ceil(w2), array.shape[1])
390
+
391
+ if (w2 - w1 < min_width) or (h2 - h1 < min_height):
392
+ return None
393
+
394
+ # Crop the image using the bounding box
395
+ cropped = array[h1:h2, w1:w2]
396
+
397
+ return cropped
398
+
399
+
400
+ def normalize_image(
401
+ array: np.ndarray,
402
+ r_mean: float = 0.485,
403
+ g_mean: float = 0.456,
404
+ b_mean: float = 0.406,
405
+ r_std: float = 0.229,
406
+ g_std: float = 0.224,
407
+ b_std: float = 0.225,
408
+ ) -> np.ndarray:
409
+ """
410
+ Normalizes an RGB image by applying a mean and standard deviation to each channel.
411
+
412
+ Parameters:
413
+ ----------
414
+ array : np.ndarray
415
+ The input image array, which can be either grayscale or RGB. The image should have a shape of
416
+ (height, width, 3) for RGB images, or (height, width) or (height, width, 1) for grayscale images.
417
+ If a grayscale image is provided, it will be converted to RGB format by repeating the grayscale values
418
+ across all three channels (R, G, B).
419
+ r_mean : float, optional
420
+ The mean to be subtracted from the red channel (default is 0.485).
421
+ g_mean : float, optional
422
+ The mean to be subtracted from the green channel (default is 0.456).
423
+ b_mean : float, optional
424
+ The mean to be subtracted from the blue channel (default is 0.406).
425
+ r_std : float, optional
426
+ The standard deviation to divide the red channel by (default is 0.229).
427
+ g_std : float, optional
428
+ The standard deviation to divide the green channel by (default is 0.224).
429
+ b_std : float, optional
430
+ The standard deviation to divide the blue channel by (default is 0.225).
431
+
432
+ Returns:
433
+ -------
434
+ np.ndarray
435
+ A normalized image array with the same shape as the input, where the RGB channels have been normalized
436
+ by the given means and standard deviations.
437
+
438
+ Notes:
439
+ -----
440
+ The input pixel values should be in the range [0, 255], and the function scales these values to [0, 1]
441
+ before applying normalization.
442
+
443
+ If the input image is grayscale, it is converted to an RGB image by duplicating the grayscale values
444
+ across the three color channels.
445
+ """
446
+ # If the input is a grayscale image with shape (height, width) or (height, width, 1),
447
+ # convert it to RGB with shape (height, width, 3).
448
+ if array.ndim == 2 or array.shape[2] == 1:
449
+ array = np.dstack((array, 255 * np.ones_like(array), 255 * np.ones_like(array)))
450
+
451
+ height, width = array.shape[:2]
452
+
453
+ mean = np.array([r_mean, g_mean, b_mean]).reshape((1, 1, 3)).astype(np.float32)
454
+ std = np.array([r_std, g_std, b_std]).reshape((1, 1, 3)).astype(np.float32)
455
+ output_array = (array.astype("float32") / 255.0 - mean) / std
456
+
457
+ return output_array
458
+
459
+
460
+ def _preprocess_numpy_array(array: np.ndarray) -> np.ndarray:
461
+ """
462
+ Preprocesses a NumPy array for image encoding by ensuring proper format and data type.
463
+ Also handles color space conversion for OpenCV encoding.
464
+
465
+ Parameters
466
+ ----------
467
+ array : np.ndarray
468
+ The input image as a NumPy array.
469
+
470
+ Returns
471
+ -------
472
+ np.ndarray
473
+ The preprocessed array in uint8 format, ready for OpenCV encoding (BGR color order for color images).
474
+
475
+ Raises
476
+ ------
477
+ ValueError
478
+ If the input array cannot be converted into a valid image format.
479
+ """
480
+ # Check if the array is valid and can be converted to an image
481
+ try:
482
+ # If the array represents a grayscale image, drop the redundant axis in
483
+ # (h, w, 1). cv2 expects (h, w) for grayscale.
484
+ if array.ndim == 3 and array.shape[2] == 1:
485
+ array = np.squeeze(array, axis=2)
486
+
487
+ # Ensure uint8 data type
488
+ processed_array = array.astype(np.uint8)
489
+
490
+ # OpenCV uses BGR color order, so convert RGB to BGR if needed
491
+ if processed_array.ndim == 3 and processed_array.shape[2] == 3:
492
+ # Assume input is RGB and convert to BGR for OpenCV
493
+ processed_array = cv2.cvtColor(processed_array, cv2.COLOR_RGB2BGR)
494
+
495
+ return processed_array
496
+ except Exception as e:
497
+ raise ValueError(f"Failed to preprocess NumPy array for image encoding: {e}")
498
+
499
+
500
+ def _encode_opencv_jpeg(array: np.ndarray, *, quality: int = 100) -> bytes:
501
+ """NumPy array -> JPEG bytes using OpenCV."""
502
+ ok, buf = cv2.imencode(".jpg", array, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
503
+ if not ok:
504
+ raise RuntimeError("cv2.imencode failed")
505
+ return buf.tobytes()
506
+
507
+
508
+ def _encode_opencv_png(array: np.ndarray, *, compression: int = 6) -> bytes:
509
+ """NumPy array -> PNG bytes using OpenCV"""
510
+ encode_params = [
511
+ cv2.IMWRITE_PNG_COMPRESSION,
512
+ compression,
513
+ cv2.IMWRITE_PNG_STRATEGY,
514
+ cv2.IMWRITE_PNG_STRATEGY_DEFAULT,
515
+ ]
516
+ ok, buf = cv2.imencode(".png", array, encode_params)
517
+ if not ok:
518
+ raise RuntimeError("cv2.imencode(.png) failed")
519
+ return buf.tobytes()
520
+
521
+
522
+ def numpy_to_base64_png(array: np.ndarray) -> str:
523
+ """
524
+ Converts a preprocessed NumPy array representing an image to a base64-encoded PNG string using OpenCV.
525
+
526
+ Parameters
527
+ ----------
528
+ array : np.ndarray
529
+ The preprocessed input image as a NumPy array. Must have a shape compatible with image data.
530
+
531
+ Returns
532
+ -------
533
+ str
534
+ The base64-encoded PNG string representation of the input NumPy array.
535
+
536
+ Raises
537
+ ------
538
+ RuntimeError
539
+ If there is an issue during the image conversion or base64 encoding process.
540
+ """
541
+ try:
542
+ # Encode to PNG bytes using OpenCV
543
+ png_bytes = _encode_opencv_png(array)
544
+
545
+ # Convert to base64
546
+ base64_img = bytetools.base64frombytes(png_bytes)
547
+ except Exception as e:
548
+ raise RuntimeError(f"Failed to encode image to base64 PNG: {e}")
549
+
550
+ return base64_img
551
+
552
+
553
+ def numpy_to_base64_jpeg(array: np.ndarray, quality: int = 100) -> str:
554
+ """
555
+ Converts a preprocessed NumPy array representing an image to a base64-encoded JPEG string using OpenCV.
556
+
557
+ Parameters
558
+ ----------
559
+ array : np.ndarray
560
+ The preprocessed input image as a NumPy array. Must have a shape compatible with image data.
561
+ quality : int, optional
562
+ JPEG quality (1-100), by default 100. Higher values mean better quality but larger file size.
563
+
564
+ Returns
565
+ -------
566
+ str
567
+ The base64-encoded JPEG string representation of the input NumPy array.
568
+
569
+ Raises
570
+ ------
571
+ RuntimeError
572
+ If there is an issue during the image conversion or base64 encoding process.
573
+ """
574
+ try:
575
+ # Encode to JPEG bytes using OpenCV
576
+ jpeg_bytes = _encode_opencv_jpeg(array, quality=quality)
577
+
578
+ # Convert to base64
579
+ base64_img = bytetools.base64frombytes(jpeg_bytes)
580
+ except Exception as e:
581
+ raise RuntimeError(f"Failed to encode image to base64 JPEG: {e}")
582
+
583
+ return base64_img
584
+
585
+
586
+ def numpy_to_base64(array: np.ndarray, format: str = "PNG", **kwargs) -> str:
587
+ """
588
+ Converts a NumPy array representing an image to a base64-encoded string.
589
+
590
+ The function takes a NumPy array, preprocesses it, and then encodes
591
+ the image in the specified format as a base64 string. The input array is expected
592
+ to be in a format that can be converted to a valid image, such as having a shape
593
+ of (H, W, C) where C is the number of channels (e.g., 3 for RGB).
594
+
595
+ Parameters
596
+ ----------
597
+ array : np.ndarray
598
+ The input image as a NumPy array. Must have a shape compatible with image data.
599
+ format : str, optional
600
+ The image format to use for encoding. Supported formats are "PNG" and "JPEG".
601
+ Defaults to "PNG".
602
+ **kwargs
603
+ Additional keyword arguments passed to the format-specific encoding function.
604
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
605
+
606
+ Returns
607
+ -------
608
+ str
609
+ The base64-encoded string representation of the input NumPy array in the specified format.
610
+
611
+ Raises
612
+ ------
613
+ ValueError
614
+ If the input array cannot be converted into a valid image format, or if an
615
+ unsupported format is specified.
616
+ RuntimeError
617
+ If there is an issue during the image conversion or base64 encoding process.
618
+
619
+ Examples
620
+ --------
621
+ >>> array = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
622
+ >>> encoded_str = numpy_to_base64(array, format="PNG")
623
+ >>> isinstance(encoded_str, str)
624
+ True
625
+ >>> encoded_str_jpeg = numpy_to_base64(array, format="JPEG", quality=90)
626
+ >>> isinstance(encoded_str_jpeg, str)
627
+ True
628
+ """
629
+ # Centralized preprocessing of the numpy array
630
+ processed_array = _preprocess_numpy_array(array)
631
+
632
+ # Quick format normalization
633
+ format = format.upper().strip()
634
+ if format == "JPG":
635
+ format = "JPEG"
636
+
637
+ if format == "PNG":
638
+ return numpy_to_base64_png(processed_array)
639
+ elif format == "JPEG":
640
+ quality = kwargs.get("quality", 100)
641
+ return numpy_to_base64_jpeg(processed_array, quality=quality)
642
+ else:
643
+ raise ValueError(f"Unsupported format: {format}. Supported formats are 'PNG' and 'JPEG'.")
644
+
645
+
646
+ def base64_to_numpy(base64_string: str) -> np.ndarray:
647
+ """
648
+ Convert a base64-encoded image string to a NumPy array using OpenCV.
649
+ Returns images in RGB format for consistency.
650
+
651
+ Parameters
652
+ ----------
653
+ base64_string : str
654
+ Base64-encoded string representing an image.
655
+
656
+ Returns
657
+ -------
658
+ numpy.ndarray
659
+ NumPy array representation of the decoded image in RGB format (for color images).
660
+ Grayscale images are returned as-is.
661
+
662
+ Raises
663
+ ------
664
+ ValueError
665
+ If the base64 string is invalid or cannot be decoded into an image.
666
+
667
+ Examples
668
+ --------
669
+ >>> base64_str = '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBD...'
670
+ >>> img_array = base64_to_numpy(base64_str)
671
+ >>> # img_array is now in RGB format (for color images)
672
+ """
673
+ try:
674
+ # Decode the base64 string to bytes using bytetools
675
+ image_bytes = bytetools.bytesfrombase64(base64_string)
676
+ except Exception as e:
677
+ raise ValueError("Invalid base64 string") from e
678
+
679
+ # Create numpy buffer from bytes and decode using OpenCV
680
+ buf = np.frombuffer(image_bytes, dtype=np.uint8)
681
+ try:
682
+ img = cv2.imdecode(buf, cv2.IMREAD_UNCHANGED)
683
+ if img is None:
684
+ raise ValueError("OpenCV failed to decode image")
685
+
686
+ # Convert 4 channel to 3 channel if necessary
687
+ if img.shape[2] == 4:
688
+ img = rgba_to_rgb_white_bg(img)
689
+
690
+ # Convert BGR to RGB for consistent processing (OpenCV loads as BGR)
691
+ # Only convert if it's a 3-channel color image
692
+ if img.ndim == 3 and img.shape[2] == 3:
693
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
694
+ except ImportError:
695
+ raise
696
+ except Exception as e:
697
+ raise ValueError("Unable to decode image from base64 string") from e
698
+
699
+ # Convert to numpy array
700
+ img = np.array(img)
701
+ # Assert that 3-channel images are in RGB format after conversion
702
+ assert img.ndim <= 3, f"Image has unexpected number of dimensions: {img.ndim}"
703
+ assert img.ndim != 3 or img.shape[2] == 3, f"3-channel image should have 3 channels, got: {img.shape[2]}"
704
+
705
+ return img
706
+
707
+
708
+ def scale_numpy_image(
709
+ img_arr: np.ndarray, scale_tuple: Optional[Tuple[int, int]] = None, interpolation=Image.LANCZOS
710
+ ) -> np.ndarray:
711
+ """
712
+ Scales a NumPy image array using OpenCV with aspect ratio preservation.
713
+
714
+ This function provides OpenCV-based image scaling that mimics PIL's thumbnail behavior
715
+ by maintaining aspect ratio and scaling to fit within the specified dimensions.
716
+
717
+ Parameters
718
+ ----------
719
+ img_arr : np.ndarray
720
+ The input image as a NumPy array.
721
+ scale_tuple : Optional[Tuple[int, int]], optional
722
+ A tuple (width, height) to resize the image to. If provided, the image
723
+ will be resized to fit within these dimensions while maintaining aspect ratio
724
+ (similar to PIL's thumbnail method). Defaults to None.
725
+ interpolation : int, optional
726
+ OpenCV interpolation method. Defaults to cv2.INTER_LANCZOS4.
727
+
728
+ Returns
729
+ -------
730
+ np.ndarray
731
+ A NumPy array representing the scaled image data.
732
+ """
733
+ # Apply scaling using OpenCV if specified
734
+ # Using PIL for scaling as CV2 seems to lead to different results
735
+ # TODO: Remove when we move to YOLOX Ensemble Models
736
+ if scale_tuple:
737
+ image = Image.fromarray(img_arr)
738
+ image.thumbnail(scale_tuple, interpolation)
739
+ img_arr = np.array(image)
740
+ # Ensure we return a copy
741
+ return img_arr.copy()
742
+
743
+
744
+ def base64_to_disk(base64_string: str, output_path: str) -> bool:
745
+ """
746
+ Write base64-encoded image data directly to disk without conversion.
747
+
748
+ This function performs efficient base64 decoding and direct file writing,
749
+ preserving the original image format without unnecessary decode/encode cycles.
750
+ Used as the foundation for higher-level image saving operations.
751
+
752
+ Parameters
753
+ ----------
754
+ base64_string : str
755
+ Base64-encoded image data. May include data URL prefix.
756
+ output_path : str
757
+ Path where the image should be saved.
758
+
759
+ Returns
760
+ -------
761
+ bool
762
+ True if successful, False otherwise.
763
+
764
+ Examples
765
+ --------
766
+ >>> success = base64_to_disk(image_b64, "/path/to/output.jpeg")
767
+ >>> if success:
768
+ ... print("Image saved successfully")
769
+ """
770
+ try:
771
+ # Validate input
772
+ if not base64_string or not base64_string.strip():
773
+ return False
774
+
775
+ # Strip data URL prefix if present (e.g., "data:image/jpeg;base64,")
776
+ if "," in base64_string:
777
+ base64_string = base64_string.split(",")[1]
778
+
779
+ # Decode and write directly using bytetools (consistent with rest of codebase)
780
+ image_bytes = bytetools.bytesfrombase64(base64_string)
781
+
782
+ # Validate we actually have image data
783
+ if not image_bytes:
784
+ return False
785
+
786
+ with open(output_path, "wb") as f:
787
+ f.write(image_bytes)
788
+ return True
789
+
790
+ except Exception as e:
791
+ logger.error(f"Failed to write base64 image to disk: {e}")
792
+ return False
793
+
794
+
795
+ def save_image_to_disk(base64_content: str, output_path: str, target_format: str = "auto", **kwargs) -> bool:
796
+ """
797
+ Save base64 image to disk with optional format conversion.
798
+
799
+ This function provides a high-level interface for saving images that combines
800
+ format conversion capabilities with efficient disk writing. It automatically
801
+ chooses between direct writing (when no conversion needed) and format conversion
802
+ to optimize performance while maintaining flexibility.
803
+
804
+ Parameters
805
+ ----------
806
+ base64_content : str
807
+ Base64-encoded image data.
808
+ output_path : str
809
+ Path where the image should be saved.
810
+ target_format : str, optional
811
+ Target format ("PNG", "JPEG", "auto"). Default is "auto" (preserve original).
812
+ Use "auto" to preserve the original format for maximum speed.
813
+ **kwargs
814
+ Additional arguments passed to ensure_base64_format() for conversion.
815
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
816
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
817
+
818
+ Returns
819
+ -------
820
+ bool
821
+ True if successful, False otherwise.
822
+
823
+ Examples
824
+ --------
825
+ >>> # Preserve original format (fastest)
826
+ >>> success = save_image_to_disk(image_b64, "/path/to/output.jpeg", "auto")
827
+ >>>
828
+ >>> # Convert to JPEG with specific quality
829
+ >>> success = save_image_to_disk(image_b64, "/path/to/output.jpeg", "JPEG", quality=85)
830
+ """
831
+ try:
832
+ # Quick format normalization
833
+ target_format = target_format.lower().strip()
834
+ if target_format in ["jpg"]:
835
+ target_format = "jpeg"
836
+
837
+ # Handle format conversion if needed
838
+ if target_format == "auto":
839
+ # Preserve original format - no conversion needed
840
+ formatted_b64 = base64_content
841
+ else:
842
+ # Use API's smart format conversion
843
+ formatted_b64 = ensure_base64_format(base64_content, target_format, **kwargs)
844
+
845
+ # Direct write - no round trips
846
+ return base64_to_disk(formatted_b64, output_path)
847
+
848
+ except Exception as e:
849
+ logger.error(f"Failed to save image to disk: {e}")
850
+ return False