nv-ingest-api 2025.7.14.dev20250714__py3-none-any.whl → 2025.7.16.dev20250716__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -52,6 +52,8 @@ class ContentDescriptionEnum(str, Enum):
52
52
  Description for image extracted from PDF document.
53
53
  PDF_INFOGRAPHIC : str
54
54
  Description for structured infographic extracted from PDF document.
55
+ PDF_PAGE_IMAGE : str
56
+ Description for a full-page image rendered from a PDF document.
55
57
  PDF_TABLE : str
56
58
  Description for structured table extracted from PDF document.
57
59
  PDF_TEXT : str
@@ -70,6 +72,7 @@ class ContentDescriptionEnum(str, Enum):
70
72
  PDF_CHART: str = "Structured chart extracted from PDF document."
71
73
  PDF_IMAGE: str = "Image extracted from PDF document."
72
74
  PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
75
+ PDF_PAGE_IMAGE: str = "Full-page image rendered from a PDF document."
73
76
  PDF_TABLE: str = "Structured table extracted from PDF document."
74
77
  PDF_TEXT: str = "Unstructured text from PDF document."
75
78
  PPTX_IMAGE: str = "Image extracted from PPTX presentation."
@@ -94,6 +97,8 @@ class ContentTypeEnum(str, Enum):
94
97
  Represents image content.
95
98
  INFO_MSG : str
96
99
  Represents an informational message.
100
+ PAGE_IMAGE : str
101
+ Represents a full-page image rendered from a document.
97
102
  STRUCTURED : str
98
103
  Represents structured content.
99
104
  TEXT : str
@@ -111,6 +116,7 @@ class ContentTypeEnum(str, Enum):
111
116
  INFOGRAPHIC: str = "infographic"
112
117
  INFO_MSG: str = "info_message"
113
118
  NONE: str = "none"
119
+ PAGE_IMAGE: str = "page_image"
114
120
  STRUCTURED: str = "structured"
115
121
  TABLE: str = "table"
116
122
  TEXT: str = "text"
@@ -40,6 +40,7 @@ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadat
40
40
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
41
41
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
42
42
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
43
+ YOLOX_PAGE_IMAGE_FORMAT,
43
44
  )
44
45
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import NemoRetrieverParseConfigSchema
45
46
  from nv_ingest_api.util.metadata.aggregators import (
@@ -355,7 +356,7 @@ def nemoretriever_parse_extractor(
355
356
  img_numpy = crop_image(page_image, transformed_bbox)
356
357
 
357
358
  if img_numpy is not None:
358
- base64_img = numpy_to_base64(img_numpy)
359
+ base64_img = numpy_to_base64(img_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
359
360
  image = Base64Image(
360
361
  image=base64_img,
361
362
  bbox=transformed_bbox,
@@ -4,20 +4,21 @@
4
4
  # Copyright (c) 2024, NVIDIA CORPORATION.
5
5
 
6
6
  import base64
7
+ import inspect
7
8
  import io
8
-
9
- import pandas as pd
10
- from typing import Any, Dict, List, Optional
11
9
  import logging
10
+ from typing import Any
11
+ from typing import Dict
12
+ from typing import List
13
+ from typing import Optional
12
14
 
13
- from nv_ingest_api.internal.extract.pdf.engines import (
14
- adobe_extractor,
15
- llama_parse_extractor,
16
- nemoretriever_parse_extractor,
17
- pdfium_extractor,
18
- tika_extractor,
19
- unstructured_io_extractor,
20
- )
15
+ import pandas as pd
16
+ from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
17
+ from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
18
+ from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
19
+ from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
20
+ from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
21
+ from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
21
22
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
22
23
 
23
24
  # Import extraction functions for different engines.
@@ -43,6 +44,7 @@ def _work_extract_pdf(
43
44
  extract_infographics: bool,
44
45
  extract_tables: bool,
45
46
  extract_charts: bool,
47
+ extract_page_as_image: bool,
46
48
  extractor_config: dict,
47
49
  execution_trace_log=None,
48
50
  ) -> Any:
@@ -52,17 +54,25 @@ def _work_extract_pdf(
52
54
 
53
55
  extract_method = extractor_config["extract_method"]
54
56
  extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
55
- return extractor_fn(
56
- pdf_stream,
57
- extract_text,
58
- extract_images,
59
- extract_infographics,
60
- extract_tables,
61
- extract_charts,
62
- extractor_config,
63
- execution_trace_log,
57
+
58
+ extractor_fn_args = dict(
59
+ pdf_stream=pdf_stream,
60
+ extract_text=extract_text,
61
+ extract_images=extract_images,
62
+ extract_infographics=extract_infographics,
63
+ extract_tables=extract_tables,
64
+ extract_charts=extract_charts,
65
+ extractor_config=extractor_config,
66
+ execution_trace_log=execution_trace_log,
64
67
  )
65
68
 
69
+ if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
70
+ extractor_fn_args["extract_page_as_image"] = extract_page_as_image
71
+ elif extract_page_as_image:
72
+ logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
73
+
74
+ return extractor_fn(**extractor_fn_args)
75
+
66
76
 
67
77
  @unified_exception_handler
68
78
  def _orchestrate_row_extraction(
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
97
107
  extract_tables = params.pop("extract_tables", False)
98
108
  extract_charts = params.pop("extract_charts", False)
99
109
  extract_infographics = params.pop("extract_infographics", False)
110
+ extract_page_as_image = params.pop("extract_page_as_image", False)
100
111
  extract_method = params.get("extract_method", "pdfium")
101
112
  except KeyError as e:
102
113
  raise ValueError(f"Missing required extraction flag: {e}")
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
137
148
  extract_text=extract_text,
138
149
  extract_images=extract_images,
139
150
  extract_infographics=extract_infographics,
151
+ extract_page_as_image=extract_page_as_image,
140
152
  extract_tables=extract_tables,
141
153
  extract_charts=extract_charts,
142
154
  extractor_config=extractor_config,
@@ -24,16 +24,19 @@ import numpy as np
24
24
  import pandas as pd
25
25
  import pypdfium2 as libpdfium
26
26
 
27
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
27
28
  from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
28
29
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
29
30
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
30
31
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
32
+ YOLOX_PAGE_IMAGE_FORMAT,
31
33
  get_yolox_model_name,
32
34
  YoloxPageElementsModelInterface,
33
35
  )
34
36
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
35
37
  from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
36
38
  from nv_ingest_api.util.metadata.aggregators import (
39
+ construct_image_metadata_from_base64,
37
40
  construct_image_metadata_from_pdf_image,
38
41
  extract_pdf_metadata,
39
42
  construct_text_metadata,
@@ -46,6 +49,7 @@ from nv_ingest_api.util.pdf.pdfium import (
46
49
  extract_image_like_objects_from_pdfium_page,
47
50
  )
48
51
  from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
52
+ from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
49
53
  from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
50
54
 
51
55
  logger = logging.getLogger(__name__)
@@ -186,7 +190,7 @@ def _extract_page_element_images(
186
190
  if cropped is None:
187
191
  continue
188
192
 
189
- base64_img = numpy_to_base64(cropped)
193
+ base64_img = numpy_to_base64(cropped, format=YOLOX_PAGE_IMAGE_FORMAT)
190
194
 
191
195
  bbox_in_orig_coord = (
192
196
  int(w1) - pad_width,
@@ -384,6 +388,7 @@ def pdfium_extractor(
384
388
  extract_infographics: bool,
385
389
  extract_tables: bool,
386
390
  extract_charts: bool,
391
+ extract_page_as_image: bool,
387
392
  extractor_config: dict,
388
393
  execution_trace_log: Optional[List[Any]] = None,
389
394
  ) -> pd.DataFrame:
@@ -524,6 +529,24 @@ def pdfium_extractor(
524
529
  )
525
530
  extracted_data.extend(image_data)
526
531
 
532
+ # Full page image extraction
533
+ if extract_page_as_image:
534
+ page_text = _extract_page_text(page)
535
+ image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
536
+ base64_image = numpy_to_base64(image[0])
537
+ if len(base64_image) > 2**24 - 1:
538
+ base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
539
+ image_meta = construct_image_metadata_from_base64(
540
+ base64_image,
541
+ page_idx,
542
+ page_count,
543
+ source_metadata,
544
+ base_unified_metadata,
545
+ subtype=ContentTypeEnum.PAGE_IMAGE,
546
+ text=page_text,
547
+ )
548
+ extracted_data.append(image_meta)
549
+
527
550
  # If we want tables or charts, rasterize the page and store it
528
551
  if extract_tables or extract_charts or extract_infographics:
529
552
  image, padding_offsets = pdfium_pages_to_numpy(
@@ -574,6 +597,7 @@ def pdfium_extractor(
574
597
  execution_trace_log=execution_trace_log,
575
598
  )
576
599
  futures.append(future)
600
+
577
601
  pages_for_tables.clear()
578
602
 
579
603
  # Wait for all asynchronous jobs to complete.
@@ -120,6 +120,7 @@ class NemoRetrieverParseModelInterface(ModelInterface):
120
120
  logger.debug("Formatting input for HTTP NemoRetrieverParse model")
121
121
  # Prepare payload for HTTP request
122
122
 
123
+ ## TODO: Ask @Edward Kim if we want to switch to JPEG/PNG here
123
124
  if "images" in data:
124
125
  base64_list = [numpy_to_base64(img) for img in data["images"]]
125
126
  else:
@@ -2,9 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
-
6
- import base64
7
- import io
5
+ import os
8
6
  import logging
9
7
  import warnings
10
8
  from math import log
@@ -20,11 +18,11 @@ import packaging
20
18
  import pandas as pd
21
19
  import torch
22
20
  import torchvision
23
- from PIL import Image
24
21
 
25
22
  from nv_ingest_api.internal.primitives.nim import ModelInterface
26
23
  from nv_ingest_api.internal.primitives.nim.model_interface.helpers import get_model_name
27
24
  from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
25
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64
28
26
 
29
27
  logger = logging.getLogger(__name__)
30
28
 
@@ -35,6 +33,7 @@ YOLOX_PAGE_MIN_SCORE = 0.1
35
33
  YOLOX_PAGE_NIM_MAX_IMAGE_SIZE = 512_000
36
34
  YOLOX_PAGE_IMAGE_PREPROC_HEIGHT = 1024
37
35
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH = 1024
36
+ YOLOX_PAGE_IMAGE_FORMAT = os.getenv("YOLOX_PAGE_IMAGE_FORMAT", "PNG")
38
37
 
39
38
  # yolox-page-elements-v1 contants
40
39
  YOLOX_PAGE_V1_NUM_CLASSES = 4
@@ -239,15 +238,11 @@ class YoloxModelInterfaceBase(ModelInterface):
239
238
  # Convert to uint8 if needed.
240
239
  if image.dtype != np.uint8:
241
240
  image = (image * 255).astype(np.uint8)
242
- # Convert the numpy array to a PIL Image.
243
- image_pil = Image.fromarray(image)
244
- original_size = image_pil.size
245
-
246
- # Save the image to a buffer and encode to base64.
247
- buffered = io.BytesIO()
248
- image_pil.save(buffered, format="PNG")
249
- image_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
250
241
 
242
+ # Get original size directly from numpy array (width, height)
243
+ original_size = (image.shape[1], image.shape[0])
244
+ # Convert numpy array directly to base64 using OpenCV
245
+ image_b64 = numpy_to_base64(image, format=YOLOX_PAGE_IMAGE_FORMAT)
251
246
  # Scale the image if necessary.
252
247
  scaled_image_b64, new_size = scale_image_to_encoding_size(
253
248
  image_b64, max_base64_size=self.nim_max_image_size
@@ -107,6 +107,10 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
107
107
  model_name: Optional[str] = None
108
108
  api_key: Optional[str] = None
109
109
  filter_errors: bool = False
110
+ text_elements_modality: Optional[str] = None
111
+ image_elements_modality: Optional[str] = None
112
+ structured_elements_modality: Optional[str] = None
113
+ audio_elements_modality: Optional[str] = None
110
114
 
111
115
 
112
116
  class IngestTaskVdbUploadSchema(BaseModelNoExt):
@@ -195,6 +199,7 @@ class IngestTaskSchema(BaseModelNoExt):
195
199
  validated_task_properties = expected_schema_cls(**task_properties)
196
200
  values["type"] = task_type # ensure type is now always the enum
197
201
  values["task_properties"] = validated_task_properties
202
+
198
203
  return values
199
204
 
200
205
  @field_validator("type", mode="before")
@@ -22,5 +22,9 @@ class TextEmbeddingSchema(BaseModel):
22
22
  input_type: str = Field(default="passage")
23
23
  raise_on_failure: bool = Field(default=False)
24
24
  truncate: str = Field(default="END")
25
+ text_elements_modality: str = Field(default="text")
26
+ image_elements_modality: str = Field(default="text")
27
+ structured_elements_modality: str = Field(default="text")
28
+ audio_elements_modality: str = Field(default="text")
25
29
 
26
30
  model_config = ConfigDict(extra="forbid")
@@ -4,6 +4,7 @@
4
4
 
5
5
  import logging
6
6
  from concurrent.futures import ThreadPoolExecutor
7
+ from functools import partial
7
8
  from typing import Any, Dict, Tuple, Optional, Iterable, List
8
9
 
9
10
  import pandas as pd
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
19
20
  logger = logging.getLogger(__name__)
20
21
 
21
22
 
23
+ MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
24
+
25
+
22
26
  # ------------------------------------------------------------------------------
23
27
  # Asynchronous Embedding Requests
24
28
  # ------------------------------------------------------------------------------
@@ -33,6 +37,7 @@ def _make_async_request(
33
37
  input_type: str,
34
38
  truncate: str,
35
39
  filter_errors: bool,
40
+ modalities: Optional[List[str]] = None,
36
41
  ) -> list:
37
42
  """
38
43
  Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
@@ -74,11 +79,18 @@ def _make_async_request(
74
79
  base_url=embedding_nim_endpoint,
75
80
  )
76
81
 
82
+ extra_body = {
83
+ "input_type": input_type,
84
+ "truncate": truncate,
85
+ }
86
+ if modalities:
87
+ extra_body["modality"] = modalities
88
+
77
89
  resp = client.embeddings.create(
78
90
  input=prompts,
79
91
  model=embedding_model,
80
92
  encoding_format=encoding_format,
81
- extra_body={"input_type": input_type, "truncate": truncate},
93
+ extra_body=extra_body,
82
94
  )
83
95
 
84
96
  response["embedding"] = resp.data
@@ -110,6 +122,7 @@ def _async_request_handler(
110
122
  input_type: str,
111
123
  truncate: str,
112
124
  filter_errors: bool,
125
+ modalities: Optional[List[str]] = None,
113
126
  ) -> List[dict]:
114
127
  """
115
128
  Gathers calculated embedding results from the NIM embedding service concurrently.
@@ -138,6 +151,9 @@ def _async_request_handler(
138
151
  List[dict]
139
152
  A list of response dictionaries from the embedding service.
140
153
  """
154
+ if modalities is None:
155
+ modalities = [None] * len(prompts)
156
+
141
157
  with ThreadPoolExecutor() as executor:
142
158
  futures = [
143
159
  executor.submit(
@@ -150,8 +166,9 @@ def _async_request_handler(
150
166
  input_type=input_type,
151
167
  truncate=truncate,
152
168
  filter_errors=filter_errors,
169
+ modalities=modality_batch,
153
170
  )
154
- for prompt_batch in prompts
171
+ for prompt_batch, modality_batch in zip(prompts, modalities)
155
172
  ]
156
173
  results = [future.result() for future in futures]
157
174
 
@@ -167,6 +184,7 @@ def _async_runner(
167
184
  input_type: str,
168
185
  truncate: str,
169
186
  filter_errors: bool,
187
+ modalities: Optional[List[str]] = None,
170
188
  ) -> dict:
171
189
  """
172
190
  Concurrently launches all NIM embedding requests and flattens the results.
@@ -204,6 +222,7 @@ def _async_runner(
204
222
  input_type,
205
223
  truncate,
206
224
  filter_errors,
225
+ modalities=modalities,
207
226
  )
208
227
 
209
228
  flat_results = {"embeddings": [], "info_msgs": []}
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
263
282
  return row
264
283
 
265
284
 
266
- def _get_pandas_text_content(row):
285
+ def _format_image_input_string(image_b64: Optional[str]) -> str:
286
+ if not image_b64:
287
+ return
288
+ return f"data:image/png;base64,{image_b64}"
289
+
290
+
291
+ def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
292
+ if (not text) or (not text.strip()) or (not image_b64):
293
+ return
294
+ return f"{text.strip()} {_format_image_input_string(image_b64)}"
295
+
296
+
297
+ def _get_pandas_text_content(row, modality="text"):
267
298
  """
268
299
  Extracts text content from a DataFrame row.
269
300
 
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
280
311
  return row["content"]
281
312
 
282
313
 
283
- def _get_pandas_table_content(row):
314
+ def _get_pandas_table_content(row, modality="text"):
284
315
  """
285
316
  Extracts table/chart content from a DataFrame row.
286
317
 
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
294
325
  str
295
326
  The table/chart content from the row.
296
327
  """
297
- return row.get("table_metadata", {}).get("table_content")
328
+ if modality == "text":
329
+ content = row.get("table_metadata", {}).get("table_content")
330
+ elif modality == "image":
331
+ content = _format_image_input_string(row.get("content"))
332
+ elif modality == "text_image":
333
+ text = row.get("table_metadata", {}).get("table_content")
334
+ image = row.get("content")
335
+ content = _format_text_image_pair_input_string(text, image)
336
+
337
+ return content
298
338
 
299
339
 
300
- def _get_pandas_image_content(row):
340
+ def _get_pandas_image_content(row, modality="text"):
301
341
  """
302
342
  Extracts image caption content from a DataFrame row.
303
343
 
@@ -311,10 +351,28 @@ def _get_pandas_image_content(row):
311
351
  str
312
352
  The image caption from the row.
313
353
  """
314
- return row.get("image_metadata", {}).get("caption")
354
+ subtype = row.get("content_metadata", {}).get("subtype")
355
+ if modality == "text":
356
+ if subtype == "page_image":
357
+ content = row.get("image_metadata", {}).get("text")
358
+ else:
359
+ content = row.get("image_metadata", {}).get("caption")
360
+ elif modality == "image":
361
+ content = _format_image_input_string(row.get("content"))
362
+ elif modality == "text_image":
363
+ if subtype == "page_image":
364
+ text = row.get("image_metadata", {}).get("text")
365
+ else:
366
+ text = row.get("image_metadata", {}).get("caption")
367
+ image = row.get("content")
368
+ content = _format_text_image_pair_input_string(text, image)
315
369
 
370
+ # A workaround to save memory.
371
+ row["content"] = ""
372
+ return content
316
373
 
317
- def _get_pandas_audio_content(row):
374
+
375
+ def _get_pandas_audio_content(row, modality="text"):
318
376
  """
319
377
  A pandas UDF used to select extracted audio transcription to be used to create embeddings.
320
378
  """
@@ -408,6 +466,23 @@ def _concatenate_extractions_pandas(
408
466
  # ------------------------------------------------------------------------------
409
467
 
410
468
 
469
+ def does_model_support_multimodal_embeddings(model: str) -> bool:
470
+ """
471
+ Checks if a given model supports multi-modal embeddings.
472
+
473
+ Parameters
474
+ ----------
475
+ model : str
476
+ The name of the model.
477
+
478
+ Returns
479
+ -------
480
+ bool
481
+ True if the model supports multi-modal embeddings, False otherwise.
482
+ """
483
+ return model in MULTI_MODAL_MODELS
484
+
485
+
411
486
  def transform_create_text_embeddings_internal(
412
487
  df_transform_ledger: pd.DataFrame,
413
488
  task_config: Dict[str, Any],
@@ -460,6 +535,15 @@ def transform_create_text_embeddings_internal(
460
535
  ContentTypeEnum.AUDIO: _get_pandas_audio_content,
461
536
  ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
462
537
  }
538
+ task_type_to_modality = {
539
+ ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
540
+ ContentTypeEnum.STRUCTURED: (
541
+ task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
542
+ ),
543
+ ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
544
+ ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
545
+ ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
546
+ }
463
547
 
464
548
  def _content_type_getter(row):
465
549
  return row["content_metadata"]["type"]
@@ -480,7 +564,7 @@ def transform_create_text_embeddings_internal(
480
564
  # Extract content and normalize empty or non-str to None
481
565
  extracted_content = (
482
566
  df_content["metadata"]
483
- .apply(content_getter)
567
+ .apply(partial(content_getter, modality=task_type_to_modality[content_type]))
484
568
  .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
485
569
  )
486
570
  df_content["_content"] = extracted_content
@@ -488,9 +572,15 @@ def transform_create_text_embeddings_internal(
488
572
  # Prepare batches for only valid (non-None) content
489
573
  valid_content_mask = df_content["_content"].notna()
490
574
  if valid_content_mask.any():
491
- filtered_content_batches = _generate_batches(
492
- df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
493
- )
575
+ filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
576
+ filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
577
+
578
+ if model_name in MULTI_MODAL_MODELS:
579
+ modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
580
+ modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
581
+ else:
582
+ modality_batches = None
583
+
494
584
  content_embeddings = _async_runner(
495
585
  filtered_content_batches,
496
586
  api_key,
@@ -500,6 +590,7 @@ def transform_create_text_embeddings_internal(
500
590
  transform_config.input_type,
501
591
  transform_config.truncate,
502
592
  False,
593
+ modalities=modality_batches,
503
594
  )
504
595
  # Build a simple row index -> embedding map
505
596
  embeddings_dict = dict(
@@ -2,29 +2,55 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- import base64
6
- import io
7
5
  import logging
8
- from io import BytesIO
9
6
  from math import ceil
10
7
  from math import floor
11
8
  from typing import Optional
12
9
  from typing import Tuple
13
10
 
11
+ import cv2
14
12
  import numpy as np
13
+ from io import BytesIO
15
14
  from PIL import Image
16
- from PIL import UnidentifiedImageError
17
15
 
18
16
  from nv_ingest_api.util.converters import bytetools
19
17
 
18
+ # Configure OpenCV to use a single thread for image processing
19
+ cv2.setNumThreads(1)
20
20
  DEFAULT_MAX_WIDTH = 1024
21
21
  DEFAULT_MAX_HEIGHT = 1280
22
22
 
23
+ # Workaround for PIL.Image.DecompressionBombError
24
+ Image.MAX_IMAGE_PIXELS = None
25
+
23
26
  logger = logging.getLogger(__name__)
24
27
 
25
28
 
29
+ def _resize_image_opencv(
30
+ array: np.ndarray, target_size: Tuple[int, int], interpolation=cv2.INTER_LANCZOS4
31
+ ) -> np.ndarray:
32
+ """
33
+ Resizes a NumPy array representing an image using OpenCV.
34
+
35
+ Parameters
36
+ ----------
37
+ array : np.ndarray
38
+ The input image as a NumPy array.
39
+ target_size : Tuple[int, int]
40
+ The target size as (width, height).
41
+ interpolation : int, optional
42
+ OpenCV interpolation method. Defaults to cv2.INTER_LANCZOS4.
43
+
44
+ Returns
45
+ -------
46
+ np.ndarray
47
+ The resized image as a NumPy array.
48
+ """
49
+ return cv2.resize(array, target_size, interpolation=interpolation)
50
+
51
+
26
52
  def scale_image_to_encoding_size(
27
- base64_image: str, max_base64_size: int = 180_000, initial_reduction: float = 0.9
53
+ base64_image: str, max_base64_size: int = 180_000, initial_reduction: float = 0.9, format: str = "PNG", **kwargs
28
54
  ) -> Tuple[str, Tuple[int, int]]:
29
55
  """
30
56
  Decodes a base64-encoded image, resizes it if needed, and re-encodes it as base64.
@@ -38,12 +64,19 @@ def scale_image_to_encoding_size(
38
64
  Maximum allowable size for the base64-encoded image, by default 180,000 characters.
39
65
  initial_reduction : float, optional
40
66
  Initial reduction step for resizing, by default 0.9.
67
+ format : str, optional
68
+ The image format to use for encoding. Supported formats are "PNG" and "JPEG".
69
+ Defaults to "PNG".
70
+ **kwargs
71
+ Additional keyword arguments passed to the format-specific encoding function.
72
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
73
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
41
74
 
42
75
  Returns
43
76
  -------
44
77
  Tuple[str, Tuple[int, int]]
45
78
  A tuple containing:
46
- - Base64-encoded PNG image string, resized if necessary.
79
+ - Base64-encoded image string in the specified format, resized if necessary.
47
80
  - The new size as a tuple (width, height).
48
81
 
49
82
  Raises
@@ -52,12 +85,11 @@ def scale_image_to_encoding_size(
52
85
  If the image cannot be resized below the specified max_base64_size.
53
86
  """
54
87
  try:
55
- # Decode the base64 image and open it as a PIL image
56
- image_data = base64.b64decode(base64_image)
57
- img = Image.open(io.BytesIO(image_data)).convert("RGB")
88
+ # Decode the base64 image using OpenCV (returns RGB format)
89
+ img_array = base64_to_numpy(base64_image)
58
90
 
59
- # Initial image size
60
- original_size = img.size
91
+ # Initial image size (height, width, channels) -> (width, height)
92
+ original_size = (img_array.shape[1], img_array.shape[0])
61
93
 
62
94
  # Check initial size
63
95
  if len(base64_image) <= max_base64_size:
@@ -66,23 +98,24 @@ def scale_image_to_encoding_size(
66
98
  # Initial reduction step
67
99
  reduction_step = initial_reduction
68
100
  new_size = original_size
101
+ current_img = img_array.copy()
102
+ original_width, original_height = original_size
103
+
69
104
  while len(base64_image) > max_base64_size:
70
- width, height = img.size
71
- new_size = (int(width * reduction_step), int(height * reduction_step))
105
+ new_size = (int(original_width * reduction_step), int(original_height * reduction_step))
106
+ if new_size[0] < 1 or new_size[1] < 1:
107
+ raise ValueError("Image cannot be resized further without becoming too small.")
72
108
 
73
- img_resized = img.resize(new_size, Image.LANCZOS)
74
- buffered = io.BytesIO()
75
- img_resized.save(buffered, format="PNG")
76
- base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
109
+ # Resize the image using OpenCV
110
+ current_img = _resize_image_opencv(img_array, new_size)
111
+
112
+ # Re-encode as base64 using the specified format
113
+ base64_image = numpy_to_base64(current_img, format=format, **kwargs)
77
114
 
78
115
  # Adjust the reduction step if necessary
79
116
  if len(base64_image) > max_base64_size:
80
117
  reduction_step *= 0.95 # Reduce size further if needed
81
118
 
82
- # Safety check
83
- if new_size[0] < 1 or new_size[1] < 1:
84
- raise Exception("Image cannot be resized further without becoming too small.")
85
-
86
119
  return base64_image, new_size
87
120
 
88
121
  except Exception as e:
@@ -90,36 +123,84 @@ def scale_image_to_encoding_size(
90
123
  raise
91
124
 
92
125
 
93
- def ensure_base64_is_png(base64_image: str) -> str:
126
+ def _detect_base64_image_format(base64_string: str) -> Optional[str]:
94
127
  """
95
- Ensures the given base64-encoded image is in PNG format. Converts to PNG if necessary.
128
+ Detects the format of a base64-encoded image using Pillow.
96
129
 
97
130
  Parameters
98
131
  ----------
99
- base64_image : str
132
+ base64_string : str
100
133
  Base64-encoded image string.
101
134
 
102
135
  Returns
103
136
  -------
104
- str
105
- Base64-encoded PNG image string.
137
+ The detected format ("PNG", "JPEG", "UNKNOWN")
106
138
  """
107
139
  try:
108
- # Decode the base64 string and load the image
109
- image_data = base64.b64decode(base64_image)
110
- image = Image.open(io.BytesIO(image_data))
140
+ image_bytes = bytetools.bytesfrombase64(base64_string)
141
+ except Exception as e:
142
+ logger.error(f"Invalid base64 string: {e}")
143
+ raise ValueError(f"Invalid base64 string: {e}") from e
144
+
145
+ try:
146
+ with Image.open(BytesIO(image_bytes)) as img:
147
+ return img.format.upper()
148
+ except ImportError:
149
+ raise ImportError("Pillow library not available")
150
+ except Exception as e:
151
+ logger.error(f"Error detecting image format: {e}")
152
+ return "UNKNOWN"
153
+
154
+
155
+ def ensure_base64_format(base64_image: str, target_format: str = "PNG", **kwargs) -> str:
156
+ """
157
+ Ensures the given base64-encoded image is in the specified format. Converts if necessary.
158
+ Skips conversion if the image is already in the target format.
159
+
160
+ Parameters
161
+ ----------
162
+ base64_image : str
163
+ Base64-encoded image string.
164
+ target_format : str, optional
165
+ The target image format. Supported formats are "PNG" and "JPEG". Defaults to "PNG".
166
+ **kwargs
167
+ Additional keyword arguments passed to the format-specific encoding function.
168
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
169
+ For PNG: compression (int, default=3) - PNG compression level (0-9).
111
170
 
112
- # Check if the image is already in PNG format
113
- if image.format != "PNG":
114
- # Convert the image to PNG
115
- buffered = io.BytesIO()
116
- image.convert("RGB").save(buffered, format="PNG")
117
- base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
171
+ Returns
172
+ -------
173
+ str
174
+ Base64-encoded image string in the specified format.
118
175
 
176
+ Raises
177
+ ------
178
+ ValueError
179
+ If there is an error during format conversion.
180
+ """
181
+ target_format = target_format.upper()
182
+ if target_format == "JPG":
183
+ target_format = "JPEG"
184
+
185
+ current_format = _detect_base64_image_format(base64_image)
186
+ if current_format == "UNKNOWN":
187
+ raise ValueError(
188
+ f"Unable to decode image from base64 string: {base64_image}, because current format could not be detected."
189
+ )
190
+ if current_format == target_format:
191
+ logger.debug(f"Image already in {target_format} format, skipping conversion")
119
192
  return base64_image
193
+
194
+ try:
195
+ # Decode the base64 image using OpenCV (returns RGB format)
196
+ img_array = base64_to_numpy(base64_image)
197
+ # Re-encode in the target format
198
+ return numpy_to_base64(img_array, format=target_format, **kwargs)
199
+ except ImportError as e:
200
+ raise e
120
201
  except Exception as e:
121
- logger.error(f"Error ensuring PNG format: {e}")
122
- return None
202
+ logger.error(f"Error converting image to {target_format} format: {e}")
203
+ raise ValueError(f"Failed to convert image to {target_format} format: {e}") from e
123
204
 
124
205
 
125
206
  def pad_image(
@@ -302,66 +383,193 @@ def normalize_image(
302
383
  return output_array
303
384
 
304
385
 
305
- def numpy_to_base64(array: np.ndarray) -> str:
386
+ def _preprocess_numpy_array(array: np.ndarray) -> np.ndarray:
387
+ """
388
+ Preprocesses a NumPy array for image encoding by ensuring proper format and data type.
389
+ Also handles color space conversion for OpenCV encoding.
390
+
391
+ Parameters
392
+ ----------
393
+ array : np.ndarray
394
+ The input image as a NumPy array.
395
+
396
+ Returns
397
+ -------
398
+ np.ndarray
399
+ The preprocessed array in uint8 format, ready for OpenCV encoding (BGR color order for color images).
400
+
401
+ Raises
402
+ ------
403
+ ValueError
404
+ If the input array cannot be converted into a valid image format.
405
+ """
406
+ # Check if the array is valid and can be converted to an image
407
+ try:
408
+ # If the array represents a grayscale image, drop the redundant axis in
409
+ # (h, w, 1). cv2 expects (h, w) for grayscale.
410
+ if array.ndim == 3 and array.shape[2] == 1:
411
+ array = np.squeeze(array, axis=2)
412
+
413
+ # Ensure uint8 data type
414
+ processed_array = array.astype(np.uint8)
415
+
416
+ # OpenCV uses BGR color order, so convert RGB to BGR if needed
417
+ if processed_array.ndim == 3 and processed_array.shape[2] == 3:
418
+ # Assume input is RGB and convert to BGR for OpenCV
419
+ processed_array = cv2.cvtColor(processed_array, cv2.COLOR_RGB2BGR)
420
+
421
+ return processed_array
422
+ except Exception as e:
423
+ raise ValueError(f"Failed to preprocess NumPy array for image encoding: {e}")
424
+
425
+
426
+ def _encode_opencv_jpeg(array: np.ndarray, *, quality: int = 100) -> bytes:
427
+ """NumPy array -> JPEG bytes using OpenCV."""
428
+ ok, buf = cv2.imencode(".jpg", array, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
429
+ if not ok:
430
+ raise RuntimeError("cv2.imencode failed")
431
+ return buf.tobytes()
432
+
433
+
434
+ def _encode_opencv_png(array: np.ndarray, *, compression: int = 6) -> bytes:
435
+ """NumPy array -> PNG bytes using OpenCV"""
436
+ encode_params = [
437
+ cv2.IMWRITE_PNG_COMPRESSION,
438
+ compression,
439
+ cv2.IMWRITE_PNG_STRATEGY,
440
+ cv2.IMWRITE_PNG_STRATEGY_DEFAULT,
441
+ ]
442
+ ok, buf = cv2.imencode(".png", array, encode_params)
443
+ if not ok:
444
+ raise RuntimeError("cv2.imencode(.png) failed")
445
+ return buf.tobytes()
446
+
447
+
448
+ def numpy_to_base64_png(array: np.ndarray) -> str:
449
+ """
450
+ Converts a preprocessed NumPy array representing an image to a base64-encoded PNG string using OpenCV.
451
+
452
+ Parameters
453
+ ----------
454
+ array : np.ndarray
455
+ The preprocessed input image as a NumPy array. Must have a shape compatible with image data.
456
+
457
+ Returns
458
+ -------
459
+ str
460
+ The base64-encoded PNG string representation of the input NumPy array.
461
+
462
+ Raises
463
+ ------
464
+ RuntimeError
465
+ If there is an issue during the image conversion or base64 encoding process.
466
+ """
467
+ try:
468
+ # Encode to PNG bytes using OpenCV
469
+ png_bytes = _encode_opencv_png(array)
470
+
471
+ # Convert to base64
472
+ base64_img = bytetools.base64frombytes(png_bytes)
473
+ except Exception as e:
474
+ raise RuntimeError(f"Failed to encode image to base64 PNG: {e}")
475
+
476
+ return base64_img
477
+
478
+
479
+ def numpy_to_base64_jpeg(array: np.ndarray, quality: int = 100) -> str:
480
+ """
481
+ Converts a preprocessed NumPy array representing an image to a base64-encoded JPEG string using OpenCV.
482
+
483
+ Parameters
484
+ ----------
485
+ array : np.ndarray
486
+ The preprocessed input image as a NumPy array. Must have a shape compatible with image data.
487
+ quality : int, optional
488
+ JPEG quality (1-100), by default 100. Higher values mean better quality but larger file size.
489
+
490
+ Returns
491
+ -------
492
+ str
493
+ The base64-encoded JPEG string representation of the input NumPy array.
494
+
495
+ Raises
496
+ ------
497
+ RuntimeError
498
+ If there is an issue during the image conversion or base64 encoding process.
499
+ """
500
+ try:
501
+ # Encode to JPEG bytes using OpenCV
502
+ jpeg_bytes = _encode_opencv_jpeg(array, quality=quality)
503
+
504
+ # Convert to base64
505
+ base64_img = bytetools.base64frombytes(jpeg_bytes)
506
+ except Exception as e:
507
+ raise RuntimeError(f"Failed to encode image to base64 JPEG: {e}")
508
+
509
+ return base64_img
510
+
511
+
512
+ def numpy_to_base64(array: np.ndarray, format: str = "PNG", **kwargs) -> str:
306
513
  """
307
514
  Converts a NumPy array representing an image to a base64-encoded string.
308
515
 
309
- The function takes a NumPy array, converts it to a PIL image, and then encodes
310
- the image as a PNG in a base64 string format. The input array is expected to be in
311
- a format that can be converted to a valid image, such as having a shape of (H, W, C)
312
- where C is the number of channels (e.g., 3 for RGB).
516
+ The function takes a NumPy array, preprocesses it, and then encodes
517
+ the image in the specified format as a base64 string. The input array is expected
518
+ to be in a format that can be converted to a valid image, such as having a shape
519
+ of (H, W, C) where C is the number of channels (e.g., 3 for RGB).
313
520
 
314
521
  Parameters
315
522
  ----------
316
523
  array : np.ndarray
317
524
  The input image as a NumPy array. Must have a shape compatible with image data.
525
+ format : str, optional
526
+ The image format to use for encoding. Supported formats are "PNG" and "JPEG".
527
+ Defaults to "PNG".
528
+ **kwargs
529
+ Additional keyword arguments passed to the format-specific encoding function.
530
+ For JPEG: quality (int, default=100) - JPEG quality (1-100).
318
531
 
319
532
  Returns
320
533
  -------
321
534
  str
322
- The base64-encoded string representation of the input NumPy array as a PNG image.
535
+ The base64-encoded string representation of the input NumPy array in the specified format.
323
536
 
324
537
  Raises
325
538
  ------
326
539
  ValueError
327
- If the input array cannot be converted into a valid image format.
540
+ If the input array cannot be converted into a valid image format, or if an
541
+ unsupported format is specified.
328
542
  RuntimeError
329
543
  If there is an issue during the image conversion or base64 encoding process.
330
544
 
331
545
  Examples
332
546
  --------
333
547
  >>> array = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
334
- >>> encoded_str = numpy_to_base64(array)
548
+ >>> encoded_str = numpy_to_base64(array, format="PNG")
335
549
  >>> isinstance(encoded_str, str)
336
550
  True
551
+ >>> encoded_str_jpeg = numpy_to_base64(array, format="JPEG", quality=90)
552
+ >>> isinstance(encoded_str_jpeg, str)
553
+ True
337
554
  """
338
- # If the array represents a grayscale image, drop the redundant axis in
339
- # (h, w, 1). PIL.Image.fromarray() expects an array of form (h, w) if it's
340
- # a grayscale image.
341
- if array.ndim == 3 and array.shape[2] == 1:
342
- array = np.squeeze(array, axis=2)
555
+ # Centralized preprocessing of the numpy array
556
+ processed_array = _preprocess_numpy_array(array)
343
557
 
344
- # Check if the array is valid and can be converted to an image
345
- try:
346
- # Convert the NumPy array to a PIL image
347
- pil_image = Image.fromarray(array.astype(np.uint8))
348
- except Exception as e:
349
- raise ValueError(f"Failed to convert NumPy array to image: {e}")
558
+ format = format.upper()
350
559
 
351
- try:
352
- # Convert the PIL image to a base64-encoded string
353
- with BytesIO() as buffer:
354
- pil_image.save(buffer, format="PNG")
355
- base64_img = bytetools.base64frombytes(buffer.getvalue())
356
- except Exception as e:
357
- raise RuntimeError(f"Failed to encode image to base64: {e}")
358
-
359
- return base64_img
560
+ if format == "PNG":
561
+ return numpy_to_base64_png(processed_array)
562
+ elif format == "JPEG" or format == "JPG":
563
+ quality = kwargs.get("quality", 100)
564
+ return numpy_to_base64_jpeg(processed_array, quality=quality)
565
+ else:
566
+ raise ValueError(f"Unsupported format: {format}. Supported formats are 'PNG' and 'JPEG'.")
360
567
 
361
568
 
362
569
  def base64_to_numpy(base64_string: str) -> np.ndarray:
363
570
  """
364
- Convert a base64-encoded image string to a NumPy array.
571
+ Convert a base64-encoded image string to a NumPy array using OpenCV.
572
+ Returns images in RGB format for consistency.
365
573
 
366
574
  Parameters
367
575
  ----------
@@ -371,37 +579,82 @@ def base64_to_numpy(base64_string: str) -> np.ndarray:
371
579
  Returns
372
580
  -------
373
581
  numpy.ndarray
374
- NumPy array representation of the decoded image.
582
+ NumPy array representation of the decoded image in RGB format (for color images).
583
+ Grayscale images are returned as-is.
375
584
 
376
585
  Raises
377
586
  ------
378
587
  ValueError
379
588
  If the base64 string is invalid or cannot be decoded into an image.
380
- ImportError
381
- If required libraries are not installed.
382
589
 
383
590
  Examples
384
591
  --------
385
592
  >>> base64_str = '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBD...'
386
593
  >>> img_array = base64_to_numpy(base64_str)
594
+ >>> # img_array is now in RGB format (for color images)
387
595
  """
388
596
  try:
389
- # Decode the base64 string
390
- image_data = base64.b64decode(base64_string)
391
- except (base64.binascii.Error, ValueError) as e:
597
+ # Decode the base64 string to bytes using bytetools
598
+ image_bytes = bytetools.bytesfrombase64(base64_string)
599
+ except Exception as e:
392
600
  raise ValueError("Invalid base64 string") from e
393
601
 
602
+ # Create numpy buffer from bytes and decode using OpenCV
603
+ buf = np.frombuffer(image_bytes, dtype=np.uint8)
394
604
  try:
395
- # Convert the bytes into a BytesIO object
396
- image_bytes = BytesIO(image_data)
397
-
398
- # Open the image using PIL
399
- image = Image.open(image_bytes)
400
- image.load()
401
- except UnidentifiedImageError as e:
605
+ img = cv2.imdecode(buf, cv2.IMREAD_UNCHANGED)
606
+ if img is None:
607
+ raise ValueError("OpenCV failed to decode image")
608
+
609
+ # Convert BGR to RGB for consistent processing (OpenCV loads as BGR)
610
+ # Only convert if it's a 3-channel color image
611
+ if img.ndim == 3 and img.shape[2] == 3:
612
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
613
+ except ImportError:
614
+ raise
615
+ except Exception as e:
402
616
  raise ValueError("Unable to decode image from base64 string") from e
403
617
 
404
- # Convert the image to a NumPy array
405
- image_array = np.array(image)
618
+ # Convert to numpy array
619
+ img = np.array(img)
620
+ # Assert that 3-channel images are in RGB format after conversion
621
+ assert img.ndim <= 3, f"Image has unexpected number of dimensions: {img.ndim}"
622
+ assert img.ndim != 3 or img.shape[2] == 3, f"3-channel image should have 3 channels, got: {img.shape[2]}"
623
+
624
+ return img
625
+
626
+
627
+ def scale_numpy_image(
628
+ img_arr: np.ndarray, scale_tuple: Optional[Tuple[int, int]] = None, interpolation=Image.LANCZOS
629
+ ) -> np.ndarray:
630
+ """
631
+ Scales a NumPy image array using OpenCV with aspect ratio preservation.
406
632
 
407
- return image_array
633
+ This function provides OpenCV-based image scaling that mimics PIL's thumbnail behavior
634
+ by maintaining aspect ratio and scaling to fit within the specified dimensions.
635
+
636
+ Parameters
637
+ ----------
638
+ img_arr : np.ndarray
639
+ The input image as a NumPy array.
640
+ scale_tuple : Optional[Tuple[int, int]], optional
641
+ A tuple (width, height) to resize the image to. If provided, the image
642
+ will be resized to fit within these dimensions while maintaining aspect ratio
643
+ (similar to PIL's thumbnail method). Defaults to None.
644
+ interpolation : int, optional
645
+ OpenCV interpolation method. Defaults to cv2.INTER_LANCZOS4.
646
+
647
+ Returns
648
+ -------
649
+ np.ndarray
650
+ A NumPy array representing the scaled image data.
651
+ """
652
+ # Apply scaling using OpenCV if specified
653
+ # Using PIL for scaling as CV2 seems to lead to different results
654
+ # TODO: Remove when we move to YOLOX Ensemble Models
655
+ if scale_tuple:
656
+ image = Image.fromarray(img_arr)
657
+ image.thumbnail(scale_tuple, interpolation)
658
+ img_arr = np.array(image)
659
+ # Ensure we return a copy
660
+ return img_arr.copy()
@@ -201,6 +201,8 @@ def construct_image_metadata_from_base64(
201
201
  page_count: int,
202
202
  source_metadata: Dict[str, Any],
203
203
  base_unified_metadata: Dict[str, Any],
204
+ subtype: None | ContentTypeEnum | str = "",
205
+ text: str = "",
204
206
  ) -> List[Any]:
205
207
  """
206
208
  Extracts image data from a base64-encoded image string, decodes the image to get
@@ -252,6 +254,7 @@ def construct_image_metadata_from_base64(
252
254
  "line": -1,
253
255
  "span": -1,
254
256
  },
257
+ "subtype": subtype or "",
255
258
  }
256
259
 
257
260
  # Construct image metadata
@@ -259,7 +262,7 @@ def construct_image_metadata_from_base64(
259
262
  "image_type": DocumentTypeEnum.PNG,
260
263
  "structured_image_type": ContentTypeEnum.UNKNOWN,
261
264
  "caption": "",
262
- "text": "",
265
+ "text": text,
263
266
  "image_location": bbox,
264
267
  "image_location_max_dimensions": (width, height),
265
268
  "height": height,
@@ -7,7 +7,6 @@ from typing import List, Any
7
7
  from typing import Optional
8
8
  from typing import Tuple
9
9
 
10
- import PIL
11
10
  import numpy as np
12
11
  import pypdfium2 as pdfium
13
12
  import pypdfium2.raw as pdfium_c
@@ -20,8 +19,9 @@ from nv_ingest_api.util.image_processing.clustering import (
20
19
  combine_groups_into_bboxes,
21
20
  remove_superset_bboxes,
22
21
  )
23
- from nv_ingest_api.util.image_processing.transforms import pad_image, numpy_to_base64, crop_image
22
+ from nv_ingest_api.util.image_processing.transforms import pad_image, numpy_to_base64, crop_image, scale_numpy_image
24
23
  from nv_ingest_api.util.metadata.aggregators import Base64Image
24
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YOLOX_PAGE_IMAGE_FORMAT
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
@@ -176,18 +176,10 @@ def pdfium_pages_to_numpy(
176
176
  for idx, page in enumerate(pages):
177
177
  # Render the page as a bitmap with the specified scale and rotation
178
178
  page_bitmap = page.render(scale=scale, rotation=rotation)
179
-
180
- # Convert the bitmap to a PIL image
181
- pil_image = page_bitmap.to_pil()
182
-
179
+ img_arr = convert_bitmap_to_corrected_numpy(page_bitmap)
183
180
  # Apply scaling using the thumbnail approach if specified
184
181
  if scale_tuple:
185
- pil_image.thumbnail(scale_tuple, PIL.Image.LANCZOS)
186
-
187
- # Convert the PIL image to a NumPy array and force a full copy,
188
- # ensuring the returned array is entirely independent of the original buffer.
189
- img_arr = np.array(pil_image).copy()
190
-
182
+ img_arr = scale_numpy_image(img_arr, scale_tuple)
191
183
  # Apply padding if specified
192
184
  if padding_tuple:
193
185
  img_arr, (pad_width, pad_height) = pad_image(
@@ -250,7 +242,7 @@ def extract_simple_images_from_pdfium_page(page, max_depth):
250
242
  try:
251
243
  # Attempt to retrieve the image bitmap
252
244
  image_numpy: np.ndarray = pdfium_try_get_bitmap_as_numpy(obj) # noqa
253
- image_base64: str = numpy_to_base64(image_numpy)
245
+ image_base64: str = numpy_to_base64(image_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
254
246
  image_bbox = obj.get_pos()
255
247
  image_size = obj.get_size()
256
248
  if image_size[0] < 10 and image_size[1] < 10:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.7.14.dev20250714
3
+ Version: 2025.7.16.dev20250716
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -217,6 +217,7 @@ Requires-Dist: backoff==2.2.1
217
217
  Requires-Dist: pandas>=2.0
218
218
  Requires-Dist: pydantic>2.0.0
219
219
  Requires-Dist: pydantic-settings>2.0.0
220
+ Requires-Dist: tritonclient
220
221
  Dynamic: license-file
221
222
 
222
223
  # nv-ingest-api
@@ -7,7 +7,7 @@ nv_ingest_api/interface/transform.py,sha256=g6YnFR7TpEU0xNtzCvv6kqnFbuCwQ6vRMjjB
7
7
  nv_ingest_api/interface/utility.py,sha256=AL4l0cJNvTjG1MAe1YNTk1jbbPED3g4HCewzx6Ffcio,7296
8
8
  nv_ingest_api/internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  nv_ingest_api/internal/enums/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
10
- nv_ingest_api/internal/enums/common.py,sha256=HSj7qqNr6KXu_FIyK_Wvel24R-r8lV7dLA173z5XFBc,12321
10
+ nv_ingest_api/internal/enums/common.py,sha256=lzDJ35VWfIwlL_Lx_q0dfHUuwEB7CXudHIQAilpjoRw,12611
11
11
  nv_ingest_api/internal/extract/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
12
12
  nv_ingest_api/internal/extract/audio/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
13
13
  nv_ingest_api/internal/extract/audio/audio_extraction.py,sha256=_jf_UC_FTqZr-xEpwG8edwBzdDjM01gGhqm9ulOsDcY,6973
@@ -31,11 +31,11 @@ nv_ingest_api/internal/extract/pdf/pdf_extractor.py,sha256=CxtWaD6mql9MEqSdk2CfS
31
31
  nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIRiozIRw70Kybw3A72-lcKFeoTI,582
32
32
  nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
33
33
  nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=PpKTqS8jGHBV6mKLGZWwjpfT8ga6Fy8ffrvL-gPAf2c,8182
34
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=Uqj1NH7yWga9P6_vCzgny1WKALfF--UdAaGHUF8K_aQ,22926
35
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=fDbrZwJ-lgeHYOq107WXehzdSvyF8zEDza_9UkDm5aE,22360
34
+ nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=XNYz4S2tMFBv0KFzXNERrVs-1raxJ_iIIXpBGlJFcD0,22987
35
+ nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=8hUJUdpx6FhOBgabFmGhJiAQdl12kR8YoSbUfN-geOk,23506
36
36
  nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
37
37
  nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
38
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ2CZs167juvEZ-uV6qXWQjR08hhIu8otk2MWj4,4931
38
+ nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=4bvN6LsPksLicI6jM0JqbJFiOZNHEcuc8MVVW4XfgV8,5875
39
39
  nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
40
40
  nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
41
41
  nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -55,12 +55,12 @@ nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1E
55
55
  nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubkHs4WjnexM6rI0wkjWCsrVNEbA4Wjk2oKL9OYCU,1499
56
56
  nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
57
57
  nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=x35a9AyTYxpESQflLo_YnhVOKblQKVen6vGGFaXmNiE,9927
58
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=MFWPqMTXs_MZG3ripRR21o7f_mVeoE46Q10yvJ8KNr0,7023
58
+ nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
59
59
  nv_ingest_api/internal/primitives/nim/model_interface/paddle.py,sha256=rSUPwl5XOrqneoS6aKhatVjrNBg_LhP3nwUWS_aTwz0,17950
60
60
  nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=5PqD2JuHY2rwd-6SSB4axr2Dd79vm95sAEkcmI3U7ME,12977
61
61
  nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
62
62
  nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=qJ382PU1ZrIM-SR3cqIhtY_W2rmHec2HIa2aUB2SvaU,6031
63
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=uYXqdvqgkyS4Yfr9ZoikRDX4e94OV3ch3Xhv3JVg-3s,49581
63
+ nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=nsfDQgeupBe9Tdf3S5sfNpYcObEwVlzCZdfg1ObAW88,49584
64
64
  nv_ingest_api/internal/primitives/tracing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  nv_ingest_api/internal/primitives/tracing/latency.py,sha256=5kVTeYRbRdTlT_aI4MeS20N_S7mqCcLqZR6YHtxhXkY,2215
66
66
  nv_ingest_api/internal/primitives/tracing/logging.py,sha256=SSzIgS7afLH-e1C7VagYDmkkA6rTXmQ-bmtLjoEguhg,3851
@@ -82,7 +82,7 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
82
82
  nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
83
83
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
84
84
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
85
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=szDvgc2A_JetD2Jyewyl4ac4lwpy3NiLxD9dOYz42sM,8116
85
+ nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=ceYQjRjhBSDbbZ6q-Db7Y6GHVOvWPdGAMb3TX1vMWfY,8321
86
86
  nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=VnAzkSFat_ckI19mlwQTlFrvP6EZVCwyNl9bt51b8oU,7193
87
87
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
88
88
  nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
@@ -92,14 +92,14 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
92
92
  nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
93
93
  nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=OtM1iPw26uioC3mghbOJQurKGg641uQfhASH462VqOY,578
94
94
  nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
95
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=ongmHkJA2953f9_RI7ZYzf5BUnFzVL6Al5E8WKyfgw4,885
95
+ nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=RZCISA8CUqKiY8eJuk4uWxzo4PZ-fuYdzMO7_LYFkoM,1117
96
96
  nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
97
97
  nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
98
98
  nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
99
99
  nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
100
100
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
101
101
  nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
102
- nv_ingest_api/internal/transform/embed_text.py,sha256=A8JMotTkC8KQ0pmz4AIJhaKebza6JzhQ0aEnHX2oHY8,16539
102
+ nv_ingest_api/internal/transform/embed_text.py,sha256=kvVGlNH1S91UENXWLD31uh3KzlfJYOlYitpIFMsyowU,20033
103
103
  nv_ingest_api/internal/transform/split_text.py,sha256=-kwpRWSVZrPldm1hn3-tVz_TkzuKM-kPvNU3HTp9zOY,7476
104
104
  nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
105
105
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +123,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
123
123
  nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
124
124
  nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
125
125
  nv_ingest_api/util/image_processing/table_and_chart.py,sha256=bxOu9PZYkG_WFCDGw_JLaO60S2pDSN8EOWK3xkIwr2A,14376
126
- nv_ingest_api/util/image_processing/transforms.py,sha256=Kz9hrizV314Hy7cRCYK9ZmhmBbVUOZ_z0HEpzZYcslQ,14081
126
+ nv_ingest_api/util/image_processing/transforms.py,sha256=3-xeUerc2AaXJTYuR23EjwdtjRQ8F85pS5D9zxR4cLA,23452
127
127
  nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
128
128
  nv_ingest_api/util/imports/callable_signatures.py,sha256=e2bJB1pmkN4Ee-Bf-VggOSBaQ4RXofWF5eKkWXgIj2U,1855
129
129
  nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=7GByV_-8z2X0tnVoabCxVioxOP3sYMros3ZllVAW-wY,4343
@@ -135,12 +135,12 @@ nv_ingest_api/util/message_brokers/simple_message_broker/broker.py,sha256=h9Q4q_
135
135
  nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py,sha256=3p-LRqG8qLnsfEhBNf73_DG22C08JKahTqUvPLS2Apg,2554
136
136
  nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=fh7Q0wO5H_FtrHV1VdT6V66aZNqglOh_2XdkfLt8hgg,15722
137
137
  nv_ingest_api/util/metadata/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
138
- nv_ingest_api/util/metadata/aggregators.py,sha256=Y5JSKuLhhk_ldpzT3eRIcVg7QM7cTNhfQZn4g5bcbq4,15884
138
+ nv_ingest_api/util/metadata/aggregators.py,sha256=YYdvJ1E04eGFZKKHUxXoH6mzLg8nor9Smvnv0qzqK5w,15988
139
139
  nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ-HBVVEunn5Z9I99swA,242
140
140
  nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
141
141
  nv_ingest_api/util/nim/__init__.py,sha256=UqbiXFCqjWcjNvoduXd_0gOUOGBT8JvppiYHOmMyneA,1775
142
142
  nv_ingest_api/util/pdf/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
143
- nv_ingest_api/util/pdf/pdfium.py,sha256=Ch9Gh5jRLcBr3stjCckqWwTUL-T0sI50PlQnZHo_9NA,15761
143
+ nv_ingest_api/util/pdf/pdfium.py,sha256=qTiTlSaiCk_rxm_eoQBoAFKq_5OQrioHVSbPbGDxVkE,15668
144
144
  nv_ingest_api/util/schema/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
145
145
  nv_ingest_api/util/schema/schema_validator.py,sha256=H0yZ_i_HZaiBRUCGmTBfRB9-hURhVqyd10aS_ynM1_0,321
146
146
  nv_ingest_api/util/service_clients/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -153,8 +153,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
153
153
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
154
154
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
156
- nv_ingest_api-2025.7.14.dev20250714.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
157
- nv_ingest_api-2025.7.14.dev20250714.dist-info/METADATA,sha256=ZSDiSF9iqAtQvebMJ1Xp4Y_Uee8FqaZwEshVsywq_5I,13919
158
- nv_ingest_api-2025.7.14.dev20250714.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
- nv_ingest_api-2025.7.14.dev20250714.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
160
- nv_ingest_api-2025.7.14.dev20250714.dist-info/RECORD,,
156
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
157
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/METADATA,sha256=RaPAkQ4Dtkkrn6hi9Va1t2XDpDgRbe-bFqmCVL3IlEA,13947
158
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
160
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/RECORD,,