nv-ingest-api 2025.10.4.dev20251004__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (34) hide show
  1. nv_ingest_api/internal/extract/image/chart_extractor.py +7 -3
  2. nv_ingest_api/internal/extract/image/infographic_extractor.py +7 -3
  3. nv_ingest_api/internal/extract/image/table_extractor.py +7 -3
  4. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
  5. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +11 -4
  6. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +4 -0
  7. nv_ingest_api/internal/primitives/nim/nim_client.py +158 -15
  8. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +4 -2
  9. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +10 -1
  10. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +4 -2
  11. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +4 -2
  12. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +10 -1
  13. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +6 -4
  14. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +4 -2
  15. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +9 -1
  16. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +56 -1
  17. nv_ingest_api/internal/schemas/meta/metadata_schema.py +9 -0
  18. nv_ingest_api/internal/schemas/mixins.py +39 -0
  19. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
  20. nv_ingest_api/internal/transform/embed_text.py +82 -0
  21. nv_ingest_api/util/dataloader/dataloader.py +20 -9
  22. nv_ingest_api/util/image_processing/transforms.py +67 -1
  23. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  24. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +1 -0
  25. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +8 -2
  26. nv_ingest_api/util/service_clients/redis/redis_client.py +160 -0
  27. nv_ingest_api/util/service_clients/rest/rest_client.py +42 -3
  28. nv_ingest_api/util/string_processing/yaml.py +41 -4
  29. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/METADATA +2 -1
  30. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/RECORD +34 -32
  31. udfs/llm_summarizer_udf.py +132 -137
  32. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  33. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  34. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -24,8 +24,41 @@ logger = logging.getLogger(__name__)
24
24
  # Tracing Options Schema
25
25
  class TracingOptionsSchema(BaseModelNoExt):
26
26
  trace: bool = False
27
- ts_send: int
27
+ ts_send: Optional[int] = None
28
28
  trace_id: Optional[str] = None
29
+ # V2 PDF splitting support
30
+ parent_job_id: Optional[str] = None
31
+ page_num: Optional[int] = None
32
+ total_pages: Optional[int] = None
33
+
34
+
35
+ # PDF Configuration Schema
36
+ class PdfConfigSchema(BaseModelNoExt):
37
+ """PDF-specific configuration options for job submission.
38
+
39
+ Note: split_page_count accepts any positive integer but will be clamped
40
+ to [1, 128] range by the server at runtime.
41
+ """
42
+
43
+ split_page_count: Annotated[int, Field(ge=1)] = 32
44
+
45
+
46
+ class RoutingOptionsSchema(BaseModelNoExt):
47
+ # Queue routing hint for QoS scheduler
48
+ queue_hint: Optional[str] = None
49
+
50
+ @field_validator("queue_hint")
51
+ @classmethod
52
+ def validate_queue_hint(cls, v):
53
+ if v is None:
54
+ return v
55
+ if not isinstance(v, str):
56
+ raise ValueError("queue_hint must be a string")
57
+ s = v.lower()
58
+ allowed = {"default", "immediate", "micro", "small", "medium", "large"}
59
+ if s not in allowed:
60
+ raise ValueError("queue_hint must be one of: default, immediate, micro, small, medium, large")
61
+ return s
29
62
 
30
63
 
31
64
  # Ingest Task Schemas
@@ -111,6 +144,8 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
111
144
  image_elements_modality: Optional[str] = None
112
145
  structured_elements_modality: Optional[str] = None
113
146
  audio_elements_modality: Optional[str] = None
147
+ custom_content_field: Optional[str] = None
148
+ result_target_field: Optional[str] = None
114
149
 
115
150
 
116
151
  class IngestTaskVdbUploadSchema(BaseModelNoExt):
@@ -266,6 +301,26 @@ class IngestJobSchema(BaseModelNoExt):
266
301
  job_id: Union[str, int]
267
302
  tasks: List[IngestTaskSchema]
268
303
  tracing_options: Optional[TracingOptionsSchema] = None
304
+ routing_options: Optional[RoutingOptionsSchema] = None
305
+ pdf_config: Optional[PdfConfigSchema] = None
306
+
307
+ @model_validator(mode="before")
308
+ @classmethod
309
+ def migrate_queue_hint(cls, values):
310
+ """
311
+ Backward-compatibility shim: if a legacy client sends
312
+ tracing_options.queue_hint, move it into routing_options.queue_hint.
313
+ """
314
+ try:
315
+ topt = values.get("tracing_options") or {}
316
+ ropt = values.get("routing_options") or {}
317
+ if isinstance(topt, dict) and "queue_hint" in topt and "queue_hint" not in ropt:
318
+ ropt["queue_hint"] = topt.pop("queue_hint")
319
+ values["routing_options"] = ropt
320
+ values["tracing_options"] = topt
321
+ except Exception:
322
+ pass
323
+ return values
269
324
 
270
325
 
271
326
  # ------------------------------------------------------------------------------
@@ -352,6 +352,15 @@ class MetadataSchema(BaseModelNoExt):
352
352
  raise_on_failure: bool = False
353
353
  """If True, indicates that processing should halt on failure."""
354
354
 
355
+ total_pages: Optional[int] = None
356
+ """Total number of pages in the source document (V2 API)."""
357
+
358
+ original_source_id: Optional[str] = None
359
+ """The original source identifier before any splitting or chunking (V2 API)."""
360
+
361
+ original_source_name: Optional[str] = None
362
+ """The original source name before any splitting or chunking (V2 API)."""
363
+
355
364
  custom_content: Optional[Dict[str, Any]] = None
356
365
 
357
366
  @model_validator(mode="before")
@@ -0,0 +1,39 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Shared mixins for Pydantic schemas.
7
+ """
8
+
9
+ from typing import Any
10
+ from pydantic import BaseModel, field_validator
11
+
12
+
13
+ class LowercaseProtocolMixin(BaseModel):
14
+ """
15
+ Mixin that automatically lowercases any field ending with '_infer_protocol'.
16
+
17
+ This ensures case-insensitive handling of protocol values (e.g., "HTTP" -> "http").
18
+ Apply this mixin to any schema that has protocol fields to normalize user input.
19
+
20
+ Examples
21
+ --------
22
+ >>> class MyConfigSchema(LowercaseProtocolMixin):
23
+ ... yolox_infer_protocol: str = ""
24
+ ... ocr_infer_protocol: str = ""
25
+ >>>
26
+ >>> config = MyConfigSchema(yolox_infer_protocol="GRPC", ocr_infer_protocol="HTTP")
27
+ >>> config.yolox_infer_protocol
28
+ 'grpc'
29
+ >>> config.ocr_infer_protocol
30
+ 'http'
31
+ """
32
+
33
+ @field_validator("*", mode="before")
34
+ @classmethod
35
+ def _lowercase_protocol_fields(cls, v: Any, info):
36
+ """Lowercase any field ending with '_infer_protocol'."""
37
+ if info.field_name.endswith("_infer_protocol") and v is not None:
38
+ return str(v).strip().lower()
39
+ return v
@@ -7,6 +7,8 @@ import logging
7
7
 
8
8
  from pydantic import ConfigDict, BaseModel, Field, model_validator, field_validator
9
9
 
10
+ from typing import Optional
11
+
10
12
  from nv_ingest_api.util.logging.configuration import LogLevel
11
13
 
12
14
  logger = logging.getLogger(__name__)
@@ -26,6 +28,8 @@ class TextEmbeddingSchema(BaseModel):
26
28
  image_elements_modality: str = Field(default="text")
27
29
  structured_elements_modality: str = Field(default="text")
28
30
  audio_elements_modality: str = Field(default="text")
31
+ custom_content_field: Optional[str] = None
32
+ result_target_field: Optional[str] = None
29
33
 
30
34
  model_config = ConfigDict(extra="forbid")
31
35
 
@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor
7
7
  from functools import partial
8
8
  from typing import Any, Dict, Tuple, Optional, Iterable, List
9
9
 
10
+ import glom
10
11
  import pandas as pd
11
12
  from openai import OpenAI
12
13
 
@@ -282,6 +283,33 @@ def _add_embeddings(row, embeddings, info_msgs):
282
283
  return row
283
284
 
284
285
 
286
+ def _add_custom_embeddings(row, embeddings, result_target_field):
287
+ """
288
+ Updates a DataFrame row with embedding data and associated error info
289
+ based on a user supplied custom content field.
290
+
291
+ Parameters
292
+ ----------
293
+ row : pandas.Series
294
+ A row of the DataFrame.
295
+ embeddings : dict
296
+ Dictionary mapping row indices to embeddings.
297
+ result_target_field: str
298
+ The field in custom_content to output the embeddings to
299
+
300
+ Returns
301
+ -------
302
+ pandas.Series
303
+ The updated row
304
+ """
305
+ embedding = embeddings.get(row.name, None)
306
+
307
+ if embedding is not None:
308
+ row["metadata"] = glom.assign(row["metadata"], "custom_content." + result_target_field, embedding, missing=dict)
309
+
310
+ return row
311
+
312
+
285
313
  def _format_image_input_string(image_b64: Optional[str]) -> str:
286
314
  if not image_b64:
287
315
  return
@@ -381,6 +409,20 @@ def _get_pandas_audio_content(row, modality="text"):
381
409
  return row.get("audio_metadata", {}).get("audio_transcript")
382
410
 
383
411
 
412
+ def _get_pandas_custom_content(row, custom_content_field):
413
+ custom_content = row.get("custom_content", {})
414
+ content = glom.glom(custom_content, custom_content_field, default=None)
415
+ if content is None:
416
+ logger.warning(f"Custom content field: {custom_content_field} not found")
417
+ return None
418
+
419
+ try:
420
+ return str(content)
421
+ except (TypeError, ValueError):
422
+ logger.warning(f"Cannot convert custom content field: {custom_content_field} to string")
423
+ return None
424
+
425
+
384
426
  # ------------------------------------------------------------------------------
385
427
  # Batch Processing Utilities
386
428
  # ------------------------------------------------------------------------------
@@ -519,6 +561,7 @@ def transform_create_text_embeddings_internal(
519
561
  api_key = task_config.get("api_key") or transform_config.api_key
520
562
  endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
521
563
  model_name = task_config.get("model_name") or transform_config.embedding_model
564
+ custom_content_field = task_config.get("custom_content_field") or transform_config.custom_content_field
522
565
 
523
566
  if execution_trace_log is None:
524
567
  execution_trace_log = {}
@@ -612,4 +655,43 @@ def transform_create_text_embeddings_internal(
612
655
  content_masks.append(content_mask)
613
656
 
614
657
  combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
658
+
659
+ # Embed custom content
660
+ if custom_content_field is not None:
661
+ result_target_field = task_config.get("result_target_field") or custom_content_field + "_embedding"
662
+
663
+ extracted_custom_content = (
664
+ combined_df["metadata"]
665
+ .apply(partial(_get_pandas_custom_content, custom_content_field=custom_content_field))
666
+ .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
667
+ )
668
+
669
+ valid_custom_content_mask = extracted_custom_content.notna()
670
+ if valid_custom_content_mask.any():
671
+ custom_content_list = extracted_custom_content[valid_custom_content_mask].to_list()
672
+ custom_content_batches = _generate_batches(custom_content_list, batch_size=transform_config.batch_size)
673
+
674
+ custom_content_embeddings = _async_runner(
675
+ custom_content_batches,
676
+ api_key,
677
+ endpoint_url,
678
+ model_name,
679
+ transform_config.encoding_format,
680
+ transform_config.input_type,
681
+ transform_config.truncate,
682
+ False,
683
+ )
684
+ custom_embeddings_dict = dict(
685
+ zip(
686
+ extracted_custom_content.loc[valid_custom_content_mask].index,
687
+ custom_content_embeddings.get("embeddings", []),
688
+ )
689
+ )
690
+ else:
691
+ custom_embeddings_dict = {}
692
+
693
+ combined_df = combined_df.apply(
694
+ _add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1
695
+ )
696
+
615
697
  return combined_df, {"trace_info": execution_trace_log}
@@ -254,22 +254,29 @@ else:
254
254
  file = None
255
255
  try:
256
256
  for file in paths:
257
+ if thread_stop.is_set():
258
+ return
257
259
  if isinstance(file, tuple):
258
260
  video_file, audio_file = file
261
+ if thread_stop.is_set():
262
+ return
259
263
  with open(video_file, "rb") as f:
260
264
  video = f.read()
265
+ if thread_stop.is_set():
266
+ return
261
267
  with open(audio_file, "rb") as f:
262
268
  audio = f.read()
263
269
  queue.put((video, audio))
264
270
  else:
265
- if thread_stop:
271
+ if thread_stop.is_set():
266
272
  return
267
273
  with open(file, "rb") as f:
268
274
  queue.put(f.read())
269
275
  except Exception as e:
270
276
  logging.error(f"Error processing file {file}: {e}")
271
277
  queue.put(RuntimeError(f"Error processing file {file}: {e}"))
272
- queue.put(StopIteration)
278
+ finally:
279
+ queue.put(StopIteration)
273
280
 
274
281
  class DataLoader:
275
282
  """
@@ -290,7 +297,7 @@ else:
290
297
  ):
291
298
  interface = interface if interface else MediaInterface()
292
299
  self.thread = None
293
- self.thread_stop = False
300
+ self.thread_stop = threading.Event()
294
301
  self.queue = queue.Queue(size)
295
302
  self.path = Path(path)
296
303
  self.output_dir = output_dir
@@ -323,16 +330,20 @@ else:
323
330
  Reset itertor by stopping the thread and clearing the queue.
324
331
  """
325
332
  if self.thread:
326
- self.thread_stop = True
333
+ self.thread_stop.set()
327
334
  self.thread.join()
328
- self.thread_stop = False
329
- while self.queue.qsize() != 0:
330
- with self.queue.mutex:
331
- self.queue.queue.clear()
335
+ self.thread = None
336
+ try:
337
+ while True:
338
+ self.queue.get_nowait()
339
+ except Exception:
340
+ pass
341
+ finally:
342
+ self.thread_stop.clear()
332
343
 
333
344
  def __iter__(self):
334
345
  self.stop()
335
- self.thread_stop = False
346
+ self.thread_stop.clear()
336
347
  self.thread = threading.Thread(
337
348
  target=load_data,
338
349
  args=(
@@ -49,6 +49,68 @@ def _resize_image_opencv(
49
49
  return cv2.resize(array, target_size, interpolation=interpolation)
50
50
 
51
51
 
52
+ def rgba_to_rgb_white_bg(rgba_image):
53
+ """
54
+ Convert RGBA image to RGB by blending with a white background.
55
+
56
+ This function properly handles transparency by alpha-blending transparent
57
+ and semi-transparent pixels with a white background, producing visually
58
+ accurate results that match how the image would appear when displayed.
59
+
60
+ Parameters
61
+ ----------
62
+ rgba_image : numpy.ndarray
63
+ Input image array with shape (height, width, 4) where the channels
64
+ are Red, Green, Blue, Alpha. Alpha values can be in range [0, 1]
65
+ (float) or [0, 255] (uint8).
66
+
67
+ Returns
68
+ -------
69
+ numpy.ndarray
70
+ RGB image array with shape (height, width, 3) and dtype uint8.
71
+ Values are in range [0, 255] representing Red, Green, Blue channels.
72
+
73
+ Notes
74
+ -----
75
+ The alpha blending formula used is:
76
+ RGB_out = RGB_in * alpha + background * (1 - alpha)
77
+
78
+ Where background is white (255, 255, 255).
79
+
80
+ For pixels with alpha = 1.0 (fully opaque), the original RGB values
81
+ are preserved. For pixels with alpha = 0.0 (fully transparent), the
82
+ result is white. Semi-transparent pixels are blended proportionally.
83
+
84
+ Examples
85
+ --------
86
+ >>> import numpy as np
87
+ >>> # Create a sample RGBA image with some transparency
88
+ >>> rgba = np.random.randint(0, 256, (100, 100, 4), dtype=np.uint8)
89
+ >>> rgb = rgba_to_rgb_white_bg(rgba)
90
+ >>> print(rgb.shape) # (100, 100, 3)
91
+ >>> print(rgb.dtype) # uint8
92
+
93
+ >>> # Example with float alpha values [0, 1]
94
+ >>> rgba_float = np.random.rand(50, 50, 4).astype(np.float32)
95
+ >>> rgb_float = rgba_to_rgb_white_bg(rgba_float)
96
+ >>> print(rgb_float.dtype) # uint8
97
+ """
98
+ # Extract RGB and alpha channels
99
+ rgb = rgba_image[:, :, :3] # RGB channels (H, W, 3)
100
+ alpha = rgba_image[:, :, 3:4] # Alpha channel (H, W, 1)
101
+
102
+ # Normalize alpha to [0, 1] range if it's in [0, 255] range
103
+ if alpha.max() > 1.0:
104
+ alpha = alpha / 255.0
105
+
106
+ # Alpha blend with white background using the formula:
107
+ # result = foreground * alpha + background * (1 - alpha)
108
+ rgb_image = rgb * alpha + 255 * (1 - alpha)
109
+
110
+ # Convert to uint8 format for standard image representation
111
+ return rgb_image.astype(np.uint8)
112
+
113
+
52
114
  def scale_image_to_encoding_size(
53
115
  base64_image: str, max_base64_size: int = 180_000, initial_reduction: float = 0.9, format: str = "PNG", **kwargs
54
116
  ) -> Tuple[str, Tuple[int, int]]:
@@ -93,7 +155,7 @@ def scale_image_to_encoding_size(
93
155
 
94
156
  # Check initial size
95
157
  if len(base64_image) <= max_base64_size:
96
- return base64_image, original_size
158
+ return numpy_to_base64(img_array, format=format, **kwargs), original_size
97
159
 
98
160
  # Initial reduction step
99
161
  reduction_step = initial_reduction
@@ -621,6 +683,10 @@ def base64_to_numpy(base64_string: str) -> np.ndarray:
621
683
  if img is None:
622
684
  raise ValueError("OpenCV failed to decode image")
623
685
 
686
+ # Convert 4 channel to 3 channel if necessary
687
+ if img.shape[2] == 4:
688
+ img = rgba_to_rgb_white_bg(img)
689
+
624
690
  # Convert BGR to RGB for consistent processing (OpenCV loads as BGR)
625
691
  # Only convert if it's a 3-channel color image
626
692
  if img.ndim == 3 and img.shape[2] == 3: