nv-ingest-api 2025.9.22.dev20250922__py3-none-any.whl → 2025.9.25.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -2,10 +2,14 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import hashlib
6
+ import json
5
7
  import logging
6
8
  import threading
7
9
  import time
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ import queue
11
+ from collections import namedtuple
12
+ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
9
13
  from typing import Any
10
14
  from typing import Optional
11
15
  from typing import Tuple, Union
@@ -17,8 +21,12 @@ import tritonclient.grpc as grpcclient
17
21
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable_func
18
22
  from nv_ingest_api.util.string_processing import generate_url
19
23
 
24
+
20
25
  logger = logging.getLogger(__name__)
21
26
 
27
+ # A simple structure to hold a request's data and its Future for the result
28
+ InferenceRequest = namedtuple("InferenceRequest", ["data", "future", "model_name", "dims", "kwargs"])
29
+
22
30
 
23
31
  class NimClient:
24
32
  """
@@ -34,6 +42,9 @@ class NimClient:
34
42
  timeout: float = 120.0,
35
43
  max_retries: int = 5,
36
44
  max_429_retries: int = 5,
45
+ enable_dynamic_batching: bool = False,
46
+ dynamic_batch_timeout: float = 0.1, # 100 milliseconds
47
+ dynamic_batch_memory_budget_mb: Optional[float] = None,
37
48
  ):
38
49
  """
39
50
  Initialize the NimClient with the specified model interface, protocol, and server endpoints.
@@ -60,7 +71,6 @@ class NimClient:
60
71
  ValueError
61
72
  If an invalid protocol is specified or if required endpoints are missing.
62
73
  """
63
-
64
74
  self.client = None
65
75
  self.model_interface = model_interface
66
76
  self.protocol = protocol.lower()
@@ -88,12 +98,32 @@ class NimClient:
88
98
  else:
89
99
  raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
90
100
 
101
+ self.dynamic_batching_enabled = enable_dynamic_batching
102
+ if self.dynamic_batching_enabled:
103
+ self._batch_timeout = dynamic_batch_timeout
104
+ if dynamic_batch_memory_budget_mb is not None:
105
+ self._batch_memory_budget_bytes = dynamic_batch_memory_budget_mb * 1024 * 1024
106
+ else:
107
+ self._batch_memory_budget_bytes = None
108
+
109
+ self._request_queue = queue.Queue()
110
+ self._stop_event = threading.Event()
111
+ self._batcher_thread = threading.Thread(target=self._batcher_loop, daemon=True)
112
+
113
+ def start(self):
114
+ """Starts the dynamic batching worker thread if enabled."""
115
+ if self.dynamic_batching_enabled and not self._batcher_thread.is_alive():
116
+ self._batcher_thread.start()
117
+
91
118
  def _fetch_max_batch_size(self, model_name, model_version: str = "") -> int:
92
119
  """Fetch the maximum batch size from the Triton model configuration in a thread-safe manner."""
93
120
 
94
121
  if model_name == "yolox_ensemble":
95
122
  model_name = "yolox"
96
123
 
124
+ if model_name == "scene_text_ensemble":
125
+ model_name = "scene_text_pre"
126
+
97
127
  if model_name in self._max_batch_sizes:
98
128
  return self._max_batch_sizes[model_name]
99
129
 
@@ -102,13 +132,12 @@ class NimClient:
102
132
  if model_name in self._max_batch_sizes:
103
133
  return self._max_batch_sizes[model_name]
104
134
 
105
- if not self._grpc_endpoint:
135
+ if not self._grpc_endpoint or not self.client:
106
136
  self._max_batch_sizes[model_name] = 1
107
137
  return 1
108
138
 
109
139
  try:
110
- client = self.client if self.client else grpcclient.InferenceServerClient(url=self._grpc_endpoint)
111
- model_config = client.get_model_config(model_name=model_name, model_version=model_version)
140
+ model_config = self.client.get_model_config(model_name=model_name, model_version=model_version)
112
141
  self._max_batch_sizes[model_name] = model_config.config.max_batch_size
113
142
  logger.debug(f"Max batch size for model '{model_name}': {self._max_batch_sizes[model_name]}")
114
143
  except Exception as e:
@@ -176,17 +205,40 @@ class NimClient:
176
205
  Any
177
206
  The processed inference results, coalesced in the same order as the input images.
178
207
  """
179
- try:
180
- # 1. Retrieve or default to the model's maximum batch size.
181
- batch_size = self._fetch_max_batch_size(model_name)
182
- max_requested_batch_size = kwargs.pop("max_batch_size", batch_size)
183
- force_requested_batch_size = kwargs.pop("force_max_batch_size", False)
184
- max_batch_size = (
185
- max(1, min(batch_size, max_requested_batch_size))
186
- if not force_requested_batch_size
187
- else max_requested_batch_size
188
- )
208
+ # 1. Retrieve or default to the model's maximum batch size.
209
+ batch_size = self._fetch_max_batch_size(model_name)
210
+ max_requested_batch_size = kwargs.pop("max_batch_size", batch_size)
211
+ force_requested_batch_size = kwargs.pop("force_max_batch_size", False)
212
+ max_batch_size = (
213
+ max(1, min(batch_size, max_requested_batch_size))
214
+ if not force_requested_batch_size
215
+ else max_requested_batch_size
216
+ )
217
+ self._batch_size = max_batch_size
218
+
219
+ if self.dynamic_batching_enabled:
220
+ # DYNAMIC BATCHING PATH
221
+ try:
222
+ data = self.model_interface.prepare_data_for_inference(data)
223
+
224
+ futures = []
225
+ for base64_image, image_array in zip(data["base64_images"], data["images"]):
226
+ dims = image_array.shape[:2]
227
+ futures.append(self.submit(base64_image, model_name, dims, **kwargs))
228
+
229
+ results = [future.result() for future in futures]
230
+
231
+ return results
232
+
233
+ except Exception as err:
234
+ error_str = (
235
+ f"Error during synchronous infer with dynamic batching [{self.model_interface.name()}]: {err}"
236
+ )
237
+ logger.error(error_str)
238
+ raise RuntimeError(error_str) from err
189
239
 
240
+ # OFFLINE BATCHING PATH
241
+ try:
190
242
  # 2. Prepare data for inference.
191
243
  data = self.model_interface.prepare_data_for_inference(data)
192
244
 
@@ -390,6 +442,209 @@ class NimClient:
390
442
  logger.error(f"Failed to get a successful response after {self.max_retries} retries.")
391
443
  raise Exception(f"Failed to get a successful response after {self.max_retries} retries.")
392
444
 
445
+ def _batcher_loop(self):
446
+ """The main loop for the background thread to form and process batches."""
447
+ while not self._stop_event.is_set():
448
+ requests_batch = []
449
+ try:
450
+ first_req = self._request_queue.get(timeout=self._batch_timeout)
451
+ if first_req is None:
452
+ continue
453
+ requests_batch.append(first_req)
454
+
455
+ start_time = time.monotonic()
456
+
457
+ while len(requests_batch) < self._batch_size:
458
+ if (time.monotonic() - start_time) >= self._batch_timeout:
459
+ break
460
+
461
+ if self._request_queue.empty():
462
+ break
463
+
464
+ next_req_peek = self._request_queue.queue[0]
465
+ if next_req_peek is None:
466
+ break
467
+
468
+ if self._batch_memory_budget_bytes:
469
+ if not self.model_interface.does_item_fit_in_batch(
470
+ requests_batch,
471
+ next_req_peek,
472
+ self._batch_memory_budget_bytes,
473
+ ):
474
+ break
475
+
476
+ try:
477
+ next_req = self._request_queue.get_nowait()
478
+ if next_req is None:
479
+ break
480
+ requests_batch.append(next_req)
481
+ except queue.Empty:
482
+ break
483
+
484
+ except queue.Empty:
485
+ continue
486
+
487
+ if requests_batch:
488
+ self._process_dynamic_batch(requests_batch)
489
+
490
+ def _process_dynamic_batch(self, requests: list[InferenceRequest]):
491
+ """Coalesces, infers, and distributes results for a dynamic batch."""
492
+ if not requests:
493
+ return
494
+
495
+ first_req = requests[0]
496
+ model_name = first_req.model_name
497
+ kwargs = first_req.kwargs
498
+
499
+ try:
500
+ # 1. Coalesce individual data items into a single batch input
501
+ batch_input, batch_data = self.model_interface.coalesce_requests_to_batch(
502
+ [req.data for req in requests],
503
+ [req.dims for req in requests],
504
+ protocol=self.protocol,
505
+ model_name=model_name,
506
+ **kwargs,
507
+ )
508
+
509
+ # 2. Perform inference using the existing _process_batch logic
510
+ parsed_output, _ = self._process_batch(batch_input, batch_data=batch_data, model_name=model_name, **kwargs)
511
+
512
+ # 3. Process the batched output to get final results
513
+ all_results = self.model_interface.process_inference_results(
514
+ parsed_output,
515
+ original_image_shapes=batch_data.get("original_image_shapes"),
516
+ protocol=self.protocol,
517
+ **kwargs,
518
+ )
519
+
520
+ # 4. Distribute the individual results back to the correct Future
521
+ if len(all_results) != len(requests):
522
+ raise ValueError("Mismatch between result count and request count.")
523
+
524
+ for i, req in enumerate(requests):
525
+ req.future.set_result(all_results[i])
526
+
527
+ except Exception as e:
528
+ # If anything fails, propagate the exception to all futures in the batch
529
+ logger.error(f"Error processing dynamic batch: {e}")
530
+ for req in requests:
531
+ req.future.set_exception(e)
532
+
533
+ def submit(self, data: Any, model_name: str, dims: Tuple[int, int], **kwargs) -> Future:
534
+ """
535
+ Submits a single inference request to the dynamic batcher.
536
+
537
+ This method is non-blocking and returns a Future object that will
538
+ eventually contain the inference result.
539
+
540
+ Parameters
541
+ ----------
542
+ data : Any
543
+ The single data item for inference (e.g., one image, one text prompt).
544
+
545
+ Returns
546
+ -------
547
+ concurrent.futures.Future
548
+ A future that will be fulfilled with the inference result.
549
+ """
550
+ if not self.dynamic_batching_enabled:
551
+ raise RuntimeError(
552
+ "Dynamic batching is not enabled. Please initialize NimClient with " "enable_dynamic_batching=True."
553
+ )
554
+
555
+ future = Future()
556
+ request = InferenceRequest(data=data, future=future, model_name=model_name, dims=dims, kwargs=kwargs)
557
+ self._request_queue.put(request)
558
+ return future
559
+
393
560
  def close(self):
394
- if self.protocol == "grpc" and hasattr(self.client, "close"):
561
+ """Stops the dynamic batching worker and closes client connections."""
562
+
563
+ if self.dynamic_batching_enabled:
564
+ self._stop_event.set()
565
+ # Unblock the queue in case the thread is waiting on get()
566
+ self._request_queue.put(None)
567
+ if self._batcher_thread.is_alive():
568
+ self._batcher_thread.join()
569
+
570
+ if self.client:
395
571
  self.client.close()
572
+
573
+
574
+ class NimClientManager:
575
+ """
576
+ A thread-safe, singleton manager for creating and sharing NimClient instances.
577
+
578
+ This manager ensures that only one NimClient is created per unique configuration.
579
+ """
580
+
581
+ _instance = None
582
+ _lock = threading.Lock()
583
+
584
+ def __new__(cls):
585
+ # Singleton pattern
586
+ if cls._instance is None:
587
+ with cls._lock:
588
+ if cls._instance is None:
589
+ cls._instance = super(NimClientManager, cls).__new__(cls)
590
+ return cls._instance
591
+
592
+ def __init__(self):
593
+ if not hasattr(self, "_initialized"):
594
+ with self._lock:
595
+ if not hasattr(self, "_initialized"):
596
+ self._clients = {} # Key: config_hash, Value: NimClient instance
597
+ self._client_lock = threading.Lock()
598
+ self._initialized = True
599
+
600
+ def _generate_config_key(self, **kwargs) -> str:
601
+ """Creates a stable, hashable key from client configuration."""
602
+ sorted_config = sorted(kwargs.items())
603
+ config_str = json.dumps(sorted_config)
604
+ return hashlib.md5(config_str.encode("utf-8")).hexdigest()
605
+
606
+ def get_client(self, model_interface, **kwargs) -> "NimClient":
607
+ """
608
+ Gets or creates a NimClient for the given configuration.
609
+ """
610
+ config_key = self._generate_config_key(model_interface_name=model_interface.name(), **kwargs)
611
+
612
+ if config_key in self._clients:
613
+ return self._clients[config_key]
614
+
615
+ with self._client_lock:
616
+ if config_key in self._clients:
617
+ return self._clients[config_key]
618
+
619
+ logger.debug(f"Creating new NimClient for config hash: {config_key}")
620
+
621
+ new_client = NimClient(model_interface=model_interface, **kwargs)
622
+
623
+ if new_client.dynamic_batching_enabled:
624
+ new_client.start()
625
+
626
+ self._clients[config_key] = new_client
627
+
628
+ return new_client
629
+
630
+ def shutdown(self):
631
+ """
632
+ Gracefully closes all managed NimClient instances.
633
+ This is called automatically on application exit by `atexit`.
634
+ """
635
+ logger.debug(f"Shutting down NimClientManager and {len(self._clients)} client(s)...")
636
+ with self._client_lock:
637
+ for config_key, client in self._clients.items():
638
+ logger.debug(f"Closing client for config: {config_key}")
639
+ try:
640
+ client.close()
641
+ except Exception as e:
642
+ logger.error(f"Error closing client for config {config_key}: {e}")
643
+ self._clients.clear()
644
+ logger.debug("NimClientManager shutdown complete.")
645
+
646
+
647
+ # A global helper function to make access even easier
648
+ def get_nim_client_manager(*args, **kwargs) -> NimClientManager:
649
+ """Returns the singleton instance of the NimClientManager."""
650
+ return NimClientManager(*args, **kwargs)
@@ -3,7 +3,10 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Any
7
+ from typing import Dict
6
8
  from typing import Optional
9
+ from typing import Tuple
7
10
 
8
11
 
9
12
  logger = logging.getLogger(__name__)
@@ -79,3 +82,45 @@ class ModelInterface:
79
82
  The name of the model interface.
80
83
  """
81
84
  raise NotImplementedError("Subclasses should implement this method")
85
+
86
+ def coalesce_requests_to_batch(self, requests, protocol: str, **kwargs) -> Tuple[Any, Dict[str, Any]]:
87
+ """
88
+ Takes a list of InferenceRequest objects and combines them into a single
89
+ formatted batch ready for inference.
90
+
91
+ THIS METHOD IS REQUIRED FOR DYNAMIC BATCHING SUPPORT.
92
+
93
+ Parameters
94
+ ----------
95
+ requests : List[InferenceRequest]
96
+ A list of InferenceRequest namedtuples collected for the batch.
97
+ Each tuple contains the data, dimensions, and other context for a single item.
98
+ protocol : str
99
+ The inference protocol, either "grpc" or "http".
100
+ **kwargs : Any
101
+ Additional keyword arguments passed from the original request.
102
+
103
+ Returns
104
+ -------
105
+ Tuple[Any, Dict[str, Any]]
106
+ A tuple containing the single formatted batch and its scratch-pad data.
107
+ """
108
+ raise NotImplementedError(
109
+ f"{self.__class__.__name__} does not support dynamic batching "
110
+ "because `coalesce_requests_to_batch` is not implemented."
111
+ )
112
+
113
+ def does_item_fit_in_batch(self, current_batch, next_request, memory_budget_bytes: int) -> bool:
114
+ """
115
+ Checks if adding another request to the current batch would exceed the memory budget.
116
+
117
+ This is a model-specific calculation. The default implementation always
118
+ returns True, effectively ignoring the memory budget. Interfaces for models
119
+ that require memory management (like padded image models) must override this.
120
+
121
+ Returns
122
+ -------
123
+ bool
124
+ True if the item fits within the budget, False otherwise.
125
+ """
126
+ return True
@@ -0,0 +1,9 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2025, NVIDIA CORPORATION.
6
+
7
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader, MediaInterface
8
+
9
+ __all__ = ["DataLoader", "MediaInterface"]