nv-ingest-api 2025.10.29.dev20251029__py3-none-any.whl → 2025.10.30.dev20251030__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/primitives/nim/nim_client.py +124 -14
- {nv_ingest_api-2025.10.29.dev20251029.dist-info → nv_ingest_api-2025.10.30.dev20251030.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.10.29.dev20251029.dist-info → nv_ingest_api-2025.10.30.dev20251030.dist-info}/RECORD +6 -6
- {nv_ingest_api-2025.10.29.dev20251029.dist-info → nv_ingest_api-2025.10.30.dev20251030.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.10.29.dev20251029.dist-info → nv_ingest_api-2025.10.30.dev20251030.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.10.29.dev20251029.dist-info → nv_ingest_api-2025.10.30.dev20251030.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
+
import re
|
|
8
9
|
import threading
|
|
9
10
|
import time
|
|
10
11
|
import queue
|
|
@@ -24,6 +25,12 @@ from nv_ingest_api.util.string_processing import generate_url
|
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
28
|
+
# Regex pattern to detect CUDA-related errors in Triton gRPC responses
|
|
29
|
+
CUDA_ERROR_REGEX = re.compile(
|
|
30
|
+
r"(illegal memory access|invalid argument|failed to (copy|load|perform) .*: .*|TritonModelException: failed to copy data: .*)", # noqa: E501
|
|
31
|
+
re.IGNORECASE,
|
|
32
|
+
)
|
|
33
|
+
|
|
27
34
|
# A simple structure to hold a request's data and its Future for the result
|
|
28
35
|
InferenceRequest = namedtuple("InferenceRequest", ["data", "future", "model_name", "dims", "kwargs"])
|
|
29
36
|
|
|
@@ -40,7 +47,7 @@ class NimClient:
|
|
|
40
47
|
endpoints: Tuple[str, str],
|
|
41
48
|
auth_token: Optional[str] = None,
|
|
42
49
|
timeout: float = 120.0,
|
|
43
|
-
max_retries: int =
|
|
50
|
+
max_retries: int = 10,
|
|
44
51
|
max_429_retries: int = 5,
|
|
45
52
|
enable_dynamic_batching: bool = False,
|
|
46
53
|
dynamic_batch_timeout: float = 0.1, # 100 milliseconds
|
|
@@ -60,11 +67,11 @@ class NimClient:
|
|
|
60
67
|
auth_token : str, optional
|
|
61
68
|
Authorization token for HTTP requests (default: None).
|
|
62
69
|
timeout : float, optional
|
|
63
|
-
Timeout for HTTP requests in seconds (default:
|
|
70
|
+
Timeout for HTTP requests in seconds (default: 120.0).
|
|
64
71
|
max_retries : int, optional
|
|
65
|
-
The maximum number of retries for non-429 server-side errors (default:
|
|
72
|
+
The maximum number of retries for non-429 server-side errors (default: 10).
|
|
66
73
|
max_429_retries : int, optional
|
|
67
|
-
The maximum number of retries specifically for 429 errors (default:
|
|
74
|
+
The maximum number of retries specifically for 429 errors (default: 5).
|
|
68
75
|
|
|
69
76
|
Raises
|
|
70
77
|
------
|
|
@@ -323,7 +330,7 @@ class NimClient:
|
|
|
323
330
|
|
|
324
331
|
outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
|
|
325
332
|
|
|
326
|
-
base_delay = 0
|
|
333
|
+
base_delay = 2.0
|
|
327
334
|
attempt = 0
|
|
328
335
|
retries_429 = 0
|
|
329
336
|
max_grpc_retries = self.max_429_retries
|
|
@@ -342,8 +349,58 @@ class NimClient:
|
|
|
342
349
|
return [response.as_numpy(output.name()) for output in outputs]
|
|
343
350
|
|
|
344
351
|
except grpcclient.InferenceServerException as e:
|
|
345
|
-
status = e.status()
|
|
346
|
-
|
|
352
|
+
status = str(e.status())
|
|
353
|
+
message = e.message()
|
|
354
|
+
|
|
355
|
+
# Handle CUDA memory errors
|
|
356
|
+
if status == "StatusCode.INTERNAL":
|
|
357
|
+
if CUDA_ERROR_REGEX.search(message):
|
|
358
|
+
logger.warning(
|
|
359
|
+
f"Received gRPC INTERNAL error with CUDA-related message for model '{model_name}'. "
|
|
360
|
+
f"Attempt {attempt + 1} of {self.max_retries}. Message (truncated): {message[:500]}"
|
|
361
|
+
)
|
|
362
|
+
if attempt >= self.max_retries - 1:
|
|
363
|
+
logger.error(f"Max retries exceeded for CUDA errors on model '{model_name}'.")
|
|
364
|
+
raise e
|
|
365
|
+
# Try to reload models before retrying
|
|
366
|
+
model_reload_succeeded = reload_models(client=self.client, client_timeout=self.timeout)
|
|
367
|
+
if not model_reload_succeeded:
|
|
368
|
+
logger.error(f"Failed to reload models for model '{model_name}'.")
|
|
369
|
+
else:
|
|
370
|
+
logger.warning(
|
|
371
|
+
f"Received gRPC INTERNAL error for model '{model_name}'. "
|
|
372
|
+
f"Attempt {attempt + 1} of {self.max_retries}. Message (truncated): {message[:500]}"
|
|
373
|
+
)
|
|
374
|
+
if attempt >= self.max_retries - 1:
|
|
375
|
+
logger.error(f"Max retries exceeded for INTERNAL error on model '{model_name}'.")
|
|
376
|
+
raise e
|
|
377
|
+
|
|
378
|
+
# Common retry logic for both CUDA and non-CUDA INTERNAL errors
|
|
379
|
+
backoff_time = base_delay * (2**attempt)
|
|
380
|
+
time.sleep(backoff_time)
|
|
381
|
+
attempt += 1
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# Handle errors that can occur after model reload (NOT_FOUND, model not loaded)
|
|
385
|
+
if status == "StatusCode.NOT_FOUND":
|
|
386
|
+
logger.warning(
|
|
387
|
+
f"Received gRPC {status} error for model '{model_name}'. "
|
|
388
|
+
f"Attempt {attempt + 1} of {self.max_retries}. Message: {message[:500]}"
|
|
389
|
+
)
|
|
390
|
+
if attempt >= self.max_retries - 1:
|
|
391
|
+
logger.error(f"Max retries exceeded for model not found errors on model '{model_name}'.")
|
|
392
|
+
raise e
|
|
393
|
+
|
|
394
|
+
# Retry with exponential backoff WITHOUT reloading
|
|
395
|
+
backoff_time = base_delay * (2**attempt)
|
|
396
|
+
logger.info(
|
|
397
|
+
f"Retrying after {backoff_time}s backoff for model not found error on model '{model_name}'."
|
|
398
|
+
)
|
|
399
|
+
time.sleep(backoff_time)
|
|
400
|
+
attempt += 1
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in message.lower():
|
|
347
404
|
retries_429 += 1
|
|
348
405
|
logger.warning(
|
|
349
406
|
f"Received gRPC {status} for model '{model_name}'. "
|
|
@@ -357,13 +414,12 @@ class NimClient:
|
|
|
357
414
|
time.sleep(backoff_time)
|
|
358
415
|
continue
|
|
359
416
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
raise
|
|
417
|
+
# For other server-side errors (e.g., INVALID_ARGUMENT, etc.),
|
|
418
|
+
# fail fast as retrying will not help
|
|
419
|
+
logger.error(
|
|
420
|
+
f"Received non-retryable gRPC error {status} from Triton for model '{model_name}': {message}"
|
|
421
|
+
)
|
|
422
|
+
raise
|
|
367
423
|
|
|
368
424
|
except Exception as e:
|
|
369
425
|
# Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
|
|
@@ -681,3 +737,57 @@ class NimClientManager:
|
|
|
681
737
|
def get_nim_client_manager(*args, **kwargs) -> NimClientManager:
|
|
682
738
|
"""Returns the singleton instance of the NimClientManager."""
|
|
683
739
|
return NimClientManager(*args, **kwargs)
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def reload_models(client: grpcclient.InferenceServerClient, exclude: list[str] = [], client_timeout: int = 120) -> bool:
|
|
743
|
+
"""
|
|
744
|
+
Reloads all models in the Triton server except for the models in the exclude list.
|
|
745
|
+
|
|
746
|
+
Parameters
|
|
747
|
+
----------
|
|
748
|
+
client : grpcclient.InferenceServerClient
|
|
749
|
+
The gRPC client connected to the Triton server.
|
|
750
|
+
exclude : list[str], optional
|
|
751
|
+
A list of model names to exclude from reloading.
|
|
752
|
+
client_timeout : int, optional
|
|
753
|
+
Timeout for client operations in seconds (default: 120).
|
|
754
|
+
|
|
755
|
+
Returns
|
|
756
|
+
-------
|
|
757
|
+
bool
|
|
758
|
+
True if all models were successfully reloaded, False otherwise.
|
|
759
|
+
"""
|
|
760
|
+
model_index = client.get_model_repository_index()
|
|
761
|
+
exclude = set(exclude)
|
|
762
|
+
names = [m.name for m in model_index.models if m.name not in exclude]
|
|
763
|
+
|
|
764
|
+
logger.info(f"Reloading {len(names)} model(s): {', '.join(names) if names else '(none)'}")
|
|
765
|
+
|
|
766
|
+
# 1) Unload
|
|
767
|
+
for name in names:
|
|
768
|
+
try:
|
|
769
|
+
client.unload_model(name)
|
|
770
|
+
except grpcclient.InferenceServerException as e:
|
|
771
|
+
msg = e.message()
|
|
772
|
+
if "explicit model load / unload" in msg.lower():
|
|
773
|
+
status = e.status()
|
|
774
|
+
logger.warning(
|
|
775
|
+
f"[SKIP Model Reload] Explicit model control disabled; cannot unload '{name}'. Status: {status}."
|
|
776
|
+
)
|
|
777
|
+
return False
|
|
778
|
+
logger.error(f"[ERROR] Failed to unload '{name}': {msg}")
|
|
779
|
+
return False
|
|
780
|
+
|
|
781
|
+
# 2) Load
|
|
782
|
+
for name in names:
|
|
783
|
+
client.load_model(name)
|
|
784
|
+
|
|
785
|
+
# 3) Readiness check
|
|
786
|
+
for name in names:
|
|
787
|
+
ready = client.is_model_ready(model_name=name, client_timeout=client_timeout)
|
|
788
|
+
if not ready:
|
|
789
|
+
logger.warning(f"[Warning] Triton Not ready: {name}")
|
|
790
|
+
return False
|
|
791
|
+
|
|
792
|
+
logger.info("✅ Reload of models complete.")
|
|
793
|
+
return True
|
|
@@ -50,7 +50,7 @@ nv_ingest_api/internal/primitives/control_message_task.py,sha256=nWVB3QsP6p8BKwH
|
|
|
50
50
|
nv_ingest_api/internal/primitives/ingest_control_message.py,sha256=8rA0UbPDSB3avReAKNxiUa_FCy7fIQpqk6tfmcYUibA,9879
|
|
51
51
|
nv_ingest_api/internal/primitives/nim/__init__.py,sha256=-dFBTHQnMKV0yc5tfSqIT-rkJXKtpcmyUfTPs8TJAi8,339
|
|
52
52
|
nv_ingest_api/internal/primitives/nim/default_values.py,sha256=W92XjfyeC6uuVxut6J7p00x1kpNsnXIDb97gSVytZJk,380
|
|
53
|
-
nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=
|
|
53
|
+
nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=kT-JP9jbkXzotS7EeajTgfMbFWhMoD8o2JtOLYu1JuU,32770
|
|
54
54
|
nv_ingest_api/internal/primitives/nim/nim_model_interface.py,sha256=gWhyR33mIgEOYirq53WOk1bRl1SL0C_SVrM4w1-JmKU,4166
|
|
55
55
|
nv_ingest_api/internal/primitives/nim/model_interface/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
56
56
|
nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1ExW5V6pXC1ZiHdobeG_BmbPr3rBbVJef13s,11003
|
|
@@ -165,10 +165,10 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
|
|
|
165
165
|
nv_ingest_api/util/string_processing/yaml.py,sha256=4Zdmc4474lUZn6kznqaNTlQJwsmRnnJQZ-DvAWLu-zo,2678
|
|
166
166
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
167
|
nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
|
|
168
|
-
nv_ingest_api-2025.10.
|
|
168
|
+
nv_ingest_api-2025.10.30.dev20251030.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
169
169
|
udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
|
|
170
170
|
udfs/llm_summarizer_udf.py,sha256=lH5c5NHoT-5ecHC3og_40u1Ujta8SpsKU4X0e4wzbMU,7314
|
|
171
|
-
nv_ingest_api-2025.10.
|
|
172
|
-
nv_ingest_api-2025.10.
|
|
173
|
-
nv_ingest_api-2025.10.
|
|
174
|
-
nv_ingest_api-2025.10.
|
|
171
|
+
nv_ingest_api-2025.10.30.dev20251030.dist-info/METADATA,sha256=Gv6plGuAgs8l0Zb7RDZv4eyLhcL-ajOSAKgH8SW3aRI,14106
|
|
172
|
+
nv_ingest_api-2025.10.30.dev20251030.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
173
|
+
nv_ingest_api-2025.10.30.dev20251030.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
|
|
174
|
+
nv_ingest_api-2025.10.30.dev20251030.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|