nv-ingest-client 2025.10.6.dev20251006__py3-none-any.whl → 2025.10.8.dev20251008__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +455 -185
- nv_ingest_client/client/ingest_job_handler.py +384 -0
- nv_ingest_client/client/interface.py +36 -6
- nv_ingest_client/nv_ingest_cli.py +6 -3
- nv_ingest_client/primitives/tasks/extract.py +1 -1
- nv_ingest_client/primitives/tasks/task_factory.py +9 -12
- {nv_ingest_client-2025.10.6.dev20251006.dist-info → nv_ingest_client-2025.10.8.dev20251008.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.10.6.dev20251006.dist-info → nv_ingest_client-2025.10.8.dev20251008.dist-info}/RECORD +13 -12
- {nv_ingest_client-2025.10.6.dev20251006.dist-info → nv_ingest_client-2025.10.8.dev20251008.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.10.6.dev20251006.dist-info → nv_ingest_client-2025.10.8.dev20251008.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.6.dev20251006.dist-info → nv_ingest_client-2025.10.8.dev20251008.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.10.6.dev20251006.dist-info → nv_ingest_client-2025.10.8.dev20251008.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,11 @@ import json
|
|
|
9
9
|
import logging
|
|
10
10
|
import math
|
|
11
11
|
import os
|
|
12
|
+
import random
|
|
12
13
|
import time
|
|
14
|
+
import threading
|
|
15
|
+
import copy
|
|
16
|
+
from statistics import mean, median
|
|
13
17
|
from collections import defaultdict
|
|
14
18
|
from concurrent.futures import Future
|
|
15
19
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -76,9 +80,12 @@ class _ConcurrentProcessor:
|
|
|
76
80
|
batch_size: int,
|
|
77
81
|
timeout: Tuple[int, Union[float, None]],
|
|
78
82
|
max_job_retries: Optional[int],
|
|
83
|
+
retry_delay: float,
|
|
84
|
+
initial_fetch_delay: float,
|
|
79
85
|
completion_callback: Optional[Callable[[Dict[str, Any], str], None]],
|
|
80
86
|
fail_on_submit_error: bool,
|
|
81
87
|
stream_to_callback_only: bool,
|
|
88
|
+
return_full_response: bool,
|
|
82
89
|
verbose: bool = False,
|
|
83
90
|
):
|
|
84
91
|
"""
|
|
@@ -128,14 +135,19 @@ class _ConcurrentProcessor:
|
|
|
128
135
|
self.batch_size = batch_size
|
|
129
136
|
self.timeout = timeout
|
|
130
137
|
self.max_job_retries = max_job_retries
|
|
138
|
+
self.retry_delay = retry_delay
|
|
139
|
+
self.initial_fetch_delay = initial_fetch_delay
|
|
131
140
|
self.completion_callback = completion_callback
|
|
132
141
|
self.fail_on_submit_error = fail_on_submit_error
|
|
133
142
|
self.stream_to_callback_only = stream_to_callback_only
|
|
143
|
+
self.return_full_response = return_full_response
|
|
134
144
|
self.verbose = verbose
|
|
135
145
|
|
|
136
146
|
# State variables managed across batch cycles
|
|
137
147
|
self.retry_job_ids: List[str] = []
|
|
138
148
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
149
|
+
self.next_allowed_fetch_time: Dict[str, float] = {}
|
|
150
|
+
self._retry_backoff_cap: float = 5.0
|
|
139
151
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
140
152
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
141
153
|
|
|
@@ -183,6 +195,8 @@ class _ConcurrentProcessor:
|
|
|
183
195
|
# Cleanup retry count if it exists for this job
|
|
184
196
|
if job_index in self.retry_counts:
|
|
185
197
|
del self.retry_counts[job_index]
|
|
198
|
+
if job_index in self.next_allowed_fetch_time:
|
|
199
|
+
del self.next_allowed_fetch_time[job_index]
|
|
186
200
|
|
|
187
201
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
188
202
|
try:
|
|
@@ -231,11 +245,14 @@ class _ConcurrentProcessor:
|
|
|
231
245
|
elif self.stream_to_callback_only:
|
|
232
246
|
self.results.append(job_index)
|
|
233
247
|
else:
|
|
234
|
-
|
|
248
|
+
# When requested, return the full response envelope (includes 'trace' and 'annotations')
|
|
249
|
+
self.results.append(result_data if self.return_full_response else result_data.get("data"))
|
|
235
250
|
|
|
236
251
|
# Cleanup retry count if it exists
|
|
237
252
|
if job_index in self.retry_counts:
|
|
238
253
|
del self.retry_counts[job_index]
|
|
254
|
+
if job_index in self.next_allowed_fetch_time:
|
|
255
|
+
del self.next_allowed_fetch_time[job_index]
|
|
239
256
|
|
|
240
257
|
# Execute completion callback if provided
|
|
241
258
|
if self.completion_callback:
|
|
@@ -275,20 +292,266 @@ class _ConcurrentProcessor:
|
|
|
275
292
|
except Exception:
|
|
276
293
|
logger.warning("Could not reliably extract job indices from results for final check.")
|
|
277
294
|
|
|
278
|
-
initial_indices = set(self.all_job_indices_list)
|
|
279
|
-
unaccounted_indices = initial_indices - processed_indices
|
|
280
|
-
|
|
281
|
-
if unaccounted_indices:
|
|
282
|
-
logger.warning(f"Potentially unaccounted for jobs: {unaccounted_indices}")
|
|
283
|
-
# Optionally add them to failures
|
|
284
|
-
# for idx in unaccounted_indices:
|
|
285
|
-
# if not any(f[0] == idx for f in self.failures):
|
|
286
|
-
# self.failures.append((idx, "Job lost or unaccounted for at exit"))
|
|
287
|
-
|
|
288
295
|
# --------------------------------------------------------------------------
|
|
289
|
-
#
|
|
296
|
+
# Declarative Helper Methods (behavior preserved)
|
|
290
297
|
# --------------------------------------------------------------------------
|
|
291
298
|
|
|
299
|
+
def _collect_retry_jobs_for_batch(self) -> List[str]:
|
|
300
|
+
"""
|
|
301
|
+
Collect eligible retry jobs for this batch based on per-job next-allowed time.
|
|
302
|
+
|
|
303
|
+
Returns
|
|
304
|
+
-------
|
|
305
|
+
List[str]
|
|
306
|
+
The list of job indices that should be retried in this batch.
|
|
307
|
+
"""
|
|
308
|
+
if not self.retry_job_ids:
|
|
309
|
+
return []
|
|
310
|
+
|
|
311
|
+
now = time.time()
|
|
312
|
+
eligible: List[str] = []
|
|
313
|
+
remaining: List[str] = []
|
|
314
|
+
for job_id in self.retry_job_ids:
|
|
315
|
+
allowed_at = self.next_allowed_fetch_time.get(job_id, 0.0)
|
|
316
|
+
if allowed_at <= now:
|
|
317
|
+
eligible.append(job_id)
|
|
318
|
+
else:
|
|
319
|
+
remaining.append(job_id)
|
|
320
|
+
|
|
321
|
+
if eligible and self.verbose:
|
|
322
|
+
logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
|
|
323
|
+
|
|
324
|
+
# Keep non-eligible retries for a later batch
|
|
325
|
+
self.retry_job_ids = remaining
|
|
326
|
+
return eligible
|
|
327
|
+
|
|
328
|
+
def _schedule_retry(self, job_index: str) -> None:
|
|
329
|
+
"""
|
|
330
|
+
Schedule a paced retry for a job using exponential backoff with jitter.
|
|
331
|
+
"""
|
|
332
|
+
now = time.time()
|
|
333
|
+
attempt = max(1, self.retry_counts.get(job_index, 1))
|
|
334
|
+
base = max(0.01, float(self.retry_delay) if self.retry_delay is not None else 1.0)
|
|
335
|
+
delay = min(base * (2 ** (attempt - 1)), self._retry_backoff_cap)
|
|
336
|
+
jitter = random.uniform(0.8, 1.2)
|
|
337
|
+
wait_s = delay * jitter
|
|
338
|
+
self.next_allowed_fetch_time[job_index] = now + wait_s
|
|
339
|
+
if job_index not in self.retry_job_ids:
|
|
340
|
+
self.retry_job_ids.append(job_index)
|
|
341
|
+
|
|
342
|
+
def _select_new_jobs_for_batch(
|
|
343
|
+
self,
|
|
344
|
+
submitted_new_indices_count: int,
|
|
345
|
+
total_jobs: int,
|
|
346
|
+
already_in_batch: int,
|
|
347
|
+
) -> Tuple[List[str], int]:
|
|
348
|
+
"""
|
|
349
|
+
Determine the slice of new jobs to include in the current batch based on
|
|
350
|
+
remaining capacity and unsubmitted jobs.
|
|
351
|
+
|
|
352
|
+
Note: This does NOT change submitted_new_indices_count. The original code
|
|
353
|
+
increments that counter only after submission is attempted/handled.
|
|
354
|
+
"""
|
|
355
|
+
if (already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
|
|
356
|
+
num_new_to_add = min(self.batch_size - already_in_batch, total_jobs - submitted_new_indices_count)
|
|
357
|
+
start_idx = submitted_new_indices_count
|
|
358
|
+
end_idx = submitted_new_indices_count + num_new_to_add
|
|
359
|
+
new_job_indices = self.all_job_indices_list[start_idx:end_idx]
|
|
360
|
+
|
|
361
|
+
if self.verbose:
|
|
362
|
+
logger.debug(f"Adding {len(new_job_indices)} new jobs to current batch.")
|
|
363
|
+
|
|
364
|
+
return new_job_indices, submitted_new_indices_count
|
|
365
|
+
|
|
366
|
+
return [], submitted_new_indices_count
|
|
367
|
+
|
|
368
|
+
def _submit_new_jobs_async(
|
|
369
|
+
self,
|
|
370
|
+
current_batch_new_job_indices: List[str],
|
|
371
|
+
current_batch_job_indices: List[str],
|
|
372
|
+
submitted_new_indices_count: int,
|
|
373
|
+
) -> Tuple[List[str], int]:
|
|
374
|
+
"""
|
|
375
|
+
Initiate asynchronous submission for the new jobs selected for this batch.
|
|
376
|
+
|
|
377
|
+
Mirrors the original inline submission block, including error handling and
|
|
378
|
+
fail_on_submit_error semantics. Returns potentially updated batch indices and
|
|
379
|
+
submitted count.
|
|
380
|
+
"""
|
|
381
|
+
if not current_batch_new_job_indices:
|
|
382
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
383
|
+
|
|
384
|
+
if not self.job_queue_id:
|
|
385
|
+
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
386
|
+
logger.error(error_msg)
|
|
387
|
+
# Fail these jobs immediately
|
|
388
|
+
for job_index in current_batch_new_job_indices:
|
|
389
|
+
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
390
|
+
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
391
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
392
|
+
if self.fail_on_submit_error:
|
|
393
|
+
raise ValueError(error_msg)
|
|
394
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
# Fire-and-forget submission initiation
|
|
398
|
+
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
399
|
+
# Add successfully initiated jobs to the overall batch list
|
|
400
|
+
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
401
|
+
# Stagger the first fetch attempt slightly to avoid immediate 202s
|
|
402
|
+
now = time.time()
|
|
403
|
+
for job_index in current_batch_new_job_indices:
|
|
404
|
+
allowed_at = self.next_allowed_fetch_time.get(job_index, 0.0)
|
|
405
|
+
self.next_allowed_fetch_time[job_index] = max(allowed_at, now + float(self.initial_fetch_delay))
|
|
406
|
+
# Update count of total initiated jobs
|
|
407
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
408
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
409
|
+
except Exception as e:
|
|
410
|
+
error_msg = (
|
|
411
|
+
f"Batch async submission initiation failed for {len(current_batch_new_job_indices)} new jobs: {e}"
|
|
412
|
+
)
|
|
413
|
+
logger.error(error_msg, exc_info=True)
|
|
414
|
+
# Fail these jobs immediately
|
|
415
|
+
for job_index in current_batch_new_job_indices:
|
|
416
|
+
self._handle_processing_failure(
|
|
417
|
+
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
418
|
+
)
|
|
419
|
+
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
420
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
421
|
+
if self.fail_on_submit_error:
|
|
422
|
+
raise RuntimeError(error_msg) from e
|
|
423
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
424
|
+
|
|
425
|
+
def _initiate_fetch_for_batch(self, current_batch_job_indices: List[str]) -> Tuple[Dict[Future, str], List[str]]:
|
|
426
|
+
"""
|
|
427
|
+
Initiate fetching for the prepared batch and ensure consistency of returned futures.
|
|
428
|
+
|
|
429
|
+
Returns
|
|
430
|
+
-------
|
|
431
|
+
batch_futures_dict : Dict[Future, str]
|
|
432
|
+
Mapping of futures to their associated job indices.
|
|
433
|
+
normalized_job_indices : List[str]
|
|
434
|
+
The job indices normalized to those actually returned by the client if a discrepancy occurs.
|
|
435
|
+
"""
|
|
436
|
+
# Filter indices by next_allowed_fetch_time to respect pacing for new jobs
|
|
437
|
+
now = time.time()
|
|
438
|
+
eligible_indices: List[str] = []
|
|
439
|
+
deferred_indices: List[str] = []
|
|
440
|
+
for idx in current_batch_job_indices:
|
|
441
|
+
if self.next_allowed_fetch_time.get(idx, 0.0) <= now:
|
|
442
|
+
eligible_indices.append(idx)
|
|
443
|
+
else:
|
|
444
|
+
deferred_indices.append(idx)
|
|
445
|
+
|
|
446
|
+
# Defer ineligible jobs for later retry window
|
|
447
|
+
for idx in deferred_indices:
|
|
448
|
+
if idx not in self.retry_job_ids:
|
|
449
|
+
self.retry_job_ids.append(idx)
|
|
450
|
+
|
|
451
|
+
if self.verbose:
|
|
452
|
+
logger.debug(
|
|
453
|
+
f"Calling fetch_job_result_async for {len(eligible_indices)} eligible jobs "
|
|
454
|
+
f"(deferred {len(deferred_indices)})."
|
|
455
|
+
)
|
|
456
|
+
# Use data_only=False to get full response for callback/results
|
|
457
|
+
batch_futures_dict = (
|
|
458
|
+
self.client.fetch_job_result_async(eligible_indices, data_only=False) if eligible_indices else {}
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# Check for discrepancies where client might not return all futures
|
|
462
|
+
if eligible_indices and (len(batch_futures_dict) != len(eligible_indices)):
|
|
463
|
+
returned_indices = set(batch_futures_dict.values())
|
|
464
|
+
missing_indices = [idx for idx in eligible_indices if idx not in returned_indices]
|
|
465
|
+
logger.error(
|
|
466
|
+
f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
|
|
467
|
+
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
468
|
+
)
|
|
469
|
+
# Fail the missing ones explicitly
|
|
470
|
+
for missing_idx in missing_indices:
|
|
471
|
+
self._handle_processing_failure(
|
|
472
|
+
missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
|
|
473
|
+
)
|
|
474
|
+
if self.fail_on_submit_error:
|
|
475
|
+
raise RuntimeError("fetch_job_result_async failed to return all expected futures.")
|
|
476
|
+
# Continue processing only the futures we received
|
|
477
|
+
normalized_job_indices = list(returned_indices)
|
|
478
|
+
else:
|
|
479
|
+
normalized_job_indices = list(eligible_indices)
|
|
480
|
+
|
|
481
|
+
return batch_futures_dict, normalized_job_indices
|
|
482
|
+
|
|
483
|
+
def _process_batch_futures(self, batch_futures_dict: Dict[Future, str], batch_timeout: float) -> None:
|
|
484
|
+
"""
|
|
485
|
+
Process the batch futures as they complete, handling success, 202-timeout retries,
|
|
486
|
+
and failures according to existing logic.
|
|
487
|
+
"""
|
|
488
|
+
if not batch_futures_dict:
|
|
489
|
+
if self.verbose:
|
|
490
|
+
logger.debug("No futures returned/available for processing in this batch.")
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
|
|
495
|
+
job_index = batch_futures_dict[future]
|
|
496
|
+
try:
|
|
497
|
+
# Expect list with one tuple: [(data, index, trace)]
|
|
498
|
+
result_list = future.result()
|
|
499
|
+
if not isinstance(result_list, list) or len(result_list) != 1:
|
|
500
|
+
raise ValueError(f"Expected list length 1, got {len(result_list)}")
|
|
501
|
+
|
|
502
|
+
result_tuple = result_list[0]
|
|
503
|
+
if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
|
|
504
|
+
raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
|
|
505
|
+
|
|
506
|
+
full_response_dict, fetched_job_index, trace_id = result_tuple
|
|
507
|
+
|
|
508
|
+
if fetched_job_index != job_index:
|
|
509
|
+
logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
|
|
510
|
+
|
|
511
|
+
self._handle_processing_success(job_index, full_response_dict, trace_id)
|
|
512
|
+
|
|
513
|
+
except TimeoutError:
|
|
514
|
+
# Handle job not ready - check retry policy and schedule paced retry
|
|
515
|
+
self.retry_counts[job_index] += 1
|
|
516
|
+
if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
|
|
517
|
+
if self.verbose:
|
|
518
|
+
logger.info(
|
|
519
|
+
f"Job {job_index} not ready, scheduling paced retry (Attempt "
|
|
520
|
+
f"{self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
|
|
521
|
+
)
|
|
522
|
+
self._schedule_retry(job_index)
|
|
523
|
+
else:
|
|
524
|
+
error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
|
|
525
|
+
logger.error(error_msg)
|
|
526
|
+
self._handle_processing_failure(job_index, error_msg)
|
|
527
|
+
|
|
528
|
+
except (ValueError, RuntimeError) as e:
|
|
529
|
+
logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
|
|
530
|
+
self._handle_processing_failure(job_index, f"Error processing result: {e}")
|
|
531
|
+
except Exception as e:
|
|
532
|
+
logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
|
|
533
|
+
self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
|
|
534
|
+
|
|
535
|
+
except TimeoutError:
|
|
536
|
+
self._handle_batch_timeout(batch_futures_dict, batch_timeout)
|
|
537
|
+
|
|
538
|
+
def _handle_batch_timeout(self, batch_futures_dict: Dict[Future, str], batch_timeout: float) -> None:
|
|
539
|
+
"""
|
|
540
|
+
Handle a timeout while waiting for batch futures, mirroring the original behavior.
|
|
541
|
+
"""
|
|
542
|
+
logger.error(
|
|
543
|
+
f"Batch processing timed out after {batch_timeout}s waiting for futures. "
|
|
544
|
+
"Some jobs in batch may be lost or incomplete."
|
|
545
|
+
)
|
|
546
|
+
remaining_indices_in_batch = []
|
|
547
|
+
for f, idx in batch_futures_dict.items():
|
|
548
|
+
if not f.done():
|
|
549
|
+
remaining_indices_in_batch.append(idx)
|
|
550
|
+
f.cancel() # Attempt to cancel underlying task
|
|
551
|
+
logger.warning(f"Jobs potentially lost/cancelled due to batch timeout: {remaining_indices_in_batch}")
|
|
552
|
+
for idx in remaining_indices_in_batch:
|
|
553
|
+
self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
|
|
554
|
+
|
|
292
555
|
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
|
|
293
556
|
"""
|
|
294
557
|
Executes the main processing loop in batches.
|
|
@@ -315,123 +578,66 @@ class _ConcurrentProcessor:
|
|
|
315
578
|
initiation error occurs.
|
|
316
579
|
"""
|
|
317
580
|
total_jobs = len(self.all_job_indices_list)
|
|
318
|
-
# Tracks indices for which submission has been initiated at least once
|
|
319
|
-
submitted_new_indices_count = 0
|
|
581
|
+
submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
|
|
320
582
|
|
|
321
|
-
logger.info(f"Starting batch processing for {total_jobs} jobs with batch
|
|
583
|
+
logger.info(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
|
|
322
584
|
|
|
323
|
-
# Main loop: continues as long as there are new jobs to submit
|
|
324
|
-
# or jobs waiting for retry.
|
|
325
585
|
while (submitted_new_indices_count < total_jobs) or self.retry_job_ids:
|
|
326
586
|
|
|
327
|
-
#
|
|
328
|
-
current_batch_job_indices: List[str] =
|
|
587
|
+
# 1) Collect retries intended for this batch
|
|
588
|
+
current_batch_job_indices: List[str] = self._collect_retry_jobs_for_batch()
|
|
329
589
|
|
|
330
|
-
#
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
# Clear the list; retries for *this* batch will be collected later
|
|
337
|
-
self.retry_job_ids = []
|
|
338
|
-
|
|
339
|
-
# Determine and add new jobs to the batch
|
|
340
|
-
num_already_in_batch = len(current_batch_job_indices)
|
|
341
|
-
if (num_already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
|
|
342
|
-
num_new_to_add = min(self.batch_size - num_already_in_batch, total_jobs - submitted_new_indices_count)
|
|
343
|
-
start_idx = submitted_new_indices_count
|
|
344
|
-
end_idx = submitted_new_indices_count + num_new_to_add
|
|
345
|
-
current_batch_new_job_indices = self.all_job_indices_list[start_idx:end_idx]
|
|
590
|
+
# 2) Select new jobs to fill the batch capacity
|
|
591
|
+
current_batch_new_job_indices, submitted_new_indices_count = self._select_new_jobs_for_batch(
|
|
592
|
+
submitted_new_indices_count=submitted_new_indices_count,
|
|
593
|
+
total_jobs=total_jobs,
|
|
594
|
+
already_in_batch=len(current_batch_job_indices),
|
|
595
|
+
)
|
|
346
596
|
|
|
347
|
-
|
|
348
|
-
|
|
597
|
+
# 3) Initiate async submission for the selected new jobs
|
|
598
|
+
try:
|
|
599
|
+
current_batch_job_indices, submitted_new_indices_count = self._submit_new_jobs_async(
|
|
600
|
+
current_batch_new_job_indices,
|
|
601
|
+
current_batch_job_indices,
|
|
602
|
+
submitted_new_indices_count,
|
|
603
|
+
)
|
|
604
|
+
except Exception as e: # noqa: F841
|
|
605
|
+
# Preserve original fail-on-submit behavior
|
|
606
|
+
# (errors already logged and failures recorded inside helper)
|
|
607
|
+
if self.fail_on_submit_error:
|
|
608
|
+
raise
|
|
349
609
|
|
|
350
|
-
|
|
351
|
-
if current_batch_new_job_indices:
|
|
352
|
-
if not self.job_queue_id:
|
|
353
|
-
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
354
|
-
logger.error(error_msg)
|
|
355
|
-
# Fail these jobs immediately
|
|
356
|
-
for job_index in current_batch_new_job_indices:
|
|
357
|
-
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
358
|
-
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
359
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
360
|
-
if self.fail_on_submit_error:
|
|
361
|
-
raise ValueError(error_msg)
|
|
362
|
-
else:
|
|
363
|
-
try:
|
|
364
|
-
# Fire-and-forget submission initiation
|
|
365
|
-
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
366
|
-
# Add successfully initiated jobs to the overall batch list
|
|
367
|
-
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
368
|
-
# Update count of total initiated jobs
|
|
369
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
370
|
-
except Exception as e:
|
|
371
|
-
error_msg = (
|
|
372
|
-
f"Batch async submission initiation failed for "
|
|
373
|
-
f"{len(current_batch_new_job_indices)} new jobs: {e}"
|
|
374
|
-
)
|
|
375
|
-
logger.error(error_msg, exc_info=True)
|
|
376
|
-
# Fail these jobs immediately
|
|
377
|
-
for job_index in current_batch_new_job_indices:
|
|
378
|
-
self._handle_processing_failure(
|
|
379
|
-
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
380
|
-
)
|
|
381
|
-
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
382
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
383
|
-
if self.fail_on_submit_error:
|
|
384
|
-
raise RuntimeError(error_msg) from e
|
|
385
|
-
|
|
386
|
-
# If nothing ended up in the batch (e.g., only submission failures)
|
|
610
|
+
# 4) If no jobs to fetch this cycle, decide whether to exit or continue
|
|
387
611
|
if not current_batch_job_indices:
|
|
388
612
|
if self.verbose:
|
|
389
613
|
logger.debug("No jobs identified for fetching in this batch iteration.")
|
|
390
|
-
# If there are no retries pending either, break the loop
|
|
391
614
|
if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
392
615
|
logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
|
|
393
616
|
break
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
617
|
+
# If retries remain but are not yet eligible, sleep until earliest allowed
|
|
618
|
+
if self.retry_job_ids:
|
|
619
|
+
now = time.time()
|
|
620
|
+
future_times = [self.next_allowed_fetch_time.get(j, now) for j in self.retry_job_ids]
|
|
621
|
+
# Consider only times in the future
|
|
622
|
+
future_times = [t for t in future_times if t > now]
|
|
623
|
+
if future_times:
|
|
624
|
+
sleep_for = min(max(min(future_times) - now, 0.05), 1.0)
|
|
625
|
+
if self.verbose:
|
|
626
|
+
logger.debug(f"Pacing retries: sleeping {sleep_for:.2f}s waiting for next allowed fetch.")
|
|
627
|
+
time.sleep(sleep_for)
|
|
628
|
+
continue
|
|
629
|
+
|
|
630
|
+
# 5) Initiate fetching for the current batch
|
|
397
631
|
try:
|
|
398
|
-
|
|
399
|
-
logger.debug(
|
|
400
|
-
f"Calling fetch_job_result_async for "
|
|
401
|
-
f"{len(current_batch_job_indices)} jobs in current batch."
|
|
402
|
-
)
|
|
403
|
-
# Use data_only=False to get full response for callback/results
|
|
404
|
-
batch_futures_dict = self.client.fetch_job_result_async(current_batch_job_indices, data_only=False)
|
|
405
|
-
|
|
406
|
-
# Check for discrepancies where client might not return all futures
|
|
407
|
-
if len(batch_futures_dict) != len(current_batch_job_indices):
|
|
408
|
-
returned_indices = set(batch_futures_dict.values())
|
|
409
|
-
missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
|
|
410
|
-
logger.error(
|
|
411
|
-
f"fetch_job_result_async discrepancy: Expected "
|
|
412
|
-
f"{len(current_batch_job_indices)}, got "
|
|
413
|
-
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
414
|
-
)
|
|
415
|
-
# Fail the missing ones explicitly
|
|
416
|
-
for missing_idx in missing_indices:
|
|
417
|
-
self._handle_processing_failure(
|
|
418
|
-
missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
|
|
419
|
-
)
|
|
420
|
-
if self.fail_on_submit_error:
|
|
421
|
-
raise RuntimeError("fetch_job_result_async failed to return all " "expected futures.")
|
|
422
|
-
# Continue processing only the futures we received
|
|
423
|
-
current_batch_job_indices = list(returned_indices)
|
|
424
|
-
|
|
632
|
+
batch_futures_dict, _ = self._initiate_fetch_for_batch(current_batch_job_indices)
|
|
425
633
|
except Exception as fetch_init_err:
|
|
426
634
|
error_msg = (
|
|
427
|
-
f"fetch_job_result_async failed for batch "
|
|
428
|
-
f"({len(current_batch_job_indices)} jobs): {fetch_init_err}"
|
|
635
|
+
f"fetch_job_result_async failed for batch ({len(current_batch_job_indices)} jobs): {fetch_init_err}"
|
|
429
636
|
)
|
|
430
637
|
logger.error(error_msg, exc_info=True)
|
|
431
638
|
logger.warning(
|
|
432
|
-
f"Marking all {len(current_batch_job_indices)} jobs in
|
|
639
|
+
f"Marking all {len(current_batch_job_indices)} jobs in failed fetch initiation batch as failed."
|
|
433
640
|
)
|
|
434
|
-
# Fail all jobs intended for this batch
|
|
435
641
|
for job_index in current_batch_job_indices:
|
|
436
642
|
self._handle_processing_failure(
|
|
437
643
|
job_index, f"Fetch initiation failed for batch: {fetch_init_err}", is_submission_failure=True
|
|
@@ -440,79 +646,11 @@ class _ConcurrentProcessor:
|
|
|
440
646
|
raise RuntimeError(
|
|
441
647
|
f"Stopping due to fetch initiation failure: {fetch_init_err}"
|
|
442
648
|
) from fetch_init_err
|
|
443
|
-
continue
|
|
649
|
+
continue
|
|
444
650
|
|
|
445
|
-
#
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
logger.debug("No futures returned/available for processing in this batch.")
|
|
449
|
-
continue # Skip processing if no futures
|
|
450
|
-
|
|
451
|
-
batch_timeout = 600.0 # Timeout for waiting on the whole batch
|
|
452
|
-
try:
|
|
453
|
-
# Process futures as they complete within this batch
|
|
454
|
-
for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
|
|
455
|
-
job_index = batch_futures_dict[future]
|
|
456
|
-
try:
|
|
457
|
-
# Expect list with one tuple: [(data, index, trace)]
|
|
458
|
-
result_list = future.result()
|
|
459
|
-
if not isinstance(result_list, list) or len(result_list) != 1:
|
|
460
|
-
raise ValueError(f"Expected list length 1, got {len(result_list)}")
|
|
461
|
-
|
|
462
|
-
result_tuple = result_list[0]
|
|
463
|
-
if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
|
|
464
|
-
raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
|
|
465
|
-
|
|
466
|
-
full_response_dict, fetched_job_index, trace_id = result_tuple
|
|
467
|
-
|
|
468
|
-
if fetched_job_index != job_index:
|
|
469
|
-
logger.warning(f"Mismatch: Future for {job_index} returned " f"{fetched_job_index}")
|
|
470
|
-
|
|
471
|
-
self._handle_processing_success(job_index, full_response_dict, trace_id)
|
|
472
|
-
|
|
473
|
-
except TimeoutError:
|
|
474
|
-
# Handle job not ready - check retry policy
|
|
475
|
-
self.retry_counts[job_index] += 1
|
|
476
|
-
if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
|
|
477
|
-
if self.verbose:
|
|
478
|
-
logger.info(
|
|
479
|
-
f"Job {job_index} not ready, adding to next "
|
|
480
|
-
f"batch's retry list (Attempt "
|
|
481
|
-
f"{self.retry_counts[job_index]}/"
|
|
482
|
-
f"{self.max_job_retries or 'inf'})."
|
|
483
|
-
)
|
|
484
|
-
# Collect for the *next* batch
|
|
485
|
-
self.retry_job_ids.append(job_index)
|
|
486
|
-
else:
|
|
487
|
-
error_msg = f"Exceeded max fetch retries " f"({self.max_job_retries}) for job {job_index}."
|
|
488
|
-
logger.error(error_msg)
|
|
489
|
-
self._handle_processing_failure(job_index, error_msg)
|
|
490
|
-
|
|
491
|
-
except (ValueError, RuntimeError) as e:
|
|
492
|
-
logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
|
|
493
|
-
self._handle_processing_failure(job_index, f"Error processing result: {e}")
|
|
494
|
-
except Exception as e:
|
|
495
|
-
logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
|
|
496
|
-
self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
|
|
497
|
-
# No finally block incrementing count here; tracking is batch-based
|
|
498
|
-
|
|
499
|
-
except TimeoutError:
|
|
500
|
-
# `as_completed` timed out waiting for remaining futures in batch
|
|
501
|
-
logger.error(
|
|
502
|
-
f"Batch processing timed out after {batch_timeout}s waiting "
|
|
503
|
-
f"for futures. Some jobs in batch may be lost or incomplete."
|
|
504
|
-
)
|
|
505
|
-
# Identify and fail remaining futures
|
|
506
|
-
remaining_indices_in_batch = []
|
|
507
|
-
for f, idx in batch_futures_dict.items():
|
|
508
|
-
if not f.done():
|
|
509
|
-
remaining_indices_in_batch.append(idx)
|
|
510
|
-
f.cancel() # Attempt to cancel underlying task
|
|
511
|
-
logger.warning(
|
|
512
|
-
f"Jobs potentially lost/cancelled due to batch timeout: " f"{remaining_indices_in_batch}"
|
|
513
|
-
)
|
|
514
|
-
for idx in remaining_indices_in_batch:
|
|
515
|
-
self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
|
|
651
|
+
# 6) Process results for the current batch
|
|
652
|
+
batch_timeout = 600.0
|
|
653
|
+
self._process_batch_futures(batch_futures_dict, batch_timeout)
|
|
516
654
|
# End of processing for this batch cycle
|
|
517
655
|
|
|
518
656
|
# --- Final Logging ---
|
|
@@ -576,7 +714,15 @@ class NvIngestClient:
|
|
|
576
714
|
# Initialize the worker pool with the specified size
|
|
577
715
|
self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
|
|
578
716
|
|
|
717
|
+
# Telemetry state and controls
|
|
718
|
+
self._telemetry_lock = threading.Lock()
|
|
719
|
+
self._telemetry_enabled: bool = bool(int(os.getenv("NV_INGEST_CLIENT_TELEMETRY", "1")))
|
|
720
|
+
try:
|
|
721
|
+
self._telemetry_max_calls: int = int(os.getenv("NV_INGEST_CLIENT_TELEMETRY_MAX_CALLS", "10000"))
|
|
722
|
+
except ValueError:
|
|
723
|
+
self._telemetry_max_calls = 10000
|
|
579
724
|
self._telemetry = {}
|
|
725
|
+
self.reset_telemetry()
|
|
580
726
|
|
|
581
727
|
def __str__(self) -> str:
|
|
582
728
|
"""
|
|
@@ -624,6 +770,106 @@ class NvIngestClient:
|
|
|
624
770
|
|
|
625
771
|
return job_state
|
|
626
772
|
|
|
773
|
+
# ------------------------------------------------------------------
|
|
774
|
+
# Telemetry helpers
|
|
775
|
+
# ------------------------------------------------------------------
|
|
776
|
+
|
|
777
|
+
def enable_telemetry(self, enabled: bool) -> None:
|
|
778
|
+
with self._telemetry_lock:
|
|
779
|
+
self._telemetry_enabled = bool(enabled)
|
|
780
|
+
|
|
781
|
+
def reset_telemetry(self) -> None:
|
|
782
|
+
with self._telemetry_lock:
|
|
783
|
+
self._telemetry = {
|
|
784
|
+
"started_at": time.time(),
|
|
785
|
+
"submit": {"count": 0, "calls": []},
|
|
786
|
+
"fetch": {"count": 0, "last_ts": None, "intervals": [], "calls": []},
|
|
787
|
+
"per_job": {},
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
def _t_per_job(self, job_index: str) -> Dict[str, Any]:
|
|
791
|
+
pj = self._telemetry["per_job"].get(job_index)
|
|
792
|
+
if pj is None:
|
|
793
|
+
pj = {"submits": [], "fetch_attempts": [], "timeouts_202": 0, "failures": 0, "first_success_ts": None}
|
|
794
|
+
self._telemetry["per_job"][job_index] = pj
|
|
795
|
+
return pj
|
|
796
|
+
|
|
797
|
+
def _t_append_capped(self, arr: List[Any], item: Any) -> None:
|
|
798
|
+
if len(arr) < self._telemetry_max_calls:
|
|
799
|
+
arr.append(item)
|
|
800
|
+
|
|
801
|
+
def _t_record_submit(self, job_index: str, status: str, ts: float, trace_id: Optional[str]) -> None:
|
|
802
|
+
if not self._telemetry_enabled:
|
|
803
|
+
return
|
|
804
|
+
with self._telemetry_lock:
|
|
805
|
+
self._telemetry["submit"]["count"] += 1
|
|
806
|
+
self._t_append_capped(
|
|
807
|
+
self._telemetry["submit"]["calls"],
|
|
808
|
+
{"job": job_index, "status": status, "ts": ts, "trace": trace_id},
|
|
809
|
+
)
|
|
810
|
+
pj = self._t_per_job(job_index)
|
|
811
|
+
self._t_append_capped(pj["submits"], ts)
|
|
812
|
+
|
|
813
|
+
def _t_record_fetch_attempt(self, job_index: str, ts: float) -> None:
|
|
814
|
+
if not self._telemetry_enabled:
|
|
815
|
+
return
|
|
816
|
+
with self._telemetry_lock:
|
|
817
|
+
self._telemetry["fetch"]["count"] += 1
|
|
818
|
+
last = self._telemetry["fetch"]["last_ts"]
|
|
819
|
+
if last is not None:
|
|
820
|
+
delta = ts - float(last)
|
|
821
|
+
if delta >= 0:
|
|
822
|
+
self._t_append_capped(self._telemetry["fetch"]["intervals"], delta)
|
|
823
|
+
self._telemetry["fetch"]["last_ts"] = ts
|
|
824
|
+
pj = self._t_per_job(job_index)
|
|
825
|
+
self._t_append_capped(pj["fetch_attempts"], ts)
|
|
826
|
+
|
|
827
|
+
def _t_record_fetch_outcome(self, job_index: str, code: int, ts: float, ok: bool, trace_id: Optional[str]) -> None:
|
|
828
|
+
if not self._telemetry_enabled:
|
|
829
|
+
return
|
|
830
|
+
with self._telemetry_lock:
|
|
831
|
+
self._t_append_capped(
|
|
832
|
+
self._telemetry["fetch"]["calls"],
|
|
833
|
+
{"job": job_index, "code": code, "ok": ok, "ts": ts, "trace": trace_id},
|
|
834
|
+
)
|
|
835
|
+
pj = self._t_per_job(job_index)
|
|
836
|
+
if code == 2: # 202 not ready
|
|
837
|
+
pj["timeouts_202"] += 1
|
|
838
|
+
if ok and pj["first_success_ts"] is None:
|
|
839
|
+
pj["first_success_ts"] = ts
|
|
840
|
+
if not ok and code not in (0, 2):
|
|
841
|
+
pj["failures"] += 1
|
|
842
|
+
|
|
843
|
+
def get_telemetry(self) -> Dict[str, Any]:
|
|
844
|
+
with self._telemetry_lock:
|
|
845
|
+
return copy.deepcopy(self._telemetry)
|
|
846
|
+
|
|
847
|
+
def summarize_telemetry(self) -> Dict[str, Any]:
|
|
848
|
+
with self._telemetry_lock:
|
|
849
|
+
submit_count = self._telemetry["submit"]["count"]
|
|
850
|
+
fetch_count = self._telemetry["fetch"]["count"]
|
|
851
|
+
intervals = list(self._telemetry["fetch"]["intervals"])
|
|
852
|
+
intervals.sort()
|
|
853
|
+
avg = mean(intervals) if intervals else 0.0
|
|
854
|
+
p50 = median(intervals) if intervals else 0.0
|
|
855
|
+
# p95 via index
|
|
856
|
+
p95 = intervals[int(0.95 * (len(intervals) - 1))] if intervals else 0.0
|
|
857
|
+
per_job = self._telemetry["per_job"]
|
|
858
|
+
# Aggregate per-job stats
|
|
859
|
+
jobs = len(per_job)
|
|
860
|
+
total_timeouts = sum(pj.get("timeouts_202", 0) for pj in per_job.values())
|
|
861
|
+
total_failures = sum(pj.get("failures", 0) for pj in per_job.values())
|
|
862
|
+
return {
|
|
863
|
+
"submit_count": submit_count,
|
|
864
|
+
"fetch_count": fetch_count,
|
|
865
|
+
"fetch_interval_avg": avg,
|
|
866
|
+
"fetch_interval_p50": p50,
|
|
867
|
+
"fetch_interval_p95": p95,
|
|
868
|
+
"jobs_tracked": jobs,
|
|
869
|
+
"timeouts_202_total": total_timeouts,
|
|
870
|
+
"failures_total": total_failures,
|
|
871
|
+
}
|
|
872
|
+
|
|
627
873
|
def _get_and_check_job_state(
|
|
628
874
|
self,
|
|
629
875
|
job_index: str,
|
|
@@ -861,6 +1107,8 @@ class NvIngestClient:
|
|
|
861
1107
|
Exception
|
|
862
1108
|
For unexpected issues.
|
|
863
1109
|
"""
|
|
1110
|
+
ts_attempt = time.time()
|
|
1111
|
+
self._t_record_fetch_attempt(job_index, ts_attempt)
|
|
864
1112
|
try:
|
|
865
1113
|
# Get job state using the client-side index
|
|
866
1114
|
job_state = self._get_and_check_job_state(
|
|
@@ -901,6 +1149,7 @@ class NvIngestClient:
|
|
|
901
1149
|
logger.debug(
|
|
902
1150
|
f"Successfully processed and removed job index {job_index} (Server ID: {server_job_id})"
|
|
903
1151
|
)
|
|
1152
|
+
self._t_record_fetch_outcome(job_index, 0, time.time(), ok=True, trace_id=job_state.trace_id)
|
|
904
1153
|
return result_data, job_index, job_state.trace_id
|
|
905
1154
|
|
|
906
1155
|
except json.JSONDecodeError as err:
|
|
@@ -922,6 +1171,7 @@ class NvIngestClient:
|
|
|
922
1171
|
elif response.response_code == 2: # Job Not Ready (e.g., HTTP 202, or r-2 from SimpleBroker)
|
|
923
1172
|
# Raise TimeoutError to signal the calling retry loop in fetch_job_result
|
|
924
1173
|
# Do not change job state here, remains SUBMITTED
|
|
1174
|
+
self._t_record_fetch_outcome(job_index, 2, time.time(), ok=False, trace_id=job_state.trace_id)
|
|
925
1175
|
raise TimeoutError(f"Job not ready: {response.response_reason}")
|
|
926
1176
|
|
|
927
1177
|
else:
|
|
@@ -934,6 +1184,7 @@ class NvIngestClient:
|
|
|
934
1184
|
job_state.state = JobStateEnum.FAILED # Mark job as failed in the client
|
|
935
1185
|
# Do NOT pop the state for failed jobs here
|
|
936
1186
|
# Raise RuntimeError to indicate a terminal failure for this fetch attempt
|
|
1187
|
+
self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=job_state.trace_id)
|
|
937
1188
|
raise RuntimeError(error_msg)
|
|
938
1189
|
|
|
939
1190
|
except (TimeoutError, ValueError, RuntimeError):
|
|
@@ -945,6 +1196,10 @@ class NvIngestClient:
|
|
|
945
1196
|
# Attempt to mark state as FAILED if possible and state object exists
|
|
946
1197
|
if "job_state" in locals() and hasattr(job_state, "state"):
|
|
947
1198
|
job_state.state = JobStateEnum.FAILED
|
|
1199
|
+
try:
|
|
1200
|
+
self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=None)
|
|
1201
|
+
except Exception:
|
|
1202
|
+
pass
|
|
948
1203
|
raise # Re-raise the original exception
|
|
949
1204
|
|
|
950
1205
|
def fetch_job_result_cli(
|
|
@@ -1019,12 +1274,14 @@ class NvIngestClient:
|
|
|
1019
1274
|
concurrency_limit: int = 64,
|
|
1020
1275
|
timeout: int = 100,
|
|
1021
1276
|
max_job_retries: Optional[int] = None,
|
|
1022
|
-
retry_delay: float = 5
|
|
1277
|
+
retry_delay: float = 0.5,
|
|
1278
|
+
initial_fetch_delay: float = 0.3,
|
|
1023
1279
|
fail_on_submit_error: bool = False,
|
|
1024
1280
|
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1025
1281
|
return_failures: bool = False,
|
|
1026
1282
|
data_only: bool = True,
|
|
1027
1283
|
stream_to_callback_only: bool = False,
|
|
1284
|
+
return_full_response: bool = False,
|
|
1028
1285
|
verbose: bool = False,
|
|
1029
1286
|
) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
|
|
1030
1287
|
"""
|
|
@@ -1056,6 +1313,9 @@ class NvIngestClient:
|
|
|
1056
1313
|
If True, return (results, failures). Default is False.
|
|
1057
1314
|
data_only : bool, optional
|
|
1058
1315
|
If True, return only payload 'data'. Default is True.
|
|
1316
|
+
return_full_response : bool, optional
|
|
1317
|
+
If True, results contain the full response envelopes (including 'trace' and 'annotations').
|
|
1318
|
+
Ignored when stream_to_callback_only=True. Default is False.
|
|
1059
1319
|
verbose : bool, optional
|
|
1060
1320
|
If True, enable debug logging. Default is False.
|
|
1061
1321
|
|
|
@@ -1082,8 +1342,8 @@ class NvIngestClient:
|
|
|
1082
1342
|
# Validate and set batch_size
|
|
1083
1343
|
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1084
1344
|
|
|
1085
|
-
# Prepare timeout tuple for fetch calls
|
|
1086
|
-
effective_timeout: Tuple[int,
|
|
1345
|
+
# Prepare timeout tuple for fetch calls (enable long-poll): (connect<=5s, read=timeout)
|
|
1346
|
+
effective_timeout: Tuple[int, int] = (min(5, int(timeout)), int(timeout))
|
|
1087
1347
|
|
|
1088
1348
|
# Delegate to the concurrent processor
|
|
1089
1349
|
processor = _ConcurrentProcessor(
|
|
@@ -1093,9 +1353,12 @@ class NvIngestClient:
|
|
|
1093
1353
|
job_queue_id=job_queue_id,
|
|
1094
1354
|
timeout=effective_timeout,
|
|
1095
1355
|
max_job_retries=max_job_retries,
|
|
1356
|
+
retry_delay=retry_delay,
|
|
1357
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1096
1358
|
completion_callback=completion_callback,
|
|
1097
1359
|
fail_on_submit_error=fail_on_submit_error,
|
|
1098
1360
|
stream_to_callback_only=stream_to_callback_only,
|
|
1361
|
+
return_full_response=return_full_response,
|
|
1099
1362
|
verbose=verbose,
|
|
1100
1363
|
)
|
|
1101
1364
|
|
|
@@ -1207,12 +1470,19 @@ class NvIngestClient:
|
|
|
1207
1470
|
# Free up memory -- payload should never be used again, and we don't want to keep it around.
|
|
1208
1471
|
job_state.job_spec.payload = None
|
|
1209
1472
|
|
|
1473
|
+
try:
|
|
1474
|
+
self._t_record_submit(job_index, "ok", time.time(), x_trace_id)
|
|
1475
|
+
except Exception:
|
|
1476
|
+
pass
|
|
1210
1477
|
return x_trace_id
|
|
1211
1478
|
except Exception as err:
|
|
1212
1479
|
err_msg = f"Failed to submit job {job_index} to queue {job_queue_id}: {err}"
|
|
1213
1480
|
logger.exception(err_msg)
|
|
1214
1481
|
job_state.state = JobStateEnum.FAILED
|
|
1215
|
-
|
|
1482
|
+
try:
|
|
1483
|
+
self._t_record_submit(job_index, "fail", time.time(), None)
|
|
1484
|
+
except Exception:
|
|
1485
|
+
pass
|
|
1216
1486
|
raise
|
|
1217
1487
|
|
|
1218
1488
|
def submit_job(
|