nv-ingest-client 2025.10.7.dev20251007__py3-none-any.whl → 2025.10.9.dev20251009__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -9,7 +9,11 @@ import json
9
9
  import logging
10
10
  import math
11
11
  import os
12
+ import random
12
13
  import time
14
+ import threading
15
+ import copy
16
+ from statistics import mean, median
13
17
  from collections import defaultdict
14
18
  from concurrent.futures import Future
15
19
  from concurrent.futures import ThreadPoolExecutor
@@ -76,9 +80,12 @@ class _ConcurrentProcessor:
76
80
  batch_size: int,
77
81
  timeout: Tuple[int, Union[float, None]],
78
82
  max_job_retries: Optional[int],
83
+ retry_delay: float,
84
+ initial_fetch_delay: float,
79
85
  completion_callback: Optional[Callable[[Dict[str, Any], str], None]],
80
86
  fail_on_submit_error: bool,
81
87
  stream_to_callback_only: bool,
88
+ return_full_response: bool,
82
89
  verbose: bool = False,
83
90
  ):
84
91
  """
@@ -128,14 +135,19 @@ class _ConcurrentProcessor:
128
135
  self.batch_size = batch_size
129
136
  self.timeout = timeout
130
137
  self.max_job_retries = max_job_retries
138
+ self.retry_delay = retry_delay
139
+ self.initial_fetch_delay = initial_fetch_delay
131
140
  self.completion_callback = completion_callback
132
141
  self.fail_on_submit_error = fail_on_submit_error
133
142
  self.stream_to_callback_only = stream_to_callback_only
143
+ self.return_full_response = return_full_response
134
144
  self.verbose = verbose
135
145
 
136
146
  # State variables managed across batch cycles
137
147
  self.retry_job_ids: List[str] = []
138
148
  self.retry_counts: Dict[str, int] = defaultdict(int)
149
+ self.next_allowed_fetch_time: Dict[str, float] = {}
150
+ self._retry_backoff_cap: float = 5.0
139
151
  self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
140
152
  self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
141
153
 
@@ -183,6 +195,8 @@ class _ConcurrentProcessor:
183
195
  # Cleanup retry count if it exists for this job
184
196
  if job_index in self.retry_counts:
185
197
  del self.retry_counts[job_index]
198
+ if job_index in self.next_allowed_fetch_time:
199
+ del self.next_allowed_fetch_time[job_index]
186
200
 
187
201
  # Attempt to mark state as FAILED locally in the client (best effort)
188
202
  try:
@@ -231,11 +245,14 @@ class _ConcurrentProcessor:
231
245
  elif self.stream_to_callback_only:
232
246
  self.results.append(job_index)
233
247
  else:
234
- self.results.append(result_data.get("data"))
248
+ # When requested, return the full response envelope (includes 'trace' and 'annotations')
249
+ self.results.append(result_data if self.return_full_response else result_data.get("data"))
235
250
 
236
251
  # Cleanup retry count if it exists
237
252
  if job_index in self.retry_counts:
238
253
  del self.retry_counts[job_index]
254
+ if job_index in self.next_allowed_fetch_time:
255
+ del self.next_allowed_fetch_time[job_index]
239
256
 
240
257
  # Execute completion callback if provided
241
258
  if self.completion_callback:
@@ -275,20 +292,266 @@ class _ConcurrentProcessor:
275
292
  except Exception:
276
293
  logger.warning("Could not reliably extract job indices from results for final check.")
277
294
 
278
- initial_indices = set(self.all_job_indices_list)
279
- unaccounted_indices = initial_indices - processed_indices
280
-
281
- if unaccounted_indices:
282
- logger.warning(f"Potentially unaccounted for jobs: {unaccounted_indices}")
283
- # Optionally add them to failures
284
- # for idx in unaccounted_indices:
285
- # if not any(f[0] == idx for f in self.failures):
286
- # self.failures.append((idx, "Job lost or unaccounted for at exit"))
287
-
288
295
  # --------------------------------------------------------------------------
289
- # Public Methods
296
+ # Declarative Helper Methods (behavior preserved)
290
297
  # --------------------------------------------------------------------------
291
298
 
299
+ def _collect_retry_jobs_for_batch(self) -> List[str]:
300
+ """
301
+ Collect eligible retry jobs for this batch based on per-job next-allowed time.
302
+
303
+ Returns
304
+ -------
305
+ List[str]
306
+ The list of job indices that should be retried in this batch.
307
+ """
308
+ if not self.retry_job_ids:
309
+ return []
310
+
311
+ now = time.time()
312
+ eligible: List[str] = []
313
+ remaining: List[str] = []
314
+ for job_id in self.retry_job_ids:
315
+ allowed_at = self.next_allowed_fetch_time.get(job_id, 0.0)
316
+ if allowed_at <= now:
317
+ eligible.append(job_id)
318
+ else:
319
+ remaining.append(job_id)
320
+
321
+ if eligible and self.verbose:
322
+ logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
323
+
324
+ # Keep non-eligible retries for a later batch
325
+ self.retry_job_ids = remaining
326
+ return eligible
327
+
328
+ def _schedule_retry(self, job_index: str) -> None:
329
+ """
330
+ Schedule a paced retry for a job using exponential backoff with jitter.
331
+ """
332
+ now = time.time()
333
+ attempt = max(1, self.retry_counts.get(job_index, 1))
334
+ base = max(0.01, float(self.retry_delay) if self.retry_delay is not None else 1.0)
335
+ delay = min(base * (2 ** (attempt - 1)), self._retry_backoff_cap)
336
+ jitter = random.uniform(0.8, 1.2)
337
+ wait_s = delay * jitter
338
+ self.next_allowed_fetch_time[job_index] = now + wait_s
339
+ if job_index not in self.retry_job_ids:
340
+ self.retry_job_ids.append(job_index)
341
+
342
+ def _select_new_jobs_for_batch(
343
+ self,
344
+ submitted_new_indices_count: int,
345
+ total_jobs: int,
346
+ already_in_batch: int,
347
+ ) -> Tuple[List[str], int]:
348
+ """
349
+ Determine the slice of new jobs to include in the current batch based on
350
+ remaining capacity and unsubmitted jobs.
351
+
352
+ Note: This does NOT change submitted_new_indices_count. The original code
353
+ increments that counter only after submission is attempted/handled.
354
+ """
355
+ if (already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
356
+ num_new_to_add = min(self.batch_size - already_in_batch, total_jobs - submitted_new_indices_count)
357
+ start_idx = submitted_new_indices_count
358
+ end_idx = submitted_new_indices_count + num_new_to_add
359
+ new_job_indices = self.all_job_indices_list[start_idx:end_idx]
360
+
361
+ if self.verbose:
362
+ logger.debug(f"Adding {len(new_job_indices)} new jobs to current batch.")
363
+
364
+ return new_job_indices, submitted_new_indices_count
365
+
366
+ return [], submitted_new_indices_count
367
+
368
+ def _submit_new_jobs_async(
369
+ self,
370
+ current_batch_new_job_indices: List[str],
371
+ current_batch_job_indices: List[str],
372
+ submitted_new_indices_count: int,
373
+ ) -> Tuple[List[str], int]:
374
+ """
375
+ Initiate asynchronous submission for the new jobs selected for this batch.
376
+
377
+ Mirrors the original inline submission block, including error handling and
378
+ fail_on_submit_error semantics. Returns potentially updated batch indices and
379
+ submitted count.
380
+ """
381
+ if not current_batch_new_job_indices:
382
+ return current_batch_job_indices, submitted_new_indices_count
383
+
384
+ if not self.job_queue_id:
385
+ error_msg = "Cannot submit new jobs: job_queue_id is not set."
386
+ logger.error(error_msg)
387
+ # Fail these jobs immediately
388
+ for job_index in current_batch_new_job_indices:
389
+ self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
390
+ # Mark as "submitted" (to prevent reprocessing) but failed
391
+ submitted_new_indices_count += len(current_batch_new_job_indices)
392
+ if self.fail_on_submit_error:
393
+ raise ValueError(error_msg)
394
+ return current_batch_job_indices, submitted_new_indices_count
395
+
396
+ try:
397
+ # Fire-and-forget submission initiation
398
+ _ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
399
+ # Add successfully initiated jobs to the overall batch list
400
+ current_batch_job_indices.extend(current_batch_new_job_indices)
401
+ # Stagger the first fetch attempt slightly to avoid immediate 202s
402
+ now = time.time()
403
+ for job_index in current_batch_new_job_indices:
404
+ allowed_at = self.next_allowed_fetch_time.get(job_index, 0.0)
405
+ self.next_allowed_fetch_time[job_index] = max(allowed_at, now + float(self.initial_fetch_delay))
406
+ # Update count of total initiated jobs
407
+ submitted_new_indices_count += len(current_batch_new_job_indices)
408
+ return current_batch_job_indices, submitted_new_indices_count
409
+ except Exception as e:
410
+ error_msg = (
411
+ f"Batch async submission initiation failed for {len(current_batch_new_job_indices)} new jobs: {e}"
412
+ )
413
+ logger.error(error_msg, exc_info=True)
414
+ # Fail these jobs immediately
415
+ for job_index in current_batch_new_job_indices:
416
+ self._handle_processing_failure(
417
+ job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
418
+ )
419
+ # Mark as "submitted" (to prevent reprocessing) but failed
420
+ submitted_new_indices_count += len(current_batch_new_job_indices)
421
+ if self.fail_on_submit_error:
422
+ raise RuntimeError(error_msg) from e
423
+ return current_batch_job_indices, submitted_new_indices_count
424
+
425
+ def _initiate_fetch_for_batch(self, current_batch_job_indices: List[str]) -> Tuple[Dict[Future, str], List[str]]:
426
+ """
427
+ Initiate fetching for the prepared batch and ensure consistency of returned futures.
428
+
429
+ Returns
430
+ -------
431
+ batch_futures_dict : Dict[Future, str]
432
+ Mapping of futures to their associated job indices.
433
+ normalized_job_indices : List[str]
434
+ The job indices normalized to those actually returned by the client if a discrepancy occurs.
435
+ """
436
+ # Filter indices by next_allowed_fetch_time to respect pacing for new jobs
437
+ now = time.time()
438
+ eligible_indices: List[str] = []
439
+ deferred_indices: List[str] = []
440
+ for idx in current_batch_job_indices:
441
+ if self.next_allowed_fetch_time.get(idx, 0.0) <= now:
442
+ eligible_indices.append(idx)
443
+ else:
444
+ deferred_indices.append(idx)
445
+
446
+ # Defer ineligible jobs for later retry window
447
+ for idx in deferred_indices:
448
+ if idx not in self.retry_job_ids:
449
+ self.retry_job_ids.append(idx)
450
+
451
+ if self.verbose:
452
+ logger.debug(
453
+ f"Calling fetch_job_result_async for {len(eligible_indices)} eligible jobs "
454
+ f"(deferred {len(deferred_indices)})."
455
+ )
456
+ # Use data_only=False to get full response for callback/results
457
+ batch_futures_dict = (
458
+ self.client.fetch_job_result_async(eligible_indices, data_only=False) if eligible_indices else {}
459
+ )
460
+
461
+ # Check for discrepancies where client might not return all futures
462
+ if eligible_indices and (len(batch_futures_dict) != len(eligible_indices)):
463
+ returned_indices = set(batch_futures_dict.values())
464
+ missing_indices = [idx for idx in eligible_indices if idx not in returned_indices]
465
+ logger.error(
466
+ f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
467
+ f"{len(batch_futures_dict)}. Missing: {missing_indices}"
468
+ )
469
+ # Fail the missing ones explicitly
470
+ for missing_idx in missing_indices:
471
+ self._handle_processing_failure(
472
+ missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
473
+ )
474
+ if self.fail_on_submit_error:
475
+ raise RuntimeError("fetch_job_result_async failed to return all expected futures.")
476
+ # Continue processing only the futures we received
477
+ normalized_job_indices = list(returned_indices)
478
+ else:
479
+ normalized_job_indices = list(eligible_indices)
480
+
481
+ return batch_futures_dict, normalized_job_indices
482
+
483
+ def _process_batch_futures(self, batch_futures_dict: Dict[Future, str], batch_timeout: float) -> None:
484
+ """
485
+ Process the batch futures as they complete, handling success, 202-timeout retries,
486
+ and failures according to existing logic.
487
+ """
488
+ if not batch_futures_dict:
489
+ if self.verbose:
490
+ logger.debug("No futures returned/available for processing in this batch.")
491
+ return
492
+
493
+ try:
494
+ for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
495
+ job_index = batch_futures_dict[future]
496
+ try:
497
+ # Expect list with one tuple: [(data, index, trace)]
498
+ result_list = future.result()
499
+ if not isinstance(result_list, list) or len(result_list) != 1:
500
+ raise ValueError(f"Expected list length 1, got {len(result_list)}")
501
+
502
+ result_tuple = result_list[0]
503
+ if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
504
+ raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
505
+
506
+ full_response_dict, fetched_job_index, trace_id = result_tuple
507
+
508
+ if fetched_job_index != job_index:
509
+ logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
510
+
511
+ self._handle_processing_success(job_index, full_response_dict, trace_id)
512
+
513
+ except TimeoutError:
514
+ # Handle job not ready - check retry policy and schedule paced retry
515
+ self.retry_counts[job_index] += 1
516
+ if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
517
+ if self.verbose:
518
+ logger.info(
519
+ f"Job {job_index} not ready, scheduling paced retry (Attempt "
520
+ f"{self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
521
+ )
522
+ self._schedule_retry(job_index)
523
+ else:
524
+ error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
525
+ logger.error(error_msg)
526
+ self._handle_processing_failure(job_index, error_msg)
527
+
528
+ except (ValueError, RuntimeError) as e:
529
+ logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
530
+ self._handle_processing_failure(job_index, f"Error processing result: {e}")
531
+ except Exception as e:
532
+ logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
533
+ self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
534
+
535
+ except TimeoutError:
536
+ self._handle_batch_timeout(batch_futures_dict, batch_timeout)
537
+
538
+ def _handle_batch_timeout(self, batch_futures_dict: Dict[Future, str], batch_timeout: float) -> None:
539
+ """
540
+ Handle a timeout while waiting for batch futures, mirroring the original behavior.
541
+ """
542
+ logger.error(
543
+ f"Batch processing timed out after {batch_timeout}s waiting for futures. "
544
+ "Some jobs in batch may be lost or incomplete."
545
+ )
546
+ remaining_indices_in_batch = []
547
+ for f, idx in batch_futures_dict.items():
548
+ if not f.done():
549
+ remaining_indices_in_batch.append(idx)
550
+ f.cancel() # Attempt to cancel underlying task
551
+ logger.warning(f"Jobs potentially lost/cancelled due to batch timeout: {remaining_indices_in_batch}")
552
+ for idx in remaining_indices_in_batch:
553
+ self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
554
+
292
555
  def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
293
556
  """
294
557
  Executes the main processing loop in batches.
@@ -315,123 +578,66 @@ class _ConcurrentProcessor:
315
578
  initiation error occurs.
316
579
  """
317
580
  total_jobs = len(self.all_job_indices_list)
318
- # Tracks indices for which submission has been initiated at least once
319
- submitted_new_indices_count = 0
581
+ submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
320
582
 
321
- logger.info(f"Starting batch processing for {total_jobs} jobs with batch " f"size {self.batch_size}.")
583
+ logger.info(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
322
584
 
323
- # Main loop: continues as long as there are new jobs to submit
324
- # or jobs waiting for retry.
325
585
  while (submitted_new_indices_count < total_jobs) or self.retry_job_ids:
326
586
 
327
- # --- Determine Jobs for Current Batch ---
328
- current_batch_job_indices: List[str] = []
587
+ # 1) Collect retries intended for this batch
588
+ current_batch_job_indices: List[str] = self._collect_retry_jobs_for_batch()
329
589
 
330
- # Add retries from the previous batch first
331
- if self.retry_job_ids:
332
- num_retries = len(self.retry_job_ids)
333
- current_batch_job_indices.extend(self.retry_job_ids)
334
- if self.verbose:
335
- logger.debug(f"Adding {num_retries} retry jobs to current batch.")
336
- # Clear the list; retries for *this* batch will be collected later
337
- self.retry_job_ids = []
338
-
339
- # Determine and add new jobs to the batch
340
- num_already_in_batch = len(current_batch_job_indices)
341
- if (num_already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
342
- num_new_to_add = min(self.batch_size - num_already_in_batch, total_jobs - submitted_new_indices_count)
343
- start_idx = submitted_new_indices_count
344
- end_idx = submitted_new_indices_count + num_new_to_add
345
- current_batch_new_job_indices = self.all_job_indices_list[start_idx:end_idx]
590
+ # 2) Select new jobs to fill the batch capacity
591
+ current_batch_new_job_indices, submitted_new_indices_count = self._select_new_jobs_for_batch(
592
+ submitted_new_indices_count=submitted_new_indices_count,
593
+ total_jobs=total_jobs,
594
+ already_in_batch=len(current_batch_job_indices),
595
+ )
346
596
 
347
- if self.verbose:
348
- logger.debug(f"Adding {len(current_batch_new_job_indices)} new " f"jobs to current batch.")
597
+ # 3) Initiate async submission for the selected new jobs
598
+ try:
599
+ current_batch_job_indices, submitted_new_indices_count = self._submit_new_jobs_async(
600
+ current_batch_new_job_indices,
601
+ current_batch_job_indices,
602
+ submitted_new_indices_count,
603
+ )
604
+ except Exception as e: # noqa: F841
605
+ # Preserve original fail-on-submit behavior
606
+ # (errors already logged and failures recorded inside helper)
607
+ if self.fail_on_submit_error:
608
+ raise
349
609
 
350
- # Initiate async submission for ONLY the NEW jobs
351
- if current_batch_new_job_indices:
352
- if not self.job_queue_id:
353
- error_msg = "Cannot submit new jobs: job_queue_id is not set."
354
- logger.error(error_msg)
355
- # Fail these jobs immediately
356
- for job_index in current_batch_new_job_indices:
357
- self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
358
- # Mark as "submitted" (to prevent reprocessing) but failed
359
- submitted_new_indices_count += len(current_batch_new_job_indices)
360
- if self.fail_on_submit_error:
361
- raise ValueError(error_msg)
362
- else:
363
- try:
364
- # Fire-and-forget submission initiation
365
- _ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
366
- # Add successfully initiated jobs to the overall batch list
367
- current_batch_job_indices.extend(current_batch_new_job_indices)
368
- # Update count of total initiated jobs
369
- submitted_new_indices_count += len(current_batch_new_job_indices)
370
- except Exception as e:
371
- error_msg = (
372
- f"Batch async submission initiation failed for "
373
- f"{len(current_batch_new_job_indices)} new jobs: {e}"
374
- )
375
- logger.error(error_msg, exc_info=True)
376
- # Fail these jobs immediately
377
- for job_index in current_batch_new_job_indices:
378
- self._handle_processing_failure(
379
- job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
380
- )
381
- # Mark as "submitted" (to prevent reprocessing) but failed
382
- submitted_new_indices_count += len(current_batch_new_job_indices)
383
- if self.fail_on_submit_error:
384
- raise RuntimeError(error_msg) from e
385
-
386
- # If nothing ended up in the batch (e.g., only submission failures)
610
+ # 4) If no jobs to fetch this cycle, decide whether to exit or continue
387
611
  if not current_batch_job_indices:
388
612
  if self.verbose:
389
613
  logger.debug("No jobs identified for fetching in this batch iteration.")
390
- # If there are no retries pending either, break the loop
391
614
  if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
392
615
  logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
393
616
  break
394
- continue # Otherwise, proceed to next iteration
395
-
396
- # --- Initiate Fetching for the Current Batch ---
617
+ # If retries remain but are not yet eligible, sleep until earliest allowed
618
+ if self.retry_job_ids:
619
+ now = time.time()
620
+ future_times = [self.next_allowed_fetch_time.get(j, now) for j in self.retry_job_ids]
621
+ # Consider only times in the future
622
+ future_times = [t for t in future_times if t > now]
623
+ if future_times:
624
+ sleep_for = min(max(min(future_times) - now, 0.05), 1.0)
625
+ if self.verbose:
626
+ logger.debug(f"Pacing retries: sleeping {sleep_for:.2f}s waiting for next allowed fetch.")
627
+ time.sleep(sleep_for)
628
+ continue
629
+
630
+ # 5) Initiate fetching for the current batch
397
631
  try:
398
- if self.verbose:
399
- logger.debug(
400
- f"Calling fetch_job_result_async for "
401
- f"{len(current_batch_job_indices)} jobs in current batch."
402
- )
403
- # Use data_only=False to get full response for callback/results
404
- batch_futures_dict = self.client.fetch_job_result_async(current_batch_job_indices, data_only=False)
405
-
406
- # Check for discrepancies where client might not return all futures
407
- if len(batch_futures_dict) != len(current_batch_job_indices):
408
- returned_indices = set(batch_futures_dict.values())
409
- missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
410
- logger.error(
411
- f"fetch_job_result_async discrepancy: Expected "
412
- f"{len(current_batch_job_indices)}, got "
413
- f"{len(batch_futures_dict)}. Missing: {missing_indices}"
414
- )
415
- # Fail the missing ones explicitly
416
- for missing_idx in missing_indices:
417
- self._handle_processing_failure(
418
- missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
419
- )
420
- if self.fail_on_submit_error:
421
- raise RuntimeError("fetch_job_result_async failed to return all " "expected futures.")
422
- # Continue processing only the futures we received
423
- current_batch_job_indices = list(returned_indices)
424
-
632
+ batch_futures_dict, _ = self._initiate_fetch_for_batch(current_batch_job_indices)
425
633
  except Exception as fetch_init_err:
426
634
  error_msg = (
427
- f"fetch_job_result_async failed for batch "
428
- f"({len(current_batch_job_indices)} jobs): {fetch_init_err}"
635
+ f"fetch_job_result_async failed for batch ({len(current_batch_job_indices)} jobs): {fetch_init_err}"
429
636
  )
430
637
  logger.error(error_msg, exc_info=True)
431
638
  logger.warning(
432
- f"Marking all {len(current_batch_job_indices)} jobs in " f"failed fetch initiation batch as failed."
639
+ f"Marking all {len(current_batch_job_indices)} jobs in failed fetch initiation batch as failed."
433
640
  )
434
- # Fail all jobs intended for this batch
435
641
  for job_index in current_batch_job_indices:
436
642
  self._handle_processing_failure(
437
643
  job_index, f"Fetch initiation failed for batch: {fetch_init_err}", is_submission_failure=True
@@ -440,79 +646,11 @@ class _ConcurrentProcessor:
440
646
  raise RuntimeError(
441
647
  f"Stopping due to fetch initiation failure: {fetch_init_err}"
442
648
  ) from fetch_init_err
443
- continue # Skip processing results for this failed batch
649
+ continue
444
650
 
445
- # --- Process Results for the Current Batch ---
446
- if not batch_futures_dict:
447
- if self.verbose:
448
- logger.debug("No futures returned/available for processing in this batch.")
449
- continue # Skip processing if no futures
450
-
451
- batch_timeout = 600.0 # Timeout for waiting on the whole batch
452
- try:
453
- # Process futures as they complete within this batch
454
- for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
455
- job_index = batch_futures_dict[future]
456
- try:
457
- # Expect list with one tuple: [(data, index, trace)]
458
- result_list = future.result()
459
- if not isinstance(result_list, list) or len(result_list) != 1:
460
- raise ValueError(f"Expected list length 1, got {len(result_list)}")
461
-
462
- result_tuple = result_list[0]
463
- if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
464
- raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
465
-
466
- full_response_dict, fetched_job_index, trace_id = result_tuple
467
-
468
- if fetched_job_index != job_index:
469
- logger.warning(f"Mismatch: Future for {job_index} returned " f"{fetched_job_index}")
470
-
471
- self._handle_processing_success(job_index, full_response_dict, trace_id)
472
-
473
- except TimeoutError:
474
- # Handle job not ready - check retry policy
475
- self.retry_counts[job_index] += 1
476
- if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
477
- if self.verbose:
478
- logger.info(
479
- f"Job {job_index} not ready, adding to next "
480
- f"batch's retry list (Attempt "
481
- f"{self.retry_counts[job_index]}/"
482
- f"{self.max_job_retries or 'inf'})."
483
- )
484
- # Collect for the *next* batch
485
- self.retry_job_ids.append(job_index)
486
- else:
487
- error_msg = f"Exceeded max fetch retries " f"({self.max_job_retries}) for job {job_index}."
488
- logger.error(error_msg)
489
- self._handle_processing_failure(job_index, error_msg)
490
-
491
- except (ValueError, RuntimeError) as e:
492
- logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
493
- self._handle_processing_failure(job_index, f"Error processing result: {e}")
494
- except Exception as e:
495
- logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
496
- self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
497
- # No finally block incrementing count here; tracking is batch-based
498
-
499
- except TimeoutError:
500
- # `as_completed` timed out waiting for remaining futures in batch
501
- logger.error(
502
- f"Batch processing timed out after {batch_timeout}s waiting "
503
- f"for futures. Some jobs in batch may be lost or incomplete."
504
- )
505
- # Identify and fail remaining futures
506
- remaining_indices_in_batch = []
507
- for f, idx in batch_futures_dict.items():
508
- if not f.done():
509
- remaining_indices_in_batch.append(idx)
510
- f.cancel() # Attempt to cancel underlying task
511
- logger.warning(
512
- f"Jobs potentially lost/cancelled due to batch timeout: " f"{remaining_indices_in_batch}"
513
- )
514
- for idx in remaining_indices_in_batch:
515
- self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
651
+ # 6) Process results for the current batch
652
+ batch_timeout = 600.0
653
+ self._process_batch_futures(batch_futures_dict, batch_timeout)
516
654
  # End of processing for this batch cycle
517
655
 
518
656
  # --- Final Logging ---
@@ -576,7 +714,15 @@ class NvIngestClient:
576
714
  # Initialize the worker pool with the specified size
577
715
  self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
578
716
 
717
+ # Telemetry state and controls
718
+ self._telemetry_lock = threading.Lock()
719
+ self._telemetry_enabled: bool = bool(int(os.getenv("NV_INGEST_CLIENT_TELEMETRY", "1")))
720
+ try:
721
+ self._telemetry_max_calls: int = int(os.getenv("NV_INGEST_CLIENT_TELEMETRY_MAX_CALLS", "10000"))
722
+ except ValueError:
723
+ self._telemetry_max_calls = 10000
579
724
  self._telemetry = {}
725
+ self.reset_telemetry()
580
726
 
581
727
  def __str__(self) -> str:
582
728
  """
@@ -624,6 +770,106 @@ class NvIngestClient:
624
770
 
625
771
  return job_state
626
772
 
773
+ # ------------------------------------------------------------------
774
+ # Telemetry helpers
775
+ # ------------------------------------------------------------------
776
+
777
+ def enable_telemetry(self, enabled: bool) -> None:
778
+ with self._telemetry_lock:
779
+ self._telemetry_enabled = bool(enabled)
780
+
781
+ def reset_telemetry(self) -> None:
782
+ with self._telemetry_lock:
783
+ self._telemetry = {
784
+ "started_at": time.time(),
785
+ "submit": {"count": 0, "calls": []},
786
+ "fetch": {"count": 0, "last_ts": None, "intervals": [], "calls": []},
787
+ "per_job": {},
788
+ }
789
+
790
+ def _t_per_job(self, job_index: str) -> Dict[str, Any]:
791
+ pj = self._telemetry["per_job"].get(job_index)
792
+ if pj is None:
793
+ pj = {"submits": [], "fetch_attempts": [], "timeouts_202": 0, "failures": 0, "first_success_ts": None}
794
+ self._telemetry["per_job"][job_index] = pj
795
+ return pj
796
+
797
+ def _t_append_capped(self, arr: List[Any], item: Any) -> None:
798
+ if len(arr) < self._telemetry_max_calls:
799
+ arr.append(item)
800
+
801
+ def _t_record_submit(self, job_index: str, status: str, ts: float, trace_id: Optional[str]) -> None:
802
+ if not self._telemetry_enabled:
803
+ return
804
+ with self._telemetry_lock:
805
+ self._telemetry["submit"]["count"] += 1
806
+ self._t_append_capped(
807
+ self._telemetry["submit"]["calls"],
808
+ {"job": job_index, "status": status, "ts": ts, "trace": trace_id},
809
+ )
810
+ pj = self._t_per_job(job_index)
811
+ self._t_append_capped(pj["submits"], ts)
812
+
813
+ def _t_record_fetch_attempt(self, job_index: str, ts: float) -> None:
814
+ if not self._telemetry_enabled:
815
+ return
816
+ with self._telemetry_lock:
817
+ self._telemetry["fetch"]["count"] += 1
818
+ last = self._telemetry["fetch"]["last_ts"]
819
+ if last is not None:
820
+ delta = ts - float(last)
821
+ if delta >= 0:
822
+ self._t_append_capped(self._telemetry["fetch"]["intervals"], delta)
823
+ self._telemetry["fetch"]["last_ts"] = ts
824
+ pj = self._t_per_job(job_index)
825
+ self._t_append_capped(pj["fetch_attempts"], ts)
826
+
827
+ def _t_record_fetch_outcome(self, job_index: str, code: int, ts: float, ok: bool, trace_id: Optional[str]) -> None:
828
+ if not self._telemetry_enabled:
829
+ return
830
+ with self._telemetry_lock:
831
+ self._t_append_capped(
832
+ self._telemetry["fetch"]["calls"],
833
+ {"job": job_index, "code": code, "ok": ok, "ts": ts, "trace": trace_id},
834
+ )
835
+ pj = self._t_per_job(job_index)
836
+ if code == 2: # 202 not ready
837
+ pj["timeouts_202"] += 1
838
+ if ok and pj["first_success_ts"] is None:
839
+ pj["first_success_ts"] = ts
840
+ if not ok and code not in (0, 2):
841
+ pj["failures"] += 1
842
+
843
+ def get_telemetry(self) -> Dict[str, Any]:
844
+ with self._telemetry_lock:
845
+ return copy.deepcopy(self._telemetry)
846
+
847
+ def summarize_telemetry(self) -> Dict[str, Any]:
848
+ with self._telemetry_lock:
849
+ submit_count = self._telemetry["submit"]["count"]
850
+ fetch_count = self._telemetry["fetch"]["count"]
851
+ intervals = list(self._telemetry["fetch"]["intervals"])
852
+ intervals.sort()
853
+ avg = mean(intervals) if intervals else 0.0
854
+ p50 = median(intervals) if intervals else 0.0
855
+ # p95 via index
856
+ p95 = intervals[int(0.95 * (len(intervals) - 1))] if intervals else 0.0
857
+ per_job = self._telemetry["per_job"]
858
+ # Aggregate per-job stats
859
+ jobs = len(per_job)
860
+ total_timeouts = sum(pj.get("timeouts_202", 0) for pj in per_job.values())
861
+ total_failures = sum(pj.get("failures", 0) for pj in per_job.values())
862
+ return {
863
+ "submit_count": submit_count,
864
+ "fetch_count": fetch_count,
865
+ "fetch_interval_avg": avg,
866
+ "fetch_interval_p50": p50,
867
+ "fetch_interval_p95": p95,
868
+ "jobs_tracked": jobs,
869
+ "timeouts_202_total": total_timeouts,
870
+ "failures_total": total_failures,
871
+ }
872
+
627
873
  def _get_and_check_job_state(
628
874
  self,
629
875
  job_index: str,
@@ -861,6 +1107,8 @@ class NvIngestClient:
861
1107
  Exception
862
1108
  For unexpected issues.
863
1109
  """
1110
+ ts_attempt = time.time()
1111
+ self._t_record_fetch_attempt(job_index, ts_attempt)
864
1112
  try:
865
1113
  # Get job state using the client-side index
866
1114
  job_state = self._get_and_check_job_state(
@@ -901,6 +1149,7 @@ class NvIngestClient:
901
1149
  logger.debug(
902
1150
  f"Successfully processed and removed job index {job_index} (Server ID: {server_job_id})"
903
1151
  )
1152
+ self._t_record_fetch_outcome(job_index, 0, time.time(), ok=True, trace_id=job_state.trace_id)
904
1153
  return result_data, job_index, job_state.trace_id
905
1154
 
906
1155
  except json.JSONDecodeError as err:
@@ -922,6 +1171,7 @@ class NvIngestClient:
922
1171
  elif response.response_code == 2: # Job Not Ready (e.g., HTTP 202, or r-2 from SimpleBroker)
923
1172
  # Raise TimeoutError to signal the calling retry loop in fetch_job_result
924
1173
  # Do not change job state here, remains SUBMITTED
1174
+ self._t_record_fetch_outcome(job_index, 2, time.time(), ok=False, trace_id=job_state.trace_id)
925
1175
  raise TimeoutError(f"Job not ready: {response.response_reason}")
926
1176
 
927
1177
  else:
@@ -934,6 +1184,7 @@ class NvIngestClient:
934
1184
  job_state.state = JobStateEnum.FAILED # Mark job as failed in the client
935
1185
  # Do NOT pop the state for failed jobs here
936
1186
  # Raise RuntimeError to indicate a terminal failure for this fetch attempt
1187
+ self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=job_state.trace_id)
937
1188
  raise RuntimeError(error_msg)
938
1189
 
939
1190
  except (TimeoutError, ValueError, RuntimeError):
@@ -945,6 +1196,10 @@ class NvIngestClient:
945
1196
  # Attempt to mark state as FAILED if possible and state object exists
946
1197
  if "job_state" in locals() and hasattr(job_state, "state"):
947
1198
  job_state.state = JobStateEnum.FAILED
1199
+ try:
1200
+ self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=None)
1201
+ except Exception:
1202
+ pass
948
1203
  raise # Re-raise the original exception
949
1204
 
950
1205
  def fetch_job_result_cli(
@@ -1019,12 +1274,14 @@ class NvIngestClient:
1019
1274
  concurrency_limit: int = 64,
1020
1275
  timeout: int = 100,
1021
1276
  max_job_retries: Optional[int] = None,
1022
- retry_delay: float = 5.0,
1277
+ retry_delay: float = 0.5,
1278
+ initial_fetch_delay: float = 0.3,
1023
1279
  fail_on_submit_error: bool = False,
1024
1280
  completion_callback: Optional[Callable[[Any, str], None]] = None,
1025
1281
  return_failures: bool = False,
1026
1282
  data_only: bool = True,
1027
1283
  stream_to_callback_only: bool = False,
1284
+ return_full_response: bool = False,
1028
1285
  verbose: bool = False,
1029
1286
  ) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
1030
1287
  """
@@ -1056,6 +1313,9 @@ class NvIngestClient:
1056
1313
  If True, return (results, failures). Default is False.
1057
1314
  data_only : bool, optional
1058
1315
  If True, return only payload 'data'. Default is True.
1316
+ return_full_response : bool, optional
1317
+ If True, results contain the full response envelopes (including 'trace' and 'annotations').
1318
+ Ignored when stream_to_callback_only=True. Default is False.
1059
1319
  verbose : bool, optional
1060
1320
  If True, enable debug logging. Default is False.
1061
1321
 
@@ -1082,8 +1342,8 @@ class NvIngestClient:
1082
1342
  # Validate and set batch_size
1083
1343
  validated_batch_size = self._validate_batch_size(batch_size)
1084
1344
 
1085
- # Prepare timeout tuple for fetch calls
1086
- effective_timeout: Tuple[int, None] = (timeout, None)
1345
+ # Prepare timeout tuple for fetch calls (enable long-poll): (connect<=5s, read=timeout)
1346
+ effective_timeout: Tuple[int, int] = (min(5, int(timeout)), int(timeout))
1087
1347
 
1088
1348
  # Delegate to the concurrent processor
1089
1349
  processor = _ConcurrentProcessor(
@@ -1093,9 +1353,12 @@ class NvIngestClient:
1093
1353
  job_queue_id=job_queue_id,
1094
1354
  timeout=effective_timeout,
1095
1355
  max_job_retries=max_job_retries,
1356
+ retry_delay=retry_delay,
1357
+ initial_fetch_delay=initial_fetch_delay,
1096
1358
  completion_callback=completion_callback,
1097
1359
  fail_on_submit_error=fail_on_submit_error,
1098
1360
  stream_to_callback_only=stream_to_callback_only,
1361
+ return_full_response=return_full_response,
1099
1362
  verbose=verbose,
1100
1363
  )
1101
1364
 
@@ -1207,12 +1470,19 @@ class NvIngestClient:
1207
1470
  # Free up memory -- payload should never be used again, and we don't want to keep it around.
1208
1471
  job_state.job_spec.payload = None
1209
1472
 
1473
+ try:
1474
+ self._t_record_submit(job_index, "ok", time.time(), x_trace_id)
1475
+ except Exception:
1476
+ pass
1210
1477
  return x_trace_id
1211
1478
  except Exception as err:
1212
1479
  err_msg = f"Failed to submit job {job_index} to queue {job_queue_id}: {err}"
1213
1480
  logger.exception(err_msg)
1214
1481
  job_state.state = JobStateEnum.FAILED
1215
-
1482
+ try:
1483
+ self._t_record_submit(job_index, "fail", time.time(), None)
1484
+ except Exception:
1485
+ pass
1216
1486
  raise
1217
1487
 
1218
1488
  def submit_job(