nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,11 @@ import concurrent
8
8
  import json
9
9
  import logging
10
10
  import math
11
+ import os
11
12
  import time
13
+ import threading
14
+ import copy
15
+ from statistics import mean, median
12
16
  from collections import defaultdict
13
17
  from concurrent.futures import Future
14
18
  from concurrent.futures import ThreadPoolExecutor
@@ -31,11 +35,59 @@ from nv_ingest_client.primitives.tasks import TaskType
31
35
  from nv_ingest_client.primitives.tasks import is_valid_task_type
32
36
  from nv_ingest_client.primitives.tasks import task_factory
33
37
  from nv_ingest_client.util.processing import handle_future_result, IngestJobFailure
34
- from nv_ingest_client.util.util import create_job_specs_for_batch, check_ingest_result
38
+ from nv_ingest_client.util.util import (
39
+ create_job_specs_for_batch,
40
+ check_ingest_result,
41
+ apply_pdf_split_config_to_job_specs,
42
+ )
35
43
 
36
44
  logger = logging.getLogger(__name__)
37
45
 
38
46
 
47
+ def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
48
+ """
49
+ Compute resident_time entries from entry/exit pairs if not already present.
50
+
51
+ This ensures consistency between split jobs (where server computes resident_time)
52
+ and non-split jobs (where we compute it client-side).
53
+
54
+ Parameters
55
+ ----------
56
+ trace_dict : Dict[str, Any]
57
+ Trace dictionary with entry/exit pairs
58
+
59
+ Returns
60
+ -------
61
+ Dict[str, Any]
62
+ Trace dictionary with resident_time entries added
63
+ """
64
+ if not trace_dict or not isinstance(trace_dict, dict):
65
+ return trace_dict
66
+
67
+ # Check if resident_time already exists (server-computed for split jobs)
68
+ has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
69
+ if has_resident:
70
+ return trace_dict # Already computed by server
71
+
72
+ # Compute resident_time from entry/exit pairs
73
+ result = dict(trace_dict)
74
+ stages = set()
75
+
76
+ # Find all unique stages
77
+ for key in trace_dict:
78
+ if key.startswith("trace::entry::"):
79
+ stages.add(key.replace("trace::entry::", ""))
80
+
81
+ # Compute resident_time for each stage
82
+ for stage in stages:
83
+ entry_key = f"trace::entry::{stage}"
84
+ exit_key = f"trace::exit::{stage}"
85
+ if entry_key in trace_dict and exit_key in trace_dict:
86
+ result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
87
+
88
+ return result
89
+
90
+
39
91
  class DataDecodeException(Exception):
40
92
  """
41
93
  Exception raised for errors in decoding data.
@@ -56,15 +108,12 @@ class DataDecodeException(Exception):
56
108
 
57
109
  class _ConcurrentProcessor:
58
110
  """
59
- Manages the asynchronous submission and result fetching of jobs using a
60
- client's public methods, mirroring the batching structure of the CLI path.
61
-
62
- This processor takes a list of pre-created job indices, submits them in
63
- batches via the client's `submit_job_async`, and then fetches results
64
- for each batch using `fetch_job_result_async`. It processes results as
65
- they become available within the batch using `as_completed`. Retries due
66
- to job readiness timeouts are handled by adding the job index to the next
67
- processing batch.
111
+ Manages asynchronous submission and result fetching while keeping a steady
112
+ pool of up to `batch_size` in-flight jobs:
113
+ - Retries (202/TimeoutError) are re-queued immediately.
114
+ - New jobs are submitted as capacity frees up.
115
+ - Fetches are started for jobs added each cycle.
116
+ - We always attempt to keep the executor saturated up to `batch_size`.
68
117
  """
69
118
 
70
119
  def __init__(
@@ -75,10 +124,14 @@ class _ConcurrentProcessor:
75
124
  batch_size: int,
76
125
  timeout: Tuple[int, Union[float, None]],
77
126
  max_job_retries: Optional[int],
127
+ retry_delay: float,
128
+ initial_fetch_delay: float,
78
129
  completion_callback: Optional[Callable[[Dict[str, Any], str], None]],
79
130
  fail_on_submit_error: bool,
80
131
  stream_to_callback_only: bool,
132
+ return_full_response: bool,
81
133
  verbose: bool = False,
134
+ return_traces: bool = False,
82
135
  ):
83
136
  """
84
137
  Initializes the concurrent processor.
@@ -112,6 +165,8 @@ class _ConcurrentProcessor:
112
165
  initiating job submission or fetching fails for a batch.
113
166
  verbose : bool, optional
114
167
  If True, enables detailed debug logging. Default is False.
168
+ return_traces : bool, optional
169
+ If True, parent-level trace data for each completed job is stored.
115
170
 
116
171
  Raises
117
172
  ------
@@ -127,16 +182,21 @@ class _ConcurrentProcessor:
127
182
  self.batch_size = batch_size
128
183
  self.timeout = timeout
129
184
  self.max_job_retries = max_job_retries
185
+ self.retry_delay = retry_delay
186
+ self.initial_fetch_delay = initial_fetch_delay
130
187
  self.completion_callback = completion_callback
131
188
  self.fail_on_submit_error = fail_on_submit_error
132
189
  self.stream_to_callback_only = stream_to_callback_only
190
+ self.return_full_response = return_full_response
133
191
  self.verbose = verbose
192
+ self.return_traces = return_traces
134
193
 
135
194
  # State variables managed across batch cycles
136
195
  self.retry_job_ids: List[str] = []
137
196
  self.retry_counts: Dict[str, int] = defaultdict(int)
138
197
  self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
139
198
  self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
199
+ self.traces: List[Optional[Dict[str, Any]]] = []
140
200
 
141
201
  # --- Initial Checks ---
142
202
  if not self.job_queue_id:
@@ -224,13 +284,25 @@ class _ConcurrentProcessor:
224
284
 
225
285
  is_failed, description = check_ingest_result(result_data)
226
286
 
287
+ if trace_id:
288
+ self.client.register_parent_trace_id(trace_id)
289
+
227
290
  if is_failed:
228
291
  failed_job_spec = self.client._job_index_to_job_spec.get(job_index)
229
292
  self.failures.append((f"{job_index}:{failed_job_spec.source_id}", description))
230
293
  elif self.stream_to_callback_only:
231
294
  self.results.append(job_index)
232
295
  else:
233
- self.results.append(result_data.get("data"))
296
+ # When requested, return the full response envelope (includes 'trace' and 'annotations')
297
+ self.results.append(result_data if self.return_full_response else result_data.get("data"))
298
+
299
+ # Extract trace data for all successful (non-failed) jobs
300
+ if self.return_traces and not is_failed:
301
+ trace_payload = result_data.get("trace") if result_data else None
302
+ # Compute resident_time if not already present (for consistency)
303
+ if trace_payload:
304
+ trace_payload = _compute_resident_times(trace_payload)
305
+ self.traces.append(trace_payload if trace_payload else None)
234
306
 
235
307
  # Cleanup retry count if it exists
236
308
  if job_index in self.retry_counts:
@@ -274,21 +346,156 @@ class _ConcurrentProcessor:
274
346
  except Exception:
275
347
  logger.warning("Could not reliably extract job indices from results for final check.")
276
348
 
277
- initial_indices = set(self.all_job_indices_list)
278
- unaccounted_indices = initial_indices - processed_indices
279
-
280
- if unaccounted_indices:
281
- logger.warning(f"Potentially unaccounted for jobs: {unaccounted_indices}")
282
- # Optionally add them to failures
283
- # for idx in unaccounted_indices:
284
- # if not any(f[0] == idx for f in self.failures):
285
- # self.failures.append((idx, "Job lost or unaccounted for at exit"))
286
-
287
349
  # --------------------------------------------------------------------------
288
- # Public Methods
350
+ # Declarative Helper Methods (behavior preserved)
289
351
  # --------------------------------------------------------------------------
290
352
 
291
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
353
+ def _collect_retry_jobs_for_batch(self) -> List[str]:
354
+ """
355
+ Collect retry jobs for this batch, mirroring handler behavior (no pacing filter).
356
+
357
+ Returns
358
+ -------
359
+ List[str]
360
+ The list of job indices that should be retried in this batch.
361
+ """
362
+ if not self.retry_job_ids:
363
+ return []
364
+
365
+ # Take all retries this cycle and clear the list (handler resets per-iteration)
366
+ eligible: List[str] = list(self.retry_job_ids)
367
+ self.retry_job_ids = []
368
+ if eligible and self.verbose:
369
+ logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
370
+ return eligible
371
+
372
+ def _schedule_retry(self, job_index: str) -> None:
373
+ """
374
+ Schedule an immediate retry for a job (no pacing), mirroring handler behavior.
375
+ """
376
+ if job_index not in self.retry_job_ids:
377
+ self.retry_job_ids.append(job_index)
378
+
379
+ def _select_new_jobs_for_batch(
380
+ self,
381
+ submitted_new_indices_count: int,
382
+ total_jobs: int,
383
+ already_in_batch: int,
384
+ ) -> Tuple[List[str], int]:
385
+ """
386
+ Determine the slice of new jobs to include in the current batch based on
387
+ remaining capacity and unsubmitted jobs.
388
+
389
+ Note: This does NOT change submitted_new_indices_count. The original code
390
+ increments that counter only after submission is attempted/handled.
391
+ """
392
+ if (already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
393
+ num_new_to_add = min(self.batch_size - already_in_batch, total_jobs - submitted_new_indices_count)
394
+ start_idx = submitted_new_indices_count
395
+ end_idx = submitted_new_indices_count + num_new_to_add
396
+ new_job_indices = self.all_job_indices_list[start_idx:end_idx]
397
+
398
+ if self.verbose:
399
+ logger.debug(f"Adding {len(new_job_indices)} new jobs to current batch.")
400
+
401
+ return new_job_indices, submitted_new_indices_count
402
+
403
+ return [], submitted_new_indices_count
404
+
405
+ def _submit_new_jobs_async(
406
+ self,
407
+ current_batch_new_job_indices: List[str],
408
+ current_batch_job_indices: List[str],
409
+ submitted_new_indices_count: int,
410
+ ) -> Tuple[List[str], int]:
411
+ """
412
+ Initiate asynchronous submission for the new jobs selected for this batch.
413
+
414
+ Mirrors the original inline submission block, including error handling and
415
+ fail_on_submit_error semantics. Returns potentially updated batch indices and
416
+ submitted count.
417
+ """
418
+ if not current_batch_new_job_indices:
419
+ return current_batch_job_indices, submitted_new_indices_count
420
+
421
+ if not self.job_queue_id:
422
+ error_msg = "Cannot submit new jobs: job_queue_id is not set."
423
+ logger.error(error_msg)
424
+ # Fail these jobs immediately
425
+ for job_index in current_batch_new_job_indices:
426
+ self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
427
+ # Mark as "submitted" (to prevent reprocessing) but failed
428
+ submitted_new_indices_count += len(current_batch_new_job_indices)
429
+ if self.fail_on_submit_error:
430
+ raise ValueError(error_msg)
431
+ return current_batch_job_indices, submitted_new_indices_count
432
+
433
+ try:
434
+ # Fire-and-forget submission initiation
435
+ _ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
436
+ # Add successfully initiated jobs to the overall batch list
437
+ current_batch_job_indices.extend(current_batch_new_job_indices)
438
+ # Update count of total initiated jobs
439
+ submitted_new_indices_count += len(current_batch_new_job_indices)
440
+ return current_batch_job_indices, submitted_new_indices_count
441
+ except Exception as e:
442
+ error_msg = (
443
+ f"Batch async submission initiation failed for {len(current_batch_new_job_indices)} new jobs: {e}"
444
+ )
445
+ logger.error(error_msg, exc_info=True)
446
+ # Fail these jobs immediately
447
+ for job_index in current_batch_new_job_indices:
448
+ self._handle_processing_failure(
449
+ job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
450
+ )
451
+ # Mark as "submitted" (to prevent reprocessing) but failed
452
+ submitted_new_indices_count += len(current_batch_new_job_indices)
453
+ if self.fail_on_submit_error:
454
+ raise RuntimeError(error_msg) from e
455
+ return current_batch_job_indices, submitted_new_indices_count
456
+
457
+ def _initiate_fetch_for_batch(self, current_batch_job_indices: List[str]) -> Tuple[Dict[Future, str], List[str]]:
458
+ """
459
+ Initiate fetching for the prepared batch and ensure consistency of returned futures.
460
+
461
+ Returns
462
+ -------
463
+ batch_futures_dict : Dict[Future, str]
464
+ Mapping of futures to their associated job indices.
465
+ normalized_job_indices : List[str]
466
+ The job indices normalized to those actually returned by the client if a discrepancy occurs.
467
+ """
468
+ if self.verbose:
469
+ logger.debug(f"Calling fetch_job_result_async for {len(current_batch_job_indices)} jobs.")
470
+ batch_futures_dict: Dict[Future, str] = (
471
+ self.client.fetch_job_result_async(current_batch_job_indices, data_only=False, timeout=None)
472
+ if current_batch_job_indices
473
+ else {}
474
+ )
475
+
476
+ # Check for discrepancies where client might not return all futures
477
+ if current_batch_job_indices and (len(batch_futures_dict) != len(current_batch_job_indices)):
478
+ returned_indices = set(batch_futures_dict.values())
479
+ missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
480
+ logger.error(
481
+ f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
482
+ f"{len(batch_futures_dict)}. Missing: {missing_indices}"
483
+ )
484
+ # Fail the missing ones explicitly
485
+ for missing_idx in missing_indices:
486
+ self._handle_processing_failure(
487
+ missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
488
+ )
489
+ if self.fail_on_submit_error:
490
+ raise RuntimeError("fetch_job_result_async failed to return all expected futures.")
491
+ # Continue processing only the futures we received
492
+ normalized_job_indices = list(returned_indices)
493
+ else:
494
+ normalized_job_indices = list(current_batch_job_indices)
495
+
496
+ return batch_futures_dict, normalized_job_indices
497
+
498
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
292
499
  """
293
500
  Executes the main processing loop in batches.
294
501
 
@@ -314,210 +521,124 @@ class _ConcurrentProcessor:
314
521
  initiation error occurs.
315
522
  """
316
523
  total_jobs = len(self.all_job_indices_list)
317
- # Tracks indices for which submission has been initiated at least once
318
- submitted_new_indices_count = 0
319
-
320
- logger.info(f"Starting batch processing for {total_jobs} jobs with batch " f"size {self.batch_size}.")
321
-
322
- # Main loop: continues as long as there are new jobs to submit
323
- # or jobs waiting for retry.
324
- while (submitted_new_indices_count < total_jobs) or self.retry_job_ids:
325
-
326
- # --- Determine Jobs for Current Batch ---
327
- current_batch_job_indices: List[str] = []
328
-
329
- # Add retries from the previous batch first
330
- if self.retry_job_ids:
331
- num_retries = len(self.retry_job_ids)
332
- current_batch_job_indices.extend(self.retry_job_ids)
333
- if self.verbose:
334
- logger.debug(f"Adding {num_retries} retry jobs to current batch.")
335
- # Clear the list; retries for *this* batch will be collected later
336
- self.retry_job_ids = []
337
-
338
- # Determine and add new jobs to the batch
339
- num_already_in_batch = len(current_batch_job_indices)
340
- if (num_already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
341
- num_new_to_add = min(self.batch_size - num_already_in_batch, total_jobs - submitted_new_indices_count)
342
- start_idx = submitted_new_indices_count
343
- end_idx = submitted_new_indices_count + num_new_to_add
344
- current_batch_new_job_indices = self.all_job_indices_list[start_idx:end_idx]
345
-
346
- if self.verbose:
347
- logger.debug(f"Adding {len(current_batch_new_job_indices)} new " f"jobs to current batch.")
348
-
349
- # Initiate async submission for ONLY the NEW jobs
350
- if current_batch_new_job_indices:
351
- if not self.job_queue_id:
352
- error_msg = "Cannot submit new jobs: job_queue_id is not set."
353
- logger.error(error_msg)
354
- # Fail these jobs immediately
355
- for job_index in current_batch_new_job_indices:
356
- self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
357
- # Mark as "submitted" (to prevent reprocessing) but failed
358
- submitted_new_indices_count += len(current_batch_new_job_indices)
359
- if self.fail_on_submit_error:
360
- raise ValueError(error_msg)
361
- else:
362
- try:
363
- # Fire-and-forget submission initiation
364
- _ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
365
- # Add successfully initiated jobs to the overall batch list
366
- current_batch_job_indices.extend(current_batch_new_job_indices)
367
- # Update count of total initiated jobs
368
- submitted_new_indices_count += len(current_batch_new_job_indices)
369
- except Exception as e:
370
- error_msg = (
371
- f"Batch async submission initiation failed for "
372
- f"{len(current_batch_new_job_indices)} new jobs: {e}"
524
+ submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
525
+
526
+ logger.debug(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
527
+
528
+ # Keep up to batch_size jobs in-flight at all times
529
+ inflight_futures: Dict[Future, str] = {}
530
+
531
+ while (submitted_new_indices_count < total_jobs) or self.retry_job_ids or inflight_futures:
532
+ # 1) Top up from retries first
533
+ capacity = max(0, self.batch_size - len(inflight_futures))
534
+ to_fetch: List[str] = []
535
+ if capacity > 0 and self.retry_job_ids:
536
+ take = min(capacity, len(self.retry_job_ids))
537
+ retry_now = self.retry_job_ids[:take]
538
+ self.retry_job_ids = self.retry_job_ids[take:]
539
+ to_fetch.extend(retry_now)
540
+ capacity -= len(retry_now)
541
+
542
+ # 2) Then add new jobs up to capacity
543
+ if capacity > 0 and (submitted_new_indices_count < total_jobs):
544
+ new_count = min(capacity, total_jobs - submitted_new_indices_count)
545
+ new_job_indices = self.all_job_indices_list[
546
+ submitted_new_indices_count : submitted_new_indices_count + new_count
547
+ ]
548
+
549
+ if not self.job_queue_id:
550
+ error_msg = "Cannot submit new jobs: job_queue_id is not set."
551
+ logger.error(error_msg)
552
+ for job_index in new_job_indices:
553
+ self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
554
+ submitted_new_indices_count += len(new_job_indices)
555
+ if self.fail_on_submit_error:
556
+ raise ValueError(error_msg)
557
+ else:
558
+ try:
559
+ _ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
560
+ submitted_new_indices_count += len(new_job_indices)
561
+ to_fetch.extend(new_job_indices)
562
+ except Exception as e:
563
+ error_msg = f"Batch async submission initiation failed for {len(new_job_indices)} new jobs: {e}"
564
+ logger.error(error_msg, exc_info=True)
565
+ for job_index in new_job_indices:
566
+ self._handle_processing_failure(
567
+ job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
373
568
  )
374
- logger.error(error_msg, exc_info=True)
375
- # Fail these jobs immediately
376
- for job_index in current_batch_new_job_indices:
377
- self._handle_processing_failure(
378
- job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
379
- )
380
- # Mark as "submitted" (to prevent reprocessing) but failed
381
- submitted_new_indices_count += len(current_batch_new_job_indices)
382
- if self.fail_on_submit_error:
383
- raise RuntimeError(error_msg) from e
384
-
385
- # If nothing ended up in the batch (e.g., only submission failures)
386
- if not current_batch_job_indices:
387
- if self.verbose:
388
- logger.debug("No jobs identified for fetching in this batch iteration.")
389
- # If there are no retries pending either, break the loop
390
- if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
391
- logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
392
- break
393
- continue # Otherwise, proceed to next iteration
394
-
395
- # --- Initiate Fetching for the Current Batch ---
396
- try:
397
- if self.verbose:
398
- logger.debug(
399
- f"Calling fetch_job_result_async for "
400
- f"{len(current_batch_job_indices)} jobs in current batch."
401
- )
402
- # Use data_only=False to get full response for callback/results
403
- batch_futures_dict = self.client.fetch_job_result_async(current_batch_job_indices, data_only=False)
569
+ submitted_new_indices_count += len(new_job_indices)
570
+ if self.fail_on_submit_error:
571
+ raise RuntimeError(error_msg) from e
404
572
 
405
- # Check for discrepancies where client might not return all futures
406
- if len(batch_futures_dict) != len(current_batch_job_indices):
407
- returned_indices = set(batch_futures_dict.values())
408
- missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
573
+ # 3) Launch fetches for the jobs we added to this cycle
574
+ if to_fetch:
575
+ try:
576
+ new_futures = self.client.fetch_job_result_async(to_fetch, data_only=False, timeout=None)
577
+ inflight_futures.update(new_futures)
578
+ except Exception as fetch_init_err:
409
579
  logger.error(
410
- f"fetch_job_result_async discrepancy: Expected "
411
- f"{len(current_batch_job_indices)}, got "
412
- f"{len(batch_futures_dict)}. Missing: {missing_indices}"
580
+ f"fetch_job_result_async failed to start for {len(to_fetch)} jobs: {fetch_init_err}",
581
+ exc_info=True,
413
582
  )
414
- # Fail the missing ones explicitly
415
- for missing_idx in missing_indices:
583
+ for job_index in to_fetch:
416
584
  self._handle_processing_failure(
417
- missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
585
+ job_index, f"Fetch initiation error: {fetch_init_err}", is_submission_failure=True
418
586
  )
419
587
  if self.fail_on_submit_error:
420
- raise RuntimeError("fetch_job_result_async failed to return all " "expected futures.")
421
- # Continue processing only the futures we received
422
- current_batch_job_indices = list(returned_indices)
423
-
424
- except Exception as fetch_init_err:
425
- error_msg = (
426
- f"fetch_job_result_async failed for batch "
427
- f"({len(current_batch_job_indices)} jobs): {fetch_init_err}"
588
+ raise RuntimeError(
589
+ f"Stopping due to fetch initiation failure: {fetch_init_err}"
590
+ ) from fetch_init_err
591
+
592
+ # 4) If nothing left anywhere, exit
593
+ if not inflight_futures and not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
594
+ logger.debug("Exiting loop: No in-flight jobs, no retries, and all jobs submitted.")
595
+ break
596
+
597
+ # 5) Wait for at least one in-flight future to complete, then process done ones
598
+ if inflight_futures:
599
+ done, _ = concurrent.futures.wait(
600
+ set(inflight_futures.keys()), return_when=concurrent.futures.FIRST_COMPLETED
428
601
  )
429
- logger.error(error_msg, exc_info=True)
430
- logger.warning(
431
- f"Marking all {len(current_batch_job_indices)} jobs in " f"failed fetch initiation batch as failed."
432
- )
433
- # Fail all jobs intended for this batch
434
- for job_index in current_batch_job_indices:
435
- self._handle_processing_failure(
436
- job_index, f"Fetch initiation failed for batch: {fetch_init_err}", is_submission_failure=True
437
- )
438
- if self.fail_on_submit_error:
439
- raise RuntimeError(
440
- f"Stopping due to fetch initiation failure: {fetch_init_err}"
441
- ) from fetch_init_err
442
- continue # Skip processing results for this failed batch
443
-
444
- # --- Process Results for the Current Batch ---
445
- if not batch_futures_dict:
446
- if self.verbose:
447
- logger.debug("No futures returned/available for processing in this batch.")
448
- continue # Skip processing if no futures
449
-
450
- batch_timeout = 600.0 # Timeout for waiting on the whole batch
451
- try:
452
- # Process futures as they complete within this batch
453
- for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
454
- job_index = batch_futures_dict[future]
602
+ for future in done:
603
+ job_index = inflight_futures.pop(future, None)
604
+ if job_index is None:
605
+ continue
455
606
  try:
456
- # Expect list with one tuple: [(data, index, trace)]
457
607
  result_list = future.result()
458
608
  if not isinstance(result_list, list) or len(result_list) != 1:
459
609
  raise ValueError(f"Expected list length 1, got {len(result_list)}")
460
-
461
610
  result_tuple = result_list[0]
462
611
  if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
463
612
  raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
464
-
465
613
  full_response_dict, fetched_job_index, trace_id = result_tuple
466
-
467
614
  if fetched_job_index != job_index:
468
- logger.warning(f"Mismatch: Future for {job_index} returned " f"{fetched_job_index}")
469
-
615
+ logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
470
616
  self._handle_processing_success(job_index, full_response_dict, trace_id)
471
-
472
617
  except TimeoutError:
473
- # Handle job not ready - check retry policy
618
+ # Not ready -> immediate retry
474
619
  self.retry_counts[job_index] += 1
475
620
  if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
476
621
  if self.verbose:
477
622
  logger.info(
478
- f"Job {job_index} not ready, adding to next "
479
- f"batch's retry list (Attempt "
480
- f"{self.retry_counts[job_index]}/"
481
- f"{self.max_job_retries or 'inf'})."
623
+ f"Job {job_index} not ready, scheduling retry "
624
+ f"(Attempt {self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
482
625
  )
483
- # Collect for the *next* batch
484
- self.retry_job_ids.append(job_index)
626
+ self._schedule_retry(job_index)
485
627
  else:
486
- error_msg = f"Exceeded max fetch retries " f"({self.max_job_retries}) for job {job_index}."
628
+ error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
487
629
  logger.error(error_msg)
488
630
  self._handle_processing_failure(job_index, error_msg)
489
-
490
631
  except (ValueError, RuntimeError) as e:
491
632
  logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
492
633
  self._handle_processing_failure(job_index, f"Error processing result: {e}")
493
634
  except Exception as e:
494
635
  logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
495
636
  self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
496
- # No finally block incrementing count here; tracking is batch-based
497
-
498
- except TimeoutError:
499
- # `as_completed` timed out waiting for remaining futures in batch
500
- logger.error(
501
- f"Batch processing timed out after {batch_timeout}s waiting "
502
- f"for futures. Some jobs in batch may be lost or incomplete."
503
- )
504
- # Identify and fail remaining futures
505
- remaining_indices_in_batch = []
506
- for f, idx in batch_futures_dict.items():
507
- if not f.done():
508
- remaining_indices_in_batch.append(idx)
509
- f.cancel() # Attempt to cancel underlying task
510
- logger.warning(
511
- f"Jobs potentially lost/cancelled due to batch timeout: " f"{remaining_indices_in_batch}"
512
- )
513
- for idx in remaining_indices_in_batch:
514
- self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
515
- # End of processing for this batch cycle
516
637
 
517
638
  # --- Final Logging ---
518
639
  self._log_final_status(total_jobs)
519
640
 
520
- return self.results, self.failures
641
+ return self.results, self.failures, self.traces if self.return_traces else []
521
642
 
522
643
 
523
644
  class NvIngestClient:
@@ -546,11 +667,12 @@ class NvIngestClient:
546
667
  message_client_port : int, optional
547
668
  Port of the REST/message service. Defaults to 7670.
548
669
  message_client_kwargs : dict, optional
549
- Extra keyword arguments passed to the client allocator.
670
+ Extra keyword arguments passed to the client allocator. For RestClient,
671
+ can include 'api_version' (e.g., 'v1' or 'v2'). Defaults to 'v1'.
550
672
  msg_counter_id : str, optional
551
673
  Identifier for message counting. Defaults to "nv-ingest-message-id".
552
674
  worker_pool_size : int, optional
553
- Number of workers in the thread pool. Defaults to 1.
675
+ Number of workers in the thread pool. Defaults to 8.
554
676
 
555
677
  Returns
556
678
  -------
@@ -572,10 +694,19 @@ class NvIngestClient:
572
694
  **self._message_client_kwargs,
573
695
  )
574
696
 
575
- # Initialize the worker pool with the specified size
697
+ # Initialize the worker pool with the specified size (used for both submit and fetch)
576
698
  self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
577
699
 
700
+ # Telemetry state and controls
701
+ self._telemetry_lock = threading.Lock()
702
+ self._telemetry_enabled: bool = bool(int(os.getenv("NV_INGEST_CLIENT_TELEMETRY", "1")))
703
+ try:
704
+ self._telemetry_max_calls: int = int(os.getenv("NV_INGEST_CLIENT_TELEMETRY_MAX_CALLS", "10000"))
705
+ except ValueError:
706
+ self._telemetry_max_calls = 10000
578
707
  self._telemetry = {}
708
+ self._completed_parent_trace_ids: List[str] = [] # 1054
709
+ self.reset_telemetry()
579
710
 
580
711
  def __str__(self) -> str:
581
712
  """
@@ -623,6 +754,106 @@ class NvIngestClient:
623
754
 
624
755
  return job_state
625
756
 
757
+ # ------------------------------------------------------------------
758
+ # Telemetry helpers
759
+ # ------------------------------------------------------------------
760
+
761
+ def enable_telemetry(self, enabled: bool) -> None:
762
+ with self._telemetry_lock:
763
+ self._telemetry_enabled = bool(enabled)
764
+
765
+ def reset_telemetry(self) -> None:
766
+ with self._telemetry_lock:
767
+ self._telemetry = {
768
+ "started_at": time.time(),
769
+ "submit": {"count": 0, "calls": []},
770
+ "fetch": {"count": 0, "last_ts": None, "intervals": [], "calls": []},
771
+ "per_job": {},
772
+ }
773
+
774
+ def _t_per_job(self, job_index: str) -> Dict[str, Any]:
775
+ pj = self._telemetry["per_job"].get(job_index)
776
+ if pj is None:
777
+ pj = {"submits": [], "fetch_attempts": [], "timeouts_202": 0, "failures": 0, "first_success_ts": None}
778
+ self._telemetry["per_job"][job_index] = pj
779
+ return pj
780
+
781
+ def _t_append_capped(self, arr: List[Any], item: Any) -> None:
782
+ if len(arr) < self._telemetry_max_calls:
783
+ arr.append(item)
784
+
785
+ def _t_record_submit(self, job_index: str, status: str, ts: float, trace_id: Optional[str]) -> None:
786
+ if not self._telemetry_enabled:
787
+ return
788
+ with self._telemetry_lock:
789
+ self._telemetry["submit"]["count"] += 1
790
+ self._t_append_capped(
791
+ self._telemetry["submit"]["calls"],
792
+ {"job": job_index, "status": status, "ts": ts, "trace": trace_id},
793
+ )
794
+ pj = self._t_per_job(job_index)
795
+ self._t_append_capped(pj["submits"], ts)
796
+
797
+ def _t_record_fetch_attempt(self, job_index: str, ts: float) -> None:
798
+ if not self._telemetry_enabled:
799
+ return
800
+ with self._telemetry_lock:
801
+ self._telemetry["fetch"]["count"] += 1
802
+ last = self._telemetry["fetch"]["last_ts"]
803
+ if last is not None:
804
+ delta = ts - float(last)
805
+ if delta >= 0:
806
+ self._t_append_capped(self._telemetry["fetch"]["intervals"], delta)
807
+ self._telemetry["fetch"]["last_ts"] = ts
808
+ pj = self._t_per_job(job_index)
809
+ self._t_append_capped(pj["fetch_attempts"], ts)
810
+
811
+ def _t_record_fetch_outcome(self, job_index: str, code: int, ts: float, ok: bool, trace_id: Optional[str]) -> None:
812
+ if not self._telemetry_enabled:
813
+ return
814
+ with self._telemetry_lock:
815
+ self._t_append_capped(
816
+ self._telemetry["fetch"]["calls"],
817
+ {"job": job_index, "code": code, "ok": ok, "ts": ts, "trace": trace_id},
818
+ )
819
+ pj = self._t_per_job(job_index)
820
+ if code == 2: # 202 not ready
821
+ pj["timeouts_202"] += 1
822
+ if ok and pj["first_success_ts"] is None:
823
+ pj["first_success_ts"] = ts
824
+ if not ok and code not in (0, 2):
825
+ pj["failures"] += 1
826
+
827
+ def get_telemetry(self) -> Dict[str, Any]:
828
+ with self._telemetry_lock:
829
+ return copy.deepcopy(self._telemetry)
830
+
831
+ def summarize_telemetry(self) -> Dict[str, Any]:
832
+ with self._telemetry_lock:
833
+ submit_count = self._telemetry["submit"]["count"]
834
+ fetch_count = self._telemetry["fetch"]["count"]
835
+ intervals = list(self._telemetry["fetch"]["intervals"])
836
+ intervals.sort()
837
+ avg = mean(intervals) if intervals else 0.0
838
+ p50 = median(intervals) if intervals else 0.0
839
+ # p95 via index
840
+ p95 = intervals[int(0.95 * (len(intervals) - 1))] if intervals else 0.0
841
+ per_job = self._telemetry["per_job"]
842
+ # Aggregate per-job stats
843
+ jobs = len(per_job)
844
+ total_timeouts = sum(pj.get("timeouts_202", 0) for pj in per_job.values())
845
+ total_failures = sum(pj.get("failures", 0) for pj in per_job.values())
846
+ return {
847
+ "submit_count": submit_count,
848
+ "fetch_count": fetch_count,
849
+ "fetch_interval_avg": avg,
850
+ "fetch_interval_p50": p50,
851
+ "fetch_interval_p95": p95,
852
+ "jobs_tracked": jobs,
853
+ "timeouts_202_total": total_timeouts,
854
+ "failures_total": total_failures,
855
+ }
856
+
626
857
  def _get_and_check_job_state(
627
858
  self,
628
859
  job_index: str,
@@ -860,6 +1091,8 @@ class NvIngestClient:
860
1091
  Exception
861
1092
  For unexpected issues.
862
1093
  """
1094
+ ts_attempt = time.time()
1095
+ self._t_record_fetch_attempt(job_index, ts_attempt)
863
1096
  try:
864
1097
  # Get job state using the client-side index
865
1098
  job_state = self._get_and_check_job_state(
@@ -900,6 +1133,7 @@ class NvIngestClient:
900
1133
  logger.debug(
901
1134
  f"Successfully processed and removed job index {job_index} (Server ID: {server_job_id})"
902
1135
  )
1136
+ self._t_record_fetch_outcome(job_index, 0, time.time(), ok=True, trace_id=job_state.trace_id)
903
1137
  return result_data, job_index, job_state.trace_id
904
1138
 
905
1139
  except json.JSONDecodeError as err:
@@ -921,6 +1155,7 @@ class NvIngestClient:
921
1155
  elif response.response_code == 2: # Job Not Ready (e.g., HTTP 202, or r-2 from SimpleBroker)
922
1156
  # Raise TimeoutError to signal the calling retry loop in fetch_job_result
923
1157
  # Do not change job state here, remains SUBMITTED
1158
+ self._t_record_fetch_outcome(job_index, 2, time.time(), ok=False, trace_id=job_state.trace_id)
924
1159
  raise TimeoutError(f"Job not ready: {response.response_reason}")
925
1160
 
926
1161
  else:
@@ -933,6 +1168,7 @@ class NvIngestClient:
933
1168
  job_state.state = JobStateEnum.FAILED # Mark job as failed in the client
934
1169
  # Do NOT pop the state for failed jobs here
935
1170
  # Raise RuntimeError to indicate a terminal failure for this fetch attempt
1171
+ self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=job_state.trace_id)
936
1172
  raise RuntimeError(error_msg)
937
1173
 
938
1174
  except (TimeoutError, ValueError, RuntimeError):
@@ -944,12 +1180,17 @@ class NvIngestClient:
944
1180
  # Attempt to mark state as FAILED if possible and state object exists
945
1181
  if "job_state" in locals() and hasattr(job_state, "state"):
946
1182
  job_state.state = JobStateEnum.FAILED
1183
+ try:
1184
+ self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=None)
1185
+ except Exception:
1186
+ pass
947
1187
  raise # Re-raise the original exception
948
1188
 
949
1189
  def fetch_job_result_cli(
950
1190
  self,
951
1191
  job_ids: Union[str, List[str]],
952
1192
  data_only: bool = False,
1193
+ timeout: Optional[Tuple[int, Optional[float]]] = None,
953
1194
  ) -> List[Tuple[Any, str, Optional[str]]]:
954
1195
  """
955
1196
  Fetch job results via CLI semantics (synchronous list return).
@@ -969,23 +1210,71 @@ class NvIngestClient:
969
1210
  if isinstance(job_ids, str):
970
1211
  job_ids = [job_ids]
971
1212
 
972
- return [self._fetch_job_result(job_id, data_only=data_only) for job_id in job_ids]
1213
+ eff_timeout: Tuple[int, Optional[float]] = timeout if timeout is not None else (100, None)
1214
+ return [self._fetch_job_result(job_id, timeout=eff_timeout, data_only=data_only) for job_id in job_ids]
1215
+
1216
+ def _validate_batch_size(self, batch_size: Optional[int]) -> int:
1217
+ """
1218
+ Validates and returns a sanitized batch_size value.
1219
+
1220
+ Parameters
1221
+ ----------
1222
+ batch_size : Optional[int]
1223
+ The batch_size value to validate. None uses value from
1224
+ NV_INGEST_BATCH_SIZE environment variable or default 32.
1225
+
1226
+ Returns
1227
+ -------
1228
+ int
1229
+ Validated batch_size value.
1230
+ """
1231
+ # Handle None/default case
1232
+ if batch_size is None:
1233
+ try:
1234
+ batch_size = int(os.getenv("NV_INGEST_CLIENT_BATCH_SIZE", "32"))
1235
+ except ValueError:
1236
+ batch_size = 32
1237
+
1238
+ # Validate type and range
1239
+ if not isinstance(batch_size, int):
1240
+ logger.warning(f"batch_size must be an integer, got {type(batch_size).__name__}. Using default 32.")
1241
+ return 32
1242
+
1243
+ if batch_size < 1:
1244
+ logger.warning(f"batch_size must be >= 1, got {batch_size}. Using default 32.")
1245
+ return 32
1246
+
1247
+ # Performance guidance warnings
1248
+ if batch_size < 8:
1249
+ logger.warning(f"batch_size {batch_size} is very small and may impact performance.")
1250
+ elif batch_size > 128:
1251
+ logger.warning(f"batch_size {batch_size} is large and may increase memory usage.")
1252
+
1253
+ return batch_size
973
1254
 
974
1255
  def process_jobs_concurrently(
975
1256
  self,
976
1257
  job_indices: Union[str, List[str]],
977
1258
  job_queue_id: Optional[str] = None,
1259
+ batch_size: Optional[int] = None,
978
1260
  concurrency_limit: int = 64,
979
1261
  timeout: int = 100,
980
1262
  max_job_retries: Optional[int] = None,
981
- retry_delay: float = 5.0,
1263
+ retry_delay: float = 0.5,
1264
+ initial_fetch_delay: float = 0.3,
982
1265
  fail_on_submit_error: bool = False,
983
1266
  completion_callback: Optional[Callable[[Any, str], None]] = None,
984
1267
  return_failures: bool = False,
985
1268
  data_only: bool = True,
986
1269
  stream_to_callback_only: bool = False,
1270
+ return_full_response: bool = False,
987
1271
  verbose: bool = False,
988
- ) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
1272
+ return_traces: bool = False,
1273
+ ) -> Union[
1274
+ List[Any],
1275
+ Tuple[List[Any], List[Tuple[str, str]]],
1276
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
1277
+ ]:
989
1278
  """
990
1279
  Submit and fetch multiple jobs concurrently.
991
1280
 
@@ -995,8 +1284,12 @@ class NvIngestClient:
995
1284
  Single or multiple job indices to process.
996
1285
  job_queue_id : str, optional
997
1286
  Queue identifier for submission.
1287
+ batch_size : int, optional
1288
+ Maximum number of jobs to process in each internal batch.
1289
+ Higher values may improve throughput but increase memory usage.
1290
+ Must be >= 1. Default is 32.
998
1291
  concurrency_limit : int, optional
999
- Max number of simultaneous in-flight jobs. Default is 128.
1292
+ Max number of simultaneous in-flight jobs. Default is 64.
1000
1293
  timeout : int, optional
1001
1294
  Timeout in seconds per fetch attempt. Default is 100.
1002
1295
  max_job_retries : int, optional
@@ -1011,8 +1304,13 @@ class NvIngestClient:
1011
1304
  If True, return (results, failures). Default is False.
1012
1305
  data_only : bool, optional
1013
1306
  If True, return only payload 'data'. Default is True.
1307
+ return_full_response : bool, optional
1308
+ If True, results contain the full response envelopes (including 'trace' and 'annotations').
1309
+ Ignored when stream_to_callback_only=True. Default is False.
1014
1310
  verbose : bool, optional
1015
1311
  If True, enable debug logging. Default is False.
1312
+ return_traces : bool, optional
1313
+ If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
1016
1314
 
1017
1315
  Returns
1018
1316
  -------
@@ -1020,6 +1318,9 @@ class NvIngestClient:
1020
1318
  List of successful job results when `return_failures` is False.
1021
1319
  results, failures : tuple
1022
1320
  Tuple of (successful results, failure tuples) when `return_failures` is True.
1321
+ results, failures, traces : tuple
1322
+ Tuple of (successful results, failure tuples, trace dicts) when both
1323
+ `return_failures` and `return_traces` are True.
1023
1324
 
1024
1325
  Raises
1025
1326
  ------
@@ -1032,29 +1333,45 @@ class NvIngestClient:
1032
1333
 
1033
1334
  # Handle empty input
1034
1335
  if not job_indices:
1035
- return ([], []) if return_failures else []
1336
+ if return_failures and return_traces:
1337
+ return [], [], []
1338
+ elif return_failures:
1339
+ return [], []
1340
+ else:
1341
+ return []
1036
1342
 
1037
- # Prepare timeout tuple for fetch calls
1038
- effective_timeout: Tuple[int, None] = (timeout, None)
1343
+ # Validate and set batch_size
1344
+ validated_batch_size = self._validate_batch_size(batch_size)
1345
+
1346
+ # Prepare timeout tuple to mirror handler behavior: finite connect, unbounded read (long-poll)
1347
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1039
1348
 
1040
1349
  # Delegate to the concurrent processor
1041
1350
  processor = _ConcurrentProcessor(
1042
1351
  client=self,
1043
- batch_size=64,
1352
+ batch_size=validated_batch_size,
1044
1353
  job_indices=job_indices,
1045
1354
  job_queue_id=job_queue_id,
1046
1355
  timeout=effective_timeout,
1047
1356
  max_job_retries=max_job_retries,
1357
+ retry_delay=retry_delay,
1358
+ initial_fetch_delay=initial_fetch_delay,
1048
1359
  completion_callback=completion_callback,
1049
1360
  fail_on_submit_error=fail_on_submit_error,
1050
1361
  stream_to_callback_only=stream_to_callback_only,
1362
+ return_full_response=return_full_response,
1051
1363
  verbose=verbose,
1364
+ return_traces=return_traces,
1052
1365
  )
1053
1366
 
1054
- results, failures = processor.run()
1367
+ results, failures, traces = processor.run()
1055
1368
 
1056
- if return_failures:
1369
+ if return_failures and return_traces:
1370
+ return results, failures, traces
1371
+ elif return_failures:
1057
1372
  return results, failures
1373
+ elif return_traces:
1374
+ return results, traces
1058
1375
 
1059
1376
  if failures:
1060
1377
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
@@ -1087,7 +1404,12 @@ class NvIngestClient:
1087
1404
  job_state.trace_id = future.result()[0] # Trace_id from `submit_job` endpoint submission
1088
1405
  job_state.future = None
1089
1406
 
1090
- def fetch_job_result_async(self, job_ids: Union[str, List[str]], data_only: bool = True) -> Dict[Future, str]:
1407
+ def fetch_job_result_async(
1408
+ self,
1409
+ job_ids: Union[str, List[str]],
1410
+ data_only: bool = True,
1411
+ timeout: Optional[Tuple[int, Optional[float]]] = None,
1412
+ ) -> Dict[Future, str]:
1091
1413
  """
1092
1414
  Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to job IDs.
1093
1415
 
@@ -1108,7 +1430,7 @@ class NvIngestClient:
1108
1430
  future_to_job_id = {}
1109
1431
  for job_id in job_ids:
1110
1432
  job_state = self._get_and_check_job_state(job_id)
1111
- future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only)
1433
+ future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only, timeout)
1112
1434
  job_state.future = future
1113
1435
  future_to_job_id[future] = job_id
1114
1436
 
@@ -1159,12 +1481,19 @@ class NvIngestClient:
1159
1481
  # Free up memory -- payload should never be used again, and we don't want to keep it around.
1160
1482
  job_state.job_spec.payload = None
1161
1483
 
1484
+ try:
1485
+ self._t_record_submit(job_index, "ok", time.time(), x_trace_id)
1486
+ except Exception:
1487
+ pass
1162
1488
  return x_trace_id
1163
1489
  except Exception as err:
1164
1490
  err_msg = f"Failed to submit job {job_index} to queue {job_queue_id}: {err}"
1165
1491
  logger.exception(err_msg)
1166
1492
  job_state.state = JobStateEnum.FAILED
1167
-
1493
+ try:
1494
+ self._t_record_submit(job_index, "fail", time.time(), None)
1495
+ except Exception:
1496
+ pass
1168
1497
  raise
1169
1498
 
1170
1499
  def submit_job(
@@ -1385,7 +1714,9 @@ class NvIngestClient:
1385
1714
 
1386
1715
  return results
1387
1716
 
1388
- def create_jobs_for_batch(self, files_batch: List[str], tasks: Dict[str, Any]) -> List[str]:
1717
+ def create_jobs_for_batch(
1718
+ self, files_batch: List[str], tasks: Dict[str, Any], pdf_split_page_count: int = None
1719
+ ) -> List[str]:
1389
1720
  """
1390
1721
  Create and submit job specifications (JobSpecs) for a batch of files, returning the job IDs.
1391
1722
  This function takes a batch of files, processes each file to extract its content and type,
@@ -1401,6 +1732,9 @@ class NvIngestClient:
1401
1732
  A dictionary of tasks to be added to each job. The keys represent task names, and the
1402
1733
  values represent task specifications or configurations. Standard tasks include "split",
1403
1734
  "extract", "store", "caption", "dedup", "filter", "embed".
1735
+ pdf_split_page_count : int, optional
1736
+ Number of pages per PDF chunk for splitting (1-128). If provided, this will be added
1737
+ to the job spec's extended_options for PDF files.
1404
1738
 
1405
1739
  Returns
1406
1740
  -------
@@ -1447,6 +1781,10 @@ class NvIngestClient:
1447
1781
 
1448
1782
  job_specs = create_job_specs_for_batch(files_batch)
1449
1783
 
1784
+ # Apply PDF split config if provided
1785
+ if pdf_split_page_count is not None:
1786
+ apply_pdf_split_config_to_job_specs(job_specs, pdf_split_page_count)
1787
+
1450
1788
  job_ids = []
1451
1789
  for job_spec in job_specs:
1452
1790
  logger.debug(f"Tasks: {tasks.keys()}")
@@ -1476,3 +1814,19 @@ class NvIngestClient:
1476
1814
  job_ids.append(job_id)
1477
1815
 
1478
1816
  return job_ids
1817
+
1818
+ def register_parent_trace_id(self, trace_id: Optional[str]) -> None:
1819
+ """Record a parent trace identifier once its aggregation completed."""
1820
+
1821
+ if not trace_id:
1822
+ return
1823
+
1824
+ if trace_id not in self._completed_parent_trace_ids:
1825
+ self._completed_parent_trace_ids.append(trace_id)
1826
+
1827
+ def consume_completed_parent_trace_ids(self) -> List[str]:
1828
+ """Return and clear the set of completed parent trace identifiers."""
1829
+
1830
+ trace_ids = list(self._completed_parent_trace_ids)
1831
+ self._completed_parent_trace_ids.clear()
1832
+ return trace_ids