nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -10,6 +10,9 @@ import logging
10
10
  import math
11
11
  import os
12
12
  import time
13
+ import threading
14
+ import copy
15
+ from statistics import mean, median
13
16
  from collections import defaultdict
14
17
  from concurrent.futures import Future
15
18
  from concurrent.futures import ThreadPoolExecutor
@@ -32,11 +35,59 @@ from nv_ingest_client.primitives.tasks import TaskType
32
35
  from nv_ingest_client.primitives.tasks import is_valid_task_type
33
36
  from nv_ingest_client.primitives.tasks import task_factory
34
37
  from nv_ingest_client.util.processing import handle_future_result, IngestJobFailure
35
- from nv_ingest_client.util.util import create_job_specs_for_batch, check_ingest_result
38
+ from nv_ingest_client.util.util import (
39
+ create_job_specs_for_batch,
40
+ check_ingest_result,
41
+ apply_pdf_split_config_to_job_specs,
42
+ )
36
43
 
37
44
  logger = logging.getLogger(__name__)
38
45
 
39
46
 
47
+ def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
48
+ """
49
+ Compute resident_time entries from entry/exit pairs if not already present.
50
+
51
+ This ensures consistency between split jobs (where server computes resident_time)
52
+ and non-split jobs (where we compute it client-side).
53
+
54
+ Parameters
55
+ ----------
56
+ trace_dict : Dict[str, Any]
57
+ Trace dictionary with entry/exit pairs
58
+
59
+ Returns
60
+ -------
61
+ Dict[str, Any]
62
+ Trace dictionary with resident_time entries added
63
+ """
64
+ if not trace_dict or not isinstance(trace_dict, dict):
65
+ return trace_dict
66
+
67
+ # Check if resident_time already exists (server-computed for split jobs)
68
+ has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
69
+ if has_resident:
70
+ return trace_dict # Already computed by server
71
+
72
+ # Compute resident_time from entry/exit pairs
73
+ result = dict(trace_dict)
74
+ stages = set()
75
+
76
+ # Find all unique stages
77
+ for key in trace_dict:
78
+ if key.startswith("trace::entry::"):
79
+ stages.add(key.replace("trace::entry::", ""))
80
+
81
+ # Compute resident_time for each stage
82
+ for stage in stages:
83
+ entry_key = f"trace::entry::{stage}"
84
+ exit_key = f"trace::exit::{stage}"
85
+ if entry_key in trace_dict and exit_key in trace_dict:
86
+ result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
87
+
88
+ return result
89
+
90
+
40
91
  class DataDecodeException(Exception):
41
92
  """
42
93
  Exception raised for errors in decoding data.
@@ -57,15 +108,12 @@ class DataDecodeException(Exception):
57
108
 
58
109
  class _ConcurrentProcessor:
59
110
  """
60
- Manages the asynchronous submission and result fetching of jobs using a
61
- client's public methods, mirroring the batching structure of the CLI path.
62
-
63
- This processor takes a list of pre-created job indices, submits them in
64
- batches via the client's `submit_job_async`, and then fetches results
65
- for each batch using `fetch_job_result_async`. It processes results as
66
- they become available within the batch using `as_completed`. Retries due
67
- to job readiness timeouts are handled by adding the job index to the next
68
- processing batch.
111
+ Manages asynchronous submission and result fetching while keeping a steady
112
+ pool of up to `batch_size` in-flight jobs:
113
+ - Retries (202/TimeoutError) are re-queued immediately.
114
+ - New jobs are submitted as capacity frees up.
115
+ - Fetches are started for jobs added each cycle.
116
+ - We always attempt to keep the executor saturated up to `batch_size`.
69
117
  """
70
118
 
71
119
  def __init__(
@@ -76,10 +124,14 @@ class _ConcurrentProcessor:
76
124
  batch_size: int,
77
125
  timeout: Tuple[int, Union[float, None]],
78
126
  max_job_retries: Optional[int],
127
+ retry_delay: float,
128
+ initial_fetch_delay: float,
79
129
  completion_callback: Optional[Callable[[Dict[str, Any], str], None]],
80
130
  fail_on_submit_error: bool,
81
131
  stream_to_callback_only: bool,
132
+ return_full_response: bool,
82
133
  verbose: bool = False,
134
+ return_traces: bool = False,
83
135
  ):
84
136
  """
85
137
  Initializes the concurrent processor.
@@ -113,6 +165,8 @@ class _ConcurrentProcessor:
113
165
  initiating job submission or fetching fails for a batch.
114
166
  verbose : bool, optional
115
167
  If True, enables detailed debug logging. Default is False.
168
+ return_traces : bool, optional
169
+ If True, parent-level trace data for each completed job is stored.
116
170
 
117
171
  Raises
118
172
  ------
@@ -128,16 +182,21 @@ class _ConcurrentProcessor:
128
182
  self.batch_size = batch_size
129
183
  self.timeout = timeout
130
184
  self.max_job_retries = max_job_retries
185
+ self.retry_delay = retry_delay
186
+ self.initial_fetch_delay = initial_fetch_delay
131
187
  self.completion_callback = completion_callback
132
188
  self.fail_on_submit_error = fail_on_submit_error
133
189
  self.stream_to_callback_only = stream_to_callback_only
190
+ self.return_full_response = return_full_response
134
191
  self.verbose = verbose
192
+ self.return_traces = return_traces
135
193
 
136
194
  # State variables managed across batch cycles
137
195
  self.retry_job_ids: List[str] = []
138
196
  self.retry_counts: Dict[str, int] = defaultdict(int)
139
197
  self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
140
198
  self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
199
+ self.traces: List[Optional[Dict[str, Any]]] = []
141
200
 
142
201
  # --- Initial Checks ---
143
202
  if not self.job_queue_id:
@@ -225,13 +284,25 @@ class _ConcurrentProcessor:
225
284
 
226
285
  is_failed, description = check_ingest_result(result_data)
227
286
 
287
+ if trace_id:
288
+ self.client.register_parent_trace_id(trace_id)
289
+
228
290
  if is_failed:
229
291
  failed_job_spec = self.client._job_index_to_job_spec.get(job_index)
230
292
  self.failures.append((f"{job_index}:{failed_job_spec.source_id}", description))
231
293
  elif self.stream_to_callback_only:
232
294
  self.results.append(job_index)
233
295
  else:
234
- self.results.append(result_data.get("data"))
296
+ # When requested, return the full response envelope (includes 'trace' and 'annotations')
297
+ self.results.append(result_data if self.return_full_response else result_data.get("data"))
298
+
299
+ # Extract trace data for all successful (non-failed) jobs
300
+ if self.return_traces and not is_failed:
301
+ trace_payload = result_data.get("trace") if result_data else None
302
+ # Compute resident_time if not already present (for consistency)
303
+ if trace_payload:
304
+ trace_payload = _compute_resident_times(trace_payload)
305
+ self.traces.append(trace_payload if trace_payload else None)
235
306
 
236
307
  # Cleanup retry count if it exists
237
308
  if job_index in self.retry_counts:
@@ -275,21 +346,156 @@ class _ConcurrentProcessor:
275
346
  except Exception:
276
347
  logger.warning("Could not reliably extract job indices from results for final check.")
277
348
 
278
- initial_indices = set(self.all_job_indices_list)
279
- unaccounted_indices = initial_indices - processed_indices
280
-
281
- if unaccounted_indices:
282
- logger.warning(f"Potentially unaccounted for jobs: {unaccounted_indices}")
283
- # Optionally add them to failures
284
- # for idx in unaccounted_indices:
285
- # if not any(f[0] == idx for f in self.failures):
286
- # self.failures.append((idx, "Job lost or unaccounted for at exit"))
287
-
288
349
  # --------------------------------------------------------------------------
289
- # Public Methods
350
+ # Declarative Helper Methods (behavior preserved)
290
351
  # --------------------------------------------------------------------------
291
352
 
292
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
353
+ def _collect_retry_jobs_for_batch(self) -> List[str]:
354
+ """
355
+ Collect retry jobs for this batch, mirroring handler behavior (no pacing filter).
356
+
357
+ Returns
358
+ -------
359
+ List[str]
360
+ The list of job indices that should be retried in this batch.
361
+ """
362
+ if not self.retry_job_ids:
363
+ return []
364
+
365
+ # Take all retries this cycle and clear the list (handler resets per-iteration)
366
+ eligible: List[str] = list(self.retry_job_ids)
367
+ self.retry_job_ids = []
368
+ if eligible and self.verbose:
369
+ logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
370
+ return eligible
371
+
372
+ def _schedule_retry(self, job_index: str) -> None:
373
+ """
374
+ Schedule an immediate retry for a job (no pacing), mirroring handler behavior.
375
+ """
376
+ if job_index not in self.retry_job_ids:
377
+ self.retry_job_ids.append(job_index)
378
+
379
+ def _select_new_jobs_for_batch(
380
+ self,
381
+ submitted_new_indices_count: int,
382
+ total_jobs: int,
383
+ already_in_batch: int,
384
+ ) -> Tuple[List[str], int]:
385
+ """
386
+ Determine the slice of new jobs to include in the current batch based on
387
+ remaining capacity and unsubmitted jobs.
388
+
389
+ Note: This does NOT change submitted_new_indices_count. The original code
390
+ increments that counter only after submission is attempted/handled.
391
+ """
392
+ if (already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
393
+ num_new_to_add = min(self.batch_size - already_in_batch, total_jobs - submitted_new_indices_count)
394
+ start_idx = submitted_new_indices_count
395
+ end_idx = submitted_new_indices_count + num_new_to_add
396
+ new_job_indices = self.all_job_indices_list[start_idx:end_idx]
397
+
398
+ if self.verbose:
399
+ logger.debug(f"Adding {len(new_job_indices)} new jobs to current batch.")
400
+
401
+ return new_job_indices, submitted_new_indices_count
402
+
403
+ return [], submitted_new_indices_count
404
+
405
+ def _submit_new_jobs_async(
406
+ self,
407
+ current_batch_new_job_indices: List[str],
408
+ current_batch_job_indices: List[str],
409
+ submitted_new_indices_count: int,
410
+ ) -> Tuple[List[str], int]:
411
+ """
412
+ Initiate asynchronous submission for the new jobs selected for this batch.
413
+
414
+ Mirrors the original inline submission block, including error handling and
415
+ fail_on_submit_error semantics. Returns potentially updated batch indices and
416
+ submitted count.
417
+ """
418
+ if not current_batch_new_job_indices:
419
+ return current_batch_job_indices, submitted_new_indices_count
420
+
421
+ if not self.job_queue_id:
422
+ error_msg = "Cannot submit new jobs: job_queue_id is not set."
423
+ logger.error(error_msg)
424
+ # Fail these jobs immediately
425
+ for job_index in current_batch_new_job_indices:
426
+ self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
427
+ # Mark as "submitted" (to prevent reprocessing) but failed
428
+ submitted_new_indices_count += len(current_batch_new_job_indices)
429
+ if self.fail_on_submit_error:
430
+ raise ValueError(error_msg)
431
+ return current_batch_job_indices, submitted_new_indices_count
432
+
433
+ try:
434
+ # Fire-and-forget submission initiation
435
+ _ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
436
+ # Add successfully initiated jobs to the overall batch list
437
+ current_batch_job_indices.extend(current_batch_new_job_indices)
438
+ # Update count of total initiated jobs
439
+ submitted_new_indices_count += len(current_batch_new_job_indices)
440
+ return current_batch_job_indices, submitted_new_indices_count
441
+ except Exception as e:
442
+ error_msg = (
443
+ f"Batch async submission initiation failed for {len(current_batch_new_job_indices)} new jobs: {e}"
444
+ )
445
+ logger.error(error_msg, exc_info=True)
446
+ # Fail these jobs immediately
447
+ for job_index in current_batch_new_job_indices:
448
+ self._handle_processing_failure(
449
+ job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
450
+ )
451
+ # Mark as "submitted" (to prevent reprocessing) but failed
452
+ submitted_new_indices_count += len(current_batch_new_job_indices)
453
+ if self.fail_on_submit_error:
454
+ raise RuntimeError(error_msg) from e
455
+ return current_batch_job_indices, submitted_new_indices_count
456
+
457
+ def _initiate_fetch_for_batch(self, current_batch_job_indices: List[str]) -> Tuple[Dict[Future, str], List[str]]:
458
+ """
459
+ Initiate fetching for the prepared batch and ensure consistency of returned futures.
460
+
461
+ Returns
462
+ -------
463
+ batch_futures_dict : Dict[Future, str]
464
+ Mapping of futures to their associated job indices.
465
+ normalized_job_indices : List[str]
466
+ The job indices normalized to those actually returned by the client if a discrepancy occurs.
467
+ """
468
+ if self.verbose:
469
+ logger.debug(f"Calling fetch_job_result_async for {len(current_batch_job_indices)} jobs.")
470
+ batch_futures_dict: Dict[Future, str] = (
471
+ self.client.fetch_job_result_async(current_batch_job_indices, data_only=False, timeout=None)
472
+ if current_batch_job_indices
473
+ else {}
474
+ )
475
+
476
+ # Check for discrepancies where client might not return all futures
477
+ if current_batch_job_indices and (len(batch_futures_dict) != len(current_batch_job_indices)):
478
+ returned_indices = set(batch_futures_dict.values())
479
+ missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
480
+ logger.error(
481
+ f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
482
+ f"{len(batch_futures_dict)}. Missing: {missing_indices}"
483
+ )
484
+ # Fail the missing ones explicitly
485
+ for missing_idx in missing_indices:
486
+ self._handle_processing_failure(
487
+ missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
488
+ )
489
+ if self.fail_on_submit_error:
490
+ raise RuntimeError("fetch_job_result_async failed to return all expected futures.")
491
+ # Continue processing only the futures we received
492
+ normalized_job_indices = list(returned_indices)
493
+ else:
494
+ normalized_job_indices = list(current_batch_job_indices)
495
+
496
+ return batch_futures_dict, normalized_job_indices
497
+
498
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
293
499
  """
294
500
  Executes the main processing loop in batches.
295
501
 
@@ -315,210 +521,124 @@ class _ConcurrentProcessor:
315
521
  initiation error occurs.
316
522
  """
317
523
  total_jobs = len(self.all_job_indices_list)
318
- # Tracks indices for which submission has been initiated at least once
319
- submitted_new_indices_count = 0
320
-
321
- logger.info(f"Starting batch processing for {total_jobs} jobs with batch " f"size {self.batch_size}.")
322
-
323
- # Main loop: continues as long as there are new jobs to submit
324
- # or jobs waiting for retry.
325
- while (submitted_new_indices_count < total_jobs) or self.retry_job_ids:
326
-
327
- # --- Determine Jobs for Current Batch ---
328
- current_batch_job_indices: List[str] = []
329
-
330
- # Add retries from the previous batch first
331
- if self.retry_job_ids:
332
- num_retries = len(self.retry_job_ids)
333
- current_batch_job_indices.extend(self.retry_job_ids)
334
- if self.verbose:
335
- logger.debug(f"Adding {num_retries} retry jobs to current batch.")
336
- # Clear the list; retries for *this* batch will be collected later
337
- self.retry_job_ids = []
338
-
339
- # Determine and add new jobs to the batch
340
- num_already_in_batch = len(current_batch_job_indices)
341
- if (num_already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
342
- num_new_to_add = min(self.batch_size - num_already_in_batch, total_jobs - submitted_new_indices_count)
343
- start_idx = submitted_new_indices_count
344
- end_idx = submitted_new_indices_count + num_new_to_add
345
- current_batch_new_job_indices = self.all_job_indices_list[start_idx:end_idx]
346
-
347
- if self.verbose:
348
- logger.debug(f"Adding {len(current_batch_new_job_indices)} new " f"jobs to current batch.")
349
-
350
- # Initiate async submission for ONLY the NEW jobs
351
- if current_batch_new_job_indices:
352
- if not self.job_queue_id:
353
- error_msg = "Cannot submit new jobs: job_queue_id is not set."
354
- logger.error(error_msg)
355
- # Fail these jobs immediately
356
- for job_index in current_batch_new_job_indices:
357
- self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
358
- # Mark as "submitted" (to prevent reprocessing) but failed
359
- submitted_new_indices_count += len(current_batch_new_job_indices)
360
- if self.fail_on_submit_error:
361
- raise ValueError(error_msg)
362
- else:
363
- try:
364
- # Fire-and-forget submission initiation
365
- _ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
366
- # Add successfully initiated jobs to the overall batch list
367
- current_batch_job_indices.extend(current_batch_new_job_indices)
368
- # Update count of total initiated jobs
369
- submitted_new_indices_count += len(current_batch_new_job_indices)
370
- except Exception as e:
371
- error_msg = (
372
- f"Batch async submission initiation failed for "
373
- f"{len(current_batch_new_job_indices)} new jobs: {e}"
524
+ submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
525
+
526
+ logger.debug(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
527
+
528
+ # Keep up to batch_size jobs in-flight at all times
529
+ inflight_futures: Dict[Future, str] = {}
530
+
531
+ while (submitted_new_indices_count < total_jobs) or self.retry_job_ids or inflight_futures:
532
+ # 1) Top up from retries first
533
+ capacity = max(0, self.batch_size - len(inflight_futures))
534
+ to_fetch: List[str] = []
535
+ if capacity > 0 and self.retry_job_ids:
536
+ take = min(capacity, len(self.retry_job_ids))
537
+ retry_now = self.retry_job_ids[:take]
538
+ self.retry_job_ids = self.retry_job_ids[take:]
539
+ to_fetch.extend(retry_now)
540
+ capacity -= len(retry_now)
541
+
542
+ # 2) Then add new jobs up to capacity
543
+ if capacity > 0 and (submitted_new_indices_count < total_jobs):
544
+ new_count = min(capacity, total_jobs - submitted_new_indices_count)
545
+ new_job_indices = self.all_job_indices_list[
546
+ submitted_new_indices_count : submitted_new_indices_count + new_count
547
+ ]
548
+
549
+ if not self.job_queue_id:
550
+ error_msg = "Cannot submit new jobs: job_queue_id is not set."
551
+ logger.error(error_msg)
552
+ for job_index in new_job_indices:
553
+ self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
554
+ submitted_new_indices_count += len(new_job_indices)
555
+ if self.fail_on_submit_error:
556
+ raise ValueError(error_msg)
557
+ else:
558
+ try:
559
+ _ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
560
+ submitted_new_indices_count += len(new_job_indices)
561
+ to_fetch.extend(new_job_indices)
562
+ except Exception as e:
563
+ error_msg = f"Batch async submission initiation failed for {len(new_job_indices)} new jobs: {e}"
564
+ logger.error(error_msg, exc_info=True)
565
+ for job_index in new_job_indices:
566
+ self._handle_processing_failure(
567
+ job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
374
568
  )
375
- logger.error(error_msg, exc_info=True)
376
- # Fail these jobs immediately
377
- for job_index in current_batch_new_job_indices:
378
- self._handle_processing_failure(
379
- job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
380
- )
381
- # Mark as "submitted" (to prevent reprocessing) but failed
382
- submitted_new_indices_count += len(current_batch_new_job_indices)
383
- if self.fail_on_submit_error:
384
- raise RuntimeError(error_msg) from e
385
-
386
- # If nothing ended up in the batch (e.g., only submission failures)
387
- if not current_batch_job_indices:
388
- if self.verbose:
389
- logger.debug("No jobs identified for fetching in this batch iteration.")
390
- # If there are no retries pending either, break the loop
391
- if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
392
- logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
393
- break
394
- continue # Otherwise, proceed to next iteration
395
-
396
- # --- Initiate Fetching for the Current Batch ---
397
- try:
398
- if self.verbose:
399
- logger.debug(
400
- f"Calling fetch_job_result_async for "
401
- f"{len(current_batch_job_indices)} jobs in current batch."
402
- )
403
- # Use data_only=False to get full response for callback/results
404
- batch_futures_dict = self.client.fetch_job_result_async(current_batch_job_indices, data_only=False)
569
+ submitted_new_indices_count += len(new_job_indices)
570
+ if self.fail_on_submit_error:
571
+ raise RuntimeError(error_msg) from e
405
572
 
406
- # Check for discrepancies where client might not return all futures
407
- if len(batch_futures_dict) != len(current_batch_job_indices):
408
- returned_indices = set(batch_futures_dict.values())
409
- missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
573
+ # 3) Launch fetches for the jobs we added to this cycle
574
+ if to_fetch:
575
+ try:
576
+ new_futures = self.client.fetch_job_result_async(to_fetch, data_only=False, timeout=None)
577
+ inflight_futures.update(new_futures)
578
+ except Exception as fetch_init_err:
410
579
  logger.error(
411
- f"fetch_job_result_async discrepancy: Expected "
412
- f"{len(current_batch_job_indices)}, got "
413
- f"{len(batch_futures_dict)}. Missing: {missing_indices}"
580
+ f"fetch_job_result_async failed to start for {len(to_fetch)} jobs: {fetch_init_err}",
581
+ exc_info=True,
414
582
  )
415
- # Fail the missing ones explicitly
416
- for missing_idx in missing_indices:
583
+ for job_index in to_fetch:
417
584
  self._handle_processing_failure(
418
- missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
585
+ job_index, f"Fetch initiation error: {fetch_init_err}", is_submission_failure=True
419
586
  )
420
587
  if self.fail_on_submit_error:
421
- raise RuntimeError("fetch_job_result_async failed to return all " "expected futures.")
422
- # Continue processing only the futures we received
423
- current_batch_job_indices = list(returned_indices)
424
-
425
- except Exception as fetch_init_err:
426
- error_msg = (
427
- f"fetch_job_result_async failed for batch "
428
- f"({len(current_batch_job_indices)} jobs): {fetch_init_err}"
429
- )
430
- logger.error(error_msg, exc_info=True)
431
- logger.warning(
432
- f"Marking all {len(current_batch_job_indices)} jobs in " f"failed fetch initiation batch as failed."
588
+ raise RuntimeError(
589
+ f"Stopping due to fetch initiation failure: {fetch_init_err}"
590
+ ) from fetch_init_err
591
+
592
+ # 4) If nothing left anywhere, exit
593
+ if not inflight_futures and not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
594
+ logger.debug("Exiting loop: No in-flight jobs, no retries, and all jobs submitted.")
595
+ break
596
+
597
+ # 5) Wait for at least one in-flight future to complete, then process done ones
598
+ if inflight_futures:
599
+ done, _ = concurrent.futures.wait(
600
+ set(inflight_futures.keys()), return_when=concurrent.futures.FIRST_COMPLETED
433
601
  )
434
- # Fail all jobs intended for this batch
435
- for job_index in current_batch_job_indices:
436
- self._handle_processing_failure(
437
- job_index, f"Fetch initiation failed for batch: {fetch_init_err}", is_submission_failure=True
438
- )
439
- if self.fail_on_submit_error:
440
- raise RuntimeError(
441
- f"Stopping due to fetch initiation failure: {fetch_init_err}"
442
- ) from fetch_init_err
443
- continue # Skip processing results for this failed batch
444
-
445
- # --- Process Results for the Current Batch ---
446
- if not batch_futures_dict:
447
- if self.verbose:
448
- logger.debug("No futures returned/available for processing in this batch.")
449
- continue # Skip processing if no futures
450
-
451
- batch_timeout = 600.0 # Timeout for waiting on the whole batch
452
- try:
453
- # Process futures as they complete within this batch
454
- for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
455
- job_index = batch_futures_dict[future]
602
+ for future in done:
603
+ job_index = inflight_futures.pop(future, None)
604
+ if job_index is None:
605
+ continue
456
606
  try:
457
- # Expect list with one tuple: [(data, index, trace)]
458
607
  result_list = future.result()
459
608
  if not isinstance(result_list, list) or len(result_list) != 1:
460
609
  raise ValueError(f"Expected list length 1, got {len(result_list)}")
461
-
462
610
  result_tuple = result_list[0]
463
611
  if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
464
612
  raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
465
-
466
613
  full_response_dict, fetched_job_index, trace_id = result_tuple
467
-
468
614
  if fetched_job_index != job_index:
469
- logger.warning(f"Mismatch: Future for {job_index} returned " f"{fetched_job_index}")
470
-
615
+ logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
471
616
  self._handle_processing_success(job_index, full_response_dict, trace_id)
472
-
473
617
  except TimeoutError:
474
- # Handle job not ready - check retry policy
618
+ # Not ready -> immediate retry
475
619
  self.retry_counts[job_index] += 1
476
620
  if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
477
621
  if self.verbose:
478
622
  logger.info(
479
- f"Job {job_index} not ready, adding to next "
480
- f"batch's retry list (Attempt "
481
- f"{self.retry_counts[job_index]}/"
482
- f"{self.max_job_retries or 'inf'})."
623
+ f"Job {job_index} not ready, scheduling retry "
624
+ f"(Attempt {self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
483
625
  )
484
- # Collect for the *next* batch
485
- self.retry_job_ids.append(job_index)
626
+ self._schedule_retry(job_index)
486
627
  else:
487
- error_msg = f"Exceeded max fetch retries " f"({self.max_job_retries}) for job {job_index}."
628
+ error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
488
629
  logger.error(error_msg)
489
630
  self._handle_processing_failure(job_index, error_msg)
490
-
491
631
  except (ValueError, RuntimeError) as e:
492
632
  logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
493
633
  self._handle_processing_failure(job_index, f"Error processing result: {e}")
494
634
  except Exception as e:
495
635
  logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
496
636
  self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
497
- # No finally block incrementing count here; tracking is batch-based
498
-
499
- except TimeoutError:
500
- # `as_completed` timed out waiting for remaining futures in batch
501
- logger.error(
502
- f"Batch processing timed out after {batch_timeout}s waiting "
503
- f"for futures. Some jobs in batch may be lost or incomplete."
504
- )
505
- # Identify and fail remaining futures
506
- remaining_indices_in_batch = []
507
- for f, idx in batch_futures_dict.items():
508
- if not f.done():
509
- remaining_indices_in_batch.append(idx)
510
- f.cancel() # Attempt to cancel underlying task
511
- logger.warning(
512
- f"Jobs potentially lost/cancelled due to batch timeout: " f"{remaining_indices_in_batch}"
513
- )
514
- for idx in remaining_indices_in_batch:
515
- self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
516
- # End of processing for this batch cycle
517
637
 
518
638
  # --- Final Logging ---
519
639
  self._log_final_status(total_jobs)
520
640
 
521
- return self.results, self.failures
641
+ return self.results, self.failures, self.traces if self.return_traces else []
522
642
 
523
643
 
524
644
  class NvIngestClient:
@@ -547,11 +667,12 @@ class NvIngestClient:
547
667
  message_client_port : int, optional
548
668
  Port of the REST/message service. Defaults to 7670.
549
669
  message_client_kwargs : dict, optional
550
- Extra keyword arguments passed to the client allocator.
670
+ Extra keyword arguments passed to the client allocator. For RestClient,
671
+ can include 'api_version' (e.g., 'v1' or 'v2'). Defaults to 'v1'.
551
672
  msg_counter_id : str, optional
552
673
  Identifier for message counting. Defaults to "nv-ingest-message-id".
553
674
  worker_pool_size : int, optional
554
- Number of workers in the thread pool. Defaults to 1.
675
+ Number of workers in the thread pool. Defaults to 8.
555
676
 
556
677
  Returns
557
678
  -------
@@ -573,10 +694,19 @@ class NvIngestClient:
573
694
  **self._message_client_kwargs,
574
695
  )
575
696
 
576
- # Initialize the worker pool with the specified size
697
+ # Initialize the worker pool with the specified size (used for both submit and fetch)
577
698
  self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
578
699
 
700
+ # Telemetry state and controls
701
+ self._telemetry_lock = threading.Lock()
702
+ self._telemetry_enabled: bool = bool(int(os.getenv("NV_INGEST_CLIENT_TELEMETRY", "1")))
703
+ try:
704
+ self._telemetry_max_calls: int = int(os.getenv("NV_INGEST_CLIENT_TELEMETRY_MAX_CALLS", "10000"))
705
+ except ValueError:
706
+ self._telemetry_max_calls = 10000
579
707
  self._telemetry = {}
708
+ self._completed_parent_trace_ids: List[str] = [] # 1054
709
+ self.reset_telemetry()
580
710
 
581
711
  def __str__(self) -> str:
582
712
  """
@@ -624,6 +754,106 @@ class NvIngestClient:
624
754
 
625
755
  return job_state
626
756
 
757
+ # ------------------------------------------------------------------
758
+ # Telemetry helpers
759
+ # ------------------------------------------------------------------
760
+
761
+ def enable_telemetry(self, enabled: bool) -> None:
762
+ with self._telemetry_lock:
763
+ self._telemetry_enabled = bool(enabled)
764
+
765
+ def reset_telemetry(self) -> None:
766
+ with self._telemetry_lock:
767
+ self._telemetry = {
768
+ "started_at": time.time(),
769
+ "submit": {"count": 0, "calls": []},
770
+ "fetch": {"count": 0, "last_ts": None, "intervals": [], "calls": []},
771
+ "per_job": {},
772
+ }
773
+
774
+ def _t_per_job(self, job_index: str) -> Dict[str, Any]:
775
+ pj = self._telemetry["per_job"].get(job_index)
776
+ if pj is None:
777
+ pj = {"submits": [], "fetch_attempts": [], "timeouts_202": 0, "failures": 0, "first_success_ts": None}
778
+ self._telemetry["per_job"][job_index] = pj
779
+ return pj
780
+
781
+ def _t_append_capped(self, arr: List[Any], item: Any) -> None:
782
+ if len(arr) < self._telemetry_max_calls:
783
+ arr.append(item)
784
+
785
+ def _t_record_submit(self, job_index: str, status: str, ts: float, trace_id: Optional[str]) -> None:
786
+ if not self._telemetry_enabled:
787
+ return
788
+ with self._telemetry_lock:
789
+ self._telemetry["submit"]["count"] += 1
790
+ self._t_append_capped(
791
+ self._telemetry["submit"]["calls"],
792
+ {"job": job_index, "status": status, "ts": ts, "trace": trace_id},
793
+ )
794
+ pj = self._t_per_job(job_index)
795
+ self._t_append_capped(pj["submits"], ts)
796
+
797
+ def _t_record_fetch_attempt(self, job_index: str, ts: float) -> None:
798
+ if not self._telemetry_enabled:
799
+ return
800
+ with self._telemetry_lock:
801
+ self._telemetry["fetch"]["count"] += 1
802
+ last = self._telemetry["fetch"]["last_ts"]
803
+ if last is not None:
804
+ delta = ts - float(last)
805
+ if delta >= 0:
806
+ self._t_append_capped(self._telemetry["fetch"]["intervals"], delta)
807
+ self._telemetry["fetch"]["last_ts"] = ts
808
+ pj = self._t_per_job(job_index)
809
+ self._t_append_capped(pj["fetch_attempts"], ts)
810
+
811
+ def _t_record_fetch_outcome(self, job_index: str, code: int, ts: float, ok: bool, trace_id: Optional[str]) -> None:
812
+ if not self._telemetry_enabled:
813
+ return
814
+ with self._telemetry_lock:
815
+ self._t_append_capped(
816
+ self._telemetry["fetch"]["calls"],
817
+ {"job": job_index, "code": code, "ok": ok, "ts": ts, "trace": trace_id},
818
+ )
819
+ pj = self._t_per_job(job_index)
820
+ if code == 2: # 202 not ready
821
+ pj["timeouts_202"] += 1
822
+ if ok and pj["first_success_ts"] is None:
823
+ pj["first_success_ts"] = ts
824
+ if not ok and code not in (0, 2):
825
+ pj["failures"] += 1
826
+
827
+ def get_telemetry(self) -> Dict[str, Any]:
828
+ with self._telemetry_lock:
829
+ return copy.deepcopy(self._telemetry)
830
+
831
+ def summarize_telemetry(self) -> Dict[str, Any]:
832
+ with self._telemetry_lock:
833
+ submit_count = self._telemetry["submit"]["count"]
834
+ fetch_count = self._telemetry["fetch"]["count"]
835
+ intervals = list(self._telemetry["fetch"]["intervals"])
836
+ intervals.sort()
837
+ avg = mean(intervals) if intervals else 0.0
838
+ p50 = median(intervals) if intervals else 0.0
839
+ # p95 via index
840
+ p95 = intervals[int(0.95 * (len(intervals) - 1))] if intervals else 0.0
841
+ per_job = self._telemetry["per_job"]
842
+ # Aggregate per-job stats
843
+ jobs = len(per_job)
844
+ total_timeouts = sum(pj.get("timeouts_202", 0) for pj in per_job.values())
845
+ total_failures = sum(pj.get("failures", 0) for pj in per_job.values())
846
+ return {
847
+ "submit_count": submit_count,
848
+ "fetch_count": fetch_count,
849
+ "fetch_interval_avg": avg,
850
+ "fetch_interval_p50": p50,
851
+ "fetch_interval_p95": p95,
852
+ "jobs_tracked": jobs,
853
+ "timeouts_202_total": total_timeouts,
854
+ "failures_total": total_failures,
855
+ }
856
+
627
857
  def _get_and_check_job_state(
628
858
  self,
629
859
  job_index: str,
@@ -861,6 +1091,8 @@ class NvIngestClient:
861
1091
  Exception
862
1092
  For unexpected issues.
863
1093
  """
1094
+ ts_attempt = time.time()
1095
+ self._t_record_fetch_attempt(job_index, ts_attempt)
864
1096
  try:
865
1097
  # Get job state using the client-side index
866
1098
  job_state = self._get_and_check_job_state(
@@ -901,6 +1133,7 @@ class NvIngestClient:
901
1133
  logger.debug(
902
1134
  f"Successfully processed and removed job index {job_index} (Server ID: {server_job_id})"
903
1135
  )
1136
+ self._t_record_fetch_outcome(job_index, 0, time.time(), ok=True, trace_id=job_state.trace_id)
904
1137
  return result_data, job_index, job_state.trace_id
905
1138
 
906
1139
  except json.JSONDecodeError as err:
@@ -922,6 +1155,7 @@ class NvIngestClient:
922
1155
  elif response.response_code == 2: # Job Not Ready (e.g., HTTP 202, or r-2 from SimpleBroker)
923
1156
  # Raise TimeoutError to signal the calling retry loop in fetch_job_result
924
1157
  # Do not change job state here, remains SUBMITTED
1158
+ self._t_record_fetch_outcome(job_index, 2, time.time(), ok=False, trace_id=job_state.trace_id)
925
1159
  raise TimeoutError(f"Job not ready: {response.response_reason}")
926
1160
 
927
1161
  else:
@@ -934,6 +1168,7 @@ class NvIngestClient:
934
1168
  job_state.state = JobStateEnum.FAILED # Mark job as failed in the client
935
1169
  # Do NOT pop the state for failed jobs here
936
1170
  # Raise RuntimeError to indicate a terminal failure for this fetch attempt
1171
+ self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=job_state.trace_id)
937
1172
  raise RuntimeError(error_msg)
938
1173
 
939
1174
  except (TimeoutError, ValueError, RuntimeError):
@@ -945,12 +1180,17 @@ class NvIngestClient:
945
1180
  # Attempt to mark state as FAILED if possible and state object exists
946
1181
  if "job_state" in locals() and hasattr(job_state, "state"):
947
1182
  job_state.state = JobStateEnum.FAILED
1183
+ try:
1184
+ self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=None)
1185
+ except Exception:
1186
+ pass
948
1187
  raise # Re-raise the original exception
949
1188
 
950
1189
  def fetch_job_result_cli(
951
1190
  self,
952
1191
  job_ids: Union[str, List[str]],
953
1192
  data_only: bool = False,
1193
+ timeout: Optional[Tuple[int, Optional[float]]] = None,
954
1194
  ) -> List[Tuple[Any, str, Optional[str]]]:
955
1195
  """
956
1196
  Fetch job results via CLI semantics (synchronous list return).
@@ -970,7 +1210,8 @@ class NvIngestClient:
970
1210
  if isinstance(job_ids, str):
971
1211
  job_ids = [job_ids]
972
1212
 
973
- return [self._fetch_job_result(job_id, data_only=data_only) for job_id in job_ids]
1213
+ eff_timeout: Tuple[int, Optional[float]] = timeout if timeout is not None else (100, None)
1214
+ return [self._fetch_job_result(job_id, timeout=eff_timeout, data_only=data_only) for job_id in job_ids]
974
1215
 
975
1216
  def _validate_batch_size(self, batch_size: Optional[int]) -> int:
976
1217
  """
@@ -1019,14 +1260,21 @@ class NvIngestClient:
1019
1260
  concurrency_limit: int = 64,
1020
1261
  timeout: int = 100,
1021
1262
  max_job_retries: Optional[int] = None,
1022
- retry_delay: float = 5.0,
1263
+ retry_delay: float = 0.5,
1264
+ initial_fetch_delay: float = 0.3,
1023
1265
  fail_on_submit_error: bool = False,
1024
1266
  completion_callback: Optional[Callable[[Any, str], None]] = None,
1025
1267
  return_failures: bool = False,
1026
1268
  data_only: bool = True,
1027
1269
  stream_to_callback_only: bool = False,
1270
+ return_full_response: bool = False,
1028
1271
  verbose: bool = False,
1029
- ) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
1272
+ return_traces: bool = False,
1273
+ ) -> Union[
1274
+ List[Any],
1275
+ Tuple[List[Any], List[Tuple[str, str]]],
1276
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
1277
+ ]:
1030
1278
  """
1031
1279
  Submit and fetch multiple jobs concurrently.
1032
1280
 
@@ -1056,8 +1304,13 @@ class NvIngestClient:
1056
1304
  If True, return (results, failures). Default is False.
1057
1305
  data_only : bool, optional
1058
1306
  If True, return only payload 'data'. Default is True.
1307
+ return_full_response : bool, optional
1308
+ If True, results contain the full response envelopes (including 'trace' and 'annotations').
1309
+ Ignored when stream_to_callback_only=True. Default is False.
1059
1310
  verbose : bool, optional
1060
1311
  If True, enable debug logging. Default is False.
1312
+ return_traces : bool, optional
1313
+ If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
1061
1314
 
1062
1315
  Returns
1063
1316
  -------
@@ -1065,6 +1318,9 @@ class NvIngestClient:
1065
1318
  List of successful job results when `return_failures` is False.
1066
1319
  results, failures : tuple
1067
1320
  Tuple of (successful results, failure tuples) when `return_failures` is True.
1321
+ results, failures, traces : tuple
1322
+ Tuple of (successful results, failure tuples, trace dicts) when both
1323
+ `return_failures` and `return_traces` are True.
1068
1324
 
1069
1325
  Raises
1070
1326
  ------
@@ -1077,13 +1333,18 @@ class NvIngestClient:
1077
1333
 
1078
1334
  # Handle empty input
1079
1335
  if not job_indices:
1080
- return ([], []) if return_failures else []
1336
+ if return_failures and return_traces:
1337
+ return [], [], []
1338
+ elif return_failures:
1339
+ return [], []
1340
+ else:
1341
+ return []
1081
1342
 
1082
1343
  # Validate and set batch_size
1083
1344
  validated_batch_size = self._validate_batch_size(batch_size)
1084
1345
 
1085
- # Prepare timeout tuple for fetch calls
1086
- effective_timeout: Tuple[int, None] = (timeout, None)
1346
+ # Prepare timeout tuple to mirror handler behavior: finite connect, unbounded read (long-poll)
1347
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1087
1348
 
1088
1349
  # Delegate to the concurrent processor
1089
1350
  processor = _ConcurrentProcessor(
@@ -1093,16 +1354,24 @@ class NvIngestClient:
1093
1354
  job_queue_id=job_queue_id,
1094
1355
  timeout=effective_timeout,
1095
1356
  max_job_retries=max_job_retries,
1357
+ retry_delay=retry_delay,
1358
+ initial_fetch_delay=initial_fetch_delay,
1096
1359
  completion_callback=completion_callback,
1097
1360
  fail_on_submit_error=fail_on_submit_error,
1098
1361
  stream_to_callback_only=stream_to_callback_only,
1362
+ return_full_response=return_full_response,
1099
1363
  verbose=verbose,
1364
+ return_traces=return_traces,
1100
1365
  )
1101
1366
 
1102
- results, failures = processor.run()
1367
+ results, failures, traces = processor.run()
1103
1368
 
1104
- if return_failures:
1369
+ if return_failures and return_traces:
1370
+ return results, failures, traces
1371
+ elif return_failures:
1105
1372
  return results, failures
1373
+ elif return_traces:
1374
+ return results, traces
1106
1375
 
1107
1376
  if failures:
1108
1377
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
@@ -1135,7 +1404,12 @@ class NvIngestClient:
1135
1404
  job_state.trace_id = future.result()[0] # Trace_id from `submit_job` endpoint submission
1136
1405
  job_state.future = None
1137
1406
 
1138
- def fetch_job_result_async(self, job_ids: Union[str, List[str]], data_only: bool = True) -> Dict[Future, str]:
1407
+ def fetch_job_result_async(
1408
+ self,
1409
+ job_ids: Union[str, List[str]],
1410
+ data_only: bool = True,
1411
+ timeout: Optional[Tuple[int, Optional[float]]] = None,
1412
+ ) -> Dict[Future, str]:
1139
1413
  """
1140
1414
  Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to job IDs.
1141
1415
 
@@ -1156,7 +1430,7 @@ class NvIngestClient:
1156
1430
  future_to_job_id = {}
1157
1431
  for job_id in job_ids:
1158
1432
  job_state = self._get_and_check_job_state(job_id)
1159
- future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only)
1433
+ future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only, timeout)
1160
1434
  job_state.future = future
1161
1435
  future_to_job_id[future] = job_id
1162
1436
 
@@ -1207,12 +1481,19 @@ class NvIngestClient:
1207
1481
  # Free up memory -- payload should never be used again, and we don't want to keep it around.
1208
1482
  job_state.job_spec.payload = None
1209
1483
 
1484
+ try:
1485
+ self._t_record_submit(job_index, "ok", time.time(), x_trace_id)
1486
+ except Exception:
1487
+ pass
1210
1488
  return x_trace_id
1211
1489
  except Exception as err:
1212
1490
  err_msg = f"Failed to submit job {job_index} to queue {job_queue_id}: {err}"
1213
1491
  logger.exception(err_msg)
1214
1492
  job_state.state = JobStateEnum.FAILED
1215
-
1493
+ try:
1494
+ self._t_record_submit(job_index, "fail", time.time(), None)
1495
+ except Exception:
1496
+ pass
1216
1497
  raise
1217
1498
 
1218
1499
  def submit_job(
@@ -1433,7 +1714,9 @@ class NvIngestClient:
1433
1714
 
1434
1715
  return results
1435
1716
 
1436
- def create_jobs_for_batch(self, files_batch: List[str], tasks: Dict[str, Any]) -> List[str]:
1717
+ def create_jobs_for_batch(
1718
+ self, files_batch: List[str], tasks: Dict[str, Any], pdf_split_page_count: int = None
1719
+ ) -> List[str]:
1437
1720
  """
1438
1721
  Create and submit job specifications (JobSpecs) for a batch of files, returning the job IDs.
1439
1722
  This function takes a batch of files, processes each file to extract its content and type,
@@ -1449,6 +1732,9 @@ class NvIngestClient:
1449
1732
  A dictionary of tasks to be added to each job. The keys represent task names, and the
1450
1733
  values represent task specifications or configurations. Standard tasks include "split",
1451
1734
  "extract", "store", "caption", "dedup", "filter", "embed".
1735
+ pdf_split_page_count : int, optional
1736
+ Number of pages per PDF chunk for splitting (1-128). If provided, this will be added
1737
+ to the job spec's extended_options for PDF files.
1452
1738
 
1453
1739
  Returns
1454
1740
  -------
@@ -1495,6 +1781,10 @@ class NvIngestClient:
1495
1781
 
1496
1782
  job_specs = create_job_specs_for_batch(files_batch)
1497
1783
 
1784
+ # Apply PDF split config if provided
1785
+ if pdf_split_page_count is not None:
1786
+ apply_pdf_split_config_to_job_specs(job_specs, pdf_split_page_count)
1787
+
1498
1788
  job_ids = []
1499
1789
  for job_spec in job_specs:
1500
1790
  logger.debug(f"Tasks: {tasks.keys()}")
@@ -1524,3 +1814,19 @@ class NvIngestClient:
1524
1814
  job_ids.append(job_id)
1525
1815
 
1526
1816
  return job_ids
1817
+
1818
+ def register_parent_trace_id(self, trace_id: Optional[str]) -> None:
1819
+ """Record a parent trace identifier once its aggregation completed."""
1820
+
1821
+ if not trace_id:
1822
+ return
1823
+
1824
+ if trace_id not in self._completed_parent_trace_ids:
1825
+ self._completed_parent_trace_ids.append(trace_id)
1826
+
1827
+ def consume_completed_parent_trace_ids(self) -> List[str]:
1828
+ """Return and clear the set of completed parent trace identifiers."""
1829
+
1830
+ trace_ids = list(self._completed_parent_trace_ids)
1831
+ self._completed_parent_trace_ids.clear()
1832
+ return trace_ids