nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +511 -205
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +137 -24
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +28 -4
- nv_ingest_client/primitives/jobs/job_spec.py +1 -0
- nv_ingest_client/primitives/tasks/embed.py +16 -0
- nv_ingest_client/primitives/tasks/extract.py +1 -1
- nv_ingest_client/primitives/tasks/filter.py +1 -1
- nv_ingest_client/primitives/tasks/task_factory.py +9 -12
- nv_ingest_client/primitives/tasks/udf.py +24 -27
- nv_ingest_client/util/document_analysis.py +1 -1
- nv_ingest_client/util/util.py +26 -0
- nv_ingest_client/util/vdb/milvus.py +12 -9
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/RECORD +21 -20
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,9 @@ import logging
|
|
|
10
10
|
import math
|
|
11
11
|
import os
|
|
12
12
|
import time
|
|
13
|
+
import threading
|
|
14
|
+
import copy
|
|
15
|
+
from statistics import mean, median
|
|
13
16
|
from collections import defaultdict
|
|
14
17
|
from concurrent.futures import Future
|
|
15
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -32,11 +35,59 @@ from nv_ingest_client.primitives.tasks import TaskType
|
|
|
32
35
|
from nv_ingest_client.primitives.tasks import is_valid_task_type
|
|
33
36
|
from nv_ingest_client.primitives.tasks import task_factory
|
|
34
37
|
from nv_ingest_client.util.processing import handle_future_result, IngestJobFailure
|
|
35
|
-
from nv_ingest_client.util.util import
|
|
38
|
+
from nv_ingest_client.util.util import (
|
|
39
|
+
create_job_specs_for_batch,
|
|
40
|
+
check_ingest_result,
|
|
41
|
+
apply_pdf_split_config_to_job_specs,
|
|
42
|
+
)
|
|
36
43
|
|
|
37
44
|
logger = logging.getLogger(__name__)
|
|
38
45
|
|
|
39
46
|
|
|
47
|
+
def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Compute resident_time entries from entry/exit pairs if not already present.
|
|
50
|
+
|
|
51
|
+
This ensures consistency between split jobs (where server computes resident_time)
|
|
52
|
+
and non-split jobs (where we compute it client-side).
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
trace_dict : Dict[str, Any]
|
|
57
|
+
Trace dictionary with entry/exit pairs
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Dict[str, Any]
|
|
62
|
+
Trace dictionary with resident_time entries added
|
|
63
|
+
"""
|
|
64
|
+
if not trace_dict or not isinstance(trace_dict, dict):
|
|
65
|
+
return trace_dict
|
|
66
|
+
|
|
67
|
+
# Check if resident_time already exists (server-computed for split jobs)
|
|
68
|
+
has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
|
|
69
|
+
if has_resident:
|
|
70
|
+
return trace_dict # Already computed by server
|
|
71
|
+
|
|
72
|
+
# Compute resident_time from entry/exit pairs
|
|
73
|
+
result = dict(trace_dict)
|
|
74
|
+
stages = set()
|
|
75
|
+
|
|
76
|
+
# Find all unique stages
|
|
77
|
+
for key in trace_dict:
|
|
78
|
+
if key.startswith("trace::entry::"):
|
|
79
|
+
stages.add(key.replace("trace::entry::", ""))
|
|
80
|
+
|
|
81
|
+
# Compute resident_time for each stage
|
|
82
|
+
for stage in stages:
|
|
83
|
+
entry_key = f"trace::entry::{stage}"
|
|
84
|
+
exit_key = f"trace::exit::{stage}"
|
|
85
|
+
if entry_key in trace_dict and exit_key in trace_dict:
|
|
86
|
+
result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
|
|
87
|
+
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
40
91
|
class DataDecodeException(Exception):
|
|
41
92
|
"""
|
|
42
93
|
Exception raised for errors in decoding data.
|
|
@@ -57,15 +108,12 @@ class DataDecodeException(Exception):
|
|
|
57
108
|
|
|
58
109
|
class _ConcurrentProcessor:
|
|
59
110
|
"""
|
|
60
|
-
Manages
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
they become available within the batch using `as_completed`. Retries due
|
|
67
|
-
to job readiness timeouts are handled by adding the job index to the next
|
|
68
|
-
processing batch.
|
|
111
|
+
Manages asynchronous submission and result fetching while keeping a steady
|
|
112
|
+
pool of up to `batch_size` in-flight jobs:
|
|
113
|
+
- Retries (202/TimeoutError) are re-queued immediately.
|
|
114
|
+
- New jobs are submitted as capacity frees up.
|
|
115
|
+
- Fetches are started for jobs added each cycle.
|
|
116
|
+
- We always attempt to keep the executor saturated up to `batch_size`.
|
|
69
117
|
"""
|
|
70
118
|
|
|
71
119
|
def __init__(
|
|
@@ -76,10 +124,14 @@ class _ConcurrentProcessor:
|
|
|
76
124
|
batch_size: int,
|
|
77
125
|
timeout: Tuple[int, Union[float, None]],
|
|
78
126
|
max_job_retries: Optional[int],
|
|
127
|
+
retry_delay: float,
|
|
128
|
+
initial_fetch_delay: float,
|
|
79
129
|
completion_callback: Optional[Callable[[Dict[str, Any], str], None]],
|
|
80
130
|
fail_on_submit_error: bool,
|
|
81
131
|
stream_to_callback_only: bool,
|
|
132
|
+
return_full_response: bool,
|
|
82
133
|
verbose: bool = False,
|
|
134
|
+
return_traces: bool = False,
|
|
83
135
|
):
|
|
84
136
|
"""
|
|
85
137
|
Initializes the concurrent processor.
|
|
@@ -113,6 +165,8 @@ class _ConcurrentProcessor:
|
|
|
113
165
|
initiating job submission or fetching fails for a batch.
|
|
114
166
|
verbose : bool, optional
|
|
115
167
|
If True, enables detailed debug logging. Default is False.
|
|
168
|
+
return_traces : bool, optional
|
|
169
|
+
If True, parent-level trace data for each completed job is stored.
|
|
116
170
|
|
|
117
171
|
Raises
|
|
118
172
|
------
|
|
@@ -128,16 +182,21 @@ class _ConcurrentProcessor:
|
|
|
128
182
|
self.batch_size = batch_size
|
|
129
183
|
self.timeout = timeout
|
|
130
184
|
self.max_job_retries = max_job_retries
|
|
185
|
+
self.retry_delay = retry_delay
|
|
186
|
+
self.initial_fetch_delay = initial_fetch_delay
|
|
131
187
|
self.completion_callback = completion_callback
|
|
132
188
|
self.fail_on_submit_error = fail_on_submit_error
|
|
133
189
|
self.stream_to_callback_only = stream_to_callback_only
|
|
190
|
+
self.return_full_response = return_full_response
|
|
134
191
|
self.verbose = verbose
|
|
192
|
+
self.return_traces = return_traces
|
|
135
193
|
|
|
136
194
|
# State variables managed across batch cycles
|
|
137
195
|
self.retry_job_ids: List[str] = []
|
|
138
196
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
139
197
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
140
198
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
199
|
+
self.traces: List[Optional[Dict[str, Any]]] = []
|
|
141
200
|
|
|
142
201
|
# --- Initial Checks ---
|
|
143
202
|
if not self.job_queue_id:
|
|
@@ -225,13 +284,25 @@ class _ConcurrentProcessor:
|
|
|
225
284
|
|
|
226
285
|
is_failed, description = check_ingest_result(result_data)
|
|
227
286
|
|
|
287
|
+
if trace_id:
|
|
288
|
+
self.client.register_parent_trace_id(trace_id)
|
|
289
|
+
|
|
228
290
|
if is_failed:
|
|
229
291
|
failed_job_spec = self.client._job_index_to_job_spec.get(job_index)
|
|
230
292
|
self.failures.append((f"{job_index}:{failed_job_spec.source_id}", description))
|
|
231
293
|
elif self.stream_to_callback_only:
|
|
232
294
|
self.results.append(job_index)
|
|
233
295
|
else:
|
|
234
|
-
|
|
296
|
+
# When requested, return the full response envelope (includes 'trace' and 'annotations')
|
|
297
|
+
self.results.append(result_data if self.return_full_response else result_data.get("data"))
|
|
298
|
+
|
|
299
|
+
# Extract trace data for all successful (non-failed) jobs
|
|
300
|
+
if self.return_traces and not is_failed:
|
|
301
|
+
trace_payload = result_data.get("trace") if result_data else None
|
|
302
|
+
# Compute resident_time if not already present (for consistency)
|
|
303
|
+
if trace_payload:
|
|
304
|
+
trace_payload = _compute_resident_times(trace_payload)
|
|
305
|
+
self.traces.append(trace_payload if trace_payload else None)
|
|
235
306
|
|
|
236
307
|
# Cleanup retry count if it exists
|
|
237
308
|
if job_index in self.retry_counts:
|
|
@@ -275,21 +346,156 @@ class _ConcurrentProcessor:
|
|
|
275
346
|
except Exception:
|
|
276
347
|
logger.warning("Could not reliably extract job indices from results for final check.")
|
|
277
348
|
|
|
278
|
-
initial_indices = set(self.all_job_indices_list)
|
|
279
|
-
unaccounted_indices = initial_indices - processed_indices
|
|
280
|
-
|
|
281
|
-
if unaccounted_indices:
|
|
282
|
-
logger.warning(f"Potentially unaccounted for jobs: {unaccounted_indices}")
|
|
283
|
-
# Optionally add them to failures
|
|
284
|
-
# for idx in unaccounted_indices:
|
|
285
|
-
# if not any(f[0] == idx for f in self.failures):
|
|
286
|
-
# self.failures.append((idx, "Job lost or unaccounted for at exit"))
|
|
287
|
-
|
|
288
349
|
# --------------------------------------------------------------------------
|
|
289
|
-
#
|
|
350
|
+
# Declarative Helper Methods (behavior preserved)
|
|
290
351
|
# --------------------------------------------------------------------------
|
|
291
352
|
|
|
292
|
-
def
|
|
353
|
+
def _collect_retry_jobs_for_batch(self) -> List[str]:
|
|
354
|
+
"""
|
|
355
|
+
Collect retry jobs for this batch, mirroring handler behavior (no pacing filter).
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
List[str]
|
|
360
|
+
The list of job indices that should be retried in this batch.
|
|
361
|
+
"""
|
|
362
|
+
if not self.retry_job_ids:
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
# Take all retries this cycle and clear the list (handler resets per-iteration)
|
|
366
|
+
eligible: List[str] = list(self.retry_job_ids)
|
|
367
|
+
self.retry_job_ids = []
|
|
368
|
+
if eligible and self.verbose:
|
|
369
|
+
logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
|
|
370
|
+
return eligible
|
|
371
|
+
|
|
372
|
+
def _schedule_retry(self, job_index: str) -> None:
|
|
373
|
+
"""
|
|
374
|
+
Schedule an immediate retry for a job (no pacing), mirroring handler behavior.
|
|
375
|
+
"""
|
|
376
|
+
if job_index not in self.retry_job_ids:
|
|
377
|
+
self.retry_job_ids.append(job_index)
|
|
378
|
+
|
|
379
|
+
def _select_new_jobs_for_batch(
|
|
380
|
+
self,
|
|
381
|
+
submitted_new_indices_count: int,
|
|
382
|
+
total_jobs: int,
|
|
383
|
+
already_in_batch: int,
|
|
384
|
+
) -> Tuple[List[str], int]:
|
|
385
|
+
"""
|
|
386
|
+
Determine the slice of new jobs to include in the current batch based on
|
|
387
|
+
remaining capacity and unsubmitted jobs.
|
|
388
|
+
|
|
389
|
+
Note: This does NOT change submitted_new_indices_count. The original code
|
|
390
|
+
increments that counter only after submission is attempted/handled.
|
|
391
|
+
"""
|
|
392
|
+
if (already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
|
|
393
|
+
num_new_to_add = min(self.batch_size - already_in_batch, total_jobs - submitted_new_indices_count)
|
|
394
|
+
start_idx = submitted_new_indices_count
|
|
395
|
+
end_idx = submitted_new_indices_count + num_new_to_add
|
|
396
|
+
new_job_indices = self.all_job_indices_list[start_idx:end_idx]
|
|
397
|
+
|
|
398
|
+
if self.verbose:
|
|
399
|
+
logger.debug(f"Adding {len(new_job_indices)} new jobs to current batch.")
|
|
400
|
+
|
|
401
|
+
return new_job_indices, submitted_new_indices_count
|
|
402
|
+
|
|
403
|
+
return [], submitted_new_indices_count
|
|
404
|
+
|
|
405
|
+
def _submit_new_jobs_async(
|
|
406
|
+
self,
|
|
407
|
+
current_batch_new_job_indices: List[str],
|
|
408
|
+
current_batch_job_indices: List[str],
|
|
409
|
+
submitted_new_indices_count: int,
|
|
410
|
+
) -> Tuple[List[str], int]:
|
|
411
|
+
"""
|
|
412
|
+
Initiate asynchronous submission for the new jobs selected for this batch.
|
|
413
|
+
|
|
414
|
+
Mirrors the original inline submission block, including error handling and
|
|
415
|
+
fail_on_submit_error semantics. Returns potentially updated batch indices and
|
|
416
|
+
submitted count.
|
|
417
|
+
"""
|
|
418
|
+
if not current_batch_new_job_indices:
|
|
419
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
420
|
+
|
|
421
|
+
if not self.job_queue_id:
|
|
422
|
+
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
423
|
+
logger.error(error_msg)
|
|
424
|
+
# Fail these jobs immediately
|
|
425
|
+
for job_index in current_batch_new_job_indices:
|
|
426
|
+
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
427
|
+
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
428
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
429
|
+
if self.fail_on_submit_error:
|
|
430
|
+
raise ValueError(error_msg)
|
|
431
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
# Fire-and-forget submission initiation
|
|
435
|
+
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
436
|
+
# Add successfully initiated jobs to the overall batch list
|
|
437
|
+
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
438
|
+
# Update count of total initiated jobs
|
|
439
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
440
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
441
|
+
except Exception as e:
|
|
442
|
+
error_msg = (
|
|
443
|
+
f"Batch async submission initiation failed for {len(current_batch_new_job_indices)} new jobs: {e}"
|
|
444
|
+
)
|
|
445
|
+
logger.error(error_msg, exc_info=True)
|
|
446
|
+
# Fail these jobs immediately
|
|
447
|
+
for job_index in current_batch_new_job_indices:
|
|
448
|
+
self._handle_processing_failure(
|
|
449
|
+
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
450
|
+
)
|
|
451
|
+
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
452
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
453
|
+
if self.fail_on_submit_error:
|
|
454
|
+
raise RuntimeError(error_msg) from e
|
|
455
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
456
|
+
|
|
457
|
+
def _initiate_fetch_for_batch(self, current_batch_job_indices: List[str]) -> Tuple[Dict[Future, str], List[str]]:
|
|
458
|
+
"""
|
|
459
|
+
Initiate fetching for the prepared batch and ensure consistency of returned futures.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
batch_futures_dict : Dict[Future, str]
|
|
464
|
+
Mapping of futures to their associated job indices.
|
|
465
|
+
normalized_job_indices : List[str]
|
|
466
|
+
The job indices normalized to those actually returned by the client if a discrepancy occurs.
|
|
467
|
+
"""
|
|
468
|
+
if self.verbose:
|
|
469
|
+
logger.debug(f"Calling fetch_job_result_async for {len(current_batch_job_indices)} jobs.")
|
|
470
|
+
batch_futures_dict: Dict[Future, str] = (
|
|
471
|
+
self.client.fetch_job_result_async(current_batch_job_indices, data_only=False, timeout=None)
|
|
472
|
+
if current_batch_job_indices
|
|
473
|
+
else {}
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Check for discrepancies where client might not return all futures
|
|
477
|
+
if current_batch_job_indices and (len(batch_futures_dict) != len(current_batch_job_indices)):
|
|
478
|
+
returned_indices = set(batch_futures_dict.values())
|
|
479
|
+
missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
|
|
480
|
+
logger.error(
|
|
481
|
+
f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
|
|
482
|
+
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
483
|
+
)
|
|
484
|
+
# Fail the missing ones explicitly
|
|
485
|
+
for missing_idx in missing_indices:
|
|
486
|
+
self._handle_processing_failure(
|
|
487
|
+
missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
|
|
488
|
+
)
|
|
489
|
+
if self.fail_on_submit_error:
|
|
490
|
+
raise RuntimeError("fetch_job_result_async failed to return all expected futures.")
|
|
491
|
+
# Continue processing only the futures we received
|
|
492
|
+
normalized_job_indices = list(returned_indices)
|
|
493
|
+
else:
|
|
494
|
+
normalized_job_indices = list(current_batch_job_indices)
|
|
495
|
+
|
|
496
|
+
return batch_futures_dict, normalized_job_indices
|
|
497
|
+
|
|
498
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
293
499
|
"""
|
|
294
500
|
Executes the main processing loop in batches.
|
|
295
501
|
|
|
@@ -315,210 +521,124 @@ class _ConcurrentProcessor:
|
|
|
315
521
|
initiation error occurs.
|
|
316
522
|
"""
|
|
317
523
|
total_jobs = len(self.all_job_indices_list)
|
|
318
|
-
# Tracks indices for which submission has been initiated at least once
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
while (submitted_new_indices_count < total_jobs) or self.retry_job_ids:
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
else:
|
|
363
|
-
try:
|
|
364
|
-
# Fire-and-forget submission initiation
|
|
365
|
-
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
366
|
-
# Add successfully initiated jobs to the overall batch list
|
|
367
|
-
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
368
|
-
# Update count of total initiated jobs
|
|
369
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
370
|
-
except Exception as e:
|
|
371
|
-
error_msg = (
|
|
372
|
-
f"Batch async submission initiation failed for "
|
|
373
|
-
f"{len(current_batch_new_job_indices)} new jobs: {e}"
|
|
524
|
+
submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
|
|
525
|
+
|
|
526
|
+
logger.debug(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
|
|
527
|
+
|
|
528
|
+
# Keep up to batch_size jobs in-flight at all times
|
|
529
|
+
inflight_futures: Dict[Future, str] = {}
|
|
530
|
+
|
|
531
|
+
while (submitted_new_indices_count < total_jobs) or self.retry_job_ids or inflight_futures:
|
|
532
|
+
# 1) Top up from retries first
|
|
533
|
+
capacity = max(0, self.batch_size - len(inflight_futures))
|
|
534
|
+
to_fetch: List[str] = []
|
|
535
|
+
if capacity > 0 and self.retry_job_ids:
|
|
536
|
+
take = min(capacity, len(self.retry_job_ids))
|
|
537
|
+
retry_now = self.retry_job_ids[:take]
|
|
538
|
+
self.retry_job_ids = self.retry_job_ids[take:]
|
|
539
|
+
to_fetch.extend(retry_now)
|
|
540
|
+
capacity -= len(retry_now)
|
|
541
|
+
|
|
542
|
+
# 2) Then add new jobs up to capacity
|
|
543
|
+
if capacity > 0 and (submitted_new_indices_count < total_jobs):
|
|
544
|
+
new_count = min(capacity, total_jobs - submitted_new_indices_count)
|
|
545
|
+
new_job_indices = self.all_job_indices_list[
|
|
546
|
+
submitted_new_indices_count : submitted_new_indices_count + new_count
|
|
547
|
+
]
|
|
548
|
+
|
|
549
|
+
if not self.job_queue_id:
|
|
550
|
+
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
551
|
+
logger.error(error_msg)
|
|
552
|
+
for job_index in new_job_indices:
|
|
553
|
+
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
554
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
555
|
+
if self.fail_on_submit_error:
|
|
556
|
+
raise ValueError(error_msg)
|
|
557
|
+
else:
|
|
558
|
+
try:
|
|
559
|
+
_ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
|
|
560
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
561
|
+
to_fetch.extend(new_job_indices)
|
|
562
|
+
except Exception as e:
|
|
563
|
+
error_msg = f"Batch async submission initiation failed for {len(new_job_indices)} new jobs: {e}"
|
|
564
|
+
logger.error(error_msg, exc_info=True)
|
|
565
|
+
for job_index in new_job_indices:
|
|
566
|
+
self._handle_processing_failure(
|
|
567
|
+
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
374
568
|
)
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
self._handle_processing_failure(
|
|
379
|
-
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
380
|
-
)
|
|
381
|
-
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
382
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
383
|
-
if self.fail_on_submit_error:
|
|
384
|
-
raise RuntimeError(error_msg) from e
|
|
385
|
-
|
|
386
|
-
# If nothing ended up in the batch (e.g., only submission failures)
|
|
387
|
-
if not current_batch_job_indices:
|
|
388
|
-
if self.verbose:
|
|
389
|
-
logger.debug("No jobs identified for fetching in this batch iteration.")
|
|
390
|
-
# If there are no retries pending either, break the loop
|
|
391
|
-
if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
392
|
-
logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
|
|
393
|
-
break
|
|
394
|
-
continue # Otherwise, proceed to next iteration
|
|
395
|
-
|
|
396
|
-
# --- Initiate Fetching for the Current Batch ---
|
|
397
|
-
try:
|
|
398
|
-
if self.verbose:
|
|
399
|
-
logger.debug(
|
|
400
|
-
f"Calling fetch_job_result_async for "
|
|
401
|
-
f"{len(current_batch_job_indices)} jobs in current batch."
|
|
402
|
-
)
|
|
403
|
-
# Use data_only=False to get full response for callback/results
|
|
404
|
-
batch_futures_dict = self.client.fetch_job_result_async(current_batch_job_indices, data_only=False)
|
|
569
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
570
|
+
if self.fail_on_submit_error:
|
|
571
|
+
raise RuntimeError(error_msg) from e
|
|
405
572
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
573
|
+
# 3) Launch fetches for the jobs we added to this cycle
|
|
574
|
+
if to_fetch:
|
|
575
|
+
try:
|
|
576
|
+
new_futures = self.client.fetch_job_result_async(to_fetch, data_only=False, timeout=None)
|
|
577
|
+
inflight_futures.update(new_futures)
|
|
578
|
+
except Exception as fetch_init_err:
|
|
410
579
|
logger.error(
|
|
411
|
-
f"fetch_job_result_async
|
|
412
|
-
|
|
413
|
-
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
580
|
+
f"fetch_job_result_async failed to start for {len(to_fetch)} jobs: {fetch_init_err}",
|
|
581
|
+
exc_info=True,
|
|
414
582
|
)
|
|
415
|
-
|
|
416
|
-
for missing_idx in missing_indices:
|
|
583
|
+
for job_index in to_fetch:
|
|
417
584
|
self._handle_processing_failure(
|
|
418
|
-
|
|
585
|
+
job_index, f"Fetch initiation error: {fetch_init_err}", is_submission_failure=True
|
|
419
586
|
)
|
|
420
587
|
if self.fail_on_submit_error:
|
|
421
|
-
raise RuntimeError(
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
588
|
+
raise RuntimeError(
|
|
589
|
+
f"Stopping due to fetch initiation failure: {fetch_init_err}"
|
|
590
|
+
) from fetch_init_err
|
|
591
|
+
|
|
592
|
+
# 4) If nothing left anywhere, exit
|
|
593
|
+
if not inflight_futures and not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
594
|
+
logger.debug("Exiting loop: No in-flight jobs, no retries, and all jobs submitted.")
|
|
595
|
+
break
|
|
596
|
+
|
|
597
|
+
# 5) Wait for at least one in-flight future to complete, then process done ones
|
|
598
|
+
if inflight_futures:
|
|
599
|
+
done, _ = concurrent.futures.wait(
|
|
600
|
+
set(inflight_futures.keys()), return_when=concurrent.futures.FIRST_COMPLETED
|
|
433
601
|
)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
)
|
|
439
|
-
if self.fail_on_submit_error:
|
|
440
|
-
raise RuntimeError(
|
|
441
|
-
f"Stopping due to fetch initiation failure: {fetch_init_err}"
|
|
442
|
-
) from fetch_init_err
|
|
443
|
-
continue # Skip processing results for this failed batch
|
|
444
|
-
|
|
445
|
-
# --- Process Results for the Current Batch ---
|
|
446
|
-
if not batch_futures_dict:
|
|
447
|
-
if self.verbose:
|
|
448
|
-
logger.debug("No futures returned/available for processing in this batch.")
|
|
449
|
-
continue # Skip processing if no futures
|
|
450
|
-
|
|
451
|
-
batch_timeout = 600.0 # Timeout for waiting on the whole batch
|
|
452
|
-
try:
|
|
453
|
-
# Process futures as they complete within this batch
|
|
454
|
-
for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
|
|
455
|
-
job_index = batch_futures_dict[future]
|
|
602
|
+
for future in done:
|
|
603
|
+
job_index = inflight_futures.pop(future, None)
|
|
604
|
+
if job_index is None:
|
|
605
|
+
continue
|
|
456
606
|
try:
|
|
457
|
-
# Expect list with one tuple: [(data, index, trace)]
|
|
458
607
|
result_list = future.result()
|
|
459
608
|
if not isinstance(result_list, list) or len(result_list) != 1:
|
|
460
609
|
raise ValueError(f"Expected list length 1, got {len(result_list)}")
|
|
461
|
-
|
|
462
610
|
result_tuple = result_list[0]
|
|
463
611
|
if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
|
|
464
612
|
raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
|
|
465
|
-
|
|
466
613
|
full_response_dict, fetched_job_index, trace_id = result_tuple
|
|
467
|
-
|
|
468
614
|
if fetched_job_index != job_index:
|
|
469
|
-
logger.warning(f"Mismatch: Future for {job_index} returned
|
|
470
|
-
|
|
615
|
+
logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
|
|
471
616
|
self._handle_processing_success(job_index, full_response_dict, trace_id)
|
|
472
|
-
|
|
473
617
|
except TimeoutError:
|
|
474
|
-
#
|
|
618
|
+
# Not ready -> immediate retry
|
|
475
619
|
self.retry_counts[job_index] += 1
|
|
476
620
|
if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
|
|
477
621
|
if self.verbose:
|
|
478
622
|
logger.info(
|
|
479
|
-
f"Job {job_index} not ready,
|
|
480
|
-
f"
|
|
481
|
-
f"{self.retry_counts[job_index]}/"
|
|
482
|
-
f"{self.max_job_retries or 'inf'})."
|
|
623
|
+
f"Job {job_index} not ready, scheduling retry "
|
|
624
|
+
f"(Attempt {self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
|
|
483
625
|
)
|
|
484
|
-
|
|
485
|
-
self.retry_job_ids.append(job_index)
|
|
626
|
+
self._schedule_retry(job_index)
|
|
486
627
|
else:
|
|
487
|
-
error_msg = f"Exceeded max fetch retries
|
|
628
|
+
error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
|
|
488
629
|
logger.error(error_msg)
|
|
489
630
|
self._handle_processing_failure(job_index, error_msg)
|
|
490
|
-
|
|
491
631
|
except (ValueError, RuntimeError) as e:
|
|
492
632
|
logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
|
|
493
633
|
self._handle_processing_failure(job_index, f"Error processing result: {e}")
|
|
494
634
|
except Exception as e:
|
|
495
635
|
logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
|
|
496
636
|
self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
|
|
497
|
-
# No finally block incrementing count here; tracking is batch-based
|
|
498
|
-
|
|
499
|
-
except TimeoutError:
|
|
500
|
-
# `as_completed` timed out waiting for remaining futures in batch
|
|
501
|
-
logger.error(
|
|
502
|
-
f"Batch processing timed out after {batch_timeout}s waiting "
|
|
503
|
-
f"for futures. Some jobs in batch may be lost or incomplete."
|
|
504
|
-
)
|
|
505
|
-
# Identify and fail remaining futures
|
|
506
|
-
remaining_indices_in_batch = []
|
|
507
|
-
for f, idx in batch_futures_dict.items():
|
|
508
|
-
if not f.done():
|
|
509
|
-
remaining_indices_in_batch.append(idx)
|
|
510
|
-
f.cancel() # Attempt to cancel underlying task
|
|
511
|
-
logger.warning(
|
|
512
|
-
f"Jobs potentially lost/cancelled due to batch timeout: " f"{remaining_indices_in_batch}"
|
|
513
|
-
)
|
|
514
|
-
for idx in remaining_indices_in_batch:
|
|
515
|
-
self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
|
|
516
|
-
# End of processing for this batch cycle
|
|
517
637
|
|
|
518
638
|
# --- Final Logging ---
|
|
519
639
|
self._log_final_status(total_jobs)
|
|
520
640
|
|
|
521
|
-
return self.results, self.failures
|
|
641
|
+
return self.results, self.failures, self.traces if self.return_traces else []
|
|
522
642
|
|
|
523
643
|
|
|
524
644
|
class NvIngestClient:
|
|
@@ -547,11 +667,12 @@ class NvIngestClient:
|
|
|
547
667
|
message_client_port : int, optional
|
|
548
668
|
Port of the REST/message service. Defaults to 7670.
|
|
549
669
|
message_client_kwargs : dict, optional
|
|
550
|
-
Extra keyword arguments passed to the client allocator.
|
|
670
|
+
Extra keyword arguments passed to the client allocator. For RestClient,
|
|
671
|
+
can include 'api_version' (e.g., 'v1' or 'v2'). Defaults to 'v1'.
|
|
551
672
|
msg_counter_id : str, optional
|
|
552
673
|
Identifier for message counting. Defaults to "nv-ingest-message-id".
|
|
553
674
|
worker_pool_size : int, optional
|
|
554
|
-
Number of workers in the thread pool. Defaults to
|
|
675
|
+
Number of workers in the thread pool. Defaults to 8.
|
|
555
676
|
|
|
556
677
|
Returns
|
|
557
678
|
-------
|
|
@@ -573,10 +694,19 @@ class NvIngestClient:
|
|
|
573
694
|
**self._message_client_kwargs,
|
|
574
695
|
)
|
|
575
696
|
|
|
576
|
-
# Initialize the worker pool with the specified size
|
|
697
|
+
# Initialize the worker pool with the specified size (used for both submit and fetch)
|
|
577
698
|
self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
|
|
578
699
|
|
|
700
|
+
# Telemetry state and controls
|
|
701
|
+
self._telemetry_lock = threading.Lock()
|
|
702
|
+
self._telemetry_enabled: bool = bool(int(os.getenv("NV_INGEST_CLIENT_TELEMETRY", "1")))
|
|
703
|
+
try:
|
|
704
|
+
self._telemetry_max_calls: int = int(os.getenv("NV_INGEST_CLIENT_TELEMETRY_MAX_CALLS", "10000"))
|
|
705
|
+
except ValueError:
|
|
706
|
+
self._telemetry_max_calls = 10000
|
|
579
707
|
self._telemetry = {}
|
|
708
|
+
self._completed_parent_trace_ids: List[str] = [] # 1054
|
|
709
|
+
self.reset_telemetry()
|
|
580
710
|
|
|
581
711
|
def __str__(self) -> str:
|
|
582
712
|
"""
|
|
@@ -624,6 +754,106 @@ class NvIngestClient:
|
|
|
624
754
|
|
|
625
755
|
return job_state
|
|
626
756
|
|
|
757
|
+
# ------------------------------------------------------------------
|
|
758
|
+
# Telemetry helpers
|
|
759
|
+
# ------------------------------------------------------------------
|
|
760
|
+
|
|
761
|
+
def enable_telemetry(self, enabled: bool) -> None:
|
|
762
|
+
with self._telemetry_lock:
|
|
763
|
+
self._telemetry_enabled = bool(enabled)
|
|
764
|
+
|
|
765
|
+
def reset_telemetry(self) -> None:
|
|
766
|
+
with self._telemetry_lock:
|
|
767
|
+
self._telemetry = {
|
|
768
|
+
"started_at": time.time(),
|
|
769
|
+
"submit": {"count": 0, "calls": []},
|
|
770
|
+
"fetch": {"count": 0, "last_ts": None, "intervals": [], "calls": []},
|
|
771
|
+
"per_job": {},
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
def _t_per_job(self, job_index: str) -> Dict[str, Any]:
|
|
775
|
+
pj = self._telemetry["per_job"].get(job_index)
|
|
776
|
+
if pj is None:
|
|
777
|
+
pj = {"submits": [], "fetch_attempts": [], "timeouts_202": 0, "failures": 0, "first_success_ts": None}
|
|
778
|
+
self._telemetry["per_job"][job_index] = pj
|
|
779
|
+
return pj
|
|
780
|
+
|
|
781
|
+
def _t_append_capped(self, arr: List[Any], item: Any) -> None:
|
|
782
|
+
if len(arr) < self._telemetry_max_calls:
|
|
783
|
+
arr.append(item)
|
|
784
|
+
|
|
785
|
+
def _t_record_submit(self, job_index: str, status: str, ts: float, trace_id: Optional[str]) -> None:
|
|
786
|
+
if not self._telemetry_enabled:
|
|
787
|
+
return
|
|
788
|
+
with self._telemetry_lock:
|
|
789
|
+
self._telemetry["submit"]["count"] += 1
|
|
790
|
+
self._t_append_capped(
|
|
791
|
+
self._telemetry["submit"]["calls"],
|
|
792
|
+
{"job": job_index, "status": status, "ts": ts, "trace": trace_id},
|
|
793
|
+
)
|
|
794
|
+
pj = self._t_per_job(job_index)
|
|
795
|
+
self._t_append_capped(pj["submits"], ts)
|
|
796
|
+
|
|
797
|
+
def _t_record_fetch_attempt(self, job_index: str, ts: float) -> None:
|
|
798
|
+
if not self._telemetry_enabled:
|
|
799
|
+
return
|
|
800
|
+
with self._telemetry_lock:
|
|
801
|
+
self._telemetry["fetch"]["count"] += 1
|
|
802
|
+
last = self._telemetry["fetch"]["last_ts"]
|
|
803
|
+
if last is not None:
|
|
804
|
+
delta = ts - float(last)
|
|
805
|
+
if delta >= 0:
|
|
806
|
+
self._t_append_capped(self._telemetry["fetch"]["intervals"], delta)
|
|
807
|
+
self._telemetry["fetch"]["last_ts"] = ts
|
|
808
|
+
pj = self._t_per_job(job_index)
|
|
809
|
+
self._t_append_capped(pj["fetch_attempts"], ts)
|
|
810
|
+
|
|
811
|
+
def _t_record_fetch_outcome(self, job_index: str, code: int, ts: float, ok: bool, trace_id: Optional[str]) -> None:
|
|
812
|
+
if not self._telemetry_enabled:
|
|
813
|
+
return
|
|
814
|
+
with self._telemetry_lock:
|
|
815
|
+
self._t_append_capped(
|
|
816
|
+
self._telemetry["fetch"]["calls"],
|
|
817
|
+
{"job": job_index, "code": code, "ok": ok, "ts": ts, "trace": trace_id},
|
|
818
|
+
)
|
|
819
|
+
pj = self._t_per_job(job_index)
|
|
820
|
+
if code == 2: # 202 not ready
|
|
821
|
+
pj["timeouts_202"] += 1
|
|
822
|
+
if ok and pj["first_success_ts"] is None:
|
|
823
|
+
pj["first_success_ts"] = ts
|
|
824
|
+
if not ok and code not in (0, 2):
|
|
825
|
+
pj["failures"] += 1
|
|
826
|
+
|
|
827
|
+
def get_telemetry(self) -> Dict[str, Any]:
|
|
828
|
+
with self._telemetry_lock:
|
|
829
|
+
return copy.deepcopy(self._telemetry)
|
|
830
|
+
|
|
831
|
+
def summarize_telemetry(self) -> Dict[str, Any]:
|
|
832
|
+
with self._telemetry_lock:
|
|
833
|
+
submit_count = self._telemetry["submit"]["count"]
|
|
834
|
+
fetch_count = self._telemetry["fetch"]["count"]
|
|
835
|
+
intervals = list(self._telemetry["fetch"]["intervals"])
|
|
836
|
+
intervals.sort()
|
|
837
|
+
avg = mean(intervals) if intervals else 0.0
|
|
838
|
+
p50 = median(intervals) if intervals else 0.0
|
|
839
|
+
# p95 via index
|
|
840
|
+
p95 = intervals[int(0.95 * (len(intervals) - 1))] if intervals else 0.0
|
|
841
|
+
per_job = self._telemetry["per_job"]
|
|
842
|
+
# Aggregate per-job stats
|
|
843
|
+
jobs = len(per_job)
|
|
844
|
+
total_timeouts = sum(pj.get("timeouts_202", 0) for pj in per_job.values())
|
|
845
|
+
total_failures = sum(pj.get("failures", 0) for pj in per_job.values())
|
|
846
|
+
return {
|
|
847
|
+
"submit_count": submit_count,
|
|
848
|
+
"fetch_count": fetch_count,
|
|
849
|
+
"fetch_interval_avg": avg,
|
|
850
|
+
"fetch_interval_p50": p50,
|
|
851
|
+
"fetch_interval_p95": p95,
|
|
852
|
+
"jobs_tracked": jobs,
|
|
853
|
+
"timeouts_202_total": total_timeouts,
|
|
854
|
+
"failures_total": total_failures,
|
|
855
|
+
}
|
|
856
|
+
|
|
627
857
|
def _get_and_check_job_state(
|
|
628
858
|
self,
|
|
629
859
|
job_index: str,
|
|
@@ -861,6 +1091,8 @@ class NvIngestClient:
|
|
|
861
1091
|
Exception
|
|
862
1092
|
For unexpected issues.
|
|
863
1093
|
"""
|
|
1094
|
+
ts_attempt = time.time()
|
|
1095
|
+
self._t_record_fetch_attempt(job_index, ts_attempt)
|
|
864
1096
|
try:
|
|
865
1097
|
# Get job state using the client-side index
|
|
866
1098
|
job_state = self._get_and_check_job_state(
|
|
@@ -901,6 +1133,7 @@ class NvIngestClient:
|
|
|
901
1133
|
logger.debug(
|
|
902
1134
|
f"Successfully processed and removed job index {job_index} (Server ID: {server_job_id})"
|
|
903
1135
|
)
|
|
1136
|
+
self._t_record_fetch_outcome(job_index, 0, time.time(), ok=True, trace_id=job_state.trace_id)
|
|
904
1137
|
return result_data, job_index, job_state.trace_id
|
|
905
1138
|
|
|
906
1139
|
except json.JSONDecodeError as err:
|
|
@@ -922,6 +1155,7 @@ class NvIngestClient:
|
|
|
922
1155
|
elif response.response_code == 2: # Job Not Ready (e.g., HTTP 202, or r-2 from SimpleBroker)
|
|
923
1156
|
# Raise TimeoutError to signal the calling retry loop in fetch_job_result
|
|
924
1157
|
# Do not change job state here, remains SUBMITTED
|
|
1158
|
+
self._t_record_fetch_outcome(job_index, 2, time.time(), ok=False, trace_id=job_state.trace_id)
|
|
925
1159
|
raise TimeoutError(f"Job not ready: {response.response_reason}")
|
|
926
1160
|
|
|
927
1161
|
else:
|
|
@@ -934,6 +1168,7 @@ class NvIngestClient:
|
|
|
934
1168
|
job_state.state = JobStateEnum.FAILED # Mark job as failed in the client
|
|
935
1169
|
# Do NOT pop the state for failed jobs here
|
|
936
1170
|
# Raise RuntimeError to indicate a terminal failure for this fetch attempt
|
|
1171
|
+
self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=job_state.trace_id)
|
|
937
1172
|
raise RuntimeError(error_msg)
|
|
938
1173
|
|
|
939
1174
|
except (TimeoutError, ValueError, RuntimeError):
|
|
@@ -945,12 +1180,17 @@ class NvIngestClient:
|
|
|
945
1180
|
# Attempt to mark state as FAILED if possible and state object exists
|
|
946
1181
|
if "job_state" in locals() and hasattr(job_state, "state"):
|
|
947
1182
|
job_state.state = JobStateEnum.FAILED
|
|
1183
|
+
try:
|
|
1184
|
+
self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=None)
|
|
1185
|
+
except Exception:
|
|
1186
|
+
pass
|
|
948
1187
|
raise # Re-raise the original exception
|
|
949
1188
|
|
|
950
1189
|
def fetch_job_result_cli(
|
|
951
1190
|
self,
|
|
952
1191
|
job_ids: Union[str, List[str]],
|
|
953
1192
|
data_only: bool = False,
|
|
1193
|
+
timeout: Optional[Tuple[int, Optional[float]]] = None,
|
|
954
1194
|
) -> List[Tuple[Any, str, Optional[str]]]:
|
|
955
1195
|
"""
|
|
956
1196
|
Fetch job results via CLI semantics (synchronous list return).
|
|
@@ -970,7 +1210,8 @@ class NvIngestClient:
|
|
|
970
1210
|
if isinstance(job_ids, str):
|
|
971
1211
|
job_ids = [job_ids]
|
|
972
1212
|
|
|
973
|
-
|
|
1213
|
+
eff_timeout: Tuple[int, Optional[float]] = timeout if timeout is not None else (100, None)
|
|
1214
|
+
return [self._fetch_job_result(job_id, timeout=eff_timeout, data_only=data_only) for job_id in job_ids]
|
|
974
1215
|
|
|
975
1216
|
def _validate_batch_size(self, batch_size: Optional[int]) -> int:
|
|
976
1217
|
"""
|
|
@@ -1019,14 +1260,21 @@ class NvIngestClient:
|
|
|
1019
1260
|
concurrency_limit: int = 64,
|
|
1020
1261
|
timeout: int = 100,
|
|
1021
1262
|
max_job_retries: Optional[int] = None,
|
|
1022
|
-
retry_delay: float = 5
|
|
1263
|
+
retry_delay: float = 0.5,
|
|
1264
|
+
initial_fetch_delay: float = 0.3,
|
|
1023
1265
|
fail_on_submit_error: bool = False,
|
|
1024
1266
|
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1025
1267
|
return_failures: bool = False,
|
|
1026
1268
|
data_only: bool = True,
|
|
1027
1269
|
stream_to_callback_only: bool = False,
|
|
1270
|
+
return_full_response: bool = False,
|
|
1028
1271
|
verbose: bool = False,
|
|
1029
|
-
|
|
1272
|
+
return_traces: bool = False,
|
|
1273
|
+
) -> Union[
|
|
1274
|
+
List[Any],
|
|
1275
|
+
Tuple[List[Any], List[Tuple[str, str]]],
|
|
1276
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
|
|
1277
|
+
]:
|
|
1030
1278
|
"""
|
|
1031
1279
|
Submit and fetch multiple jobs concurrently.
|
|
1032
1280
|
|
|
@@ -1056,8 +1304,13 @@ class NvIngestClient:
|
|
|
1056
1304
|
If True, return (results, failures). Default is False.
|
|
1057
1305
|
data_only : bool, optional
|
|
1058
1306
|
If True, return only payload 'data'. Default is True.
|
|
1307
|
+
return_full_response : bool, optional
|
|
1308
|
+
If True, results contain the full response envelopes (including 'trace' and 'annotations').
|
|
1309
|
+
Ignored when stream_to_callback_only=True. Default is False.
|
|
1059
1310
|
verbose : bool, optional
|
|
1060
1311
|
If True, enable debug logging. Default is False.
|
|
1312
|
+
return_traces : bool, optional
|
|
1313
|
+
If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
|
|
1061
1314
|
|
|
1062
1315
|
Returns
|
|
1063
1316
|
-------
|
|
@@ -1065,6 +1318,9 @@ class NvIngestClient:
|
|
|
1065
1318
|
List of successful job results when `return_failures` is False.
|
|
1066
1319
|
results, failures : tuple
|
|
1067
1320
|
Tuple of (successful results, failure tuples) when `return_failures` is True.
|
|
1321
|
+
results, failures, traces : tuple
|
|
1322
|
+
Tuple of (successful results, failure tuples, trace dicts) when both
|
|
1323
|
+
`return_failures` and `return_traces` are True.
|
|
1068
1324
|
|
|
1069
1325
|
Raises
|
|
1070
1326
|
------
|
|
@@ -1077,13 +1333,18 @@ class NvIngestClient:
|
|
|
1077
1333
|
|
|
1078
1334
|
# Handle empty input
|
|
1079
1335
|
if not job_indices:
|
|
1080
|
-
|
|
1336
|
+
if return_failures and return_traces:
|
|
1337
|
+
return [], [], []
|
|
1338
|
+
elif return_failures:
|
|
1339
|
+
return [], []
|
|
1340
|
+
else:
|
|
1341
|
+
return []
|
|
1081
1342
|
|
|
1082
1343
|
# Validate and set batch_size
|
|
1083
1344
|
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1084
1345
|
|
|
1085
|
-
# Prepare timeout tuple
|
|
1086
|
-
effective_timeout: Tuple[int,
|
|
1346
|
+
# Prepare timeout tuple to mirror handler behavior: finite connect, unbounded read (long-poll)
|
|
1347
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1087
1348
|
|
|
1088
1349
|
# Delegate to the concurrent processor
|
|
1089
1350
|
processor = _ConcurrentProcessor(
|
|
@@ -1093,16 +1354,24 @@ class NvIngestClient:
|
|
|
1093
1354
|
job_queue_id=job_queue_id,
|
|
1094
1355
|
timeout=effective_timeout,
|
|
1095
1356
|
max_job_retries=max_job_retries,
|
|
1357
|
+
retry_delay=retry_delay,
|
|
1358
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1096
1359
|
completion_callback=completion_callback,
|
|
1097
1360
|
fail_on_submit_error=fail_on_submit_error,
|
|
1098
1361
|
stream_to_callback_only=stream_to_callback_only,
|
|
1362
|
+
return_full_response=return_full_response,
|
|
1099
1363
|
verbose=verbose,
|
|
1364
|
+
return_traces=return_traces,
|
|
1100
1365
|
)
|
|
1101
1366
|
|
|
1102
|
-
results, failures = processor.run()
|
|
1367
|
+
results, failures, traces = processor.run()
|
|
1103
1368
|
|
|
1104
|
-
if return_failures:
|
|
1369
|
+
if return_failures and return_traces:
|
|
1370
|
+
return results, failures, traces
|
|
1371
|
+
elif return_failures:
|
|
1105
1372
|
return results, failures
|
|
1373
|
+
elif return_traces:
|
|
1374
|
+
return results, traces
|
|
1106
1375
|
|
|
1107
1376
|
if failures:
|
|
1108
1377
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
@@ -1135,7 +1404,12 @@ class NvIngestClient:
|
|
|
1135
1404
|
job_state.trace_id = future.result()[0] # Trace_id from `submit_job` endpoint submission
|
|
1136
1405
|
job_state.future = None
|
|
1137
1406
|
|
|
1138
|
-
def fetch_job_result_async(
|
|
1407
|
+
def fetch_job_result_async(
|
|
1408
|
+
self,
|
|
1409
|
+
job_ids: Union[str, List[str]],
|
|
1410
|
+
data_only: bool = True,
|
|
1411
|
+
timeout: Optional[Tuple[int, Optional[float]]] = None,
|
|
1412
|
+
) -> Dict[Future, str]:
|
|
1139
1413
|
"""
|
|
1140
1414
|
Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to job IDs.
|
|
1141
1415
|
|
|
@@ -1156,7 +1430,7 @@ class NvIngestClient:
|
|
|
1156
1430
|
future_to_job_id = {}
|
|
1157
1431
|
for job_id in job_ids:
|
|
1158
1432
|
job_state = self._get_and_check_job_state(job_id)
|
|
1159
|
-
future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only)
|
|
1433
|
+
future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only, timeout)
|
|
1160
1434
|
job_state.future = future
|
|
1161
1435
|
future_to_job_id[future] = job_id
|
|
1162
1436
|
|
|
@@ -1207,12 +1481,19 @@ class NvIngestClient:
|
|
|
1207
1481
|
# Free up memory -- payload should never be used again, and we don't want to keep it around.
|
|
1208
1482
|
job_state.job_spec.payload = None
|
|
1209
1483
|
|
|
1484
|
+
try:
|
|
1485
|
+
self._t_record_submit(job_index, "ok", time.time(), x_trace_id)
|
|
1486
|
+
except Exception:
|
|
1487
|
+
pass
|
|
1210
1488
|
return x_trace_id
|
|
1211
1489
|
except Exception as err:
|
|
1212
1490
|
err_msg = f"Failed to submit job {job_index} to queue {job_queue_id}: {err}"
|
|
1213
1491
|
logger.exception(err_msg)
|
|
1214
1492
|
job_state.state = JobStateEnum.FAILED
|
|
1215
|
-
|
|
1493
|
+
try:
|
|
1494
|
+
self._t_record_submit(job_index, "fail", time.time(), None)
|
|
1495
|
+
except Exception:
|
|
1496
|
+
pass
|
|
1216
1497
|
raise
|
|
1217
1498
|
|
|
1218
1499
|
def submit_job(
|
|
@@ -1433,7 +1714,9 @@ class NvIngestClient:
|
|
|
1433
1714
|
|
|
1434
1715
|
return results
|
|
1435
1716
|
|
|
1436
|
-
def create_jobs_for_batch(
|
|
1717
|
+
def create_jobs_for_batch(
|
|
1718
|
+
self, files_batch: List[str], tasks: Dict[str, Any], pdf_split_page_count: int = None
|
|
1719
|
+
) -> List[str]:
|
|
1437
1720
|
"""
|
|
1438
1721
|
Create and submit job specifications (JobSpecs) for a batch of files, returning the job IDs.
|
|
1439
1722
|
This function takes a batch of files, processes each file to extract its content and type,
|
|
@@ -1449,6 +1732,9 @@ class NvIngestClient:
|
|
|
1449
1732
|
A dictionary of tasks to be added to each job. The keys represent task names, and the
|
|
1450
1733
|
values represent task specifications or configurations. Standard tasks include "split",
|
|
1451
1734
|
"extract", "store", "caption", "dedup", "filter", "embed".
|
|
1735
|
+
pdf_split_page_count : int, optional
|
|
1736
|
+
Number of pages per PDF chunk for splitting (1-128). If provided, this will be added
|
|
1737
|
+
to the job spec's extended_options for PDF files.
|
|
1452
1738
|
|
|
1453
1739
|
Returns
|
|
1454
1740
|
-------
|
|
@@ -1495,6 +1781,10 @@ class NvIngestClient:
|
|
|
1495
1781
|
|
|
1496
1782
|
job_specs = create_job_specs_for_batch(files_batch)
|
|
1497
1783
|
|
|
1784
|
+
# Apply PDF split config if provided
|
|
1785
|
+
if pdf_split_page_count is not None:
|
|
1786
|
+
apply_pdf_split_config_to_job_specs(job_specs, pdf_split_page_count)
|
|
1787
|
+
|
|
1498
1788
|
job_ids = []
|
|
1499
1789
|
for job_spec in job_specs:
|
|
1500
1790
|
logger.debug(f"Tasks: {tasks.keys()}")
|
|
@@ -1524,3 +1814,19 @@ class NvIngestClient:
|
|
|
1524
1814
|
job_ids.append(job_id)
|
|
1525
1815
|
|
|
1526
1816
|
return job_ids
|
|
1817
|
+
|
|
1818
|
+
def register_parent_trace_id(self, trace_id: Optional[str]) -> None:
|
|
1819
|
+
"""Record a parent trace identifier once its aggregation completed."""
|
|
1820
|
+
|
|
1821
|
+
if not trace_id:
|
|
1822
|
+
return
|
|
1823
|
+
|
|
1824
|
+
if trace_id not in self._completed_parent_trace_ids:
|
|
1825
|
+
self._completed_parent_trace_ids.append(trace_id)
|
|
1826
|
+
|
|
1827
|
+
def consume_completed_parent_trace_ids(self) -> List[str]:
|
|
1828
|
+
"""Return and clear the set of completed parent trace identifiers."""
|
|
1829
|
+
|
|
1830
|
+
trace_ids = list(self._completed_parent_trace_ids)
|
|
1831
|
+
self._completed_parent_trace_ids.clear()
|
|
1832
|
+
return trace_ids
|