nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +561 -207
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +466 -59
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +58 -6
- nv_ingest_client/primitives/jobs/job_spec.py +32 -10
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +37 -76
- nv_ingest_client/primitives/tasks/extract.py +68 -169
- nv_ingest_client/primitives/tasks/filter.py +22 -28
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +10 -11
- nv_ingest_client/primitives/tasks/udf.py +349 -0
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client/util/image_disk_utils.py +300 -0
- nv_ingest_client/util/transport.py +12 -6
- nv_ingest_client/util/util.py +66 -0
- nv_ingest_client/util/vdb/milvus.py +220 -75
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
- nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,11 @@ import concurrent
|
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
10
|
import math
|
|
11
|
+
import os
|
|
11
12
|
import time
|
|
13
|
+
import threading
|
|
14
|
+
import copy
|
|
15
|
+
from statistics import mean, median
|
|
12
16
|
from collections import defaultdict
|
|
13
17
|
from concurrent.futures import Future
|
|
14
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -31,11 +35,59 @@ from nv_ingest_client.primitives.tasks import TaskType
|
|
|
31
35
|
from nv_ingest_client.primitives.tasks import is_valid_task_type
|
|
32
36
|
from nv_ingest_client.primitives.tasks import task_factory
|
|
33
37
|
from nv_ingest_client.util.processing import handle_future_result, IngestJobFailure
|
|
34
|
-
from nv_ingest_client.util.util import
|
|
38
|
+
from nv_ingest_client.util.util import (
|
|
39
|
+
create_job_specs_for_batch,
|
|
40
|
+
check_ingest_result,
|
|
41
|
+
apply_pdf_split_config_to_job_specs,
|
|
42
|
+
)
|
|
35
43
|
|
|
36
44
|
logger = logging.getLogger(__name__)
|
|
37
45
|
|
|
38
46
|
|
|
47
|
+
def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Compute resident_time entries from entry/exit pairs if not already present.
|
|
50
|
+
|
|
51
|
+
This ensures consistency between split jobs (where server computes resident_time)
|
|
52
|
+
and non-split jobs (where we compute it client-side).
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
trace_dict : Dict[str, Any]
|
|
57
|
+
Trace dictionary with entry/exit pairs
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Dict[str, Any]
|
|
62
|
+
Trace dictionary with resident_time entries added
|
|
63
|
+
"""
|
|
64
|
+
if not trace_dict or not isinstance(trace_dict, dict):
|
|
65
|
+
return trace_dict
|
|
66
|
+
|
|
67
|
+
# Check if resident_time already exists (server-computed for split jobs)
|
|
68
|
+
has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
|
|
69
|
+
if has_resident:
|
|
70
|
+
return trace_dict # Already computed by server
|
|
71
|
+
|
|
72
|
+
# Compute resident_time from entry/exit pairs
|
|
73
|
+
result = dict(trace_dict)
|
|
74
|
+
stages = set()
|
|
75
|
+
|
|
76
|
+
# Find all unique stages
|
|
77
|
+
for key in trace_dict:
|
|
78
|
+
if key.startswith("trace::entry::"):
|
|
79
|
+
stages.add(key.replace("trace::entry::", ""))
|
|
80
|
+
|
|
81
|
+
# Compute resident_time for each stage
|
|
82
|
+
for stage in stages:
|
|
83
|
+
entry_key = f"trace::entry::{stage}"
|
|
84
|
+
exit_key = f"trace::exit::{stage}"
|
|
85
|
+
if entry_key in trace_dict and exit_key in trace_dict:
|
|
86
|
+
result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
|
|
87
|
+
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
39
91
|
class DataDecodeException(Exception):
|
|
40
92
|
"""
|
|
41
93
|
Exception raised for errors in decoding data.
|
|
@@ -56,15 +108,12 @@ class DataDecodeException(Exception):
|
|
|
56
108
|
|
|
57
109
|
class _ConcurrentProcessor:
|
|
58
110
|
"""
|
|
59
|
-
Manages
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
they become available within the batch using `as_completed`. Retries due
|
|
66
|
-
to job readiness timeouts are handled by adding the job index to the next
|
|
67
|
-
processing batch.
|
|
111
|
+
Manages asynchronous submission and result fetching while keeping a steady
|
|
112
|
+
pool of up to `batch_size` in-flight jobs:
|
|
113
|
+
- Retries (202/TimeoutError) are re-queued immediately.
|
|
114
|
+
- New jobs are submitted as capacity frees up.
|
|
115
|
+
- Fetches are started for jobs added each cycle.
|
|
116
|
+
- We always attempt to keep the executor saturated up to `batch_size`.
|
|
68
117
|
"""
|
|
69
118
|
|
|
70
119
|
def __init__(
|
|
@@ -75,10 +124,14 @@ class _ConcurrentProcessor:
|
|
|
75
124
|
batch_size: int,
|
|
76
125
|
timeout: Tuple[int, Union[float, None]],
|
|
77
126
|
max_job_retries: Optional[int],
|
|
127
|
+
retry_delay: float,
|
|
128
|
+
initial_fetch_delay: float,
|
|
78
129
|
completion_callback: Optional[Callable[[Dict[str, Any], str], None]],
|
|
79
130
|
fail_on_submit_error: bool,
|
|
80
131
|
stream_to_callback_only: bool,
|
|
132
|
+
return_full_response: bool,
|
|
81
133
|
verbose: bool = False,
|
|
134
|
+
return_traces: bool = False,
|
|
82
135
|
):
|
|
83
136
|
"""
|
|
84
137
|
Initializes the concurrent processor.
|
|
@@ -112,6 +165,8 @@ class _ConcurrentProcessor:
|
|
|
112
165
|
initiating job submission or fetching fails for a batch.
|
|
113
166
|
verbose : bool, optional
|
|
114
167
|
If True, enables detailed debug logging. Default is False.
|
|
168
|
+
return_traces : bool, optional
|
|
169
|
+
If True, parent-level trace data for each completed job is stored.
|
|
115
170
|
|
|
116
171
|
Raises
|
|
117
172
|
------
|
|
@@ -127,16 +182,21 @@ class _ConcurrentProcessor:
|
|
|
127
182
|
self.batch_size = batch_size
|
|
128
183
|
self.timeout = timeout
|
|
129
184
|
self.max_job_retries = max_job_retries
|
|
185
|
+
self.retry_delay = retry_delay
|
|
186
|
+
self.initial_fetch_delay = initial_fetch_delay
|
|
130
187
|
self.completion_callback = completion_callback
|
|
131
188
|
self.fail_on_submit_error = fail_on_submit_error
|
|
132
189
|
self.stream_to_callback_only = stream_to_callback_only
|
|
190
|
+
self.return_full_response = return_full_response
|
|
133
191
|
self.verbose = verbose
|
|
192
|
+
self.return_traces = return_traces
|
|
134
193
|
|
|
135
194
|
# State variables managed across batch cycles
|
|
136
195
|
self.retry_job_ids: List[str] = []
|
|
137
196
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
138
197
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
139
198
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
199
|
+
self.traces: List[Optional[Dict[str, Any]]] = []
|
|
140
200
|
|
|
141
201
|
# --- Initial Checks ---
|
|
142
202
|
if not self.job_queue_id:
|
|
@@ -224,13 +284,25 @@ class _ConcurrentProcessor:
|
|
|
224
284
|
|
|
225
285
|
is_failed, description = check_ingest_result(result_data)
|
|
226
286
|
|
|
287
|
+
if trace_id:
|
|
288
|
+
self.client.register_parent_trace_id(trace_id)
|
|
289
|
+
|
|
227
290
|
if is_failed:
|
|
228
291
|
failed_job_spec = self.client._job_index_to_job_spec.get(job_index)
|
|
229
292
|
self.failures.append((f"{job_index}:{failed_job_spec.source_id}", description))
|
|
230
293
|
elif self.stream_to_callback_only:
|
|
231
294
|
self.results.append(job_index)
|
|
232
295
|
else:
|
|
233
|
-
|
|
296
|
+
# When requested, return the full response envelope (includes 'trace' and 'annotations')
|
|
297
|
+
self.results.append(result_data if self.return_full_response else result_data.get("data"))
|
|
298
|
+
|
|
299
|
+
# Extract trace data for all successful (non-failed) jobs
|
|
300
|
+
if self.return_traces and not is_failed:
|
|
301
|
+
trace_payload = result_data.get("trace") if result_data else None
|
|
302
|
+
# Compute resident_time if not already present (for consistency)
|
|
303
|
+
if trace_payload:
|
|
304
|
+
trace_payload = _compute_resident_times(trace_payload)
|
|
305
|
+
self.traces.append(trace_payload if trace_payload else None)
|
|
234
306
|
|
|
235
307
|
# Cleanup retry count if it exists
|
|
236
308
|
if job_index in self.retry_counts:
|
|
@@ -274,21 +346,156 @@ class _ConcurrentProcessor:
|
|
|
274
346
|
except Exception:
|
|
275
347
|
logger.warning("Could not reliably extract job indices from results for final check.")
|
|
276
348
|
|
|
277
|
-
initial_indices = set(self.all_job_indices_list)
|
|
278
|
-
unaccounted_indices = initial_indices - processed_indices
|
|
279
|
-
|
|
280
|
-
if unaccounted_indices:
|
|
281
|
-
logger.warning(f"Potentially unaccounted for jobs: {unaccounted_indices}")
|
|
282
|
-
# Optionally add them to failures
|
|
283
|
-
# for idx in unaccounted_indices:
|
|
284
|
-
# if not any(f[0] == idx for f in self.failures):
|
|
285
|
-
# self.failures.append((idx, "Job lost or unaccounted for at exit"))
|
|
286
|
-
|
|
287
349
|
# --------------------------------------------------------------------------
|
|
288
|
-
#
|
|
350
|
+
# Declarative Helper Methods (behavior preserved)
|
|
289
351
|
# --------------------------------------------------------------------------
|
|
290
352
|
|
|
291
|
-
def
|
|
353
|
+
def _collect_retry_jobs_for_batch(self) -> List[str]:
|
|
354
|
+
"""
|
|
355
|
+
Collect retry jobs for this batch, mirroring handler behavior (no pacing filter).
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
List[str]
|
|
360
|
+
The list of job indices that should be retried in this batch.
|
|
361
|
+
"""
|
|
362
|
+
if not self.retry_job_ids:
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
# Take all retries this cycle and clear the list (handler resets per-iteration)
|
|
366
|
+
eligible: List[str] = list(self.retry_job_ids)
|
|
367
|
+
self.retry_job_ids = []
|
|
368
|
+
if eligible and self.verbose:
|
|
369
|
+
logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
|
|
370
|
+
return eligible
|
|
371
|
+
|
|
372
|
+
def _schedule_retry(self, job_index: str) -> None:
|
|
373
|
+
"""
|
|
374
|
+
Schedule an immediate retry for a job (no pacing), mirroring handler behavior.
|
|
375
|
+
"""
|
|
376
|
+
if job_index not in self.retry_job_ids:
|
|
377
|
+
self.retry_job_ids.append(job_index)
|
|
378
|
+
|
|
379
|
+
def _select_new_jobs_for_batch(
|
|
380
|
+
self,
|
|
381
|
+
submitted_new_indices_count: int,
|
|
382
|
+
total_jobs: int,
|
|
383
|
+
already_in_batch: int,
|
|
384
|
+
) -> Tuple[List[str], int]:
|
|
385
|
+
"""
|
|
386
|
+
Determine the slice of new jobs to include in the current batch based on
|
|
387
|
+
remaining capacity and unsubmitted jobs.
|
|
388
|
+
|
|
389
|
+
Note: This does NOT change submitted_new_indices_count. The original code
|
|
390
|
+
increments that counter only after submission is attempted/handled.
|
|
391
|
+
"""
|
|
392
|
+
if (already_in_batch < self.batch_size) and (submitted_new_indices_count < total_jobs):
|
|
393
|
+
num_new_to_add = min(self.batch_size - already_in_batch, total_jobs - submitted_new_indices_count)
|
|
394
|
+
start_idx = submitted_new_indices_count
|
|
395
|
+
end_idx = submitted_new_indices_count + num_new_to_add
|
|
396
|
+
new_job_indices = self.all_job_indices_list[start_idx:end_idx]
|
|
397
|
+
|
|
398
|
+
if self.verbose:
|
|
399
|
+
logger.debug(f"Adding {len(new_job_indices)} new jobs to current batch.")
|
|
400
|
+
|
|
401
|
+
return new_job_indices, submitted_new_indices_count
|
|
402
|
+
|
|
403
|
+
return [], submitted_new_indices_count
|
|
404
|
+
|
|
405
|
+
def _submit_new_jobs_async(
|
|
406
|
+
self,
|
|
407
|
+
current_batch_new_job_indices: List[str],
|
|
408
|
+
current_batch_job_indices: List[str],
|
|
409
|
+
submitted_new_indices_count: int,
|
|
410
|
+
) -> Tuple[List[str], int]:
|
|
411
|
+
"""
|
|
412
|
+
Initiate asynchronous submission for the new jobs selected for this batch.
|
|
413
|
+
|
|
414
|
+
Mirrors the original inline submission block, including error handling and
|
|
415
|
+
fail_on_submit_error semantics. Returns potentially updated batch indices and
|
|
416
|
+
submitted count.
|
|
417
|
+
"""
|
|
418
|
+
if not current_batch_new_job_indices:
|
|
419
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
420
|
+
|
|
421
|
+
if not self.job_queue_id:
|
|
422
|
+
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
423
|
+
logger.error(error_msg)
|
|
424
|
+
# Fail these jobs immediately
|
|
425
|
+
for job_index in current_batch_new_job_indices:
|
|
426
|
+
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
427
|
+
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
428
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
429
|
+
if self.fail_on_submit_error:
|
|
430
|
+
raise ValueError(error_msg)
|
|
431
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
# Fire-and-forget submission initiation
|
|
435
|
+
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
436
|
+
# Add successfully initiated jobs to the overall batch list
|
|
437
|
+
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
438
|
+
# Update count of total initiated jobs
|
|
439
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
440
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
441
|
+
except Exception as e:
|
|
442
|
+
error_msg = (
|
|
443
|
+
f"Batch async submission initiation failed for {len(current_batch_new_job_indices)} new jobs: {e}"
|
|
444
|
+
)
|
|
445
|
+
logger.error(error_msg, exc_info=True)
|
|
446
|
+
# Fail these jobs immediately
|
|
447
|
+
for job_index in current_batch_new_job_indices:
|
|
448
|
+
self._handle_processing_failure(
|
|
449
|
+
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
450
|
+
)
|
|
451
|
+
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
452
|
+
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
453
|
+
if self.fail_on_submit_error:
|
|
454
|
+
raise RuntimeError(error_msg) from e
|
|
455
|
+
return current_batch_job_indices, submitted_new_indices_count
|
|
456
|
+
|
|
457
|
+
def _initiate_fetch_for_batch(self, current_batch_job_indices: List[str]) -> Tuple[Dict[Future, str], List[str]]:
|
|
458
|
+
"""
|
|
459
|
+
Initiate fetching for the prepared batch and ensure consistency of returned futures.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
batch_futures_dict : Dict[Future, str]
|
|
464
|
+
Mapping of futures to their associated job indices.
|
|
465
|
+
normalized_job_indices : List[str]
|
|
466
|
+
The job indices normalized to those actually returned by the client if a discrepancy occurs.
|
|
467
|
+
"""
|
|
468
|
+
if self.verbose:
|
|
469
|
+
logger.debug(f"Calling fetch_job_result_async for {len(current_batch_job_indices)} jobs.")
|
|
470
|
+
batch_futures_dict: Dict[Future, str] = (
|
|
471
|
+
self.client.fetch_job_result_async(current_batch_job_indices, data_only=False, timeout=None)
|
|
472
|
+
if current_batch_job_indices
|
|
473
|
+
else {}
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Check for discrepancies where client might not return all futures
|
|
477
|
+
if current_batch_job_indices and (len(batch_futures_dict) != len(current_batch_job_indices)):
|
|
478
|
+
returned_indices = set(batch_futures_dict.values())
|
|
479
|
+
missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
|
|
480
|
+
logger.error(
|
|
481
|
+
f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
|
|
482
|
+
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
483
|
+
)
|
|
484
|
+
# Fail the missing ones explicitly
|
|
485
|
+
for missing_idx in missing_indices:
|
|
486
|
+
self._handle_processing_failure(
|
|
487
|
+
missing_idx, "Future not returned by fetch_job_result_async", is_submission_failure=True
|
|
488
|
+
)
|
|
489
|
+
if self.fail_on_submit_error:
|
|
490
|
+
raise RuntimeError("fetch_job_result_async failed to return all expected futures.")
|
|
491
|
+
# Continue processing only the futures we received
|
|
492
|
+
normalized_job_indices = list(returned_indices)
|
|
493
|
+
else:
|
|
494
|
+
normalized_job_indices = list(current_batch_job_indices)
|
|
495
|
+
|
|
496
|
+
return batch_futures_dict, normalized_job_indices
|
|
497
|
+
|
|
498
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
292
499
|
"""
|
|
293
500
|
Executes the main processing loop in batches.
|
|
294
501
|
|
|
@@ -314,210 +521,124 @@ class _ConcurrentProcessor:
|
|
|
314
521
|
initiation error occurs.
|
|
315
522
|
"""
|
|
316
523
|
total_jobs = len(self.all_job_indices_list)
|
|
317
|
-
# Tracks indices for which submission has been initiated at least once
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
while (submitted_new_indices_count < total_jobs) or self.retry_job_ids:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
else:
|
|
362
|
-
try:
|
|
363
|
-
# Fire-and-forget submission initiation
|
|
364
|
-
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
365
|
-
# Add successfully initiated jobs to the overall batch list
|
|
366
|
-
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
367
|
-
# Update count of total initiated jobs
|
|
368
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
369
|
-
except Exception as e:
|
|
370
|
-
error_msg = (
|
|
371
|
-
f"Batch async submission initiation failed for "
|
|
372
|
-
f"{len(current_batch_new_job_indices)} new jobs: {e}"
|
|
524
|
+
submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
|
|
525
|
+
|
|
526
|
+
logger.debug(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
|
|
527
|
+
|
|
528
|
+
# Keep up to batch_size jobs in-flight at all times
|
|
529
|
+
inflight_futures: Dict[Future, str] = {}
|
|
530
|
+
|
|
531
|
+
while (submitted_new_indices_count < total_jobs) or self.retry_job_ids or inflight_futures:
|
|
532
|
+
# 1) Top up from retries first
|
|
533
|
+
capacity = max(0, self.batch_size - len(inflight_futures))
|
|
534
|
+
to_fetch: List[str] = []
|
|
535
|
+
if capacity > 0 and self.retry_job_ids:
|
|
536
|
+
take = min(capacity, len(self.retry_job_ids))
|
|
537
|
+
retry_now = self.retry_job_ids[:take]
|
|
538
|
+
self.retry_job_ids = self.retry_job_ids[take:]
|
|
539
|
+
to_fetch.extend(retry_now)
|
|
540
|
+
capacity -= len(retry_now)
|
|
541
|
+
|
|
542
|
+
# 2) Then add new jobs up to capacity
|
|
543
|
+
if capacity > 0 and (submitted_new_indices_count < total_jobs):
|
|
544
|
+
new_count = min(capacity, total_jobs - submitted_new_indices_count)
|
|
545
|
+
new_job_indices = self.all_job_indices_list[
|
|
546
|
+
submitted_new_indices_count : submitted_new_indices_count + new_count
|
|
547
|
+
]
|
|
548
|
+
|
|
549
|
+
if not self.job_queue_id:
|
|
550
|
+
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
551
|
+
logger.error(error_msg)
|
|
552
|
+
for job_index in new_job_indices:
|
|
553
|
+
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
554
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
555
|
+
if self.fail_on_submit_error:
|
|
556
|
+
raise ValueError(error_msg)
|
|
557
|
+
else:
|
|
558
|
+
try:
|
|
559
|
+
_ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
|
|
560
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
561
|
+
to_fetch.extend(new_job_indices)
|
|
562
|
+
except Exception as e:
|
|
563
|
+
error_msg = f"Batch async submission initiation failed for {len(new_job_indices)} new jobs: {e}"
|
|
564
|
+
logger.error(error_msg, exc_info=True)
|
|
565
|
+
for job_index in new_job_indices:
|
|
566
|
+
self._handle_processing_failure(
|
|
567
|
+
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
373
568
|
)
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
self._handle_processing_failure(
|
|
378
|
-
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
379
|
-
)
|
|
380
|
-
# Mark as "submitted" (to prevent reprocessing) but failed
|
|
381
|
-
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
382
|
-
if self.fail_on_submit_error:
|
|
383
|
-
raise RuntimeError(error_msg) from e
|
|
384
|
-
|
|
385
|
-
# If nothing ended up in the batch (e.g., only submission failures)
|
|
386
|
-
if not current_batch_job_indices:
|
|
387
|
-
if self.verbose:
|
|
388
|
-
logger.debug("No jobs identified for fetching in this batch iteration.")
|
|
389
|
-
# If there are no retries pending either, break the loop
|
|
390
|
-
if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
391
|
-
logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
|
|
392
|
-
break
|
|
393
|
-
continue # Otherwise, proceed to next iteration
|
|
394
|
-
|
|
395
|
-
# --- Initiate Fetching for the Current Batch ---
|
|
396
|
-
try:
|
|
397
|
-
if self.verbose:
|
|
398
|
-
logger.debug(
|
|
399
|
-
f"Calling fetch_job_result_async for "
|
|
400
|
-
f"{len(current_batch_job_indices)} jobs in current batch."
|
|
401
|
-
)
|
|
402
|
-
# Use data_only=False to get full response for callback/results
|
|
403
|
-
batch_futures_dict = self.client.fetch_job_result_async(current_batch_job_indices, data_only=False)
|
|
569
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
570
|
+
if self.fail_on_submit_error:
|
|
571
|
+
raise RuntimeError(error_msg) from e
|
|
404
572
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
573
|
+
# 3) Launch fetches for the jobs we added to this cycle
|
|
574
|
+
if to_fetch:
|
|
575
|
+
try:
|
|
576
|
+
new_futures = self.client.fetch_job_result_async(to_fetch, data_only=False, timeout=None)
|
|
577
|
+
inflight_futures.update(new_futures)
|
|
578
|
+
except Exception as fetch_init_err:
|
|
409
579
|
logger.error(
|
|
410
|
-
f"fetch_job_result_async
|
|
411
|
-
|
|
412
|
-
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
580
|
+
f"fetch_job_result_async failed to start for {len(to_fetch)} jobs: {fetch_init_err}",
|
|
581
|
+
exc_info=True,
|
|
413
582
|
)
|
|
414
|
-
|
|
415
|
-
for missing_idx in missing_indices:
|
|
583
|
+
for job_index in to_fetch:
|
|
416
584
|
self._handle_processing_failure(
|
|
417
|
-
|
|
585
|
+
job_index, f"Fetch initiation error: {fetch_init_err}", is_submission_failure=True
|
|
418
586
|
)
|
|
419
587
|
if self.fail_on_submit_error:
|
|
420
|
-
raise RuntimeError(
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
588
|
+
raise RuntimeError(
|
|
589
|
+
f"Stopping due to fetch initiation failure: {fetch_init_err}"
|
|
590
|
+
) from fetch_init_err
|
|
591
|
+
|
|
592
|
+
# 4) If nothing left anywhere, exit
|
|
593
|
+
if not inflight_futures and not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
594
|
+
logger.debug("Exiting loop: No in-flight jobs, no retries, and all jobs submitted.")
|
|
595
|
+
break
|
|
596
|
+
|
|
597
|
+
# 5) Wait for at least one in-flight future to complete, then process done ones
|
|
598
|
+
if inflight_futures:
|
|
599
|
+
done, _ = concurrent.futures.wait(
|
|
600
|
+
set(inflight_futures.keys()), return_when=concurrent.futures.FIRST_COMPLETED
|
|
428
601
|
)
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
# Fail all jobs intended for this batch
|
|
434
|
-
for job_index in current_batch_job_indices:
|
|
435
|
-
self._handle_processing_failure(
|
|
436
|
-
job_index, f"Fetch initiation failed for batch: {fetch_init_err}", is_submission_failure=True
|
|
437
|
-
)
|
|
438
|
-
if self.fail_on_submit_error:
|
|
439
|
-
raise RuntimeError(
|
|
440
|
-
f"Stopping due to fetch initiation failure: {fetch_init_err}"
|
|
441
|
-
) from fetch_init_err
|
|
442
|
-
continue # Skip processing results for this failed batch
|
|
443
|
-
|
|
444
|
-
# --- Process Results for the Current Batch ---
|
|
445
|
-
if not batch_futures_dict:
|
|
446
|
-
if self.verbose:
|
|
447
|
-
logger.debug("No futures returned/available for processing in this batch.")
|
|
448
|
-
continue # Skip processing if no futures
|
|
449
|
-
|
|
450
|
-
batch_timeout = 600.0 # Timeout for waiting on the whole batch
|
|
451
|
-
try:
|
|
452
|
-
# Process futures as they complete within this batch
|
|
453
|
-
for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
|
|
454
|
-
job_index = batch_futures_dict[future]
|
|
602
|
+
for future in done:
|
|
603
|
+
job_index = inflight_futures.pop(future, None)
|
|
604
|
+
if job_index is None:
|
|
605
|
+
continue
|
|
455
606
|
try:
|
|
456
|
-
# Expect list with one tuple: [(data, index, trace)]
|
|
457
607
|
result_list = future.result()
|
|
458
608
|
if not isinstance(result_list, list) or len(result_list) != 1:
|
|
459
609
|
raise ValueError(f"Expected list length 1, got {len(result_list)}")
|
|
460
|
-
|
|
461
610
|
result_tuple = result_list[0]
|
|
462
611
|
if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
|
|
463
612
|
raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
|
|
464
|
-
|
|
465
613
|
full_response_dict, fetched_job_index, trace_id = result_tuple
|
|
466
|
-
|
|
467
614
|
if fetched_job_index != job_index:
|
|
468
|
-
logger.warning(f"Mismatch: Future for {job_index} returned
|
|
469
|
-
|
|
615
|
+
logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
|
|
470
616
|
self._handle_processing_success(job_index, full_response_dict, trace_id)
|
|
471
|
-
|
|
472
617
|
except TimeoutError:
|
|
473
|
-
#
|
|
618
|
+
# Not ready -> immediate retry
|
|
474
619
|
self.retry_counts[job_index] += 1
|
|
475
620
|
if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
|
|
476
621
|
if self.verbose:
|
|
477
622
|
logger.info(
|
|
478
|
-
f"Job {job_index} not ready,
|
|
479
|
-
f"
|
|
480
|
-
f"{self.retry_counts[job_index]}/"
|
|
481
|
-
f"{self.max_job_retries or 'inf'})."
|
|
623
|
+
f"Job {job_index} not ready, scheduling retry "
|
|
624
|
+
f"(Attempt {self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
|
|
482
625
|
)
|
|
483
|
-
|
|
484
|
-
self.retry_job_ids.append(job_index)
|
|
626
|
+
self._schedule_retry(job_index)
|
|
485
627
|
else:
|
|
486
|
-
error_msg = f"Exceeded max fetch retries
|
|
628
|
+
error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
|
|
487
629
|
logger.error(error_msg)
|
|
488
630
|
self._handle_processing_failure(job_index, error_msg)
|
|
489
|
-
|
|
490
631
|
except (ValueError, RuntimeError) as e:
|
|
491
632
|
logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
|
|
492
633
|
self._handle_processing_failure(job_index, f"Error processing result: {e}")
|
|
493
634
|
except Exception as e:
|
|
494
635
|
logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
|
|
495
636
|
self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
|
|
496
|
-
# No finally block incrementing count here; tracking is batch-based
|
|
497
|
-
|
|
498
|
-
except TimeoutError:
|
|
499
|
-
# `as_completed` timed out waiting for remaining futures in batch
|
|
500
|
-
logger.error(
|
|
501
|
-
f"Batch processing timed out after {batch_timeout}s waiting "
|
|
502
|
-
f"for futures. Some jobs in batch may be lost or incomplete."
|
|
503
|
-
)
|
|
504
|
-
# Identify and fail remaining futures
|
|
505
|
-
remaining_indices_in_batch = []
|
|
506
|
-
for f, idx in batch_futures_dict.items():
|
|
507
|
-
if not f.done():
|
|
508
|
-
remaining_indices_in_batch.append(idx)
|
|
509
|
-
f.cancel() # Attempt to cancel underlying task
|
|
510
|
-
logger.warning(
|
|
511
|
-
f"Jobs potentially lost/cancelled due to batch timeout: " f"{remaining_indices_in_batch}"
|
|
512
|
-
)
|
|
513
|
-
for idx in remaining_indices_in_batch:
|
|
514
|
-
self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
|
|
515
|
-
# End of processing for this batch cycle
|
|
516
637
|
|
|
517
638
|
# --- Final Logging ---
|
|
518
639
|
self._log_final_status(total_jobs)
|
|
519
640
|
|
|
520
|
-
return self.results, self.failures
|
|
641
|
+
return self.results, self.failures, self.traces if self.return_traces else []
|
|
521
642
|
|
|
522
643
|
|
|
523
644
|
class NvIngestClient:
|
|
@@ -546,11 +667,12 @@ class NvIngestClient:
|
|
|
546
667
|
message_client_port : int, optional
|
|
547
668
|
Port of the REST/message service. Defaults to 7670.
|
|
548
669
|
message_client_kwargs : dict, optional
|
|
549
|
-
Extra keyword arguments passed to the client allocator.
|
|
670
|
+
Extra keyword arguments passed to the client allocator. For RestClient,
|
|
671
|
+
can include 'api_version' (e.g., 'v1' or 'v2'). Defaults to 'v1'.
|
|
550
672
|
msg_counter_id : str, optional
|
|
551
673
|
Identifier for message counting. Defaults to "nv-ingest-message-id".
|
|
552
674
|
worker_pool_size : int, optional
|
|
553
|
-
Number of workers in the thread pool. Defaults to
|
|
675
|
+
Number of workers in the thread pool. Defaults to 8.
|
|
554
676
|
|
|
555
677
|
Returns
|
|
556
678
|
-------
|
|
@@ -572,10 +694,19 @@ class NvIngestClient:
|
|
|
572
694
|
**self._message_client_kwargs,
|
|
573
695
|
)
|
|
574
696
|
|
|
575
|
-
# Initialize the worker pool with the specified size
|
|
697
|
+
# Initialize the worker pool with the specified size (used for both submit and fetch)
|
|
576
698
|
self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
|
|
577
699
|
|
|
700
|
+
# Telemetry state and controls
|
|
701
|
+
self._telemetry_lock = threading.Lock()
|
|
702
|
+
self._telemetry_enabled: bool = bool(int(os.getenv("NV_INGEST_CLIENT_TELEMETRY", "1")))
|
|
703
|
+
try:
|
|
704
|
+
self._telemetry_max_calls: int = int(os.getenv("NV_INGEST_CLIENT_TELEMETRY_MAX_CALLS", "10000"))
|
|
705
|
+
except ValueError:
|
|
706
|
+
self._telemetry_max_calls = 10000
|
|
578
707
|
self._telemetry = {}
|
|
708
|
+
self._completed_parent_trace_ids: List[str] = [] # 1054
|
|
709
|
+
self.reset_telemetry()
|
|
579
710
|
|
|
580
711
|
def __str__(self) -> str:
|
|
581
712
|
"""
|
|
@@ -623,6 +754,106 @@ class NvIngestClient:
|
|
|
623
754
|
|
|
624
755
|
return job_state
|
|
625
756
|
|
|
757
|
+
# ------------------------------------------------------------------
|
|
758
|
+
# Telemetry helpers
|
|
759
|
+
# ------------------------------------------------------------------
|
|
760
|
+
|
|
761
|
+
def enable_telemetry(self, enabled: bool) -> None:
|
|
762
|
+
with self._telemetry_lock:
|
|
763
|
+
self._telemetry_enabled = bool(enabled)
|
|
764
|
+
|
|
765
|
+
def reset_telemetry(self) -> None:
|
|
766
|
+
with self._telemetry_lock:
|
|
767
|
+
self._telemetry = {
|
|
768
|
+
"started_at": time.time(),
|
|
769
|
+
"submit": {"count": 0, "calls": []},
|
|
770
|
+
"fetch": {"count": 0, "last_ts": None, "intervals": [], "calls": []},
|
|
771
|
+
"per_job": {},
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
def _t_per_job(self, job_index: str) -> Dict[str, Any]:
|
|
775
|
+
pj = self._telemetry["per_job"].get(job_index)
|
|
776
|
+
if pj is None:
|
|
777
|
+
pj = {"submits": [], "fetch_attempts": [], "timeouts_202": 0, "failures": 0, "first_success_ts": None}
|
|
778
|
+
self._telemetry["per_job"][job_index] = pj
|
|
779
|
+
return pj
|
|
780
|
+
|
|
781
|
+
def _t_append_capped(self, arr: List[Any], item: Any) -> None:
|
|
782
|
+
if len(arr) < self._telemetry_max_calls:
|
|
783
|
+
arr.append(item)
|
|
784
|
+
|
|
785
|
+
def _t_record_submit(self, job_index: str, status: str, ts: float, trace_id: Optional[str]) -> None:
|
|
786
|
+
if not self._telemetry_enabled:
|
|
787
|
+
return
|
|
788
|
+
with self._telemetry_lock:
|
|
789
|
+
self._telemetry["submit"]["count"] += 1
|
|
790
|
+
self._t_append_capped(
|
|
791
|
+
self._telemetry["submit"]["calls"],
|
|
792
|
+
{"job": job_index, "status": status, "ts": ts, "trace": trace_id},
|
|
793
|
+
)
|
|
794
|
+
pj = self._t_per_job(job_index)
|
|
795
|
+
self._t_append_capped(pj["submits"], ts)
|
|
796
|
+
|
|
797
|
+
def _t_record_fetch_attempt(self, job_index: str, ts: float) -> None:
|
|
798
|
+
if not self._telemetry_enabled:
|
|
799
|
+
return
|
|
800
|
+
with self._telemetry_lock:
|
|
801
|
+
self._telemetry["fetch"]["count"] += 1
|
|
802
|
+
last = self._telemetry["fetch"]["last_ts"]
|
|
803
|
+
if last is not None:
|
|
804
|
+
delta = ts - float(last)
|
|
805
|
+
if delta >= 0:
|
|
806
|
+
self._t_append_capped(self._telemetry["fetch"]["intervals"], delta)
|
|
807
|
+
self._telemetry["fetch"]["last_ts"] = ts
|
|
808
|
+
pj = self._t_per_job(job_index)
|
|
809
|
+
self._t_append_capped(pj["fetch_attempts"], ts)
|
|
810
|
+
|
|
811
|
+
def _t_record_fetch_outcome(self, job_index: str, code: int, ts: float, ok: bool, trace_id: Optional[str]) -> None:
|
|
812
|
+
if not self._telemetry_enabled:
|
|
813
|
+
return
|
|
814
|
+
with self._telemetry_lock:
|
|
815
|
+
self._t_append_capped(
|
|
816
|
+
self._telemetry["fetch"]["calls"],
|
|
817
|
+
{"job": job_index, "code": code, "ok": ok, "ts": ts, "trace": trace_id},
|
|
818
|
+
)
|
|
819
|
+
pj = self._t_per_job(job_index)
|
|
820
|
+
if code == 2: # 202 not ready
|
|
821
|
+
pj["timeouts_202"] += 1
|
|
822
|
+
if ok and pj["first_success_ts"] is None:
|
|
823
|
+
pj["first_success_ts"] = ts
|
|
824
|
+
if not ok and code not in (0, 2):
|
|
825
|
+
pj["failures"] += 1
|
|
826
|
+
|
|
827
|
+
def get_telemetry(self) -> Dict[str, Any]:
|
|
828
|
+
with self._telemetry_lock:
|
|
829
|
+
return copy.deepcopy(self._telemetry)
|
|
830
|
+
|
|
831
|
+
def summarize_telemetry(self) -> Dict[str, Any]:
|
|
832
|
+
with self._telemetry_lock:
|
|
833
|
+
submit_count = self._telemetry["submit"]["count"]
|
|
834
|
+
fetch_count = self._telemetry["fetch"]["count"]
|
|
835
|
+
intervals = list(self._telemetry["fetch"]["intervals"])
|
|
836
|
+
intervals.sort()
|
|
837
|
+
avg = mean(intervals) if intervals else 0.0
|
|
838
|
+
p50 = median(intervals) if intervals else 0.0
|
|
839
|
+
# p95 via index
|
|
840
|
+
p95 = intervals[int(0.95 * (len(intervals) - 1))] if intervals else 0.0
|
|
841
|
+
per_job = self._telemetry["per_job"]
|
|
842
|
+
# Aggregate per-job stats
|
|
843
|
+
jobs = len(per_job)
|
|
844
|
+
total_timeouts = sum(pj.get("timeouts_202", 0) for pj in per_job.values())
|
|
845
|
+
total_failures = sum(pj.get("failures", 0) for pj in per_job.values())
|
|
846
|
+
return {
|
|
847
|
+
"submit_count": submit_count,
|
|
848
|
+
"fetch_count": fetch_count,
|
|
849
|
+
"fetch_interval_avg": avg,
|
|
850
|
+
"fetch_interval_p50": p50,
|
|
851
|
+
"fetch_interval_p95": p95,
|
|
852
|
+
"jobs_tracked": jobs,
|
|
853
|
+
"timeouts_202_total": total_timeouts,
|
|
854
|
+
"failures_total": total_failures,
|
|
855
|
+
}
|
|
856
|
+
|
|
626
857
|
def _get_and_check_job_state(
|
|
627
858
|
self,
|
|
628
859
|
job_index: str,
|
|
@@ -860,6 +1091,8 @@ class NvIngestClient:
|
|
|
860
1091
|
Exception
|
|
861
1092
|
For unexpected issues.
|
|
862
1093
|
"""
|
|
1094
|
+
ts_attempt = time.time()
|
|
1095
|
+
self._t_record_fetch_attempt(job_index, ts_attempt)
|
|
863
1096
|
try:
|
|
864
1097
|
# Get job state using the client-side index
|
|
865
1098
|
job_state = self._get_and_check_job_state(
|
|
@@ -900,6 +1133,7 @@ class NvIngestClient:
|
|
|
900
1133
|
logger.debug(
|
|
901
1134
|
f"Successfully processed and removed job index {job_index} (Server ID: {server_job_id})"
|
|
902
1135
|
)
|
|
1136
|
+
self._t_record_fetch_outcome(job_index, 0, time.time(), ok=True, trace_id=job_state.trace_id)
|
|
903
1137
|
return result_data, job_index, job_state.trace_id
|
|
904
1138
|
|
|
905
1139
|
except json.JSONDecodeError as err:
|
|
@@ -921,6 +1155,7 @@ class NvIngestClient:
|
|
|
921
1155
|
elif response.response_code == 2: # Job Not Ready (e.g., HTTP 202, or r-2 from SimpleBroker)
|
|
922
1156
|
# Raise TimeoutError to signal the calling retry loop in fetch_job_result
|
|
923
1157
|
# Do not change job state here, remains SUBMITTED
|
|
1158
|
+
self._t_record_fetch_outcome(job_index, 2, time.time(), ok=False, trace_id=job_state.trace_id)
|
|
924
1159
|
raise TimeoutError(f"Job not ready: {response.response_reason}")
|
|
925
1160
|
|
|
926
1161
|
else:
|
|
@@ -933,6 +1168,7 @@ class NvIngestClient:
|
|
|
933
1168
|
job_state.state = JobStateEnum.FAILED # Mark job as failed in the client
|
|
934
1169
|
# Do NOT pop the state for failed jobs here
|
|
935
1170
|
# Raise RuntimeError to indicate a terminal failure for this fetch attempt
|
|
1171
|
+
self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=job_state.trace_id)
|
|
936
1172
|
raise RuntimeError(error_msg)
|
|
937
1173
|
|
|
938
1174
|
except (TimeoutError, ValueError, RuntimeError):
|
|
@@ -944,12 +1180,17 @@ class NvIngestClient:
|
|
|
944
1180
|
# Attempt to mark state as FAILED if possible and state object exists
|
|
945
1181
|
if "job_state" in locals() and hasattr(job_state, "state"):
|
|
946
1182
|
job_state.state = JobStateEnum.FAILED
|
|
1183
|
+
try:
|
|
1184
|
+
self._t_record_fetch_outcome(job_index, 1, time.time(), ok=False, trace_id=None)
|
|
1185
|
+
except Exception:
|
|
1186
|
+
pass
|
|
947
1187
|
raise # Re-raise the original exception
|
|
948
1188
|
|
|
949
1189
|
def fetch_job_result_cli(
|
|
950
1190
|
self,
|
|
951
1191
|
job_ids: Union[str, List[str]],
|
|
952
1192
|
data_only: bool = False,
|
|
1193
|
+
timeout: Optional[Tuple[int, Optional[float]]] = None,
|
|
953
1194
|
) -> List[Tuple[Any, str, Optional[str]]]:
|
|
954
1195
|
"""
|
|
955
1196
|
Fetch job results via CLI semantics (synchronous list return).
|
|
@@ -969,23 +1210,71 @@ class NvIngestClient:
|
|
|
969
1210
|
if isinstance(job_ids, str):
|
|
970
1211
|
job_ids = [job_ids]
|
|
971
1212
|
|
|
972
|
-
|
|
1213
|
+
eff_timeout: Tuple[int, Optional[float]] = timeout if timeout is not None else (100, None)
|
|
1214
|
+
return [self._fetch_job_result(job_id, timeout=eff_timeout, data_only=data_only) for job_id in job_ids]
|
|
1215
|
+
|
|
1216
|
+
def _validate_batch_size(self, batch_size: Optional[int]) -> int:
|
|
1217
|
+
"""
|
|
1218
|
+
Validates and returns a sanitized batch_size value.
|
|
1219
|
+
|
|
1220
|
+
Parameters
|
|
1221
|
+
----------
|
|
1222
|
+
batch_size : Optional[int]
|
|
1223
|
+
The batch_size value to validate. None uses value from
|
|
1224
|
+
NV_INGEST_BATCH_SIZE environment variable or default 32.
|
|
1225
|
+
|
|
1226
|
+
Returns
|
|
1227
|
+
-------
|
|
1228
|
+
int
|
|
1229
|
+
Validated batch_size value.
|
|
1230
|
+
"""
|
|
1231
|
+
# Handle None/default case
|
|
1232
|
+
if batch_size is None:
|
|
1233
|
+
try:
|
|
1234
|
+
batch_size = int(os.getenv("NV_INGEST_CLIENT_BATCH_SIZE", "32"))
|
|
1235
|
+
except ValueError:
|
|
1236
|
+
batch_size = 32
|
|
1237
|
+
|
|
1238
|
+
# Validate type and range
|
|
1239
|
+
if not isinstance(batch_size, int):
|
|
1240
|
+
logger.warning(f"batch_size must be an integer, got {type(batch_size).__name__}. Using default 32.")
|
|
1241
|
+
return 32
|
|
1242
|
+
|
|
1243
|
+
if batch_size < 1:
|
|
1244
|
+
logger.warning(f"batch_size must be >= 1, got {batch_size}. Using default 32.")
|
|
1245
|
+
return 32
|
|
1246
|
+
|
|
1247
|
+
# Performance guidance warnings
|
|
1248
|
+
if batch_size < 8:
|
|
1249
|
+
logger.warning(f"batch_size {batch_size} is very small and may impact performance.")
|
|
1250
|
+
elif batch_size > 128:
|
|
1251
|
+
logger.warning(f"batch_size {batch_size} is large and may increase memory usage.")
|
|
1252
|
+
|
|
1253
|
+
return batch_size
|
|
973
1254
|
|
|
974
1255
|
def process_jobs_concurrently(
|
|
975
1256
|
self,
|
|
976
1257
|
job_indices: Union[str, List[str]],
|
|
977
1258
|
job_queue_id: Optional[str] = None,
|
|
1259
|
+
batch_size: Optional[int] = None,
|
|
978
1260
|
concurrency_limit: int = 64,
|
|
979
1261
|
timeout: int = 100,
|
|
980
1262
|
max_job_retries: Optional[int] = None,
|
|
981
|
-
retry_delay: float = 5
|
|
1263
|
+
retry_delay: float = 0.5,
|
|
1264
|
+
initial_fetch_delay: float = 0.3,
|
|
982
1265
|
fail_on_submit_error: bool = False,
|
|
983
1266
|
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
984
1267
|
return_failures: bool = False,
|
|
985
1268
|
data_only: bool = True,
|
|
986
1269
|
stream_to_callback_only: bool = False,
|
|
1270
|
+
return_full_response: bool = False,
|
|
987
1271
|
verbose: bool = False,
|
|
988
|
-
|
|
1272
|
+
return_traces: bool = False,
|
|
1273
|
+
) -> Union[
|
|
1274
|
+
List[Any],
|
|
1275
|
+
Tuple[List[Any], List[Tuple[str, str]]],
|
|
1276
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
|
|
1277
|
+
]:
|
|
989
1278
|
"""
|
|
990
1279
|
Submit and fetch multiple jobs concurrently.
|
|
991
1280
|
|
|
@@ -995,8 +1284,12 @@ class NvIngestClient:
|
|
|
995
1284
|
Single or multiple job indices to process.
|
|
996
1285
|
job_queue_id : str, optional
|
|
997
1286
|
Queue identifier for submission.
|
|
1287
|
+
batch_size : int, optional
|
|
1288
|
+
Maximum number of jobs to process in each internal batch.
|
|
1289
|
+
Higher values may improve throughput but increase memory usage.
|
|
1290
|
+
Must be >= 1. Default is 32.
|
|
998
1291
|
concurrency_limit : int, optional
|
|
999
|
-
Max number of simultaneous in-flight jobs. Default is
|
|
1292
|
+
Max number of simultaneous in-flight jobs. Default is 64.
|
|
1000
1293
|
timeout : int, optional
|
|
1001
1294
|
Timeout in seconds per fetch attempt. Default is 100.
|
|
1002
1295
|
max_job_retries : int, optional
|
|
@@ -1011,8 +1304,13 @@ class NvIngestClient:
|
|
|
1011
1304
|
If True, return (results, failures). Default is False.
|
|
1012
1305
|
data_only : bool, optional
|
|
1013
1306
|
If True, return only payload 'data'. Default is True.
|
|
1307
|
+
return_full_response : bool, optional
|
|
1308
|
+
If True, results contain the full response envelopes (including 'trace' and 'annotations').
|
|
1309
|
+
Ignored when stream_to_callback_only=True. Default is False.
|
|
1014
1310
|
verbose : bool, optional
|
|
1015
1311
|
If True, enable debug logging. Default is False.
|
|
1312
|
+
return_traces : bool, optional
|
|
1313
|
+
If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
|
|
1016
1314
|
|
|
1017
1315
|
Returns
|
|
1018
1316
|
-------
|
|
@@ -1020,6 +1318,9 @@ class NvIngestClient:
|
|
|
1020
1318
|
List of successful job results when `return_failures` is False.
|
|
1021
1319
|
results, failures : tuple
|
|
1022
1320
|
Tuple of (successful results, failure tuples) when `return_failures` is True.
|
|
1321
|
+
results, failures, traces : tuple
|
|
1322
|
+
Tuple of (successful results, failure tuples, trace dicts) when both
|
|
1323
|
+
`return_failures` and `return_traces` are True.
|
|
1023
1324
|
|
|
1024
1325
|
Raises
|
|
1025
1326
|
------
|
|
@@ -1032,29 +1333,45 @@ class NvIngestClient:
|
|
|
1032
1333
|
|
|
1033
1334
|
# Handle empty input
|
|
1034
1335
|
if not job_indices:
|
|
1035
|
-
|
|
1336
|
+
if return_failures and return_traces:
|
|
1337
|
+
return [], [], []
|
|
1338
|
+
elif return_failures:
|
|
1339
|
+
return [], []
|
|
1340
|
+
else:
|
|
1341
|
+
return []
|
|
1036
1342
|
|
|
1037
|
-
#
|
|
1038
|
-
|
|
1343
|
+
# Validate and set batch_size
|
|
1344
|
+
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1345
|
+
|
|
1346
|
+
# Prepare timeout tuple to mirror handler behavior: finite connect, unbounded read (long-poll)
|
|
1347
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1039
1348
|
|
|
1040
1349
|
# Delegate to the concurrent processor
|
|
1041
1350
|
processor = _ConcurrentProcessor(
|
|
1042
1351
|
client=self,
|
|
1043
|
-
batch_size=
|
|
1352
|
+
batch_size=validated_batch_size,
|
|
1044
1353
|
job_indices=job_indices,
|
|
1045
1354
|
job_queue_id=job_queue_id,
|
|
1046
1355
|
timeout=effective_timeout,
|
|
1047
1356
|
max_job_retries=max_job_retries,
|
|
1357
|
+
retry_delay=retry_delay,
|
|
1358
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1048
1359
|
completion_callback=completion_callback,
|
|
1049
1360
|
fail_on_submit_error=fail_on_submit_error,
|
|
1050
1361
|
stream_to_callback_only=stream_to_callback_only,
|
|
1362
|
+
return_full_response=return_full_response,
|
|
1051
1363
|
verbose=verbose,
|
|
1364
|
+
return_traces=return_traces,
|
|
1052
1365
|
)
|
|
1053
1366
|
|
|
1054
|
-
results, failures = processor.run()
|
|
1367
|
+
results, failures, traces = processor.run()
|
|
1055
1368
|
|
|
1056
|
-
if return_failures:
|
|
1369
|
+
if return_failures and return_traces:
|
|
1370
|
+
return results, failures, traces
|
|
1371
|
+
elif return_failures:
|
|
1057
1372
|
return results, failures
|
|
1373
|
+
elif return_traces:
|
|
1374
|
+
return results, traces
|
|
1058
1375
|
|
|
1059
1376
|
if failures:
|
|
1060
1377
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
@@ -1087,7 +1404,12 @@ class NvIngestClient:
|
|
|
1087
1404
|
job_state.trace_id = future.result()[0] # Trace_id from `submit_job` endpoint submission
|
|
1088
1405
|
job_state.future = None
|
|
1089
1406
|
|
|
1090
|
-
def fetch_job_result_async(
|
|
1407
|
+
def fetch_job_result_async(
|
|
1408
|
+
self,
|
|
1409
|
+
job_ids: Union[str, List[str]],
|
|
1410
|
+
data_only: bool = True,
|
|
1411
|
+
timeout: Optional[Tuple[int, Optional[float]]] = None,
|
|
1412
|
+
) -> Dict[Future, str]:
|
|
1091
1413
|
"""
|
|
1092
1414
|
Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to job IDs.
|
|
1093
1415
|
|
|
@@ -1108,7 +1430,7 @@ class NvIngestClient:
|
|
|
1108
1430
|
future_to_job_id = {}
|
|
1109
1431
|
for job_id in job_ids:
|
|
1110
1432
|
job_state = self._get_and_check_job_state(job_id)
|
|
1111
|
-
future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only)
|
|
1433
|
+
future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only, timeout)
|
|
1112
1434
|
job_state.future = future
|
|
1113
1435
|
future_to_job_id[future] = job_id
|
|
1114
1436
|
|
|
@@ -1159,12 +1481,19 @@ class NvIngestClient:
|
|
|
1159
1481
|
# Free up memory -- payload should never be used again, and we don't want to keep it around.
|
|
1160
1482
|
job_state.job_spec.payload = None
|
|
1161
1483
|
|
|
1484
|
+
try:
|
|
1485
|
+
self._t_record_submit(job_index, "ok", time.time(), x_trace_id)
|
|
1486
|
+
except Exception:
|
|
1487
|
+
pass
|
|
1162
1488
|
return x_trace_id
|
|
1163
1489
|
except Exception as err:
|
|
1164
1490
|
err_msg = f"Failed to submit job {job_index} to queue {job_queue_id}: {err}"
|
|
1165
1491
|
logger.exception(err_msg)
|
|
1166
1492
|
job_state.state = JobStateEnum.FAILED
|
|
1167
|
-
|
|
1493
|
+
try:
|
|
1494
|
+
self._t_record_submit(job_index, "fail", time.time(), None)
|
|
1495
|
+
except Exception:
|
|
1496
|
+
pass
|
|
1168
1497
|
raise
|
|
1169
1498
|
|
|
1170
1499
|
def submit_job(
|
|
@@ -1385,7 +1714,9 @@ class NvIngestClient:
|
|
|
1385
1714
|
|
|
1386
1715
|
return results
|
|
1387
1716
|
|
|
1388
|
-
def create_jobs_for_batch(
|
|
1717
|
+
def create_jobs_for_batch(
|
|
1718
|
+
self, files_batch: List[str], tasks: Dict[str, Any], pdf_split_page_count: int = None
|
|
1719
|
+
) -> List[str]:
|
|
1389
1720
|
"""
|
|
1390
1721
|
Create and submit job specifications (JobSpecs) for a batch of files, returning the job IDs.
|
|
1391
1722
|
This function takes a batch of files, processes each file to extract its content and type,
|
|
@@ -1401,6 +1732,9 @@ class NvIngestClient:
|
|
|
1401
1732
|
A dictionary of tasks to be added to each job. The keys represent task names, and the
|
|
1402
1733
|
values represent task specifications or configurations. Standard tasks include "split",
|
|
1403
1734
|
"extract", "store", "caption", "dedup", "filter", "embed".
|
|
1735
|
+
pdf_split_page_count : int, optional
|
|
1736
|
+
Number of pages per PDF chunk for splitting (1-128). If provided, this will be added
|
|
1737
|
+
to the job spec's extended_options for PDF files.
|
|
1404
1738
|
|
|
1405
1739
|
Returns
|
|
1406
1740
|
-------
|
|
@@ -1447,6 +1781,10 @@ class NvIngestClient:
|
|
|
1447
1781
|
|
|
1448
1782
|
job_specs = create_job_specs_for_batch(files_batch)
|
|
1449
1783
|
|
|
1784
|
+
# Apply PDF split config if provided
|
|
1785
|
+
if pdf_split_page_count is not None:
|
|
1786
|
+
apply_pdf_split_config_to_job_specs(job_specs, pdf_split_page_count)
|
|
1787
|
+
|
|
1450
1788
|
job_ids = []
|
|
1451
1789
|
for job_spec in job_specs:
|
|
1452
1790
|
logger.debug(f"Tasks: {tasks.keys()}")
|
|
@@ -1476,3 +1814,19 @@ class NvIngestClient:
|
|
|
1476
1814
|
job_ids.append(job_id)
|
|
1477
1815
|
|
|
1478
1816
|
return job_ids
|
|
1817
|
+
|
|
1818
|
+
def register_parent_trace_id(self, trace_id: Optional[str]) -> None:
|
|
1819
|
+
"""Record a parent trace identifier once its aggregation completed."""
|
|
1820
|
+
|
|
1821
|
+
if not trace_id:
|
|
1822
|
+
return
|
|
1823
|
+
|
|
1824
|
+
if trace_id not in self._completed_parent_trace_ids:
|
|
1825
|
+
self._completed_parent_trace_ids.append(trace_id)
|
|
1826
|
+
|
|
1827
|
+
def consume_completed_parent_trace_ids(self) -> List[str]:
|
|
1828
|
+
"""Return and clear the set of completed parent trace identifiers."""
|
|
1829
|
+
|
|
1830
|
+
trace_ids = list(self._completed_parent_trace_ids)
|
|
1831
|
+
self._completed_parent_trace_ids.clear()
|
|
1832
|
+
return trace_ids
|