nv-ingest-client 2025.10.18.dev20251018__tar.gz → 2025.12.14.dev20251214__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client.egg-info → nv_ingest_client-2025.12.14.dev20251214}/PKG-INFO +2 -1
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/pyproject.toml +1 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/client.py +194 -10
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/ingest_job_handler.py +28 -6
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/interface.py +425 -108
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/nv_ingest_cli.py +2 -2
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_spec.py +29 -1
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/caption.py +12 -1
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/embed.py +24 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/extract.py +50 -2
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/filter.py +1 -1
- nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/store.py +18 -13
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/extract.py +27 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/util.py +34 -1
- nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/adt_vdb.py +243 -0
- nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/lancedb.py +276 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/milvus.py +78 -31
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client.egg-info}/PKG-INFO +2 -1
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/SOURCES.txt +2 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/requires.txt +1 -0
- nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -27
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/LICENSE +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/README.md +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/setup.cfg +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/document_analysis.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-client
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.14.dev20251214
|
|
4
4
|
Summary: Python client for the nv-ingest service
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
|
|
|
223
223
|
Requires-Dist: requests>=2.28.2
|
|
224
224
|
Requires-Dist: setuptools>=78.1.1
|
|
225
225
|
Requires-Dist: tqdm>=4.67.1
|
|
226
|
+
Requires-Dist: lancedb>=0.25.3
|
|
226
227
|
Provides-Extra: milvus
|
|
227
228
|
Requires-Dist: pymilvus==2.5.10; extra == "milvus"
|
|
228
229
|
Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
|
|
@@ -44,6 +44,50 @@ from nv_ingest_client.util.util import (
|
|
|
44
44
|
logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Compute resident_time entries from entry/exit pairs if not already present.
|
|
50
|
+
|
|
51
|
+
This ensures consistency between split jobs (where server computes resident_time)
|
|
52
|
+
and non-split jobs (where we compute it client-side).
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
trace_dict : Dict[str, Any]
|
|
57
|
+
Trace dictionary with entry/exit pairs
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Dict[str, Any]
|
|
62
|
+
Trace dictionary with resident_time entries added
|
|
63
|
+
"""
|
|
64
|
+
if not trace_dict or not isinstance(trace_dict, dict):
|
|
65
|
+
return trace_dict
|
|
66
|
+
|
|
67
|
+
# Check if resident_time already exists (server-computed for split jobs)
|
|
68
|
+
has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
|
|
69
|
+
if has_resident:
|
|
70
|
+
return trace_dict # Already computed by server
|
|
71
|
+
|
|
72
|
+
# Compute resident_time from entry/exit pairs
|
|
73
|
+
result = dict(trace_dict)
|
|
74
|
+
stages = set()
|
|
75
|
+
|
|
76
|
+
# Find all unique stages
|
|
77
|
+
for key in trace_dict:
|
|
78
|
+
if key.startswith("trace::entry::"):
|
|
79
|
+
stages.add(key.replace("trace::entry::", ""))
|
|
80
|
+
|
|
81
|
+
# Compute resident_time for each stage
|
|
82
|
+
for stage in stages:
|
|
83
|
+
entry_key = f"trace::entry::{stage}"
|
|
84
|
+
exit_key = f"trace::exit::{stage}"
|
|
85
|
+
if entry_key in trace_dict and exit_key in trace_dict:
|
|
86
|
+
result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
|
|
87
|
+
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
47
91
|
class DataDecodeException(Exception):
|
|
48
92
|
"""
|
|
49
93
|
Exception raised for errors in decoding data.
|
|
@@ -87,6 +131,7 @@ class _ConcurrentProcessor:
|
|
|
87
131
|
stream_to_callback_only: bool,
|
|
88
132
|
return_full_response: bool,
|
|
89
133
|
verbose: bool = False,
|
|
134
|
+
return_traces: bool = False,
|
|
90
135
|
):
|
|
91
136
|
"""
|
|
92
137
|
Initializes the concurrent processor.
|
|
@@ -120,6 +165,8 @@ class _ConcurrentProcessor:
|
|
|
120
165
|
initiating job submission or fetching fails for a batch.
|
|
121
166
|
verbose : bool, optional
|
|
122
167
|
If True, enables detailed debug logging. Default is False.
|
|
168
|
+
return_traces : bool, optional
|
|
169
|
+
If True, parent-level trace data for each completed job is stored.
|
|
123
170
|
|
|
124
171
|
Raises
|
|
125
172
|
------
|
|
@@ -142,17 +189,26 @@ class _ConcurrentProcessor:
|
|
|
142
189
|
self.stream_to_callback_only = stream_to_callback_only
|
|
143
190
|
self.return_full_response = return_full_response
|
|
144
191
|
self.verbose = verbose
|
|
192
|
+
self.return_traces = return_traces
|
|
145
193
|
|
|
146
194
|
# State variables managed across batch cycles
|
|
147
195
|
self.retry_job_ids: List[str] = []
|
|
148
196
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
149
197
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
150
198
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
199
|
+
self.traces: List[Optional[Dict[str, Any]]] = []
|
|
151
200
|
|
|
152
201
|
# --- Initial Checks ---
|
|
153
202
|
if not self.job_queue_id:
|
|
154
203
|
logger.warning("job_queue_id is not set; submission of new jobs will fail.")
|
|
155
204
|
|
|
205
|
+
# Executor check required for run_async
|
|
206
|
+
if not hasattr(client, "_worker_pool"):
|
|
207
|
+
raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
|
|
208
|
+
if not isinstance(client._worker_pool, ThreadPoolExecutor):
|
|
209
|
+
raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
|
|
210
|
+
self._executor = client._worker_pool
|
|
211
|
+
|
|
156
212
|
# --------------------------------------------------------------------------
|
|
157
213
|
# Private Methods
|
|
158
214
|
# --------------------------------------------------------------------------
|
|
@@ -197,7 +253,7 @@ class _ConcurrentProcessor:
|
|
|
197
253
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
198
254
|
try:
|
|
199
255
|
# Use a method assumed to safely get the state object
|
|
200
|
-
job_state = self.client.
|
|
256
|
+
job_state = self.client._get_and_check_job_state(job_index)
|
|
201
257
|
# Check state exists and is not already terminal before updating
|
|
202
258
|
if (
|
|
203
259
|
job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
|
|
@@ -247,6 +303,14 @@ class _ConcurrentProcessor:
|
|
|
247
303
|
# When requested, return the full response envelope (includes 'trace' and 'annotations')
|
|
248
304
|
self.results.append(result_data if self.return_full_response else result_data.get("data"))
|
|
249
305
|
|
|
306
|
+
# Extract trace data for all successful (non-failed) jobs
|
|
307
|
+
if self.return_traces and not is_failed:
|
|
308
|
+
trace_payload = result_data.get("trace") if result_data else None
|
|
309
|
+
# Compute resident_time if not already present (for consistency)
|
|
310
|
+
if trace_payload:
|
|
311
|
+
trace_payload = _compute_resident_times(trace_payload)
|
|
312
|
+
self.traces.append(trace_payload if trace_payload else None)
|
|
313
|
+
|
|
250
314
|
# Cleanup retry count if it exists
|
|
251
315
|
if job_index in self.retry_counts:
|
|
252
316
|
del self.retry_counts[job_index]
|
|
@@ -438,7 +502,10 @@ class _ConcurrentProcessor:
|
|
|
438
502
|
|
|
439
503
|
return batch_futures_dict, normalized_job_indices
|
|
440
504
|
|
|
441
|
-
|
|
505
|
+
# --------------------------------------------------------------------------
|
|
506
|
+
# Core Processing Logic
|
|
507
|
+
# --------------------------------------------------------------------------
|
|
508
|
+
def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
442
509
|
"""
|
|
443
510
|
Executes the main processing loop in batches.
|
|
444
511
|
|
|
@@ -581,7 +648,45 @@ class _ConcurrentProcessor:
|
|
|
581
648
|
# --- Final Logging ---
|
|
582
649
|
self._log_final_status(total_jobs)
|
|
583
650
|
|
|
584
|
-
return self.results, self.failures
|
|
651
|
+
return self.results, self.failures, self.traces if self.return_traces else []
|
|
652
|
+
|
|
653
|
+
# --------------------------------------------------------------------------
|
|
654
|
+
# Public Methods
|
|
655
|
+
# --------------------------------------------------------------------------
|
|
656
|
+
|
|
657
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
658
|
+
"""
|
|
659
|
+
Executes the main processing loop synchronously.
|
|
660
|
+
|
|
661
|
+
This method orchestrates the job processing by maintaining a constant
|
|
662
|
+
pool of in-flight jobs, handling submissions, fetches, and retries until
|
|
663
|
+
all jobs are complete. It blocks until all jobs are processed.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
|
|
668
|
+
A tuple containing:
|
|
669
|
+
1. A list of successfully fetched job results.
|
|
670
|
+
2. A list of tuples for failed jobs (job_index, error_message).
|
|
671
|
+
3. A list of trace dictionaries if `return_traces` was True.
|
|
672
|
+
"""
|
|
673
|
+
return self._process_all_jobs()
|
|
674
|
+
|
|
675
|
+
def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
676
|
+
"""
|
|
677
|
+
Executes the main processing loop asynchronously.
|
|
678
|
+
|
|
679
|
+
Submits the entire processing logic to the client's background
|
|
680
|
+
thread pool and returns a Future that resolves with the final
|
|
681
|
+
results, failures, and traces once all jobs are complete.
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
Future
|
|
686
|
+
A future representing the asynchronous execution. Its result()
|
|
687
|
+
will be a tuple containing (results, failures, traces).
|
|
688
|
+
"""
|
|
689
|
+
return self._executor.submit(self._process_all_jobs)
|
|
585
690
|
|
|
586
691
|
|
|
587
692
|
class NvIngestClient:
|
|
@@ -1212,7 +1317,12 @@ class NvIngestClient:
|
|
|
1212
1317
|
stream_to_callback_only: bool = False,
|
|
1213
1318
|
return_full_response: bool = False,
|
|
1214
1319
|
verbose: bool = False,
|
|
1215
|
-
|
|
1320
|
+
return_traces: bool = False,
|
|
1321
|
+
) -> Union[
|
|
1322
|
+
List[Any],
|
|
1323
|
+
Tuple[List[Any], List[Tuple[str, str]]],
|
|
1324
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
|
|
1325
|
+
]:
|
|
1216
1326
|
"""
|
|
1217
1327
|
Submit and fetch multiple jobs concurrently.
|
|
1218
1328
|
|
|
@@ -1247,6 +1357,8 @@ class NvIngestClient:
|
|
|
1247
1357
|
Ignored when stream_to_callback_only=True. Default is False.
|
|
1248
1358
|
verbose : bool, optional
|
|
1249
1359
|
If True, enable debug logging. Default is False.
|
|
1360
|
+
return_traces : bool, optional
|
|
1361
|
+
If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
|
|
1250
1362
|
|
|
1251
1363
|
Returns
|
|
1252
1364
|
-------
|
|
@@ -1254,6 +1366,9 @@ class NvIngestClient:
|
|
|
1254
1366
|
List of successful job results when `return_failures` is False.
|
|
1255
1367
|
results, failures : tuple
|
|
1256
1368
|
Tuple of (successful results, failure tuples) when `return_failures` is True.
|
|
1369
|
+
results, failures, traces : tuple
|
|
1370
|
+
Tuple of (successful results, failure tuples, trace dicts) when both
|
|
1371
|
+
`return_failures` and `return_traces` are True.
|
|
1257
1372
|
|
|
1258
1373
|
Raises
|
|
1259
1374
|
------
|
|
@@ -1266,7 +1381,12 @@ class NvIngestClient:
|
|
|
1266
1381
|
|
|
1267
1382
|
# Handle empty input
|
|
1268
1383
|
if not job_indices:
|
|
1269
|
-
|
|
1384
|
+
if return_failures and return_traces:
|
|
1385
|
+
return [], [], []
|
|
1386
|
+
elif return_failures:
|
|
1387
|
+
return [], []
|
|
1388
|
+
else:
|
|
1389
|
+
return []
|
|
1270
1390
|
|
|
1271
1391
|
# Validate and set batch_size
|
|
1272
1392
|
validated_batch_size = self._validate_batch_size(batch_size)
|
|
@@ -1289,17 +1409,84 @@ class NvIngestClient:
|
|
|
1289
1409
|
stream_to_callback_only=stream_to_callback_only,
|
|
1290
1410
|
return_full_response=return_full_response,
|
|
1291
1411
|
verbose=verbose,
|
|
1412
|
+
return_traces=return_traces,
|
|
1292
1413
|
)
|
|
1293
1414
|
|
|
1294
|
-
results, failures = processor.run()
|
|
1415
|
+
results, failures, traces = processor.run()
|
|
1295
1416
|
|
|
1296
|
-
if return_failures:
|
|
1417
|
+
if return_failures and return_traces:
|
|
1418
|
+
return results, failures, traces
|
|
1419
|
+
elif return_failures:
|
|
1297
1420
|
return results, failures
|
|
1421
|
+
elif return_traces:
|
|
1422
|
+
return results, traces
|
|
1298
1423
|
|
|
1299
1424
|
if failures:
|
|
1300
1425
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
1301
1426
|
return results
|
|
1302
1427
|
|
|
1428
|
+
def process_jobs_concurrently_async(
|
|
1429
|
+
self,
|
|
1430
|
+
job_indices: Union[str, List[str]],
|
|
1431
|
+
job_queue_id: Optional[str] = None,
|
|
1432
|
+
batch_size: Optional[int] = None,
|
|
1433
|
+
timeout: int = 100,
|
|
1434
|
+
max_job_retries: Optional[int] = None,
|
|
1435
|
+
retry_delay: float = 0.5,
|
|
1436
|
+
initial_fetch_delay: float = 0.3,
|
|
1437
|
+
fail_on_submit_error: bool = False,
|
|
1438
|
+
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1439
|
+
stream_to_callback_only: bool = False,
|
|
1440
|
+
return_full_response: bool = False,
|
|
1441
|
+
verbose: bool = False,
|
|
1442
|
+
return_traces: bool = False,
|
|
1443
|
+
) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
1444
|
+
"""
|
|
1445
|
+
Submit and fetch multiple jobs concurrently and asynchronously.
|
|
1446
|
+
|
|
1447
|
+
This method initializes the processing and returns a Future immediately. The Future
|
|
1448
|
+
will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
|
|
1449
|
+
jobs have completed.
|
|
1450
|
+
|
|
1451
|
+
Parameters are identical to `process_jobs_concurrently`.
|
|
1452
|
+
|
|
1453
|
+
Returns
|
|
1454
|
+
-------
|
|
1455
|
+
Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
|
|
1456
|
+
A future that completes when all jobs are done. Its result is a tuple
|
|
1457
|
+
containing (successful_results, failures, traces).
|
|
1458
|
+
"""
|
|
1459
|
+
if isinstance(job_indices, str):
|
|
1460
|
+
job_indices = [job_indices]
|
|
1461
|
+
|
|
1462
|
+
if not job_indices:
|
|
1463
|
+
immediate_future: Future = Future()
|
|
1464
|
+
immediate_future.set_result(([], [], []))
|
|
1465
|
+
return immediate_future
|
|
1466
|
+
|
|
1467
|
+
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1468
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1469
|
+
|
|
1470
|
+
processor = _ConcurrentProcessor(
|
|
1471
|
+
client=self,
|
|
1472
|
+
batch_size=validated_batch_size,
|
|
1473
|
+
job_indices=job_indices,
|
|
1474
|
+
job_queue_id=job_queue_id,
|
|
1475
|
+
timeout=effective_timeout,
|
|
1476
|
+
max_job_retries=max_job_retries,
|
|
1477
|
+
retry_delay=retry_delay,
|
|
1478
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1479
|
+
completion_callback=completion_callback,
|
|
1480
|
+
fail_on_submit_error=fail_on_submit_error,
|
|
1481
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
1482
|
+
return_full_response=return_full_response,
|
|
1483
|
+
verbose=verbose,
|
|
1484
|
+
return_traces=return_traces,
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
# Asynchronous call
|
|
1488
|
+
return processor.run_async()
|
|
1489
|
+
|
|
1303
1490
|
def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
|
|
1304
1491
|
"""
|
|
1305
1492
|
Block until all specified jobs have been marked submitted.
|
|
@@ -1628,9 +1815,6 @@ class NvIngestClient:
|
|
|
1628
1815
|
)
|
|
1629
1816
|
logger.error(error_msg)
|
|
1630
1817
|
failures.append((self._job_index_to_job_spec[job_id].source_id, str(e)))
|
|
1631
|
-
finally:
|
|
1632
|
-
# Clean up the job spec mapping
|
|
1633
|
-
del self._job_index_to_job_spec[job_id]
|
|
1634
1818
|
|
|
1635
1819
|
if return_failures:
|
|
1636
1820
|
return results, failures
|
|
@@ -323,18 +323,40 @@ class IngestJobHandler:
|
|
|
323
323
|
|
|
324
324
|
futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
|
|
325
325
|
for future in as_completed(futures_dict.keys()):
|
|
326
|
+
pages_per_sec = None
|
|
326
327
|
try:
|
|
327
328
|
# Block as each future completes; this mirrors CLI behavior
|
|
328
329
|
future_response, trace_id = self._handle_future_result(future)
|
|
329
330
|
job_id: str = futures_dict[future]
|
|
330
331
|
trace_ids[job_id_map[job_id]] = trace_id
|
|
331
332
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
333
|
+
# Extract page count: prefer V2 metadata location, fall back to V1
|
|
334
|
+
page_count = None
|
|
335
|
+
source_name = None
|
|
336
|
+
|
|
337
|
+
# Try V2 metadata location first (top-level metadata.total_pages)
|
|
338
|
+
if "metadata" in future_response and future_response["metadata"]:
|
|
339
|
+
response_metadata = future_response["metadata"]
|
|
340
|
+
page_count = response_metadata.get("total_pages")
|
|
341
|
+
source_name = response_metadata.get("original_source_name")
|
|
342
|
+
|
|
343
|
+
# Fall back to V1 location (first data element's hierarchy.page_count)
|
|
344
|
+
if page_count is None and future_response.get("data"):
|
|
345
|
+
try:
|
|
346
|
+
first_page_metadata = future_response["data"][0]["metadata"]
|
|
347
|
+
page_count = first_page_metadata["content_metadata"]["hierarchy"]["page_count"]
|
|
348
|
+
source_name = first_page_metadata["source_metadata"]["source_name"]
|
|
349
|
+
except (KeyError, IndexError, TypeError):
|
|
350
|
+
# If we can't extract from V1 location, use defaults
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# Use extracted values or defaults
|
|
354
|
+
if page_count is None:
|
|
355
|
+
page_count = 0 # Default if not found
|
|
356
|
+
if source_name is None:
|
|
357
|
+
source_name = "unknown_source"
|
|
358
|
+
|
|
359
|
+
file_page_counts: Dict[str, int] = {source_name: page_count}
|
|
338
360
|
|
|
339
361
|
if self.output_directory:
|
|
340
362
|
self._save_response_data(
|