nv-ingest-client 2025.10.18.dev20251018__tar.gz → 2025.12.14.dev20251214__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client.egg-info → nv_ingest_client-2025.12.14.dev20251214}/PKG-INFO +2 -1
  2. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/pyproject.toml +1 -0
  3. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/client.py +194 -10
  4. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/ingest_job_handler.py +28 -6
  5. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/interface.py +425 -108
  6. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/nv_ingest_cli.py +2 -2
  7. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_spec.py +29 -1
  8. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/caption.py +12 -1
  9. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/embed.py +24 -0
  10. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/extract.py +50 -2
  11. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/filter.py +1 -1
  12. nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
  13. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/store.py +18 -13
  14. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
  15. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/extract.py +27 -0
  16. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/util.py +34 -1
  17. nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/adt_vdb.py +243 -0
  18. nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/lancedb.py +276 -0
  19. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/milvus.py +78 -31
  20. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client.egg-info}/PKG-INFO +2 -1
  21. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/SOURCES.txt +2 -0
  22. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/requires.txt +1 -0
  23. nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -27
  24. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/LICENSE +0 -0
  25. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/MANIFEST.in +0 -0
  26. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/README.md +0 -0
  27. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/setup.cfg +0 -0
  28. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/__init__.py +0 -0
  29. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/__init__.py +0 -0
  30. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  31. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/click.py +0 -0
  32. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/processing.py +0 -0
  33. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/system.py +0 -0
  34. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/__init__.py +0 -0
  35. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/util/processing.py +0 -0
  36. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/__init__.py +0 -0
  37. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  38. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  39. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  40. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  41. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  42. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  43. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  44. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  45. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  46. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  47. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  48. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  49. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/__init__.py +0 -0
  50. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/dataset.py +0 -0
  51. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/document_analysis.py +0 -0
  52. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  53. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  54. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/milvus.py +0 -0
  55. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/process_json_files.py +0 -0
  56. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/processing.py +0 -0
  57. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/system.py +0 -0
  58. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/transport.py +0 -0
  59. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  60. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  61. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/zipkin.py +0 -0
  62. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  63. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  64. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  65. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.12.14.dev20251214}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.10.18.dev20251018
3
+ Version: 2025.12.14.dev20251214
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
223
223
  Requires-Dist: requests>=2.28.2
224
224
  Requires-Dist: setuptools>=78.1.1
225
225
  Requires-Dist: tqdm>=4.67.1
226
+ Requires-Dist: lancedb>=0.25.3
226
227
  Provides-Extra: milvus
227
228
  Requires-Dist: pymilvus==2.5.10; extra == "milvus"
228
229
  Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
@@ -30,6 +30,7 @@ dependencies = [
30
30
  "requests>=2.28.2",
31
31
  "setuptools>=78.1.1",
32
32
  "tqdm>=4.67.1",
33
+ "lancedb>=0.25.3",
33
34
  ]
34
35
 
35
36
  [project.optional-dependencies]
@@ -44,6 +44,50 @@ from nv_ingest_client.util.util import (
44
44
  logger = logging.getLogger(__name__)
45
45
 
46
46
 
47
+ def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
48
+ """
49
+ Compute resident_time entries from entry/exit pairs if not already present.
50
+
51
+ This ensures consistency between split jobs (where server computes resident_time)
52
+ and non-split jobs (where we compute it client-side).
53
+
54
+ Parameters
55
+ ----------
56
+ trace_dict : Dict[str, Any]
57
+ Trace dictionary with entry/exit pairs
58
+
59
+ Returns
60
+ -------
61
+ Dict[str, Any]
62
+ Trace dictionary with resident_time entries added
63
+ """
64
+ if not trace_dict or not isinstance(trace_dict, dict):
65
+ return trace_dict
66
+
67
+ # Check if resident_time already exists (server-computed for split jobs)
68
+ has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
69
+ if has_resident:
70
+ return trace_dict # Already computed by server
71
+
72
+ # Compute resident_time from entry/exit pairs
73
+ result = dict(trace_dict)
74
+ stages = set()
75
+
76
+ # Find all unique stages
77
+ for key in trace_dict:
78
+ if key.startswith("trace::entry::"):
79
+ stages.add(key.replace("trace::entry::", ""))
80
+
81
+ # Compute resident_time for each stage
82
+ for stage in stages:
83
+ entry_key = f"trace::entry::{stage}"
84
+ exit_key = f"trace::exit::{stage}"
85
+ if entry_key in trace_dict and exit_key in trace_dict:
86
+ result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
87
+
88
+ return result
89
+
90
+
47
91
  class DataDecodeException(Exception):
48
92
  """
49
93
  Exception raised for errors in decoding data.
@@ -87,6 +131,7 @@ class _ConcurrentProcessor:
87
131
  stream_to_callback_only: bool,
88
132
  return_full_response: bool,
89
133
  verbose: bool = False,
134
+ return_traces: bool = False,
90
135
  ):
91
136
  """
92
137
  Initializes the concurrent processor.
@@ -120,6 +165,8 @@ class _ConcurrentProcessor:
120
165
  initiating job submission or fetching fails for a batch.
121
166
  verbose : bool, optional
122
167
  If True, enables detailed debug logging. Default is False.
168
+ return_traces : bool, optional
169
+ If True, parent-level trace data for each completed job is stored.
123
170
 
124
171
  Raises
125
172
  ------
@@ -142,17 +189,26 @@ class _ConcurrentProcessor:
142
189
  self.stream_to_callback_only = stream_to_callback_only
143
190
  self.return_full_response = return_full_response
144
191
  self.verbose = verbose
192
+ self.return_traces = return_traces
145
193
 
146
194
  # State variables managed across batch cycles
147
195
  self.retry_job_ids: List[str] = []
148
196
  self.retry_counts: Dict[str, int] = defaultdict(int)
149
197
  self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
150
198
  self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
199
+ self.traces: List[Optional[Dict[str, Any]]] = []
151
200
 
152
201
  # --- Initial Checks ---
153
202
  if not self.job_queue_id:
154
203
  logger.warning("job_queue_id is not set; submission of new jobs will fail.")
155
204
 
205
+ # Executor check required for run_async
206
+ if not hasattr(client, "_worker_pool"):
207
+ raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
208
+ if not isinstance(client._worker_pool, ThreadPoolExecutor):
209
+ raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
210
+ self._executor = client._worker_pool
211
+
156
212
  # --------------------------------------------------------------------------
157
213
  # Private Methods
158
214
  # --------------------------------------------------------------------------
@@ -197,7 +253,7 @@ class _ConcurrentProcessor:
197
253
  # Attempt to mark state as FAILED locally in the client (best effort)
198
254
  try:
199
255
  # Use a method assumed to safely get the state object
200
- job_state = self.client._get_job_state_object(job_index)
256
+ job_state = self.client._get_and_check_job_state(job_index)
201
257
  # Check state exists and is not already terminal before updating
202
258
  if (
203
259
  job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
@@ -247,6 +303,14 @@ class _ConcurrentProcessor:
247
303
  # When requested, return the full response envelope (includes 'trace' and 'annotations')
248
304
  self.results.append(result_data if self.return_full_response else result_data.get("data"))
249
305
 
306
+ # Extract trace data for all successful (non-failed) jobs
307
+ if self.return_traces and not is_failed:
308
+ trace_payload = result_data.get("trace") if result_data else None
309
+ # Compute resident_time if not already present (for consistency)
310
+ if trace_payload:
311
+ trace_payload = _compute_resident_times(trace_payload)
312
+ self.traces.append(trace_payload if trace_payload else None)
313
+
250
314
  # Cleanup retry count if it exists
251
315
  if job_index in self.retry_counts:
252
316
  del self.retry_counts[job_index]
@@ -438,7 +502,10 @@ class _ConcurrentProcessor:
438
502
 
439
503
  return batch_futures_dict, normalized_job_indices
440
504
 
441
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
505
+ # --------------------------------------------------------------------------
506
+ # Core Processing Logic
507
+ # --------------------------------------------------------------------------
508
+ def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
442
509
  """
443
510
  Executes the main processing loop in batches.
444
511
 
@@ -581,7 +648,45 @@ class _ConcurrentProcessor:
581
648
  # --- Final Logging ---
582
649
  self._log_final_status(total_jobs)
583
650
 
584
- return self.results, self.failures
651
+ return self.results, self.failures, self.traces if self.return_traces else []
652
+
653
+ # --------------------------------------------------------------------------
654
+ # Public Methods
655
+ # --------------------------------------------------------------------------
656
+
657
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
658
+ """
659
+ Executes the main processing loop synchronously.
660
+
661
+ This method orchestrates the job processing by maintaining a constant
662
+ pool of in-flight jobs, handling submissions, fetches, and retries until
663
+ all jobs are complete. It blocks until all jobs are processed.
664
+
665
+ Returns
666
+ -------
667
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
668
+ A tuple containing:
669
+ 1. A list of successfully fetched job results.
670
+ 2. A list of tuples for failed jobs (job_index, error_message).
671
+ 3. A list of trace dictionaries if `return_traces` was True.
672
+ """
673
+ return self._process_all_jobs()
674
+
675
+ def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
676
+ """
677
+ Executes the main processing loop asynchronously.
678
+
679
+ Submits the entire processing logic to the client's background
680
+ thread pool and returns a Future that resolves with the final
681
+ results, failures, and traces once all jobs are complete.
682
+
683
+ Returns
684
+ -------
685
+ Future
686
+ A future representing the asynchronous execution. Its result()
687
+ will be a tuple containing (results, failures, traces).
688
+ """
689
+ return self._executor.submit(self._process_all_jobs)
585
690
 
586
691
 
587
692
  class NvIngestClient:
@@ -1212,7 +1317,12 @@ class NvIngestClient:
1212
1317
  stream_to_callback_only: bool = False,
1213
1318
  return_full_response: bool = False,
1214
1319
  verbose: bool = False,
1215
- ) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
1320
+ return_traces: bool = False,
1321
+ ) -> Union[
1322
+ List[Any],
1323
+ Tuple[List[Any], List[Tuple[str, str]]],
1324
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
1325
+ ]:
1216
1326
  """
1217
1327
  Submit and fetch multiple jobs concurrently.
1218
1328
 
@@ -1247,6 +1357,8 @@ class NvIngestClient:
1247
1357
  Ignored when stream_to_callback_only=True. Default is False.
1248
1358
  verbose : bool, optional
1249
1359
  If True, enable debug logging. Default is False.
1360
+ return_traces : bool, optional
1361
+ If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
1250
1362
 
1251
1363
  Returns
1252
1364
  -------
@@ -1254,6 +1366,9 @@ class NvIngestClient:
1254
1366
  List of successful job results when `return_failures` is False.
1255
1367
  results, failures : tuple
1256
1368
  Tuple of (successful results, failure tuples) when `return_failures` is True.
1369
+ results, failures, traces : tuple
1370
+ Tuple of (successful results, failure tuples, trace dicts) when both
1371
+ `return_failures` and `return_traces` are True.
1257
1372
 
1258
1373
  Raises
1259
1374
  ------
@@ -1266,7 +1381,12 @@ class NvIngestClient:
1266
1381
 
1267
1382
  # Handle empty input
1268
1383
  if not job_indices:
1269
- return ([], []) if return_failures else []
1384
+ if return_failures and return_traces:
1385
+ return [], [], []
1386
+ elif return_failures:
1387
+ return [], []
1388
+ else:
1389
+ return []
1270
1390
 
1271
1391
  # Validate and set batch_size
1272
1392
  validated_batch_size = self._validate_batch_size(batch_size)
@@ -1289,17 +1409,84 @@ class NvIngestClient:
1289
1409
  stream_to_callback_only=stream_to_callback_only,
1290
1410
  return_full_response=return_full_response,
1291
1411
  verbose=verbose,
1412
+ return_traces=return_traces,
1292
1413
  )
1293
1414
 
1294
- results, failures = processor.run()
1415
+ results, failures, traces = processor.run()
1295
1416
 
1296
- if return_failures:
1417
+ if return_failures and return_traces:
1418
+ return results, failures, traces
1419
+ elif return_failures:
1297
1420
  return results, failures
1421
+ elif return_traces:
1422
+ return results, traces
1298
1423
 
1299
1424
  if failures:
1300
1425
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
1301
1426
  return results
1302
1427
 
1428
+ def process_jobs_concurrently_async(
1429
+ self,
1430
+ job_indices: Union[str, List[str]],
1431
+ job_queue_id: Optional[str] = None,
1432
+ batch_size: Optional[int] = None,
1433
+ timeout: int = 100,
1434
+ max_job_retries: Optional[int] = None,
1435
+ retry_delay: float = 0.5,
1436
+ initial_fetch_delay: float = 0.3,
1437
+ fail_on_submit_error: bool = False,
1438
+ completion_callback: Optional[Callable[[Any, str], None]] = None,
1439
+ stream_to_callback_only: bool = False,
1440
+ return_full_response: bool = False,
1441
+ verbose: bool = False,
1442
+ return_traces: bool = False,
1443
+ ) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
1444
+ """
1445
+ Submit and fetch multiple jobs concurrently and asynchronously.
1446
+
1447
+ This method initializes the processing and returns a Future immediately. The Future
1448
+ will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
1449
+ jobs have completed.
1450
+
1451
+ Parameters are identical to `process_jobs_concurrently`.
1452
+
1453
+ Returns
1454
+ -------
1455
+ Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
1456
+ A future that completes when all jobs are done. Its result is a tuple
1457
+ containing (successful_results, failures, traces).
1458
+ """
1459
+ if isinstance(job_indices, str):
1460
+ job_indices = [job_indices]
1461
+
1462
+ if not job_indices:
1463
+ immediate_future: Future = Future()
1464
+ immediate_future.set_result(([], [], []))
1465
+ return immediate_future
1466
+
1467
+ validated_batch_size = self._validate_batch_size(batch_size)
1468
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1469
+
1470
+ processor = _ConcurrentProcessor(
1471
+ client=self,
1472
+ batch_size=validated_batch_size,
1473
+ job_indices=job_indices,
1474
+ job_queue_id=job_queue_id,
1475
+ timeout=effective_timeout,
1476
+ max_job_retries=max_job_retries,
1477
+ retry_delay=retry_delay,
1478
+ initial_fetch_delay=initial_fetch_delay,
1479
+ completion_callback=completion_callback,
1480
+ fail_on_submit_error=fail_on_submit_error,
1481
+ stream_to_callback_only=stream_to_callback_only,
1482
+ return_full_response=return_full_response,
1483
+ verbose=verbose,
1484
+ return_traces=return_traces,
1485
+ )
1486
+
1487
+ # Asynchronous call
1488
+ return processor.run_async()
1489
+
1303
1490
  def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
1304
1491
  """
1305
1492
  Block until all specified jobs have been marked submitted.
@@ -1628,9 +1815,6 @@ class NvIngestClient:
1628
1815
  )
1629
1816
  logger.error(error_msg)
1630
1817
  failures.append((self._job_index_to_job_spec[job_id].source_id, str(e)))
1631
- finally:
1632
- # Clean up the job spec mapping
1633
- del self._job_index_to_job_spec[job_id]
1634
1818
 
1635
1819
  if return_failures:
1636
1820
  return results, failures
@@ -323,18 +323,40 @@ class IngestJobHandler:
323
323
 
324
324
  futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
325
325
  for future in as_completed(futures_dict.keys()):
326
+ pages_per_sec = None
326
327
  try:
327
328
  # Block as each future completes; this mirrors CLI behavior
328
329
  future_response, trace_id = self._handle_future_result(future)
329
330
  job_id: str = futures_dict[future]
330
331
  trace_ids[job_id_map[job_id]] = trace_id
331
332
 
332
- first_page_metadata = future_response["data"][0]["metadata"]
333
- file_page_counts: Dict[str, int] = {
334
- first_page_metadata["source_metadata"]["source_name"]: first_page_metadata[
335
- "content_metadata"
336
- ]["hierarchy"]["page_count"]
337
- }
333
+ # Extract page count: prefer V2 metadata location, fall back to V1
334
+ page_count = None
335
+ source_name = None
336
+
337
+ # Try V2 metadata location first (top-level metadata.total_pages)
338
+ if "metadata" in future_response and future_response["metadata"]:
339
+ response_metadata = future_response["metadata"]
340
+ page_count = response_metadata.get("total_pages")
341
+ source_name = response_metadata.get("original_source_name")
342
+
343
+ # Fall back to V1 location (first data element's hierarchy.page_count)
344
+ if page_count is None and future_response.get("data"):
345
+ try:
346
+ first_page_metadata = future_response["data"][0]["metadata"]
347
+ page_count = first_page_metadata["content_metadata"]["hierarchy"]["page_count"]
348
+ source_name = first_page_metadata["source_metadata"]["source_name"]
349
+ except (KeyError, IndexError, TypeError):
350
+ # If we can't extract from V1 location, use defaults
351
+ pass
352
+
353
+ # Use extracted values or defaults
354
+ if page_count is None:
355
+ page_count = 0 # Default if not found
356
+ if source_name is None:
357
+ source_name = "unknown_source"
358
+
359
+ file_page_counts: Dict[str, int] = {source_name: page_count}
338
360
 
339
361
  if self.output_directory:
340
362
  self._save_response_data(