nv-ingest-client 2025.11.14.dev20251114__tar.gz → 2025.12.14.dev20251214__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info → nv_ingest_client-2025.12.14.dev20251214}/PKG-INFO +2 -1
  2. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/pyproject.toml +1 -0
  3. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/client.py +112 -2
  4. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/interface.py +301 -83
  5. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/nv_ingest_cli.py +2 -2
  6. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_spec.py +27 -2
  7. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/caption.py +12 -1
  8. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/extract.py +50 -2
  9. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/store.py +18 -13
  10. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/extract.py +23 -0
  11. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/util.py +34 -1
  12. nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/adt_vdb.py +243 -0
  13. nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/lancedb.py +276 -0
  14. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/milvus.py +44 -21
  15. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client.egg-info}/PKG-INFO +2 -1
  16. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
  17. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/requires.txt +1 -0
  18. nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -27
  19. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/LICENSE +0 -0
  20. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/MANIFEST.in +0 -0
  21. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/README.md +0 -0
  22. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/setup.cfg +0 -0
  23. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/__init__.py +0 -0
  24. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/__init__.py +0 -0
  25. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  26. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/click.py +0 -0
  27. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/processing.py +0 -0
  28. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/system.py +0 -0
  29. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/__init__.py +0 -0
  30. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/ingest_job_handler.py +0 -0
  31. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/util/processing.py +0 -0
  32. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/__init__.py +0 -0
  33. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  34. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  35. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  36. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  37. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  38. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  39. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  40. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  41. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  42. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +0 -0
  43. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  44. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  45. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  46. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  47. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
  48. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  49. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/__init__.py +0 -0
  50. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/dataset.py +0 -0
  51. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/document_analysis.py +0 -0
  52. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  53. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  54. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/milvus.py +0 -0
  55. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/process_json_files.py +0 -0
  56. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/processing.py +0 -0
  57. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/system.py +0 -0
  58. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/transport.py +0 -0
  59. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  60. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  61. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/zipkin.py +0 -0
  62. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  63. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  64. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  65. {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.11.14.dev20251114
3
+ Version: 2025.12.14.dev20251214
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
223
223
  Requires-Dist: requests>=2.28.2
224
224
  Requires-Dist: setuptools>=78.1.1
225
225
  Requires-Dist: tqdm>=4.67.1
226
+ Requires-Dist: lancedb>=0.25.3
226
227
  Provides-Extra: milvus
227
228
  Requires-Dist: pymilvus==2.5.10; extra == "milvus"
228
229
  Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
@@ -30,6 +30,7 @@ dependencies = [
30
30
  "requests>=2.28.2",
31
31
  "setuptools>=78.1.1",
32
32
  "tqdm>=4.67.1",
33
+ "lancedb>=0.25.3",
33
34
  ]
34
35
 
35
36
  [project.optional-dependencies]
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
202
202
  if not self.job_queue_id:
203
203
  logger.warning("job_queue_id is not set; submission of new jobs will fail.")
204
204
 
205
+ # Executor check required for run_async
206
+ if not hasattr(client, "_worker_pool"):
207
+ raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
208
+ if not isinstance(client._worker_pool, ThreadPoolExecutor):
209
+ raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
210
+ self._executor = client._worker_pool
211
+
205
212
  # --------------------------------------------------------------------------
206
213
  # Private Methods
207
214
  # --------------------------------------------------------------------------
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
246
253
  # Attempt to mark state as FAILED locally in the client (best effort)
247
254
  try:
248
255
  # Use a method assumed to safely get the state object
249
- job_state = self.client._get_job_state_object(job_index)
256
+ job_state = self.client._get_and_check_job_state(job_index)
250
257
  # Check state exists and is not already terminal before updating
251
258
  if (
252
259
  job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
495
502
 
496
503
  return batch_futures_dict, normalized_job_indices
497
504
 
498
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
505
+ # --------------------------------------------------------------------------
506
+ # Core Processing Logic
507
+ # --------------------------------------------------------------------------
508
+ def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
499
509
  """
500
510
  Executes the main processing loop in batches.
501
511
 
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
640
650
 
641
651
  return self.results, self.failures, self.traces if self.return_traces else []
642
652
 
653
+ # --------------------------------------------------------------------------
654
+ # Public Methods
655
+ # --------------------------------------------------------------------------
656
+
657
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
658
+ """
659
+ Executes the main processing loop synchronously.
660
+
661
+ This method orchestrates the job processing by maintaining a constant
662
+ pool of in-flight jobs, handling submissions, fetches, and retries until
663
+ all jobs are complete. It blocks until all jobs are processed.
664
+
665
+ Returns
666
+ -------
667
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
668
+ A tuple containing:
669
+ 1. A list of successfully fetched job results.
670
+ 2. A list of tuples for failed jobs (job_index, error_message).
671
+ 3. A list of trace dictionaries if `return_traces` was True.
672
+ """
673
+ return self._process_all_jobs()
674
+
675
+ def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
676
+ """
677
+ Executes the main processing loop asynchronously.
678
+
679
+ Submits the entire processing logic to the client's background
680
+ thread pool and returns a Future that resolves with the final
681
+ results, failures, and traces once all jobs are complete.
682
+
683
+ Returns
684
+ -------
685
+ Future
686
+ A future representing the asynchronous execution. Its result()
687
+ will be a tuple containing (results, failures, traces).
688
+ """
689
+ return self._executor.submit(self._process_all_jobs)
690
+
643
691
 
644
692
  class NvIngestClient:
645
693
  """
@@ -1377,6 +1425,68 @@ class NvIngestClient:
1377
1425
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
1378
1426
  return results
1379
1427
 
1428
+ def process_jobs_concurrently_async(
1429
+ self,
1430
+ job_indices: Union[str, List[str]],
1431
+ job_queue_id: Optional[str] = None,
1432
+ batch_size: Optional[int] = None,
1433
+ timeout: int = 100,
1434
+ max_job_retries: Optional[int] = None,
1435
+ retry_delay: float = 0.5,
1436
+ initial_fetch_delay: float = 0.3,
1437
+ fail_on_submit_error: bool = False,
1438
+ completion_callback: Optional[Callable[[Any, str], None]] = None,
1439
+ stream_to_callback_only: bool = False,
1440
+ return_full_response: bool = False,
1441
+ verbose: bool = False,
1442
+ return_traces: bool = False,
1443
+ ) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
1444
+ """
1445
+ Submit and fetch multiple jobs concurrently and asynchronously.
1446
+
1447
+ This method initializes the processing and returns a Future immediately. The Future
1448
+ will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
1449
+ jobs have completed.
1450
+
1451
+ Parameters are identical to `process_jobs_concurrently`.
1452
+
1453
+ Returns
1454
+ -------
1455
+ Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
1456
+ A future that completes when all jobs are done. Its result is a tuple
1457
+ containing (successful_results, failures, traces).
1458
+ """
1459
+ if isinstance(job_indices, str):
1460
+ job_indices = [job_indices]
1461
+
1462
+ if not job_indices:
1463
+ immediate_future: Future = Future()
1464
+ immediate_future.set_result(([], [], []))
1465
+ return immediate_future
1466
+
1467
+ validated_batch_size = self._validate_batch_size(batch_size)
1468
+ effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
1469
+
1470
+ processor = _ConcurrentProcessor(
1471
+ client=self,
1472
+ batch_size=validated_batch_size,
1473
+ job_indices=job_indices,
1474
+ job_queue_id=job_queue_id,
1475
+ timeout=effective_timeout,
1476
+ max_job_retries=max_job_retries,
1477
+ retry_delay=retry_delay,
1478
+ initial_fetch_delay=initial_fetch_delay,
1479
+ completion_callback=completion_callback,
1480
+ fail_on_submit_error=fail_on_submit_error,
1481
+ stream_to_callback_only=stream_to_callback_only,
1482
+ return_full_response=return_full_response,
1483
+ verbose=verbose,
1484
+ return_traces=return_traces,
1485
+ )
1486
+
1487
+ # Asynchronous call
1488
+ return processor.run_async()
1489
+
1380
1490
  def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
1381
1491
  """
1382
1492
  Block until all specified jobs have been marked submitted.
@@ -13,6 +13,7 @@ import os
13
13
  import shutil
14
14
  import tempfile
15
15
  import threading
16
+ from io import BytesIO
16
17
  from concurrent.futures import Future
17
18
  from concurrent.futures import ThreadPoolExecutor
18
19
  from concurrent.futures import as_completed
@@ -52,6 +53,7 @@ from nv_ingest_client.primitives.tasks import SplitTask
52
53
  from nv_ingest_client.primitives.tasks import StoreTask
53
54
  from nv_ingest_client.primitives.tasks import StoreEmbedTask
54
55
  from nv_ingest_client.primitives.tasks import UDFTask
56
+ from nv_ingest_client.util.file_processing.extract import EXTENSION_TO_DOCUMENT_TYPE
55
57
  from nv_ingest_client.util.processing import check_schema
56
58
  from nv_ingest_client.util.system import ensure_directory_with_permissions
57
59
  from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
@@ -224,6 +226,7 @@ class Ingestor:
224
226
  **kwargs,
225
227
  ):
226
228
  self._documents = documents or []
229
+ self._buffers = []
227
230
  self._client = client
228
231
  self._job_queue_id = job_queue_id
229
232
  self._vdb_bulk_upload = None
@@ -352,6 +355,28 @@ class Ingestor:
352
355
 
353
356
  return self
354
357
 
358
+ def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
359
+ """
360
+ Add buffers for processing.
361
+
362
+ Parameters
363
+ ----------
364
+ buffers : List[Tuple[str, BytesIO]]
365
+ List of tuples containing the name of the buffer and the BytesIO object.
366
+ """
367
+ if (
368
+ isinstance(buffers, tuple)
369
+ and len(buffers) == 2
370
+ and isinstance(buffers[0], str)
371
+ and isinstance(buffers[1], BytesIO)
372
+ ):
373
+ buffers = [buffers]
374
+ self._buffers.extend(buffers)
375
+ self._job_specs = BatchJobSpec(self._buffers)
376
+ self._all_local = True
377
+
378
+ return self
379
+
355
380
  def load(self, **kwargs) -> "Ingestor":
356
381
  """
357
382
  Ensure all document files are accessible locally, downloading if necessary.
@@ -397,6 +422,92 @@ class Ingestor:
397
422
 
398
423
  return self
399
424
 
425
+ def _resolve_source_name(self, job_id: str, results_data: Optional[Union[List, Dict]] = None) -> str:
426
+ """
427
+ Resolves the source name for a given job ID using available metadata or fallback options.
428
+
429
+ Parameters
430
+ ----------
431
+ job_id : str
432
+ The job identifier.
433
+ results_data : Any, optional
434
+ The data associated with the job result, which might contain metadata.
435
+
436
+ Returns
437
+ -------
438
+ str
439
+ The resolved source name.
440
+ """
441
+ source_name = "unknown_source"
442
+ job_spec = self._client._job_index_to_job_spec.get(job_id)
443
+
444
+ if job_spec:
445
+ source_name = job_spec.source_name
446
+ else:
447
+ try:
448
+ if results_data:
449
+ first_item = results_data[0] if isinstance(results_data, list) and results_data else results_data
450
+ if isinstance(first_item, dict):
451
+ source_name = first_item.get("metadata", {}).get("source_metadata", {}).get("source_id", "")
452
+ if not source_name:
453
+ source_name = f"{job_id}"
454
+ except (IndexError, KeyError, TypeError):
455
+ source_name = f"{job_id}"
456
+
457
+ return source_name
458
+
459
+ def _write_results_to_disk(self, doc_data: Any, source_name: str, job_id: str) -> Optional[LazyLoadedList]:
460
+ """
461
+ Writes the results for a single job to a JSONL file and returns a LazyLoadedList.
462
+
463
+ Parameters
464
+ ----------
465
+ doc_data : Any
466
+ The result data to save.
467
+ source_name : str
468
+ The name of the source document.
469
+ job_id : str
470
+ The job identifier.
471
+
472
+ Returns
473
+ -------
474
+ Optional[LazyLoadedList]
475
+ A proxy object to the saved file, or None if the save failed.
476
+ """
477
+ if not self._output_config:
478
+ logger.warning("Attempted to write results to disk without output configuration.")
479
+ return None
480
+
481
+ try:
482
+ output_dir = self._output_config["output_directory"]
483
+ clean_source_basename = get_valid_filename(os.path.basename(source_name))
484
+ file_name, file_ext = os.path.splitext(clean_source_basename)
485
+ file_suffix = f".{file_ext.strip('.')}.results.jsonl"
486
+ if self._output_config["compression"] == "gzip":
487
+ file_suffix += ".gz"
488
+ jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
489
+
490
+ data_to_save = doc_data if isinstance(doc_data, list) else [doc_data]
491
+
492
+ num_items_saved = save_document_results_to_jsonl(
493
+ data_to_save,
494
+ jsonl_filepath,
495
+ source_name,
496
+ ensure_parent_dir_exists=False,
497
+ compression=self._output_config["compression"],
498
+ )
499
+
500
+ if num_items_saved > 0:
501
+ return LazyLoadedList(
502
+ jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
503
+ )
504
+ except Exception as e_save:
505
+ logger.error(
506
+ f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
507
+ exc_info=True,
508
+ )
509
+ return None
510
+
400
511
  def ingest(
401
512
  self,
402
513
  show_progress: bool = False,
@@ -464,52 +575,19 @@ class Ingestor:
464
575
 
465
576
  def _perform_save_task(doc_data, job_id, source_name):
466
577
  # This function runs in the io_executor
467
- try:
468
- output_dir = self._output_config["output_directory"]
469
- clean_source_basename = get_valid_filename(os.path.basename(source_name))
470
- file_name, file_ext = os.path.splitext(clean_source_basename)
471
- file_suffix = f".{file_ext.strip('.')}.results.jsonl"
472
- if self._output_config["compression"] == "gzip":
473
- file_suffix += ".gz"
474
- jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
475
-
476
- num_items_saved = save_document_results_to_jsonl(
477
- doc_data,
478
- jsonl_filepath,
479
- source_name,
480
- ensure_parent_dir_exists=False,
481
- compression=self._output_config["compression"],
482
- )
483
-
484
- if num_items_saved > 0:
485
- results = LazyLoadedList(
486
- jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
487
- )
488
- if results_lock:
489
- with results_lock:
490
- final_results_payload_list.append(results)
491
- else: # Should not happen if io_executor is used
578
+ results = self._write_results_to_disk(doc_data, source_name, job_id)
579
+ if results:
580
+ if results_lock:
581
+ with results_lock:
492
582
  final_results_payload_list.append(results)
493
- except Exception as e_save:
494
- logger.error(
495
- f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
496
- exc_info=True,
497
- )
583
+ else: # Should not happen if io_executor is used
584
+ final_results_payload_list.append(results)
498
585
 
499
586
  def _disk_save_callback(
500
587
  results_data: Dict[str, Any],
501
588
  job_id: str,
502
589
  ):
503
- source_name = "unknown_source_in_callback"
504
- job_spec = self._client._job_index_to_job_spec.get(job_id)
505
- if job_spec:
506
- source_name = job_spec.source_name
507
- else:
508
- try:
509
- if results_data:
510
- source_name = results_data[0]["metadata"]["source_metadata"]["source_id"]
511
- except (IndexError, KeyError, TypeError):
512
- source_name = f"{job_id}"
590
+ source_name = self._resolve_source_name(job_id, results_data)
513
591
 
514
592
  if not results_data:
515
593
  logger.warning(f"No data in response for job {job_id} (source: {source_name}). Skipping save.")
@@ -669,57 +747,191 @@ class Ingestor:
669
747
 
670
748
  return tuple(returns) if len(returns) > 1 else results
671
749
 
672
- def ingest_async(self, **kwargs: Any) -> Future:
750
+ def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
673
751
  """
674
752
  Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
675
753
 
754
+ The return type of the future's result is dynamic and mirrors the behavior of the synchronous
755
+ `ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
756
+ upload is configured, the future will complete *after* the VDB upload finishes.
757
+
676
758
  Parameters
677
759
  ----------
760
+ return_failures : bool, optional
761
+ If True, return a tuple containing failures; otherwise, only return results. Default is False.
762
+ return_traces : bool, optional
763
+ If True, return trace metrics alongside results. Default is False.
678
764
  kwargs : dict
679
- Additional parameters for the `submit_job_async` method.
765
+ Additional parameters passed to the concurrent processor.
766
+ Optional flags include `include_parent_trace_ids=True` to also return
767
+ parent job trace identifiers (V2 API only).
680
768
 
681
769
  Returns
682
770
  -------
683
- Future
684
- A future that completes when all submitted jobs have reached a terminal state.
771
+ Future[Union[List[Any], Tuple[Any, ...]]]
772
+ A future that completes when all jobs and any subsequent VDB upload
773
+ have finished. Its result will be one of the following:
774
+ - Default: list of results
775
+ - return_failures=True: (results, failures)
776
+ - return_traces=True: (results, traces)
777
+ - return_failures=True, return_traces=True: (results, failures, traces)
778
+
685
779
  """
686
- self._prepare_ingest_run()
780
+ try:
781
+ self._prepare_ingest_run()
687
782
 
688
- self._job_ids = self._client.add_job(self._job_specs)
783
+ # Add jobs locally first
784
+ if self._job_specs is None:
785
+ raise RuntimeError("Job specs missing for ingest_async.")
786
+ self._job_ids = self._client.add_job(self._job_specs)
787
+ self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
689
788
 
690
- future_to_job_id = self._client.submit_job_async(self._job_ids, self._job_queue_id, **kwargs)
691
- self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
789
+ proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
692
790
 
693
- combined_future = Future()
694
- submitted_futures = set(future_to_job_id.keys())
695
- completed_futures = set()
696
- future_results = []
697
- vdb_future = None
791
+ stream_to_callback_only = False
792
+ completion_callback = None
793
+ async_results_map = {}
698
794
 
699
- def _done_callback(future):
700
- job_id = future_to_job_id[future]
701
- job_state = self._job_states[job_id]
702
- try:
703
- result = self._client.fetch_job_result(job_id)
704
- if job_state.state != JobStateEnum.COMPLETED:
705
- job_state.state = JobStateEnum.COMPLETED
706
- except Exception:
707
- result = None
708
- if job_state.state != JobStateEnum.FAILED:
709
- job_state.state = JobStateEnum.FAILED
710
- completed_futures.add(future)
711
- future_results.extend(result)
712
- if completed_futures == submitted_futures:
713
- combined_future.set_result(future_results)
795
+ io_executor = None
796
+ io_futures = []
714
797
 
715
- for future in future_to_job_id:
716
- future.add_done_callback(_done_callback)
798
+ if self._output_config:
799
+ stream_to_callback_only = True
800
+ output_dir = self._output_config["output_directory"]
717
801
 
718
- if self._vdb_bulk_upload:
719
- executor = ThreadPoolExecutor(max_workers=1)
720
- vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
802
+ os.makedirs(output_dir, exist_ok=True)
803
+
804
+ io_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="IngestAsyncIO")
805
+
806
+ def _io_task(data: Dict[str, Any], job_id: str):
807
+ try:
808
+ source_name = self._resolve_source_name(job_id, data)
809
+ result = self._write_results_to_disk(data, source_name, job_id)
810
+ if result:
811
+ # Store the LazyLoadedList in our map using job_id as key
812
+ async_results_map[job_id] = result
813
+ except Exception as e:
814
+ logger.error(f"Error in async I/O task for job {job_id}: {e}", exc_info=True)
815
+
816
+ def _composite_callback(data: Dict[str, Any], job_id: str):
817
+ """Callback executed by worker threads to save data to disk."""
818
+ try:
819
+ future = io_executor.submit(_io_task, data, job_id)
820
+ io_futures.append(future)
821
+ except Exception as e:
822
+ logger.error(f"Error in async callback for job {job_id}: {e}", exc_info=True)
823
+
824
+ completion_callback = _composite_callback
825
+
826
+ final_future: Future = Future()
827
+
828
+ processor_future = self._client.process_jobs_concurrently_async(
829
+ job_indices=self._job_ids,
830
+ job_queue_id=self._job_queue_id,
831
+ return_traces=return_traces,
832
+ completion_callback=completion_callback,
833
+ stream_to_callback_only=stream_to_callback_only,
834
+ **proc_kwargs,
835
+ )
836
+
837
+ include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
721
838
 
722
- return combined_future if not vdb_future else vdb_future
839
+ def _processor_done_callback(proc_future: Future):
840
+ """Callback to handle completion, VDB upload, and final result setting."""
841
+ try:
842
+ if proc_future.cancelled():
843
+ if not final_future.done():
844
+ final_future.cancel()
845
+ return
846
+ if proc_future.exception():
847
+ if not final_future.done():
848
+ final_future.set_exception(proc_future.exception())
849
+ return
850
+
851
+ results, failures, traces_list = proc_future.result()
852
+
853
+ if io_executor:
854
+ for f in as_completed(io_futures):
855
+ if f.exception():
856
+ logger.error(f"Async I/O task failed: {f.exception()}")
857
+ io_executor.shutdown(wait=True)
858
+
859
+ final_results_list = []
860
+ if self._output_config:
861
+ for item in results:
862
+ if isinstance(item, str) and item in async_results_map:
863
+ final_results_list.append(async_results_map[item])
864
+ else:
865
+ final_results_list = results
866
+
867
+ failed_job_ids = set()
868
+ for job_id_with_source, error_msg in failures:
869
+ job_id = job_id_with_source.split(":", 1)[0]
870
+ if job_id in self._job_states:
871
+ if self._job_states[job_id].state != JobStateEnum.FAILED:
872
+ self._job_states[job_id].state = JobStateEnum.FAILED
873
+ failed_job_ids.add(job_id)
874
+
875
+ all_submitted_job_ids = set(self._job_ids)
876
+ successful_job_ids = all_submitted_job_ids - failed_job_ids
877
+
878
+ for job_id in successful_job_ids:
879
+ if job_id in self._job_states:
880
+ if self._job_states[job_id].state != JobStateEnum.COMPLETED:
881
+ self._job_states[job_id].state = JobStateEnum.COMPLETED
882
+
883
+ if self._vdb_bulk_upload and final_results_list:
884
+ with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
885
+ results_future = Future()
886
+ results_future.set_result(final_results_list)
887
+ vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
888
+ vdb_future.result()
889
+
890
+ if self._purge_results_after_vdb_upload and self._output_config:
891
+ logger.info("Purging saved results from disk after successful VDB upload.")
892
+ self._purge_saved_results(final_results_list)
893
+
894
+ parent_trace_ids = (
895
+ self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
896
+ )
897
+
898
+ returns = [final_results_list]
899
+ if return_failures:
900
+ returns.append(failures)
901
+ if return_traces:
902
+ returns.append(traces_list)
903
+ if include_parent_trace_ids:
904
+ returns.append(parent_trace_ids)
905
+
906
+ final_result = tuple(returns) if len(returns) > 1 else final_results_list
907
+
908
+ if not final_future.done():
909
+ final_future.set_result(final_result)
910
+
911
+ except Exception as e:
912
+ logger.exception("Error in ingest_async processor callback")
913
+ if not final_future.done():
914
+ final_future.set_exception(e)
915
+ finally:
916
+ final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
917
+ for job_state in self._job_states.values():
918
+ if (
919
+ job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
920
+ and job_state.state != final_state
921
+ ):
922
+ job_state.state = final_state
923
+
924
+ if io_executor:
925
+ io_executor.shutdown(wait=False)
926
+
927
+ processor_future.add_done_callback(_processor_done_callback)
928
+ return final_future
929
+
930
+ except Exception as setup_err:
931
+ logger.exception("Failed during synchronous setup of ingest_async")
932
+ error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
933
+ error_future.set_exception(setup_err)
934
+ return error_future
723
935
 
724
936
  @ensure_job_specs
725
937
  def _prepare_ingest_run(self):
@@ -863,11 +1075,18 @@ class Ingestor:
863
1075
  **kwargs,
864
1076
  )
865
1077
 
1078
+ api_document_type = EXTENSION_TO_DOCUMENT_TYPE.get(document_type.lower(), document_type)
1079
+
866
1080
  # Extract method from task_options for API schema
867
1081
  method = task_options.pop("extract_method", None)
868
1082
  if method is None:
869
1083
  # Let ExtractTask constructor handle default method selection
870
- method = "pdfium" # Default fallback
1084
+ if api_document_type == "docx":
1085
+ method = "python_docx"
1086
+ elif api_document_type == "pptx":
1087
+ method = "python_pptx"
1088
+ else:
1089
+ method = "pdfium" # Default fallback
871
1090
 
872
1091
  # Build params dict for API schema
873
1092
  params = {k: v for k, v in task_options.items() if k != "document_type"}
@@ -988,13 +1207,9 @@ class Ingestor:
988
1207
  Ingestor
989
1208
  Returns self for chaining.
990
1209
  """
991
- # Handle parameter name mapping: store_method -> method for API schema
992
- if "store_method" in kwargs:
993
- kwargs["method"] = kwargs.pop("store_method")
994
-
995
- # Provide default method if not specified (matching client StoreTask behavior)
996
- if "method" not in kwargs:
997
- kwargs["method"] = "minio"
1210
+ deprecated_method = kwargs.pop("store_method", None)
1211
+ if deprecated_method is not None:
1212
+ logger.warning("`store_method` is deprecated and no longer used. Configure storage_uri instead.")
998
1213
 
999
1214
  task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
1000
1215
 
@@ -1002,7 +1217,9 @@ class Ingestor:
1002
1217
  store_params = {
1003
1218
  "structured": task_options.structured,
1004
1219
  "images": task_options.images,
1005
- "store_method": task_options.method, # Map method back to store_method
1220
+ "storage_uri": task_options.storage_uri,
1221
+ "storage_options": task_options.storage_options,
1222
+ "public_base_url": task_options.public_base_url,
1006
1223
  "params": task_options.params,
1007
1224
  }
1008
1225
  store_task = StoreTask(**store_params)
@@ -1247,6 +1464,7 @@ class Ingestor:
1247
1464
  "api_key": task_options.api_key,
1248
1465
  "endpoint_url": task_options.endpoint_url,
1249
1466
  "prompt": task_options.prompt,
1467
+ "system_prompt": task_options.system_prompt,
1250
1468
  "model_name": task_options.model_name,
1251
1469
  }
1252
1470
  caption_task = CaptionTask(**caption_params)
@@ -76,7 +76,7 @@ logger = logging.getLogger(__name__)
76
76
  @click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
77
77
  @click.option(
78
78
  "--api_version",
79
- default="v1",
79
+ default="v2",
80
80
  type=click.Choice(["v1", "v2"], case_sensitive=False),
81
81
  help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
82
82
  )
@@ -120,7 +120,7 @@ Each task must be specified with its type and corresponding options in the '[tas
120
120
  Example:
121
121
  --task 'split:{"split_by":"page", "split_length":10}'
122
122
  --task 'extract:{"document_type":"pdf", "extract_text":true}'
123
- --task 'extract:{"document_type":"pdf", "extract_method":"nemoretriever_parse"}'
123
+ --task 'extract:{"document_type":"pdf", "extract_method":"nemotron_parse"}'
124
124
  --task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
125
125
  --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
126
126
  --task 'embed'