nv-ingest-client 2025.11.14.dev20251114__tar.gz → 2025.12.14.dev20251214__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info → nv_ingest_client-2025.12.14.dev20251214}/PKG-INFO +2 -1
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/pyproject.toml +1 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/client.py +112 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/interface.py +301 -83
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/nv_ingest_cli.py +2 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_spec.py +27 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/caption.py +12 -1
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/extract.py +50 -2
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/store.py +18 -13
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/extract.py +23 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/util.py +34 -1
- nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/adt_vdb.py +243 -0
- nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client/util/vdb/lancedb.py +276 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/milvus.py +44 -21
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214/src/nv_ingest_client.egg-info}/PKG-INFO +2 -1
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/requires.txt +1 -0
- nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -27
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/LICENSE +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/README.md +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/setup.cfg +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/ingest_job_handler.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/document_analysis.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.11.14.dev20251114 → nv_ingest_client-2025.12.14.dev20251214}/src/version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-client
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.14.dev20251214
|
|
4
4
|
Summary: Python client for the nv-ingest service
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
|
|
|
223
223
|
Requires-Dist: requests>=2.28.2
|
|
224
224
|
Requires-Dist: setuptools>=78.1.1
|
|
225
225
|
Requires-Dist: tqdm>=4.67.1
|
|
226
|
+
Requires-Dist: lancedb>=0.25.3
|
|
226
227
|
Provides-Extra: milvus
|
|
227
228
|
Requires-Dist: pymilvus==2.5.10; extra == "milvus"
|
|
228
229
|
Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
|
|
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
|
|
|
202
202
|
if not self.job_queue_id:
|
|
203
203
|
logger.warning("job_queue_id is not set; submission of new jobs will fail.")
|
|
204
204
|
|
|
205
|
+
# Executor check required for run_async
|
|
206
|
+
if not hasattr(client, "_worker_pool"):
|
|
207
|
+
raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
|
|
208
|
+
if not isinstance(client._worker_pool, ThreadPoolExecutor):
|
|
209
|
+
raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
|
|
210
|
+
self._executor = client._worker_pool
|
|
211
|
+
|
|
205
212
|
# --------------------------------------------------------------------------
|
|
206
213
|
# Private Methods
|
|
207
214
|
# --------------------------------------------------------------------------
|
|
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
|
|
|
246
253
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
247
254
|
try:
|
|
248
255
|
# Use a method assumed to safely get the state object
|
|
249
|
-
job_state = self.client.
|
|
256
|
+
job_state = self.client._get_and_check_job_state(job_index)
|
|
250
257
|
# Check state exists and is not already terminal before updating
|
|
251
258
|
if (
|
|
252
259
|
job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
|
|
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
|
|
|
495
502
|
|
|
496
503
|
return batch_futures_dict, normalized_job_indices
|
|
497
504
|
|
|
498
|
-
|
|
505
|
+
# --------------------------------------------------------------------------
|
|
506
|
+
# Core Processing Logic
|
|
507
|
+
# --------------------------------------------------------------------------
|
|
508
|
+
def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
499
509
|
"""
|
|
500
510
|
Executes the main processing loop in batches.
|
|
501
511
|
|
|
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
|
|
|
640
650
|
|
|
641
651
|
return self.results, self.failures, self.traces if self.return_traces else []
|
|
642
652
|
|
|
653
|
+
# --------------------------------------------------------------------------
|
|
654
|
+
# Public Methods
|
|
655
|
+
# --------------------------------------------------------------------------
|
|
656
|
+
|
|
657
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
658
|
+
"""
|
|
659
|
+
Executes the main processing loop synchronously.
|
|
660
|
+
|
|
661
|
+
This method orchestrates the job processing by maintaining a constant
|
|
662
|
+
pool of in-flight jobs, handling submissions, fetches, and retries until
|
|
663
|
+
all jobs are complete. It blocks until all jobs are processed.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
|
|
668
|
+
A tuple containing:
|
|
669
|
+
1. A list of successfully fetched job results.
|
|
670
|
+
2. A list of tuples for failed jobs (job_index, error_message).
|
|
671
|
+
3. A list of trace dictionaries if `return_traces` was True.
|
|
672
|
+
"""
|
|
673
|
+
return self._process_all_jobs()
|
|
674
|
+
|
|
675
|
+
def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
676
|
+
"""
|
|
677
|
+
Executes the main processing loop asynchronously.
|
|
678
|
+
|
|
679
|
+
Submits the entire processing logic to the client's background
|
|
680
|
+
thread pool and returns a Future that resolves with the final
|
|
681
|
+
results, failures, and traces once all jobs are complete.
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
Future
|
|
686
|
+
A future representing the asynchronous execution. Its result()
|
|
687
|
+
will be a tuple containing (results, failures, traces).
|
|
688
|
+
"""
|
|
689
|
+
return self._executor.submit(self._process_all_jobs)
|
|
690
|
+
|
|
643
691
|
|
|
644
692
|
class NvIngestClient:
|
|
645
693
|
"""
|
|
@@ -1377,6 +1425,68 @@ class NvIngestClient:
|
|
|
1377
1425
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
1378
1426
|
return results
|
|
1379
1427
|
|
|
1428
|
+
def process_jobs_concurrently_async(
|
|
1429
|
+
self,
|
|
1430
|
+
job_indices: Union[str, List[str]],
|
|
1431
|
+
job_queue_id: Optional[str] = None,
|
|
1432
|
+
batch_size: Optional[int] = None,
|
|
1433
|
+
timeout: int = 100,
|
|
1434
|
+
max_job_retries: Optional[int] = None,
|
|
1435
|
+
retry_delay: float = 0.5,
|
|
1436
|
+
initial_fetch_delay: float = 0.3,
|
|
1437
|
+
fail_on_submit_error: bool = False,
|
|
1438
|
+
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1439
|
+
stream_to_callback_only: bool = False,
|
|
1440
|
+
return_full_response: bool = False,
|
|
1441
|
+
verbose: bool = False,
|
|
1442
|
+
return_traces: bool = False,
|
|
1443
|
+
) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
1444
|
+
"""
|
|
1445
|
+
Submit and fetch multiple jobs concurrently and asynchronously.
|
|
1446
|
+
|
|
1447
|
+
This method initializes the processing and returns a Future immediately. The Future
|
|
1448
|
+
will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
|
|
1449
|
+
jobs have completed.
|
|
1450
|
+
|
|
1451
|
+
Parameters are identical to `process_jobs_concurrently`.
|
|
1452
|
+
|
|
1453
|
+
Returns
|
|
1454
|
+
-------
|
|
1455
|
+
Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
|
|
1456
|
+
A future that completes when all jobs are done. Its result is a tuple
|
|
1457
|
+
containing (successful_results, failures, traces).
|
|
1458
|
+
"""
|
|
1459
|
+
if isinstance(job_indices, str):
|
|
1460
|
+
job_indices = [job_indices]
|
|
1461
|
+
|
|
1462
|
+
if not job_indices:
|
|
1463
|
+
immediate_future: Future = Future()
|
|
1464
|
+
immediate_future.set_result(([], [], []))
|
|
1465
|
+
return immediate_future
|
|
1466
|
+
|
|
1467
|
+
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1468
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1469
|
+
|
|
1470
|
+
processor = _ConcurrentProcessor(
|
|
1471
|
+
client=self,
|
|
1472
|
+
batch_size=validated_batch_size,
|
|
1473
|
+
job_indices=job_indices,
|
|
1474
|
+
job_queue_id=job_queue_id,
|
|
1475
|
+
timeout=effective_timeout,
|
|
1476
|
+
max_job_retries=max_job_retries,
|
|
1477
|
+
retry_delay=retry_delay,
|
|
1478
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1479
|
+
completion_callback=completion_callback,
|
|
1480
|
+
fail_on_submit_error=fail_on_submit_error,
|
|
1481
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
1482
|
+
return_full_response=return_full_response,
|
|
1483
|
+
verbose=verbose,
|
|
1484
|
+
return_traces=return_traces,
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
# Asynchronous call
|
|
1488
|
+
return processor.run_async()
|
|
1489
|
+
|
|
1380
1490
|
def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
|
|
1381
1491
|
"""
|
|
1382
1492
|
Block until all specified jobs have been marked submitted.
|
|
@@ -13,6 +13,7 @@ import os
|
|
|
13
13
|
import shutil
|
|
14
14
|
import tempfile
|
|
15
15
|
import threading
|
|
16
|
+
from io import BytesIO
|
|
16
17
|
from concurrent.futures import Future
|
|
17
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
19
|
from concurrent.futures import as_completed
|
|
@@ -52,6 +53,7 @@ from nv_ingest_client.primitives.tasks import SplitTask
|
|
|
52
53
|
from nv_ingest_client.primitives.tasks import StoreTask
|
|
53
54
|
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
54
55
|
from nv_ingest_client.primitives.tasks import UDFTask
|
|
56
|
+
from nv_ingest_client.util.file_processing.extract import EXTENSION_TO_DOCUMENT_TYPE
|
|
55
57
|
from nv_ingest_client.util.processing import check_schema
|
|
56
58
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
57
59
|
from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
|
|
@@ -224,6 +226,7 @@ class Ingestor:
|
|
|
224
226
|
**kwargs,
|
|
225
227
|
):
|
|
226
228
|
self._documents = documents or []
|
|
229
|
+
self._buffers = []
|
|
227
230
|
self._client = client
|
|
228
231
|
self._job_queue_id = job_queue_id
|
|
229
232
|
self._vdb_bulk_upload = None
|
|
@@ -352,6 +355,28 @@ class Ingestor:
|
|
|
352
355
|
|
|
353
356
|
return self
|
|
354
357
|
|
|
358
|
+
def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
|
|
359
|
+
"""
|
|
360
|
+
Add buffers for processing.
|
|
361
|
+
|
|
362
|
+
Parameters
|
|
363
|
+
----------
|
|
364
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
365
|
+
List of tuples containing the name of the buffer and the BytesIO object.
|
|
366
|
+
"""
|
|
367
|
+
if (
|
|
368
|
+
isinstance(buffers, tuple)
|
|
369
|
+
and len(buffers) == 2
|
|
370
|
+
and isinstance(buffers[0], str)
|
|
371
|
+
and isinstance(buffers[1], BytesIO)
|
|
372
|
+
):
|
|
373
|
+
buffers = [buffers]
|
|
374
|
+
self._buffers.extend(buffers)
|
|
375
|
+
self._job_specs = BatchJobSpec(self._buffers)
|
|
376
|
+
self._all_local = True
|
|
377
|
+
|
|
378
|
+
return self
|
|
379
|
+
|
|
355
380
|
def load(self, **kwargs) -> "Ingestor":
|
|
356
381
|
"""
|
|
357
382
|
Ensure all document files are accessible locally, downloading if necessary.
|
|
@@ -397,6 +422,92 @@ class Ingestor:
|
|
|
397
422
|
|
|
398
423
|
return self
|
|
399
424
|
|
|
425
|
+
def _resolve_source_name(self, job_id: str, results_data: Optional[Union[List, Dict]] = None) -> str:
|
|
426
|
+
"""
|
|
427
|
+
Resolves the source name for a given job ID using available metadata or fallback options.
|
|
428
|
+
|
|
429
|
+
Parameters
|
|
430
|
+
----------
|
|
431
|
+
job_id : str
|
|
432
|
+
The job identifier.
|
|
433
|
+
results_data : Any, optional
|
|
434
|
+
The data associated with the job result, which might contain metadata.
|
|
435
|
+
|
|
436
|
+
Returns
|
|
437
|
+
-------
|
|
438
|
+
str
|
|
439
|
+
The resolved source name.
|
|
440
|
+
"""
|
|
441
|
+
source_name = "unknown_source"
|
|
442
|
+
job_spec = self._client._job_index_to_job_spec.get(job_id)
|
|
443
|
+
|
|
444
|
+
if job_spec:
|
|
445
|
+
source_name = job_spec.source_name
|
|
446
|
+
else:
|
|
447
|
+
try:
|
|
448
|
+
if results_data:
|
|
449
|
+
first_item = results_data[0] if isinstance(results_data, list) and results_data else results_data
|
|
450
|
+
if isinstance(first_item, dict):
|
|
451
|
+
source_name = first_item.get("metadata", {}).get("source_metadata", {}).get("source_id", "")
|
|
452
|
+
if not source_name:
|
|
453
|
+
source_name = f"{job_id}"
|
|
454
|
+
except (IndexError, KeyError, TypeError):
|
|
455
|
+
source_name = f"{job_id}"
|
|
456
|
+
|
|
457
|
+
return source_name
|
|
458
|
+
|
|
459
|
+
def _write_results_to_disk(self, doc_data: Any, source_name: str, job_id: str) -> Optional[LazyLoadedList]:
|
|
460
|
+
"""
|
|
461
|
+
Writes the results for a single job to a JSONL file and returns a LazyLoadedList.
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
doc_data : Any
|
|
466
|
+
The result data to save.
|
|
467
|
+
source_name : str
|
|
468
|
+
The name of the source document.
|
|
469
|
+
job_id : str
|
|
470
|
+
The job identifier.
|
|
471
|
+
|
|
472
|
+
Returns
|
|
473
|
+
-------
|
|
474
|
+
Optional[LazyLoadedList]
|
|
475
|
+
A proxy object to the saved file, or None if the save failed.
|
|
476
|
+
"""
|
|
477
|
+
if not self._output_config:
|
|
478
|
+
logger.warning("Attempted to write results to disk without output configuration.")
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
output_dir = self._output_config["output_directory"]
|
|
483
|
+
clean_source_basename = get_valid_filename(os.path.basename(source_name))
|
|
484
|
+
file_name, file_ext = os.path.splitext(clean_source_basename)
|
|
485
|
+
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
486
|
+
if self._output_config["compression"] == "gzip":
|
|
487
|
+
file_suffix += ".gz"
|
|
488
|
+
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
489
|
+
|
|
490
|
+
data_to_save = doc_data if isinstance(doc_data, list) else [doc_data]
|
|
491
|
+
|
|
492
|
+
num_items_saved = save_document_results_to_jsonl(
|
|
493
|
+
data_to_save,
|
|
494
|
+
jsonl_filepath,
|
|
495
|
+
source_name,
|
|
496
|
+
ensure_parent_dir_exists=False,
|
|
497
|
+
compression=self._output_config["compression"],
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
if num_items_saved > 0:
|
|
501
|
+
return LazyLoadedList(
|
|
502
|
+
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
503
|
+
)
|
|
504
|
+
except Exception as e_save:
|
|
505
|
+
logger.error(
|
|
506
|
+
f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
|
|
507
|
+
exc_info=True,
|
|
508
|
+
)
|
|
509
|
+
return None
|
|
510
|
+
|
|
400
511
|
def ingest(
|
|
401
512
|
self,
|
|
402
513
|
show_progress: bool = False,
|
|
@@ -464,52 +575,19 @@ class Ingestor:
|
|
|
464
575
|
|
|
465
576
|
def _perform_save_task(doc_data, job_id, source_name):
|
|
466
577
|
# This function runs in the io_executor
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
472
|
-
if self._output_config["compression"] == "gzip":
|
|
473
|
-
file_suffix += ".gz"
|
|
474
|
-
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
475
|
-
|
|
476
|
-
num_items_saved = save_document_results_to_jsonl(
|
|
477
|
-
doc_data,
|
|
478
|
-
jsonl_filepath,
|
|
479
|
-
source_name,
|
|
480
|
-
ensure_parent_dir_exists=False,
|
|
481
|
-
compression=self._output_config["compression"],
|
|
482
|
-
)
|
|
483
|
-
|
|
484
|
-
if num_items_saved > 0:
|
|
485
|
-
results = LazyLoadedList(
|
|
486
|
-
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
487
|
-
)
|
|
488
|
-
if results_lock:
|
|
489
|
-
with results_lock:
|
|
490
|
-
final_results_payload_list.append(results)
|
|
491
|
-
else: # Should not happen if io_executor is used
|
|
578
|
+
results = self._write_results_to_disk(doc_data, source_name, job_id)
|
|
579
|
+
if results:
|
|
580
|
+
if results_lock:
|
|
581
|
+
with results_lock:
|
|
492
582
|
final_results_payload_list.append(results)
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
|
|
496
|
-
exc_info=True,
|
|
497
|
-
)
|
|
583
|
+
else: # Should not happen if io_executor is used
|
|
584
|
+
final_results_payload_list.append(results)
|
|
498
585
|
|
|
499
586
|
def _disk_save_callback(
|
|
500
587
|
results_data: Dict[str, Any],
|
|
501
588
|
job_id: str,
|
|
502
589
|
):
|
|
503
|
-
source_name =
|
|
504
|
-
job_spec = self._client._job_index_to_job_spec.get(job_id)
|
|
505
|
-
if job_spec:
|
|
506
|
-
source_name = job_spec.source_name
|
|
507
|
-
else:
|
|
508
|
-
try:
|
|
509
|
-
if results_data:
|
|
510
|
-
source_name = results_data[0]["metadata"]["source_metadata"]["source_id"]
|
|
511
|
-
except (IndexError, KeyError, TypeError):
|
|
512
|
-
source_name = f"{job_id}"
|
|
590
|
+
source_name = self._resolve_source_name(job_id, results_data)
|
|
513
591
|
|
|
514
592
|
if not results_data:
|
|
515
593
|
logger.warning(f"No data in response for job {job_id} (source: {source_name}). Skipping save.")
|
|
@@ -669,57 +747,191 @@ class Ingestor:
|
|
|
669
747
|
|
|
670
748
|
return tuple(returns) if len(returns) > 1 else results
|
|
671
749
|
|
|
672
|
-
def ingest_async(self, **kwargs: Any) -> Future:
|
|
750
|
+
def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
|
|
673
751
|
"""
|
|
674
752
|
Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
|
|
675
753
|
|
|
754
|
+
The return type of the future's result is dynamic and mirrors the behavior of the synchronous
|
|
755
|
+
`ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
|
|
756
|
+
upload is configured, the future will complete *after* the VDB upload finishes.
|
|
757
|
+
|
|
676
758
|
Parameters
|
|
677
759
|
----------
|
|
760
|
+
return_failures : bool, optional
|
|
761
|
+
If True, return a tuple containing failures; otherwise, only return results. Default is False.
|
|
762
|
+
return_traces : bool, optional
|
|
763
|
+
If True, return trace metrics alongside results. Default is False.
|
|
678
764
|
kwargs : dict
|
|
679
|
-
Additional parameters
|
|
765
|
+
Additional parameters passed to the concurrent processor.
|
|
766
|
+
Optional flags include `include_parent_trace_ids=True` to also return
|
|
767
|
+
parent job trace identifiers (V2 API only).
|
|
680
768
|
|
|
681
769
|
Returns
|
|
682
770
|
-------
|
|
683
|
-
Future
|
|
684
|
-
A future that completes when all
|
|
771
|
+
Future[Union[List[Any], Tuple[Any, ...]]]
|
|
772
|
+
A future that completes when all jobs and any subsequent VDB upload
|
|
773
|
+
have finished. Its result will be one of the following:
|
|
774
|
+
- Default: list of results
|
|
775
|
+
- return_failures=True: (results, failures)
|
|
776
|
+
- return_traces=True: (results, traces)
|
|
777
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
778
|
+
|
|
685
779
|
"""
|
|
686
|
-
|
|
780
|
+
try:
|
|
781
|
+
self._prepare_ingest_run()
|
|
687
782
|
|
|
688
|
-
|
|
783
|
+
# Add jobs locally first
|
|
784
|
+
if self._job_specs is None:
|
|
785
|
+
raise RuntimeError("Job specs missing for ingest_async.")
|
|
786
|
+
self._job_ids = self._client.add_job(self._job_specs)
|
|
787
|
+
self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
|
|
689
788
|
|
|
690
|
-
|
|
691
|
-
self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
|
|
789
|
+
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
|
|
692
790
|
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
future_results = []
|
|
697
|
-
vdb_future = None
|
|
791
|
+
stream_to_callback_only = False
|
|
792
|
+
completion_callback = None
|
|
793
|
+
async_results_map = {}
|
|
698
794
|
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
job_state = self._job_states[job_id]
|
|
702
|
-
try:
|
|
703
|
-
result = self._client.fetch_job_result(job_id)
|
|
704
|
-
if job_state.state != JobStateEnum.COMPLETED:
|
|
705
|
-
job_state.state = JobStateEnum.COMPLETED
|
|
706
|
-
except Exception:
|
|
707
|
-
result = None
|
|
708
|
-
if job_state.state != JobStateEnum.FAILED:
|
|
709
|
-
job_state.state = JobStateEnum.FAILED
|
|
710
|
-
completed_futures.add(future)
|
|
711
|
-
future_results.extend(result)
|
|
712
|
-
if completed_futures == submitted_futures:
|
|
713
|
-
combined_future.set_result(future_results)
|
|
795
|
+
io_executor = None
|
|
796
|
+
io_futures = []
|
|
714
797
|
|
|
715
|
-
|
|
716
|
-
|
|
798
|
+
if self._output_config:
|
|
799
|
+
stream_to_callback_only = True
|
|
800
|
+
output_dir = self._output_config["output_directory"]
|
|
717
801
|
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
802
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
803
|
+
|
|
804
|
+
io_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="IngestAsyncIO")
|
|
805
|
+
|
|
806
|
+
def _io_task(data: Dict[str, Any], job_id: str):
|
|
807
|
+
try:
|
|
808
|
+
source_name = self._resolve_source_name(job_id, data)
|
|
809
|
+
result = self._write_results_to_disk(data, source_name, job_id)
|
|
810
|
+
if result:
|
|
811
|
+
# Store the LazyLoadedList in our map using job_id as key
|
|
812
|
+
async_results_map[job_id] = result
|
|
813
|
+
except Exception as e:
|
|
814
|
+
logger.error(f"Error in async I/O task for job {job_id}: {e}", exc_info=True)
|
|
815
|
+
|
|
816
|
+
def _composite_callback(data: Dict[str, Any], job_id: str):
|
|
817
|
+
"""Callback executed by worker threads to save data to disk."""
|
|
818
|
+
try:
|
|
819
|
+
future = io_executor.submit(_io_task, data, job_id)
|
|
820
|
+
io_futures.append(future)
|
|
821
|
+
except Exception as e:
|
|
822
|
+
logger.error(f"Error in async callback for job {job_id}: {e}", exc_info=True)
|
|
823
|
+
|
|
824
|
+
completion_callback = _composite_callback
|
|
825
|
+
|
|
826
|
+
final_future: Future = Future()
|
|
827
|
+
|
|
828
|
+
processor_future = self._client.process_jobs_concurrently_async(
|
|
829
|
+
job_indices=self._job_ids,
|
|
830
|
+
job_queue_id=self._job_queue_id,
|
|
831
|
+
return_traces=return_traces,
|
|
832
|
+
completion_callback=completion_callback,
|
|
833
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
834
|
+
**proc_kwargs,
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
|
|
721
838
|
|
|
722
|
-
|
|
839
|
+
def _processor_done_callback(proc_future: Future):
|
|
840
|
+
"""Callback to handle completion, VDB upload, and final result setting."""
|
|
841
|
+
try:
|
|
842
|
+
if proc_future.cancelled():
|
|
843
|
+
if not final_future.done():
|
|
844
|
+
final_future.cancel()
|
|
845
|
+
return
|
|
846
|
+
if proc_future.exception():
|
|
847
|
+
if not final_future.done():
|
|
848
|
+
final_future.set_exception(proc_future.exception())
|
|
849
|
+
return
|
|
850
|
+
|
|
851
|
+
results, failures, traces_list = proc_future.result()
|
|
852
|
+
|
|
853
|
+
if io_executor:
|
|
854
|
+
for f in as_completed(io_futures):
|
|
855
|
+
if f.exception():
|
|
856
|
+
logger.error(f"Async I/O task failed: {f.exception()}")
|
|
857
|
+
io_executor.shutdown(wait=True)
|
|
858
|
+
|
|
859
|
+
final_results_list = []
|
|
860
|
+
if self._output_config:
|
|
861
|
+
for item in results:
|
|
862
|
+
if isinstance(item, str) and item in async_results_map:
|
|
863
|
+
final_results_list.append(async_results_map[item])
|
|
864
|
+
else:
|
|
865
|
+
final_results_list = results
|
|
866
|
+
|
|
867
|
+
failed_job_ids = set()
|
|
868
|
+
for job_id_with_source, error_msg in failures:
|
|
869
|
+
job_id = job_id_with_source.split(":", 1)[0]
|
|
870
|
+
if job_id in self._job_states:
|
|
871
|
+
if self._job_states[job_id].state != JobStateEnum.FAILED:
|
|
872
|
+
self._job_states[job_id].state = JobStateEnum.FAILED
|
|
873
|
+
failed_job_ids.add(job_id)
|
|
874
|
+
|
|
875
|
+
all_submitted_job_ids = set(self._job_ids)
|
|
876
|
+
successful_job_ids = all_submitted_job_ids - failed_job_ids
|
|
877
|
+
|
|
878
|
+
for job_id in successful_job_ids:
|
|
879
|
+
if job_id in self._job_states:
|
|
880
|
+
if self._job_states[job_id].state != JobStateEnum.COMPLETED:
|
|
881
|
+
self._job_states[job_id].state = JobStateEnum.COMPLETED
|
|
882
|
+
|
|
883
|
+
if self._vdb_bulk_upload and final_results_list:
|
|
884
|
+
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
|
|
885
|
+
results_future = Future()
|
|
886
|
+
results_future.set_result(final_results_list)
|
|
887
|
+
vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
|
|
888
|
+
vdb_future.result()
|
|
889
|
+
|
|
890
|
+
if self._purge_results_after_vdb_upload and self._output_config:
|
|
891
|
+
logger.info("Purging saved results from disk after successful VDB upload.")
|
|
892
|
+
self._purge_saved_results(final_results_list)
|
|
893
|
+
|
|
894
|
+
parent_trace_ids = (
|
|
895
|
+
self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
returns = [final_results_list]
|
|
899
|
+
if return_failures:
|
|
900
|
+
returns.append(failures)
|
|
901
|
+
if return_traces:
|
|
902
|
+
returns.append(traces_list)
|
|
903
|
+
if include_parent_trace_ids:
|
|
904
|
+
returns.append(parent_trace_ids)
|
|
905
|
+
|
|
906
|
+
final_result = tuple(returns) if len(returns) > 1 else final_results_list
|
|
907
|
+
|
|
908
|
+
if not final_future.done():
|
|
909
|
+
final_future.set_result(final_result)
|
|
910
|
+
|
|
911
|
+
except Exception as e:
|
|
912
|
+
logger.exception("Error in ingest_async processor callback")
|
|
913
|
+
if not final_future.done():
|
|
914
|
+
final_future.set_exception(e)
|
|
915
|
+
finally:
|
|
916
|
+
final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
|
|
917
|
+
for job_state in self._job_states.values():
|
|
918
|
+
if (
|
|
919
|
+
job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
|
|
920
|
+
and job_state.state != final_state
|
|
921
|
+
):
|
|
922
|
+
job_state.state = final_state
|
|
923
|
+
|
|
924
|
+
if io_executor:
|
|
925
|
+
io_executor.shutdown(wait=False)
|
|
926
|
+
|
|
927
|
+
processor_future.add_done_callback(_processor_done_callback)
|
|
928
|
+
return final_future
|
|
929
|
+
|
|
930
|
+
except Exception as setup_err:
|
|
931
|
+
logger.exception("Failed during synchronous setup of ingest_async")
|
|
932
|
+
error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
|
|
933
|
+
error_future.set_exception(setup_err)
|
|
934
|
+
return error_future
|
|
723
935
|
|
|
724
936
|
@ensure_job_specs
|
|
725
937
|
def _prepare_ingest_run(self):
|
|
@@ -863,11 +1075,18 @@ class Ingestor:
|
|
|
863
1075
|
**kwargs,
|
|
864
1076
|
)
|
|
865
1077
|
|
|
1078
|
+
api_document_type = EXTENSION_TO_DOCUMENT_TYPE.get(document_type.lower(), document_type)
|
|
1079
|
+
|
|
866
1080
|
# Extract method from task_options for API schema
|
|
867
1081
|
method = task_options.pop("extract_method", None)
|
|
868
1082
|
if method is None:
|
|
869
1083
|
# Let ExtractTask constructor handle default method selection
|
|
870
|
-
|
|
1084
|
+
if api_document_type == "docx":
|
|
1085
|
+
method = "python_docx"
|
|
1086
|
+
elif api_document_type == "pptx":
|
|
1087
|
+
method = "python_pptx"
|
|
1088
|
+
else:
|
|
1089
|
+
method = "pdfium" # Default fallback
|
|
871
1090
|
|
|
872
1091
|
# Build params dict for API schema
|
|
873
1092
|
params = {k: v for k, v in task_options.items() if k != "document_type"}
|
|
@@ -988,13 +1207,9 @@ class Ingestor:
|
|
|
988
1207
|
Ingestor
|
|
989
1208
|
Returns self for chaining.
|
|
990
1209
|
"""
|
|
991
|
-
|
|
992
|
-
if
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
# Provide default method if not specified (matching client StoreTask behavior)
|
|
996
|
-
if "method" not in kwargs:
|
|
997
|
-
kwargs["method"] = "minio"
|
|
1210
|
+
deprecated_method = kwargs.pop("store_method", None)
|
|
1211
|
+
if deprecated_method is not None:
|
|
1212
|
+
logger.warning("`store_method` is deprecated and no longer used. Configure storage_uri instead.")
|
|
998
1213
|
|
|
999
1214
|
task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
|
|
1000
1215
|
|
|
@@ -1002,7 +1217,9 @@ class Ingestor:
|
|
|
1002
1217
|
store_params = {
|
|
1003
1218
|
"structured": task_options.structured,
|
|
1004
1219
|
"images": task_options.images,
|
|
1005
|
-
"
|
|
1220
|
+
"storage_uri": task_options.storage_uri,
|
|
1221
|
+
"storage_options": task_options.storage_options,
|
|
1222
|
+
"public_base_url": task_options.public_base_url,
|
|
1006
1223
|
"params": task_options.params,
|
|
1007
1224
|
}
|
|
1008
1225
|
store_task = StoreTask(**store_params)
|
|
@@ -1247,6 +1464,7 @@ class Ingestor:
|
|
|
1247
1464
|
"api_key": task_options.api_key,
|
|
1248
1465
|
"endpoint_url": task_options.endpoint_url,
|
|
1249
1466
|
"prompt": task_options.prompt,
|
|
1467
|
+
"system_prompt": task_options.system_prompt,
|
|
1250
1468
|
"model_name": task_options.model_name,
|
|
1251
1469
|
}
|
|
1252
1470
|
caption_task = CaptionTask(**caption_params)
|
|
@@ -76,7 +76,7 @@ logger = logging.getLogger(__name__)
|
|
|
76
76
|
@click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
|
|
77
77
|
@click.option(
|
|
78
78
|
"--api_version",
|
|
79
|
-
default="
|
|
79
|
+
default="v2",
|
|
80
80
|
type=click.Choice(["v1", "v2"], case_sensitive=False),
|
|
81
81
|
help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
|
|
82
82
|
)
|
|
@@ -120,7 +120,7 @@ Each task must be specified with its type and corresponding options in the '[tas
|
|
|
120
120
|
Example:
|
|
121
121
|
--task 'split:{"split_by":"page", "split_length":10}'
|
|
122
122
|
--task 'extract:{"document_type":"pdf", "extract_text":true}'
|
|
123
|
-
--task 'extract:{"document_type":"pdf", "extract_method":"
|
|
123
|
+
--task 'extract:{"document_type":"pdf", "extract_method":"nemotron_parse"}'
|
|
124
124
|
--task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
|
|
125
125
|
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
|
126
126
|
--task 'embed'
|