nv-ingest-client 2025.11.17.dev20251117__py3-none-any.whl → 2025.11.27.dev20251127__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest_client/client/client.py +112 -2
- nv_ingest_client/client/interface.py +134 -34
- nv_ingest_client/primitives/jobs/job_spec.py +26 -1
- nv_ingest_client/util/file_processing/extract.py +23 -0
- nv_ingest_client/util/util.py +34 -1
- nv_ingest_client/util/vdb/adt_vdb.py +216 -0
- nv_ingest_client/util/vdb/milvus.py +26 -17
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/RECORD +13 -13
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/top_level.txt +0 -0
|
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
|
|
|
202
202
|
if not self.job_queue_id:
|
|
203
203
|
logger.warning("job_queue_id is not set; submission of new jobs will fail.")
|
|
204
204
|
|
|
205
|
+
# Executor check required for run_async
|
|
206
|
+
if not hasattr(client, "_worker_pool"):
|
|
207
|
+
raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
|
|
208
|
+
if not isinstance(client._worker_pool, ThreadPoolExecutor):
|
|
209
|
+
raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
|
|
210
|
+
self._executor = client._worker_pool
|
|
211
|
+
|
|
205
212
|
# --------------------------------------------------------------------------
|
|
206
213
|
# Private Methods
|
|
207
214
|
# --------------------------------------------------------------------------
|
|
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
|
|
|
246
253
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
247
254
|
try:
|
|
248
255
|
# Use a method assumed to safely get the state object
|
|
249
|
-
job_state = self.client.
|
|
256
|
+
job_state = self.client._get_and_check_job_state(job_index)
|
|
250
257
|
# Check state exists and is not already terminal before updating
|
|
251
258
|
if (
|
|
252
259
|
job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
|
|
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
|
|
|
495
502
|
|
|
496
503
|
return batch_futures_dict, normalized_job_indices
|
|
497
504
|
|
|
498
|
-
|
|
505
|
+
# --------------------------------------------------------------------------
|
|
506
|
+
# Core Processing Logic
|
|
507
|
+
# --------------------------------------------------------------------------
|
|
508
|
+
def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
499
509
|
"""
|
|
500
510
|
Executes the main processing loop in batches.
|
|
501
511
|
|
|
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
|
|
|
640
650
|
|
|
641
651
|
return self.results, self.failures, self.traces if self.return_traces else []
|
|
642
652
|
|
|
653
|
+
# --------------------------------------------------------------------------
|
|
654
|
+
# Public Methods
|
|
655
|
+
# --------------------------------------------------------------------------
|
|
656
|
+
|
|
657
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
658
|
+
"""
|
|
659
|
+
Executes the main processing loop synchronously.
|
|
660
|
+
|
|
661
|
+
This method orchestrates the job processing by maintaining a constant
|
|
662
|
+
pool of in-flight jobs, handling submissions, fetches, and retries until
|
|
663
|
+
all jobs are complete. It blocks until all jobs are processed.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
|
|
668
|
+
A tuple containing:
|
|
669
|
+
1. A list of successfully fetched job results.
|
|
670
|
+
2. A list of tuples for failed jobs (job_index, error_message).
|
|
671
|
+
3. A list of trace dictionaries if `return_traces` was True.
|
|
672
|
+
"""
|
|
673
|
+
return self._process_all_jobs()
|
|
674
|
+
|
|
675
|
+
def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
676
|
+
"""
|
|
677
|
+
Executes the main processing loop asynchronously.
|
|
678
|
+
|
|
679
|
+
Submits the entire processing logic to the client's background
|
|
680
|
+
thread pool and returns a Future that resolves with the final
|
|
681
|
+
results, failures, and traces once all jobs are complete.
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
Future
|
|
686
|
+
A future representing the asynchronous execution. Its result()
|
|
687
|
+
will be a tuple containing (results, failures, traces).
|
|
688
|
+
"""
|
|
689
|
+
return self._executor.submit(self._process_all_jobs)
|
|
690
|
+
|
|
643
691
|
|
|
644
692
|
class NvIngestClient:
|
|
645
693
|
"""
|
|
@@ -1377,6 +1425,68 @@ class NvIngestClient:
|
|
|
1377
1425
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
1378
1426
|
return results
|
|
1379
1427
|
|
|
1428
|
+
def process_jobs_concurrently_async(
|
|
1429
|
+
self,
|
|
1430
|
+
job_indices: Union[str, List[str]],
|
|
1431
|
+
job_queue_id: Optional[str] = None,
|
|
1432
|
+
batch_size: Optional[int] = None,
|
|
1433
|
+
timeout: int = 100,
|
|
1434
|
+
max_job_retries: Optional[int] = None,
|
|
1435
|
+
retry_delay: float = 0.5,
|
|
1436
|
+
initial_fetch_delay: float = 0.3,
|
|
1437
|
+
fail_on_submit_error: bool = False,
|
|
1438
|
+
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1439
|
+
stream_to_callback_only: bool = False,
|
|
1440
|
+
return_full_response: bool = False,
|
|
1441
|
+
verbose: bool = False,
|
|
1442
|
+
return_traces: bool = False,
|
|
1443
|
+
) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
1444
|
+
"""
|
|
1445
|
+
Submit and fetch multiple jobs concurrently and asynchronously.
|
|
1446
|
+
|
|
1447
|
+
This method initializes the processing and returns a Future immediately. The Future
|
|
1448
|
+
will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
|
|
1449
|
+
jobs have completed.
|
|
1450
|
+
|
|
1451
|
+
Parameters are identical to `process_jobs_concurrently`.
|
|
1452
|
+
|
|
1453
|
+
Returns
|
|
1454
|
+
-------
|
|
1455
|
+
Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
|
|
1456
|
+
A future that completes when all jobs are done. Its result is a tuple
|
|
1457
|
+
containing (successful_results, failures, traces).
|
|
1458
|
+
"""
|
|
1459
|
+
if isinstance(job_indices, str):
|
|
1460
|
+
job_indices = [job_indices]
|
|
1461
|
+
|
|
1462
|
+
if not job_indices:
|
|
1463
|
+
immediate_future: Future = Future()
|
|
1464
|
+
immediate_future.set_result(([], [], []))
|
|
1465
|
+
return immediate_future
|
|
1466
|
+
|
|
1467
|
+
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1468
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1469
|
+
|
|
1470
|
+
processor = _ConcurrentProcessor(
|
|
1471
|
+
client=self,
|
|
1472
|
+
batch_size=validated_batch_size,
|
|
1473
|
+
job_indices=job_indices,
|
|
1474
|
+
job_queue_id=job_queue_id,
|
|
1475
|
+
timeout=effective_timeout,
|
|
1476
|
+
max_job_retries=max_job_retries,
|
|
1477
|
+
retry_delay=retry_delay,
|
|
1478
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1479
|
+
completion_callback=completion_callback,
|
|
1480
|
+
fail_on_submit_error=fail_on_submit_error,
|
|
1481
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
1482
|
+
return_full_response=return_full_response,
|
|
1483
|
+
verbose=verbose,
|
|
1484
|
+
return_traces=return_traces,
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
# Asynchronous call
|
|
1488
|
+
return processor.run_async()
|
|
1489
|
+
|
|
1380
1490
|
def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
|
|
1381
1491
|
"""
|
|
1382
1492
|
Block until all specified jobs have been marked submitted.
|
|
@@ -13,6 +13,7 @@ import os
|
|
|
13
13
|
import shutil
|
|
14
14
|
import tempfile
|
|
15
15
|
import threading
|
|
16
|
+
from io import BytesIO
|
|
16
17
|
from concurrent.futures import Future
|
|
17
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
19
|
from concurrent.futures import as_completed
|
|
@@ -224,6 +225,7 @@ class Ingestor:
|
|
|
224
225
|
**kwargs,
|
|
225
226
|
):
|
|
226
227
|
self._documents = documents or []
|
|
228
|
+
self._buffers = []
|
|
227
229
|
self._client = client
|
|
228
230
|
self._job_queue_id = job_queue_id
|
|
229
231
|
self._vdb_bulk_upload = None
|
|
@@ -352,6 +354,28 @@ class Ingestor:
|
|
|
352
354
|
|
|
353
355
|
return self
|
|
354
356
|
|
|
357
|
+
def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
|
|
358
|
+
"""
|
|
359
|
+
Add buffers for processing.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
364
|
+
List of tuples containing the name of the buffer and the BytesIO object.
|
|
365
|
+
"""
|
|
366
|
+
if (
|
|
367
|
+
isinstance(buffers, tuple)
|
|
368
|
+
and len(buffers) == 2
|
|
369
|
+
and isinstance(buffers[0], str)
|
|
370
|
+
and isinstance(buffers[1], BytesIO)
|
|
371
|
+
):
|
|
372
|
+
buffers = [buffers]
|
|
373
|
+
self._buffers.extend(buffers)
|
|
374
|
+
self._job_specs = BatchJobSpec(self._buffers)
|
|
375
|
+
self._all_local = True
|
|
376
|
+
|
|
377
|
+
return self
|
|
378
|
+
|
|
355
379
|
def load(self, **kwargs) -> "Ingestor":
|
|
356
380
|
"""
|
|
357
381
|
Ensure all document files are accessible locally, downloading if necessary.
|
|
@@ -669,57 +693,133 @@ class Ingestor:
|
|
|
669
693
|
|
|
670
694
|
return tuple(returns) if len(returns) > 1 else results
|
|
671
695
|
|
|
672
|
-
def ingest_async(self, **kwargs: Any) -> Future:
|
|
696
|
+
def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
|
|
673
697
|
"""
|
|
674
698
|
Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
|
|
675
699
|
|
|
700
|
+
The return type of the future's result is dynamic and mirrors the behavior of the synchronous
|
|
701
|
+
`ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
|
|
702
|
+
upload is configured, the future will complete *after* the VDB upload finishes.
|
|
703
|
+
|
|
676
704
|
Parameters
|
|
677
705
|
----------
|
|
706
|
+
return_failures : bool, optional
|
|
707
|
+
If True, return a tuple containing failures; otherwise, only return results. Default is False.
|
|
708
|
+
return_traces : bool, optional
|
|
709
|
+
If True, return trace metrics alongside results. Default is False.
|
|
678
710
|
kwargs : dict
|
|
679
|
-
Additional parameters
|
|
711
|
+
Additional parameters passed to the concurrent processor.
|
|
712
|
+
Optional flags include `include_parent_trace_ids=True` to also return
|
|
713
|
+
parent job trace identifiers (V2 API only).
|
|
680
714
|
|
|
681
715
|
Returns
|
|
682
716
|
-------
|
|
683
|
-
Future
|
|
684
|
-
A future that completes when all
|
|
717
|
+
Future[Union[List[Any], Tuple[Any, ...]]]
|
|
718
|
+
A future that completes when all jobs and any subsequent VDB upload
|
|
719
|
+
have finished. Its result will be one of the following:
|
|
720
|
+
- Default: list of results
|
|
721
|
+
- return_failures=True: (results, failures)
|
|
722
|
+
- return_traces=True: (results, traces)
|
|
723
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
724
|
+
|
|
685
725
|
"""
|
|
686
|
-
|
|
726
|
+
try:
|
|
727
|
+
self._prepare_ingest_run()
|
|
687
728
|
|
|
688
|
-
|
|
729
|
+
# Add jobs locally first
|
|
730
|
+
if self._job_specs is None:
|
|
731
|
+
raise RuntimeError("Job specs missing for ingest_async.")
|
|
732
|
+
self._job_ids = self._client.add_job(self._job_specs)
|
|
733
|
+
self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
|
|
689
734
|
|
|
690
|
-
|
|
691
|
-
self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
|
|
735
|
+
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
|
|
692
736
|
|
|
693
|
-
|
|
694
|
-
submitted_futures = set(future_to_job_id.keys())
|
|
695
|
-
completed_futures = set()
|
|
696
|
-
future_results = []
|
|
697
|
-
vdb_future = None
|
|
737
|
+
final_future: Future = Future()
|
|
698
738
|
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
job_state.state = JobStateEnum.COMPLETED
|
|
706
|
-
except Exception:
|
|
707
|
-
result = None
|
|
708
|
-
if job_state.state != JobStateEnum.FAILED:
|
|
709
|
-
job_state.state = JobStateEnum.FAILED
|
|
710
|
-
completed_futures.add(future)
|
|
711
|
-
future_results.extend(result)
|
|
712
|
-
if completed_futures == submitted_futures:
|
|
713
|
-
combined_future.set_result(future_results)
|
|
739
|
+
processor_future = self._client.process_jobs_concurrently_async(
|
|
740
|
+
job_indices=self._job_ids,
|
|
741
|
+
job_queue_id=self._job_queue_id,
|
|
742
|
+
return_traces=return_traces,
|
|
743
|
+
**proc_kwargs,
|
|
744
|
+
)
|
|
714
745
|
|
|
715
|
-
|
|
716
|
-
future.add_done_callback(_done_callback)
|
|
746
|
+
include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
|
|
717
747
|
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
748
|
+
def _processor_done_callback(proc_future: Future):
|
|
749
|
+
"""Callback to handle completion, VDB upload, and final result setting."""
|
|
750
|
+
try:
|
|
751
|
+
if proc_future.cancelled():
|
|
752
|
+
if not final_future.done():
|
|
753
|
+
final_future.cancel()
|
|
754
|
+
return
|
|
755
|
+
if proc_future.exception():
|
|
756
|
+
if not final_future.done():
|
|
757
|
+
final_future.set_exception(proc_future.exception())
|
|
758
|
+
return
|
|
759
|
+
|
|
760
|
+
results, failures, traces_list = proc_future.result()
|
|
761
|
+
|
|
762
|
+
failed_job_ids = set()
|
|
763
|
+
for job_id_with_source, error_msg in failures:
|
|
764
|
+
job_id = job_id_with_source.split(":", 1)[0]
|
|
765
|
+
if job_id in self._job_states:
|
|
766
|
+
if self._job_states[job_id].state != JobStateEnum.FAILED:
|
|
767
|
+
self._job_states[job_id].state = JobStateEnum.FAILED
|
|
768
|
+
failed_job_ids.add(job_id)
|
|
769
|
+
|
|
770
|
+
all_submitted_job_ids = set(self._job_ids)
|
|
771
|
+
successful_job_ids = all_submitted_job_ids - failed_job_ids
|
|
772
|
+
|
|
773
|
+
for job_id in successful_job_ids:
|
|
774
|
+
if job_id in self._job_states:
|
|
775
|
+
if self._job_states[job_id].state != JobStateEnum.COMPLETED:
|
|
776
|
+
self._job_states[job_id].state = JobStateEnum.COMPLETED
|
|
777
|
+
|
|
778
|
+
if self._vdb_bulk_upload and results:
|
|
779
|
+
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
|
|
780
|
+
results_future = Future()
|
|
781
|
+
results_future.set_result(results)
|
|
782
|
+
vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
|
|
783
|
+
vdb_future.result()
|
|
784
|
+
|
|
785
|
+
parent_trace_ids = (
|
|
786
|
+
self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
787
|
+
)
|
|
721
788
|
|
|
722
|
-
|
|
789
|
+
returns = [results]
|
|
790
|
+
if return_failures:
|
|
791
|
+
returns.append(failures)
|
|
792
|
+
if return_traces:
|
|
793
|
+
returns.append(traces_list)
|
|
794
|
+
if include_parent_trace_ids:
|
|
795
|
+
returns.append(parent_trace_ids)
|
|
796
|
+
|
|
797
|
+
final_result = tuple(returns) if len(returns) > 1 else results
|
|
798
|
+
|
|
799
|
+
if not final_future.done():
|
|
800
|
+
final_future.set_result(final_result)
|
|
801
|
+
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.exception("Error in ingest_async processor callback")
|
|
804
|
+
if not final_future.done():
|
|
805
|
+
final_future.set_exception(e)
|
|
806
|
+
finally:
|
|
807
|
+
final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
|
|
808
|
+
for job_state in self._job_states.values():
|
|
809
|
+
if (
|
|
810
|
+
job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
|
|
811
|
+
and job_state.state != final_state
|
|
812
|
+
):
|
|
813
|
+
job_state.state = final_state
|
|
814
|
+
|
|
815
|
+
processor_future.add_done_callback(_processor_done_callback)
|
|
816
|
+
return final_future
|
|
817
|
+
|
|
818
|
+
except Exception as setup_err:
|
|
819
|
+
logger.exception("Failed during synchronous setup of ingest_async")
|
|
820
|
+
error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
|
|
821
|
+
error_future.set_exception(setup_err)
|
|
822
|
+
return error_future
|
|
723
823
|
|
|
724
824
|
@ensure_job_specs
|
|
725
825
|
def _prepare_ingest_run(self):
|
|
@@ -10,6 +10,7 @@ from typing import Dict
|
|
|
10
10
|
from typing import List
|
|
11
11
|
from typing import Optional
|
|
12
12
|
from typing import Union
|
|
13
|
+
from typing import Tuple
|
|
13
14
|
from uuid import UUID
|
|
14
15
|
|
|
15
16
|
from nv_ingest_client.primitives.tasks import Task
|
|
@@ -222,7 +223,9 @@ class BatchJobSpec:
|
|
|
222
223
|
A dictionary that maps document types to a list of `JobSpec` instances.
|
|
223
224
|
"""
|
|
224
225
|
|
|
225
|
-
def __init__(
|
|
226
|
+
def __init__(
|
|
227
|
+
self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
|
|
228
|
+
) -> None:
|
|
226
229
|
"""
|
|
227
230
|
Initializes the BatchJobSpec instance.
|
|
228
231
|
|
|
@@ -239,6 +242,13 @@ class BatchJobSpec:
|
|
|
239
242
|
self.from_job_specs(job_specs_or_files)
|
|
240
243
|
elif isinstance(job_specs_or_files[0], str):
|
|
241
244
|
self.from_files(job_specs_or_files)
|
|
245
|
+
elif (
|
|
246
|
+
isinstance(job_specs_or_files[0], tuple)
|
|
247
|
+
and len(job_specs_or_files[0]) == 2
|
|
248
|
+
and isinstance(job_specs_or_files[0][0], str)
|
|
249
|
+
and isinstance(job_specs_or_files[0][1], BytesIO)
|
|
250
|
+
):
|
|
251
|
+
self.from_buffers(job_specs_or_files)
|
|
242
252
|
else:
|
|
243
253
|
raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
|
|
244
254
|
|
|
@@ -282,6 +292,21 @@ class BatchJobSpec:
|
|
|
282
292
|
for job_spec in job_specs:
|
|
283
293
|
self.add_job_spec(job_spec)
|
|
284
294
|
|
|
295
|
+
def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Initializes the batch from a list of buffers.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
302
|
+
A list of tuples containing the name of the buffer and the BytesIO object.
|
|
303
|
+
"""
|
|
304
|
+
from nv_ingest_client.util.util import create_job_specs_for_buffers
|
|
305
|
+
|
|
306
|
+
job_specs = create_job_specs_for_buffers(buffers)
|
|
307
|
+
for job_spec in job_specs:
|
|
308
|
+
self.add_job_spec(job_spec)
|
|
309
|
+
|
|
285
310
|
def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
|
|
286
311
|
"""
|
|
287
312
|
Internal method to initialize the batch from a dataset.
|
|
@@ -145,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
|
|
|
145
145
|
|
|
146
146
|
logger.debug(f"Content extracted from '{path}'")
|
|
147
147
|
return content, DocumentTypeEnum(document_type)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
|
|
151
|
+
"""
|
|
152
|
+
Extracts the content and type from a buffer.
|
|
153
|
+
"""
|
|
154
|
+
document_type = get_or_infer_file_type(buffer[0])
|
|
155
|
+
try:
|
|
156
|
+
if document_type in [
|
|
157
|
+
DocumentTypeEnum.TXT,
|
|
158
|
+
DocumentTypeEnum.MD,
|
|
159
|
+
DocumentTypeEnum.HTML,
|
|
160
|
+
]:
|
|
161
|
+
content = detect_encoding_and_read_text_file(buffer[1])
|
|
162
|
+
else:
|
|
163
|
+
content = serialize_to_base64(buffer[1])
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"Error processing buffer {buffer[0]}: {e}")
|
|
166
|
+
|
|
167
|
+
raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
|
|
168
|
+
|
|
169
|
+
logger.debug(f"Content extracted from '{buffer[0]}'")
|
|
170
|
+
return content, DocumentTypeEnum(document_type)
|
nv_ingest_client/util/util.py
CHANGED
|
@@ -12,10 +12,12 @@ import math
|
|
|
12
12
|
import heapq
|
|
13
13
|
from typing import Dict
|
|
14
14
|
from typing import List
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from io import BytesIO
|
|
15
17
|
|
|
16
18
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
17
19
|
from nv_ingest_client.primitives.jobs.job_spec import JobSpec
|
|
18
|
-
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
|
20
|
+
from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
|
|
19
21
|
|
|
20
22
|
logger = logging.getLogger(__name__)
|
|
21
23
|
|
|
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
350
352
|
return job_specs
|
|
351
353
|
|
|
352
354
|
|
|
355
|
+
def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
|
|
356
|
+
"""
|
|
357
|
+
Create and job specifications (JobSpecs) for a list of buffers.
|
|
358
|
+
This function takes a list of buffers, processes each buffer to extract its content and type,
|
|
359
|
+
creates a job specification (JobSpec) for each buffer.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
364
|
+
A list of tuples containing the name of the buffer and the BytesIO object.
|
|
365
|
+
|
|
366
|
+
Returns
|
|
367
|
+
-------
|
|
368
|
+
List[JobSpec]
|
|
369
|
+
A list of JobSpecs.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
job_specs = []
|
|
373
|
+
for name, buffer in buffers:
|
|
374
|
+
content, file_type = extract_content_from_buffer((name, buffer))
|
|
375
|
+
job_spec = JobSpec(
|
|
376
|
+
document_type=file_type,
|
|
377
|
+
payload=content,
|
|
378
|
+
source_id=name,
|
|
379
|
+
source_name=name,
|
|
380
|
+
)
|
|
381
|
+
job_specs.append(job_spec)
|
|
382
|
+
|
|
383
|
+
return job_specs
|
|
384
|
+
|
|
385
|
+
|
|
353
386
|
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
387
|
"""
|
|
355
388
|
Apply PDF split configuration to a list of JobSpec objects.
|
|
@@ -1,27 +1,243 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
"""Abstract Vector Database (VDB) operator API.
|
|
5
|
+
|
|
6
|
+
This module defines the `VDB` abstract base class which specifies the
|
|
7
|
+
interface that custom vector-database operators must implement to integrate
|
|
8
|
+
with NV-Ingest.
|
|
9
|
+
|
|
10
|
+
The implementation details and an example OpenSearch operator are described
|
|
11
|
+
in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
|
|
12
|
+
production-ready OpenSearch implementation is available at
|
|
13
|
+
`client/src/nv_ingest_client/util/vdb/opensearch.py`.
|
|
14
|
+
|
|
15
|
+
Design goals:
|
|
16
|
+
- Provide a small, well-documented interface that supports common vector
|
|
17
|
+
database operations: index creation, batch ingestion, nearest-neighbor
|
|
18
|
+
retrieval, and a simple `run` orchestration entry-point used by the
|
|
19
|
+
NV-Ingest pipeline.
|
|
20
|
+
- Keep the API flexible by accepting `**kwargs` on methods so implementers can
|
|
21
|
+
pass database-specific options without changing the interface.
|
|
22
|
+
|
|
23
|
+
Typical implementation notes (inferred from the example OpenSearch operator):
|
|
24
|
+
- Constructor accepts connection and index configuration parameters such as
|
|
25
|
+
`host`, `port`, `index_name`, `dense_dim` and feature toggles for content
|
|
26
|
+
types (e.g. `enable_text`, `enable_images`).
|
|
27
|
+
- `create_index` should be able to create (and optionally recreate) an
|
|
28
|
+
index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
|
|
29
|
+
- `write_to_index` should accept batches of NV-Ingest records, perform
|
|
30
|
+
validation/transformation, and write documents into the database efficiently
|
|
31
|
+
(bulk APIs are recommended).
|
|
32
|
+
- `retrieval` should accept a list of textual queries, convert them to
|
|
33
|
+
embeddings (by calling an external embedding service or model), perform a
|
|
34
|
+
vector search (top-k), and return cleaned results (e.g., removing stored
|
|
35
|
+
dense vectors from returned payloads).
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
|
|
4
40
|
class VDB(ABC):
|
|
41
|
+
"""Abstract base class for Vector Database operators.
|
|
42
|
+
|
|
43
|
+
Subclasses must implement the abstract methods below. The interface is
|
|
44
|
+
intentionally small and uses `**kwargs` to allow operator-specific
|
|
45
|
+
configuration without changing the common API.
|
|
46
|
+
|
|
47
|
+
Example (high level):
|
|
48
|
+
|
|
49
|
+
class OpenSearch(VDB):
|
|
50
|
+
def __init__(self, **kwargs):
|
|
51
|
+
# parse kwargs, initialize client, call super().__init__(**kwargs)
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
def create_index(self, **kwargs):
|
|
55
|
+
# create index, mappings, settings
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
def write_to_index(self, records: list, **kwargs):
|
|
59
|
+
# transform NV-Ingest records and write to database
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
def retrieval(self, queries: list, **kwargs):
|
|
63
|
+
# convert queries to embeddings, k-NN search, format results
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
def run(self, records):
|
|
67
|
+
# orchestrate create_index + write_to_index
|
|
68
|
+
...
|
|
69
|
+
|
|
70
|
+
Notes on recommended constructor parameters (not enforced by this ABC):
|
|
71
|
+
- host (str): database hostname (default: 'localhost')
|
|
72
|
+
- port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
|
|
73
|
+
- index_name (str): base index name used by the operator
|
|
74
|
+
- dense_dim (int): dimensionality of stored dense embeddings
|
|
75
|
+
- enable_text/enable_images/... (bool): content-type toggles used when
|
|
76
|
+
extracting text from NV-Ingest records before indexing
|
|
77
|
+
|
|
78
|
+
The concrete operator may accept additional parameters (username,
|
|
79
|
+
password, ssl options, client-specific flags). Passing these via
|
|
80
|
+
`**kwargs` is the intended pattern.
|
|
81
|
+
"""
|
|
5
82
|
|
|
6
83
|
@abstractmethod
|
|
7
84
|
def __init__(self, **kwargs):
|
|
85
|
+
"""Initialize the VDB operator.
|
|
86
|
+
|
|
87
|
+
Implementations should extract configuration values from `kwargs`
|
|
88
|
+
(or use defaults) and initialize any client connections required to
|
|
89
|
+
talk to the target vector database. Implementations are encouraged to
|
|
90
|
+
call `super().__init__(**kwargs)` only if they want the base-class
|
|
91
|
+
behavior of storing kwargs on the instance (the base class itself does
|
|
92
|
+
not require that behavior).
|
|
93
|
+
|
|
94
|
+
Parameters (suggested/common):
|
|
95
|
+
- host (str): database host
|
|
96
|
+
- port (int): database port
|
|
97
|
+
- index_name (str): base name for created indices
|
|
98
|
+
- dense_dim (int): embedding vector dimension
|
|
99
|
+
- enable_text (bool): whether text content should be extracted/indexed
|
|
100
|
+
- enable_images (bool), enable_audio (bool), etc.: other toggles
|
|
101
|
+
|
|
102
|
+
The constructor should not perform heavy operations (like creating
|
|
103
|
+
indices) unless explicitly desired; prefer leaving that work to
|
|
104
|
+
`create_index` to make the operator easier to test.
|
|
105
|
+
"""
|
|
8
106
|
self.__dict__.update(kwargs)
|
|
9
107
|
|
|
10
108
|
@abstractmethod
|
|
11
109
|
def create_index(self, **kwargs):
|
|
110
|
+
"""Create and configure the index(es) required by this operator.
|
|
111
|
+
|
|
112
|
+
Implementations must ensure an appropriate index (or indices) exist
|
|
113
|
+
before data ingestion. For vector indexes this typically means
|
|
114
|
+
creating settings and mappings that enable k-NN/vector search (for
|
|
115
|
+
example, enabling an HNSW/FAISS engine, setting `dimension`, and any
|
|
116
|
+
engine-specific parameters).
|
|
117
|
+
|
|
118
|
+
Common keyword arguments (operator-specific):
|
|
119
|
+
- recreate (bool): if True, delete and recreate the index even if it
|
|
120
|
+
already exists (default: False)
|
|
121
|
+
- index_name (str): override the operator's configured index name for
|
|
122
|
+
this call
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
implementation-specific result (e.g., a boolean, the created
|
|
126
|
+
index name, or the raw response from the database client). There
|
|
127
|
+
is no strict requirement here because different DB clients return
|
|
128
|
+
different values; document behavior in concrete implementations.
|
|
129
|
+
"""
|
|
12
130
|
pass
|
|
13
131
|
|
|
14
132
|
@abstractmethod
|
|
15
133
|
def write_to_index(self, records: list, **kwargs):
|
|
134
|
+
"""Write a batch of NV-Ingest records to the vector database.
|
|
135
|
+
|
|
136
|
+
This method receives `records` formatted as NV-Ingest provides them
|
|
137
|
+
(commonly a list of record-sets). Implementations are responsible for
|
|
138
|
+
transforming each record into the target database document format,
|
|
139
|
+
validating the presence of embeddings and content, and using the most
|
|
140
|
+
efficient ingestion API available (for example a bulk endpoint).
|
|
141
|
+
|
|
142
|
+
Expected behavior:
|
|
143
|
+
- Iterate over the provided `records` (which can be nested lists of
|
|
144
|
+
record dictionaries) and transform each record to the DB document
|
|
145
|
+
structure (fields such as `dense` for the vector, `text` for the
|
|
146
|
+
content, and `metadata` for auxiliary fields are common in the
|
|
147
|
+
repository examples).
|
|
148
|
+
- Skip records missing required fields (for example, missing
|
|
149
|
+
embeddings) and log or report failures as appropriate.
|
|
150
|
+
- Use batching / bulk APIs to reduce overhead when writing large
|
|
151
|
+
volumes of documents.
|
|
152
|
+
|
|
153
|
+
Parameters:
|
|
154
|
+
- records (list): NV-Ingest records (see repository examples for
|
|
155
|
+
structure)
|
|
156
|
+
- batch_size (int, optional): how many documents to send per bulk
|
|
157
|
+
request; database-specific implementations can use this hint
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
implementation-specific result (e.g., number of documents
|
|
161
|
+
indexed, client response for bulk API). Concrete implementations
|
|
162
|
+
should document exact return values and failure semantics.
|
|
163
|
+
"""
|
|
16
164
|
pass
|
|
17
165
|
|
|
18
166
|
@abstractmethod
|
|
19
167
|
def retrieval(self, queries: list, **kwargs):
|
|
168
|
+
"""Perform similarity search for a list of text queries.
|
|
169
|
+
|
|
170
|
+
The typical retrieval flow implemented by operators in this ecosystem
|
|
171
|
+
is:
|
|
172
|
+
1. Convert each textual `query` into a dense embedding using an
|
|
173
|
+
external embedding model or service (the example uses an NVIDIA
|
|
174
|
+
embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
|
|
175
|
+
2. Issue a vector (k-NN) search to the database using the generated
|
|
176
|
+
embedding, requesting the top-k (configurable) neighbors.
|
|
177
|
+
3. Post-process results (for example, remove stored dense vectors
|
|
178
|
+
from returned documents to reduce payload size) and return a
|
|
179
|
+
list-of-lists of result documents aligned with the input `queries`.
|
|
180
|
+
|
|
181
|
+
Keyword arguments (common):
|
|
182
|
+
- index_name (str): index to search (default: operator's configured
|
|
183
|
+
index_name)
|
|
184
|
+
- top_k (int): number of nearest neighbors to return (default: 10)
|
|
185
|
+
- embedding_endpoint / model_name / nvidia_api_key: parameters needed
|
|
186
|
+
when the operator integrates with an external embedding service.
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
- queries (list[str]): list of text queries to be vectorized and
|
|
190
|
+
searched
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
- results (list[list[dict]]): for each query, a list of hit documents
|
|
194
|
+
(concrete implementations should specify the document shape they
|
|
195
|
+
return). Operators should remove large binary/vector fields from
|
|
196
|
+
responses where possible.
|
|
197
|
+
"""
|
|
20
198
|
pass
|
|
21
199
|
|
|
22
200
|
@abstractmethod
|
|
23
201
|
def run(self, records):
|
|
202
|
+
"""Main entry point used by the NV-Ingest pipeline.
|
|
203
|
+
|
|
204
|
+
The `run` method is intended to be a simple orchestration layer that
|
|
205
|
+
ensures the index exists and then ingests provided records. A minimal
|
|
206
|
+
recommended implementation is::
|
|
207
|
+
|
|
208
|
+
def run(self, records):
|
|
209
|
+
self.create_index()
|
|
210
|
+
self.write_to_index(records)
|
|
211
|
+
|
|
212
|
+
Implementers can add pre/post hooks, metrics, retries, or error
|
|
213
|
+
handling as needed for production readiness. Keep `run` simple so the
|
|
214
|
+
pipeline orchestration remains predictable.
|
|
215
|
+
|
|
216
|
+
Parameters:
|
|
217
|
+
- records: NV-Ingest records to index (format follows repository
|
|
218
|
+
conventions)
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
- implementation-specific result (for example, a summary dict or
|
|
222
|
+
boolean success flag).
|
|
223
|
+
"""
|
|
24
224
|
pass
|
|
25
225
|
|
|
26
226
|
def reindex(self, records: list, **kwargs):
|
|
227
|
+
"""Optional helper to rebuild or re-populate indexes with new data.
|
|
228
|
+
|
|
229
|
+
This non-abstract method is provided as an optional hook that concrete
|
|
230
|
+
classes may override. A typical reindex implementation will:
|
|
231
|
+
- optionally delete the existing index and recreate it (via
|
|
232
|
+
`create_index(recreate=True)`)
|
|
233
|
+
- call `write_to_index(records)` to populate the new index
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
- records (list): records used to populate the index
|
|
237
|
+
- recreate (bool, optional): whether to delete and recreate the
|
|
238
|
+
index before writing
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
- implementation-specific result
|
|
242
|
+
"""
|
|
27
243
|
pass
|
|
@@ -892,7 +892,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
|
|
|
892
892
|
logger.info(f"streamed {count} records")
|
|
893
893
|
|
|
894
894
|
|
|
895
|
-
def wait_for_index(collection_name: str,
|
|
895
|
+
def wait_for_index(collection_name: str, expected_rows_dict: dict, client: MilvusClient):
|
|
896
896
|
"""
|
|
897
897
|
This function waits for the index to be built. It checks
|
|
898
898
|
the indexed_rows of the index and waits for it to be equal
|
|
@@ -901,32 +901,28 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
901
901
|
(refer to MilvusClient.refresh_load for bulk inserts).
|
|
902
902
|
"""
|
|
903
903
|
client.flush(collection_name)
|
|
904
|
-
# index_names = utility.list_indexes(collection_name)
|
|
905
904
|
indexed_rows = 0
|
|
906
905
|
# observe dense_index, all indexes get populated simultaneously
|
|
907
|
-
for index_name in
|
|
908
|
-
indexed_rows =
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
pos_movement =
|
|
906
|
+
for index_name, rows_expected in expected_rows_dict.items():
|
|
907
|
+
indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
908
|
+
while indexed_rows < rows_expected:
|
|
909
|
+
# 0.5% of rows expected allowed without noticing an increase in indexed_rows
|
|
910
|
+
pos_movement = start_pos_movement = max((rows_expected - indexed_rows) * 0.005, 10)
|
|
912
911
|
for i in range(20):
|
|
913
|
-
|
|
912
|
+
prev_indexed_rows = indexed_rows
|
|
913
|
+
indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
914
914
|
time.sleep(1)
|
|
915
|
-
logger.info(
|
|
916
|
-
|
|
917
|
-
)
|
|
918
|
-
if current_indexed_rows == expected_rows:
|
|
919
|
-
indexed_rows = current_indexed_rows
|
|
915
|
+
logger.info(f"Indexed rows, {collection_name}, {index_name} - {indexed_rows} / {rows_expected}")
|
|
916
|
+
if indexed_rows == rows_expected:
|
|
920
917
|
break
|
|
921
918
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
922
|
-
if
|
|
919
|
+
if indexed_rows == prev_indexed_rows:
|
|
923
920
|
pos_movement -= 1
|
|
924
921
|
else:
|
|
925
|
-
pos_movement =
|
|
922
|
+
pos_movement = start_pos_movement
|
|
926
923
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
927
924
|
if pos_movement == 0:
|
|
928
925
|
raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
|
|
929
|
-
indexed_rows = current_indexed_rows
|
|
930
926
|
return indexed_rows
|
|
931
927
|
|
|
932
928
|
|
|
@@ -1046,6 +1042,13 @@ def write_to_nvingest_collection(
|
|
|
1046
1042
|
if num_elements < threshold:
|
|
1047
1043
|
stream = True
|
|
1048
1044
|
if stream:
|
|
1045
|
+
# most be accessed/saved before adding new records
|
|
1046
|
+
index_names = utility.list_indexes(collection_name)
|
|
1047
|
+
expected_rows = {}
|
|
1048
|
+
for index_name in index_names:
|
|
1049
|
+
expected_rows[index_name] = (
|
|
1050
|
+
int(client.describe_index(collection_name, index_name)["indexed_rows"]) + num_elements
|
|
1051
|
+
)
|
|
1049
1052
|
stream_insert_milvus(
|
|
1050
1053
|
cleaned_records,
|
|
1051
1054
|
client,
|
|
@@ -1054,7 +1057,7 @@ def write_to_nvingest_collection(
|
|
|
1054
1057
|
if not local_index:
|
|
1055
1058
|
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1056
1059
|
# know how long this should take, it is num_elements dependent.
|
|
1057
|
-
wait_for_index(collection_name,
|
|
1060
|
+
wait_for_index(collection_name, expected_rows, client)
|
|
1058
1061
|
else:
|
|
1059
1062
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1060
1063
|
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
@@ -2005,6 +2008,12 @@ class Milvus(VDB):
|
|
|
2005
2008
|
"""
|
|
2006
2009
|
kwargs = locals().copy()
|
|
2007
2010
|
kwargs.pop("self", None)
|
|
2011
|
+
bucket_name = kwargs.get("bucket_name", None)
|
|
2012
|
+
if bucket_name is not None and bucket_name != ClientConfigSchema().minio_bucket_name:
|
|
2013
|
+
raise ValueError(
|
|
2014
|
+
"You must use the environment variable MINIO_BUCKET to specify bucket_name, detected:",
|
|
2015
|
+
f"`bucket_name`: {bucket_name} and MINIO_BUCKET: {ClientConfigSchema().minio_bucket_name}",
|
|
2016
|
+
)
|
|
2008
2017
|
super().__init__(**kwargs)
|
|
2009
2018
|
|
|
2010
2019
|
def create_index(self, **kwargs):
|
|
@@ -6,13 +6,13 @@ nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T
|
|
|
6
6
|
nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
|
|
7
7
|
nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
|
|
8
8
|
nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
|
|
9
|
-
nv_ingest_client/client/client.py,sha256=
|
|
9
|
+
nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
|
|
10
10
|
nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
|
|
11
|
-
nv_ingest_client/client/interface.py,sha256=
|
|
11
|
+
nv_ingest_client/client/interface.py,sha256=XQ2hHNBsL-Nnsk_w48UMxFqxfkO0CdQ2AOQZEdXU3OA,59990
|
|
12
12
|
nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
|
|
13
13
|
nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
|
|
14
14
|
nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
|
|
15
|
-
nv_ingest_client/primitives/jobs/job_spec.py,sha256=
|
|
15
|
+
nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQbDkhCXhLA9hNOURc,16831
|
|
16
16
|
nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
|
|
17
17
|
nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
|
|
18
18
|
nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
|
|
@@ -40,17 +40,17 @@ nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywk
|
|
|
40
40
|
nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
|
|
41
41
|
nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
|
|
42
42
|
nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
|
|
43
|
-
nv_ingest_client/util/util.py,sha256=
|
|
43
|
+
nv_ingest_client/util/util.py,sha256=zvWgIxIeATrtrS8olo_8-fHQ4aDd83yg2SjNDcHIv4g,16805
|
|
44
44
|
nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
|
|
45
45
|
nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
nv_ingest_client/util/file_processing/extract.py,sha256=
|
|
46
|
+
nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
|
|
47
47
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
48
|
-
nv_ingest_client/util/vdb/adt_vdb.py,sha256=
|
|
49
|
-
nv_ingest_client/util/vdb/milvus.py,sha256=
|
|
48
|
+
nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
|
|
49
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=jCQyWb6xoQ6utGNccASmN09eJbwF2HlgrGGIkpoUfI8,80792
|
|
50
50
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
51
|
-
nv_ingest_client-2025.11.
|
|
52
|
-
nv_ingest_client-2025.11.
|
|
53
|
-
nv_ingest_client-2025.11.
|
|
54
|
-
nv_ingest_client-2025.11.
|
|
55
|
-
nv_ingest_client-2025.11.
|
|
56
|
-
nv_ingest_client-2025.11.
|
|
51
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
52
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/METADATA,sha256=jul59WHL8-9IYR27iL9ilxkw7IQRnqb7EMqBfJh7IGk,30627
|
|
53
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
54
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
55
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
56
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|