nv-ingest-client 2025.11.9.dev20251109__py3-none-any.whl → 2025.11.27.dev20251127__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest_client/client/client.py +112 -2
- nv_ingest_client/client/interface.py +136 -32
- nv_ingest_client/primitives/jobs/job_spec.py +29 -1
- nv_ingest_client/primitives/tasks/extract.py +1 -0
- nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
- nv_ingest_client/util/file_processing/extract.py +27 -0
- nv_ingest_client/util/util.py +34 -1
- nv_ingest_client/util/vdb/adt_vdb.py +216 -0
- nv_ingest_client/util/vdb/milvus.py +56 -23
- {nv_ingest_client-2025.11.9.dev20251109.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.11.9.dev20251109.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/RECORD +15 -14
- {nv_ingest_client-2025.11.9.dev20251109.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.11.9.dev20251109.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.9.dev20251109.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.11.9.dev20251109.dist-info → nv_ingest_client-2025.11.27.dev20251127.dist-info}/top_level.txt +0 -0
|
@@ -202,6 +202,13 @@ class _ConcurrentProcessor:
|
|
|
202
202
|
if not self.job_queue_id:
|
|
203
203
|
logger.warning("job_queue_id is not set; submission of new jobs will fail.")
|
|
204
204
|
|
|
205
|
+
# Executor check required for run_async
|
|
206
|
+
if not hasattr(client, "_worker_pool"):
|
|
207
|
+
raise AttributeError("Client object is missing the '_worker_pool' attribute, required for run_async.")
|
|
208
|
+
if not isinstance(client._worker_pool, ThreadPoolExecutor):
|
|
209
|
+
raise TypeError("Client's '_worker_pool' must be a ThreadPoolExecutor for run_async.")
|
|
210
|
+
self._executor = client._worker_pool
|
|
211
|
+
|
|
205
212
|
# --------------------------------------------------------------------------
|
|
206
213
|
# Private Methods
|
|
207
214
|
# --------------------------------------------------------------------------
|
|
@@ -246,7 +253,7 @@ class _ConcurrentProcessor:
|
|
|
246
253
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
247
254
|
try:
|
|
248
255
|
# Use a method assumed to safely get the state object
|
|
249
|
-
job_state = self.client.
|
|
256
|
+
job_state = self.client._get_and_check_job_state(job_index)
|
|
250
257
|
# Check state exists and is not already terminal before updating
|
|
251
258
|
if (
|
|
252
259
|
job_state and hasattr(job_state, "state") and job_state.state not in ["FAILED", "COMPLETED"]
|
|
@@ -495,7 +502,10 @@ class _ConcurrentProcessor:
|
|
|
495
502
|
|
|
496
503
|
return batch_futures_dict, normalized_job_indices
|
|
497
504
|
|
|
498
|
-
|
|
505
|
+
# --------------------------------------------------------------------------
|
|
506
|
+
# Core Processing Logic
|
|
507
|
+
# --------------------------------------------------------------------------
|
|
508
|
+
def _process_all_jobs(self) -> Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
499
509
|
"""
|
|
500
510
|
Executes the main processing loop in batches.
|
|
501
511
|
|
|
@@ -640,6 +650,44 @@ class _ConcurrentProcessor:
|
|
|
640
650
|
|
|
641
651
|
return self.results, self.failures, self.traces if self.return_traces else []
|
|
642
652
|
|
|
653
|
+
# --------------------------------------------------------------------------
|
|
654
|
+
# Public Methods
|
|
655
|
+
# --------------------------------------------------------------------------
|
|
656
|
+
|
|
657
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
658
|
+
"""
|
|
659
|
+
Executes the main processing loop synchronously.
|
|
660
|
+
|
|
661
|
+
This method orchestrates the job processing by maintaining a constant
|
|
662
|
+
pool of in-flight jobs, handling submissions, fetches, and retries until
|
|
663
|
+
all jobs are complete. It blocks until all jobs are processed.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]
|
|
668
|
+
A tuple containing:
|
|
669
|
+
1. A list of successfully fetched job results.
|
|
670
|
+
2. A list of tuples for failed jobs (job_index, error_message).
|
|
671
|
+
3. A list of trace dictionaries if `return_traces` was True.
|
|
672
|
+
"""
|
|
673
|
+
return self._process_all_jobs()
|
|
674
|
+
|
|
675
|
+
def run_async(self) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
676
|
+
"""
|
|
677
|
+
Executes the main processing loop asynchronously.
|
|
678
|
+
|
|
679
|
+
Submits the entire processing logic to the client's background
|
|
680
|
+
thread pool and returns a Future that resolves with the final
|
|
681
|
+
results, failures, and traces once all jobs are complete.
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
Future
|
|
686
|
+
A future representing the asynchronous execution. Its result()
|
|
687
|
+
will be a tuple containing (results, failures, traces).
|
|
688
|
+
"""
|
|
689
|
+
return self._executor.submit(self._process_all_jobs)
|
|
690
|
+
|
|
643
691
|
|
|
644
692
|
class NvIngestClient:
|
|
645
693
|
"""
|
|
@@ -1377,6 +1425,68 @@ class NvIngestClient:
|
|
|
1377
1425
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
1378
1426
|
return results
|
|
1379
1427
|
|
|
1428
|
+
def process_jobs_concurrently_async(
|
|
1429
|
+
self,
|
|
1430
|
+
job_indices: Union[str, List[str]],
|
|
1431
|
+
job_queue_id: Optional[str] = None,
|
|
1432
|
+
batch_size: Optional[int] = None,
|
|
1433
|
+
timeout: int = 100,
|
|
1434
|
+
max_job_retries: Optional[int] = None,
|
|
1435
|
+
retry_delay: float = 0.5,
|
|
1436
|
+
initial_fetch_delay: float = 0.3,
|
|
1437
|
+
fail_on_submit_error: bool = False,
|
|
1438
|
+
completion_callback: Optional[Callable[[Any, str], None]] = None,
|
|
1439
|
+
stream_to_callback_only: bool = False,
|
|
1440
|
+
return_full_response: bool = False,
|
|
1441
|
+
verbose: bool = False,
|
|
1442
|
+
return_traces: bool = False,
|
|
1443
|
+
) -> Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]:
|
|
1444
|
+
"""
|
|
1445
|
+
Submit and fetch multiple jobs concurrently and asynchronously.
|
|
1446
|
+
|
|
1447
|
+
This method initializes the processing and returns a Future immediately. The Future
|
|
1448
|
+
will resolve with a fixed 3-part tuple `(results, failures, traces)` once all
|
|
1449
|
+
jobs have completed.
|
|
1450
|
+
|
|
1451
|
+
Parameters are identical to `process_jobs_concurrently`.
|
|
1452
|
+
|
|
1453
|
+
Returns
|
|
1454
|
+
-------
|
|
1455
|
+
Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]]
|
|
1456
|
+
A future that completes when all jobs are done. Its result is a tuple
|
|
1457
|
+
containing (successful_results, failures, traces).
|
|
1458
|
+
"""
|
|
1459
|
+
if isinstance(job_indices, str):
|
|
1460
|
+
job_indices = [job_indices]
|
|
1461
|
+
|
|
1462
|
+
if not job_indices:
|
|
1463
|
+
immediate_future: Future = Future()
|
|
1464
|
+
immediate_future.set_result(([], [], []))
|
|
1465
|
+
return immediate_future
|
|
1466
|
+
|
|
1467
|
+
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1468
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1469
|
+
|
|
1470
|
+
processor = _ConcurrentProcessor(
|
|
1471
|
+
client=self,
|
|
1472
|
+
batch_size=validated_batch_size,
|
|
1473
|
+
job_indices=job_indices,
|
|
1474
|
+
job_queue_id=job_queue_id,
|
|
1475
|
+
timeout=effective_timeout,
|
|
1476
|
+
max_job_retries=max_job_retries,
|
|
1477
|
+
retry_delay=retry_delay,
|
|
1478
|
+
initial_fetch_delay=initial_fetch_delay,
|
|
1479
|
+
completion_callback=completion_callback,
|
|
1480
|
+
fail_on_submit_error=fail_on_submit_error,
|
|
1481
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
1482
|
+
return_full_response=return_full_response,
|
|
1483
|
+
verbose=verbose,
|
|
1484
|
+
return_traces=return_traces,
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
# Asynchronous call
|
|
1488
|
+
return processor.run_async()
|
|
1489
|
+
|
|
1380
1490
|
def _ensure_submitted(self, job_ids: Union[str, List[str]]) -> None:
|
|
1381
1491
|
"""
|
|
1382
1492
|
Block until all specified jobs have been marked submitted.
|
|
@@ -13,6 +13,7 @@ import os
|
|
|
13
13
|
import shutil
|
|
14
14
|
import tempfile
|
|
15
15
|
import threading
|
|
16
|
+
from io import BytesIO
|
|
16
17
|
from concurrent.futures import Future
|
|
17
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
19
|
from concurrent.futures import as_completed
|
|
@@ -224,6 +225,7 @@ class Ingestor:
|
|
|
224
225
|
**kwargs,
|
|
225
226
|
):
|
|
226
227
|
self._documents = documents or []
|
|
228
|
+
self._buffers = []
|
|
227
229
|
self._client = client
|
|
228
230
|
self._job_queue_id = job_queue_id
|
|
229
231
|
self._vdb_bulk_upload = None
|
|
@@ -352,6 +354,28 @@ class Ingestor:
|
|
|
352
354
|
|
|
353
355
|
return self
|
|
354
356
|
|
|
357
|
+
def buffers(self, buffers: Union[Tuple[str, BytesIO], List[Tuple[str, BytesIO]]]) -> "Ingestor":
|
|
358
|
+
"""
|
|
359
|
+
Add buffers for processing.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
364
|
+
List of tuples containing the name of the buffer and the BytesIO object.
|
|
365
|
+
"""
|
|
366
|
+
if (
|
|
367
|
+
isinstance(buffers, tuple)
|
|
368
|
+
and len(buffers) == 2
|
|
369
|
+
and isinstance(buffers[0], str)
|
|
370
|
+
and isinstance(buffers[1], BytesIO)
|
|
371
|
+
):
|
|
372
|
+
buffers = [buffers]
|
|
373
|
+
self._buffers.extend(buffers)
|
|
374
|
+
self._job_specs = BatchJobSpec(self._buffers)
|
|
375
|
+
self._all_local = True
|
|
376
|
+
|
|
377
|
+
return self
|
|
378
|
+
|
|
355
379
|
def load(self, **kwargs) -> "Ingestor":
|
|
356
380
|
"""
|
|
357
381
|
Ensure all document files are accessible locally, downloading if necessary.
|
|
@@ -669,55 +693,133 @@ class Ingestor:
|
|
|
669
693
|
|
|
670
694
|
return tuple(returns) if len(returns) > 1 else results
|
|
671
695
|
|
|
672
|
-
def ingest_async(self, **kwargs: Any) -> Future:
|
|
696
|
+
def ingest_async(self, *, return_failures: bool = False, return_traces: bool = False, **kwargs: Any) -> Future:
|
|
673
697
|
"""
|
|
674
698
|
Asynchronously submits jobs and returns a single future that completes when all jobs have finished.
|
|
675
699
|
|
|
700
|
+
The return type of the future's result is dynamic and mirrors the behavior of the synchronous
|
|
701
|
+
`ingest()` method, controlled by the `return_failures` and `return_traces` flags. If a VDB
|
|
702
|
+
upload is configured, the future will complete *after* the VDB upload finishes.
|
|
703
|
+
|
|
676
704
|
Parameters
|
|
677
705
|
----------
|
|
706
|
+
return_failures : bool, optional
|
|
707
|
+
If True, return a tuple containing failures; otherwise, only return results. Default is False.
|
|
708
|
+
return_traces : bool, optional
|
|
709
|
+
If True, return trace metrics alongside results. Default is False.
|
|
678
710
|
kwargs : dict
|
|
679
|
-
Additional parameters
|
|
711
|
+
Additional parameters passed to the concurrent processor.
|
|
712
|
+
Optional flags include `include_parent_trace_ids=True` to also return
|
|
713
|
+
parent job trace identifiers (V2 API only).
|
|
680
714
|
|
|
681
715
|
Returns
|
|
682
716
|
-------
|
|
683
|
-
Future
|
|
684
|
-
A future that completes when all
|
|
717
|
+
Future[Union[List[Any], Tuple[Any, ...]]]
|
|
718
|
+
A future that completes when all jobs and any subsequent VDB upload
|
|
719
|
+
have finished. Its result will be one of the following:
|
|
720
|
+
- Default: list of results
|
|
721
|
+
- return_failures=True: (results, failures)
|
|
722
|
+
- return_traces=True: (results, traces)
|
|
723
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
724
|
+
|
|
685
725
|
"""
|
|
686
|
-
|
|
726
|
+
try:
|
|
727
|
+
self._prepare_ingest_run()
|
|
687
728
|
|
|
688
|
-
|
|
729
|
+
# Add jobs locally first
|
|
730
|
+
if self._job_specs is None:
|
|
731
|
+
raise RuntimeError("Job specs missing for ingest_async.")
|
|
732
|
+
self._job_ids = self._client.add_job(self._job_specs)
|
|
733
|
+
self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
|
|
689
734
|
|
|
690
|
-
|
|
691
|
-
self._job_states = {job_id: self._client._get_and_check_job_state(job_id) for job_id in self._job_ids}
|
|
735
|
+
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
|
|
692
736
|
|
|
693
|
-
|
|
694
|
-
submitted_futures = set(future_to_job_id.keys())
|
|
695
|
-
completed_futures = set()
|
|
696
|
-
future_results = []
|
|
737
|
+
final_future: Future = Future()
|
|
697
738
|
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
job_state.state = JobStateEnum.COMPLETED
|
|
705
|
-
except Exception:
|
|
706
|
-
result = None
|
|
707
|
-
if job_state.state != JobStateEnum.FAILED:
|
|
708
|
-
job_state.state = JobStateEnum.FAILED
|
|
709
|
-
completed_futures.add(future)
|
|
710
|
-
future_results.extend(result)
|
|
711
|
-
if completed_futures == submitted_futures:
|
|
712
|
-
combined_future.set_result(future_results)
|
|
739
|
+
processor_future = self._client.process_jobs_concurrently_async(
|
|
740
|
+
job_indices=self._job_ids,
|
|
741
|
+
job_queue_id=self._job_queue_id,
|
|
742
|
+
return_traces=return_traces,
|
|
743
|
+
**proc_kwargs,
|
|
744
|
+
)
|
|
713
745
|
|
|
714
|
-
|
|
715
|
-
future.add_done_callback(_done_callback)
|
|
746
|
+
include_parent_trace_ids = bool(kwargs.get("include_parent_trace_ids", False))
|
|
716
747
|
|
|
717
|
-
|
|
718
|
-
|
|
748
|
+
def _processor_done_callback(proc_future: Future):
|
|
749
|
+
"""Callback to handle completion, VDB upload, and final result setting."""
|
|
750
|
+
try:
|
|
751
|
+
if proc_future.cancelled():
|
|
752
|
+
if not final_future.done():
|
|
753
|
+
final_future.cancel()
|
|
754
|
+
return
|
|
755
|
+
if proc_future.exception():
|
|
756
|
+
if not final_future.done():
|
|
757
|
+
final_future.set_exception(proc_future.exception())
|
|
758
|
+
return
|
|
759
|
+
|
|
760
|
+
results, failures, traces_list = proc_future.result()
|
|
761
|
+
|
|
762
|
+
failed_job_ids = set()
|
|
763
|
+
for job_id_with_source, error_msg in failures:
|
|
764
|
+
job_id = job_id_with_source.split(":", 1)[0]
|
|
765
|
+
if job_id in self._job_states:
|
|
766
|
+
if self._job_states[job_id].state != JobStateEnum.FAILED:
|
|
767
|
+
self._job_states[job_id].state = JobStateEnum.FAILED
|
|
768
|
+
failed_job_ids.add(job_id)
|
|
769
|
+
|
|
770
|
+
all_submitted_job_ids = set(self._job_ids)
|
|
771
|
+
successful_job_ids = all_submitted_job_ids - failed_job_ids
|
|
772
|
+
|
|
773
|
+
for job_id in successful_job_ids:
|
|
774
|
+
if job_id in self._job_states:
|
|
775
|
+
if self._job_states[job_id].state != JobStateEnum.COMPLETED:
|
|
776
|
+
self._job_states[job_id].state = JobStateEnum.COMPLETED
|
|
777
|
+
|
|
778
|
+
if self._vdb_bulk_upload and results:
|
|
779
|
+
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
|
|
780
|
+
results_future = Future()
|
|
781
|
+
results_future.set_result(results)
|
|
782
|
+
vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
|
|
783
|
+
vdb_future.result()
|
|
784
|
+
|
|
785
|
+
parent_trace_ids = (
|
|
786
|
+
self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
787
|
+
)
|
|
719
788
|
|
|
720
|
-
|
|
789
|
+
returns = [results]
|
|
790
|
+
if return_failures:
|
|
791
|
+
returns.append(failures)
|
|
792
|
+
if return_traces:
|
|
793
|
+
returns.append(traces_list)
|
|
794
|
+
if include_parent_trace_ids:
|
|
795
|
+
returns.append(parent_trace_ids)
|
|
796
|
+
|
|
797
|
+
final_result = tuple(returns) if len(returns) > 1 else results
|
|
798
|
+
|
|
799
|
+
if not final_future.done():
|
|
800
|
+
final_future.set_result(final_result)
|
|
801
|
+
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.exception("Error in ingest_async processor callback")
|
|
804
|
+
if not final_future.done():
|
|
805
|
+
final_future.set_exception(e)
|
|
806
|
+
finally:
|
|
807
|
+
final_state = JobStateEnum.CANCELLED if proc_future.cancelled() else JobStateEnum.FAILED
|
|
808
|
+
for job_state in self._job_states.values():
|
|
809
|
+
if (
|
|
810
|
+
job_state.state not in [JobStateEnum.COMPLETED, JobStateEnum.FAILED]
|
|
811
|
+
and job_state.state != final_state
|
|
812
|
+
):
|
|
813
|
+
job_state.state = final_state
|
|
814
|
+
|
|
815
|
+
processor_future.add_done_callback(_processor_done_callback)
|
|
816
|
+
return final_future
|
|
817
|
+
|
|
818
|
+
except Exception as setup_err:
|
|
819
|
+
logger.exception("Failed during synchronous setup of ingest_async")
|
|
820
|
+
error_future: Future[Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]] = Future()
|
|
821
|
+
error_future.set_exception(setup_err)
|
|
822
|
+
return error_future
|
|
721
823
|
|
|
722
824
|
@ensure_job_specs
|
|
723
825
|
def _prepare_ingest_run(self):
|
|
@@ -834,6 +936,7 @@ class Ingestor:
|
|
|
834
936
|
extract_tables = kwargs.pop("extract_tables", True)
|
|
835
937
|
extract_charts = kwargs.pop("extract_charts", True)
|
|
836
938
|
extract_page_as_image = kwargs.pop("extract_page_as_image", False)
|
|
939
|
+
table_output_format = kwargs.pop("table_output_format", "markdown")
|
|
837
940
|
|
|
838
941
|
# Defaulting to False since enabling infographic extraction reduces throughput.
|
|
839
942
|
# Users have to set to True if infographic extraction is required.
|
|
@@ -856,6 +959,7 @@ class Ingestor:
|
|
|
856
959
|
extract_charts=extract_charts,
|
|
857
960
|
extract_infographics=extract_infographics,
|
|
858
961
|
extract_page_as_image=extract_page_as_image,
|
|
962
|
+
table_output_format=table_output_format,
|
|
859
963
|
**kwargs,
|
|
860
964
|
)
|
|
861
965
|
|
|
@@ -10,6 +10,7 @@ from typing import Dict
|
|
|
10
10
|
from typing import List
|
|
11
11
|
from typing import Optional
|
|
12
12
|
from typing import Union
|
|
13
|
+
from typing import Tuple
|
|
13
14
|
from uuid import UUID
|
|
14
15
|
|
|
15
16
|
from nv_ingest_client.primitives.tasks import Task
|
|
@@ -18,6 +19,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
|
|
|
18
19
|
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
|
|
19
20
|
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
|
|
20
21
|
from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
|
|
22
|
+
from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
|
|
21
23
|
from nv_ingest_client.util.dataset import get_dataset_files
|
|
22
24
|
from nv_ingest_client.util.dataset import get_dataset_statistics
|
|
23
25
|
|
|
@@ -199,6 +201,8 @@ class JobSpec:
|
|
|
199
201
|
self._tasks.append(ChartExtractionTask())
|
|
200
202
|
if isinstance(task, ExtractTask) and (task._extract_infographics is True):
|
|
201
203
|
self._tasks.append(InfographicExtractionTask())
|
|
204
|
+
if isinstance(task, ExtractTask) and (task._extract_method in {"pdfium_hybrid", "ocr"}):
|
|
205
|
+
self._tasks.append(OCRExtractionTask())
|
|
202
206
|
if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
|
|
203
207
|
extract_audio_params = task._extract_audio_params or {}
|
|
204
208
|
self._tasks.append(AudioExtractionTask(**extract_audio_params))
|
|
@@ -219,7 +223,9 @@ class BatchJobSpec:
|
|
|
219
223
|
A dictionary that maps document types to a list of `JobSpec` instances.
|
|
220
224
|
"""
|
|
221
225
|
|
|
222
|
-
def __init__(
|
|
226
|
+
def __init__(
|
|
227
|
+
self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
|
|
228
|
+
) -> None:
|
|
223
229
|
"""
|
|
224
230
|
Initializes the BatchJobSpec instance.
|
|
225
231
|
|
|
@@ -236,6 +242,13 @@ class BatchJobSpec:
|
|
|
236
242
|
self.from_job_specs(job_specs_or_files)
|
|
237
243
|
elif isinstance(job_specs_or_files[0], str):
|
|
238
244
|
self.from_files(job_specs_or_files)
|
|
245
|
+
elif (
|
|
246
|
+
isinstance(job_specs_or_files[0], tuple)
|
|
247
|
+
and len(job_specs_or_files[0]) == 2
|
|
248
|
+
and isinstance(job_specs_or_files[0][0], str)
|
|
249
|
+
and isinstance(job_specs_or_files[0][1], BytesIO)
|
|
250
|
+
):
|
|
251
|
+
self.from_buffers(job_specs_or_files)
|
|
239
252
|
else:
|
|
240
253
|
raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
|
|
241
254
|
|
|
@@ -279,6 +292,21 @@ class BatchJobSpec:
|
|
|
279
292
|
for job_spec in job_specs:
|
|
280
293
|
self.add_job_spec(job_spec)
|
|
281
294
|
|
|
295
|
+
def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Initializes the batch from a list of buffers.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
302
|
+
A list of tuples containing the name of the buffer and the BytesIO object.
|
|
303
|
+
"""
|
|
304
|
+
from nv_ingest_client.util.util import create_job_specs_for_buffers
|
|
305
|
+
|
|
306
|
+
job_specs = create_job_specs_for_buffers(buffers)
|
|
307
|
+
for job_spec in job_specs:
|
|
308
|
+
self.add_job_spec(job_spec)
|
|
309
|
+
|
|
282
310
|
def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
|
|
283
311
|
"""
|
|
284
312
|
Internal method to initialize the batch from a dataset.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# pylint: disable=too-few-public-methods
|
|
7
|
+
# pylint: disable=too-many-arguments
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Dict
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
|
|
13
|
+
from nv_ingest_client.primitives.tasks.task_base import Task
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OCRExtractionTask(Task):
|
|
19
|
+
"""
|
|
20
|
+
Object for ocr extraction task
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, params: dict = None) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Setup OCR Extraction Task Config
|
|
26
|
+
"""
|
|
27
|
+
super().__init__()
|
|
28
|
+
|
|
29
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
30
|
+
if params is None:
|
|
31
|
+
params = {}
|
|
32
|
+
|
|
33
|
+
# Use the API schema for validation
|
|
34
|
+
validated_data = IngestTaskOCRExtraction(params=params)
|
|
35
|
+
|
|
36
|
+
self._params = validated_data.params
|
|
37
|
+
|
|
38
|
+
def __str__(self) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Returns a string with the object's config and run time state
|
|
41
|
+
"""
|
|
42
|
+
info = ""
|
|
43
|
+
info += "OCR Extraction Task:\n"
|
|
44
|
+
info += f" params: {self._params}\n"
|
|
45
|
+
return info
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> Dict:
|
|
48
|
+
"""
|
|
49
|
+
Convert to a dict for submission to redis
|
|
50
|
+
"""
|
|
51
|
+
task_properties = {
|
|
52
|
+
"params": self._params,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return {"type": "ocr_data_extract", "task_properties": task_properties}
|
|
@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
|
|
|
51
51
|
"txt": DocumentTypeEnum.TXT,
|
|
52
52
|
"mp3": DocumentTypeEnum.MP3,
|
|
53
53
|
"wav": DocumentTypeEnum.WAV,
|
|
54
|
+
"mp4": DocumentTypeEnum.MP4,
|
|
55
|
+
"mov": DocumentTypeEnum.MOV,
|
|
56
|
+
"avi": DocumentTypeEnum.AVI,
|
|
57
|
+
"mkv": DocumentTypeEnum.MKV,
|
|
54
58
|
# Add more as needed
|
|
55
59
|
}
|
|
56
60
|
|
|
@@ -141,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
|
|
|
141
145
|
|
|
142
146
|
logger.debug(f"Content extracted from '{path}'")
|
|
143
147
|
return content, DocumentTypeEnum(document_type)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
|
|
151
|
+
"""
|
|
152
|
+
Extracts the content and type from a buffer.
|
|
153
|
+
"""
|
|
154
|
+
document_type = get_or_infer_file_type(buffer[0])
|
|
155
|
+
try:
|
|
156
|
+
if document_type in [
|
|
157
|
+
DocumentTypeEnum.TXT,
|
|
158
|
+
DocumentTypeEnum.MD,
|
|
159
|
+
DocumentTypeEnum.HTML,
|
|
160
|
+
]:
|
|
161
|
+
content = detect_encoding_and_read_text_file(buffer[1])
|
|
162
|
+
else:
|
|
163
|
+
content = serialize_to_base64(buffer[1])
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"Error processing buffer {buffer[0]}: {e}")
|
|
166
|
+
|
|
167
|
+
raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
|
|
168
|
+
|
|
169
|
+
logger.debug(f"Content extracted from '{buffer[0]}'")
|
|
170
|
+
return content, DocumentTypeEnum(document_type)
|
nv_ingest_client/util/util.py
CHANGED
|
@@ -12,10 +12,12 @@ import math
|
|
|
12
12
|
import heapq
|
|
13
13
|
from typing import Dict
|
|
14
14
|
from typing import List
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from io import BytesIO
|
|
15
17
|
|
|
16
18
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
17
19
|
from nv_ingest_client.primitives.jobs.job_spec import JobSpec
|
|
18
|
-
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
|
20
|
+
from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
|
|
19
21
|
|
|
20
22
|
logger = logging.getLogger(__name__)
|
|
21
23
|
|
|
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
350
352
|
return job_specs
|
|
351
353
|
|
|
352
354
|
|
|
355
|
+
def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
|
|
356
|
+
"""
|
|
357
|
+
Create and job specifications (JobSpecs) for a list of buffers.
|
|
358
|
+
This function takes a list of buffers, processes each buffer to extract its content and type,
|
|
359
|
+
creates a job specification (JobSpec) for each buffer.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
364
|
+
A list of tuples containing the name of the buffer and the BytesIO object.
|
|
365
|
+
|
|
366
|
+
Returns
|
|
367
|
+
-------
|
|
368
|
+
List[JobSpec]
|
|
369
|
+
A list of JobSpecs.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
job_specs = []
|
|
373
|
+
for name, buffer in buffers:
|
|
374
|
+
content, file_type = extract_content_from_buffer((name, buffer))
|
|
375
|
+
job_spec = JobSpec(
|
|
376
|
+
document_type=file_type,
|
|
377
|
+
payload=content,
|
|
378
|
+
source_id=name,
|
|
379
|
+
source_name=name,
|
|
380
|
+
)
|
|
381
|
+
job_specs.append(job_spec)
|
|
382
|
+
|
|
383
|
+
return job_specs
|
|
384
|
+
|
|
385
|
+
|
|
353
386
|
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
387
|
"""
|
|
355
388
|
Apply PDF split configuration to a list of JobSpec objects.
|
|
@@ -1,27 +1,243 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
"""Abstract Vector Database (VDB) operator API.
|
|
5
|
+
|
|
6
|
+
This module defines the `VDB` abstract base class which specifies the
|
|
7
|
+
interface that custom vector-database operators must implement to integrate
|
|
8
|
+
with NV-Ingest.
|
|
9
|
+
|
|
10
|
+
The implementation details and an example OpenSearch operator are described
|
|
11
|
+
in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
|
|
12
|
+
production-ready OpenSearch implementation is available at
|
|
13
|
+
`client/src/nv_ingest_client/util/vdb/opensearch.py`.
|
|
14
|
+
|
|
15
|
+
Design goals:
|
|
16
|
+
- Provide a small, well-documented interface that supports common vector
|
|
17
|
+
database operations: index creation, batch ingestion, nearest-neighbor
|
|
18
|
+
retrieval, and a simple `run` orchestration entry-point used by the
|
|
19
|
+
NV-Ingest pipeline.
|
|
20
|
+
- Keep the API flexible by accepting `**kwargs` on methods so implementers can
|
|
21
|
+
pass database-specific options without changing the interface.
|
|
22
|
+
|
|
23
|
+
Typical implementation notes (inferred from the example OpenSearch operator):
|
|
24
|
+
- Constructor accepts connection and index configuration parameters such as
|
|
25
|
+
`host`, `port`, `index_name`, `dense_dim` and feature toggles for content
|
|
26
|
+
types (e.g. `enable_text`, `enable_images`).
|
|
27
|
+
- `create_index` should be able to create (and optionally recreate) an
|
|
28
|
+
index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
|
|
29
|
+
- `write_to_index` should accept batches of NV-Ingest records, perform
|
|
30
|
+
validation/transformation, and write documents into the database efficiently
|
|
31
|
+
(bulk APIs are recommended).
|
|
32
|
+
- `retrieval` should accept a list of textual queries, convert them to
|
|
33
|
+
embeddings (by calling an external embedding service or model), perform a
|
|
34
|
+
vector search (top-k), and return cleaned results (e.g., removing stored
|
|
35
|
+
dense vectors from returned payloads).
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
|
|
4
40
|
class VDB(ABC):
|
|
41
|
+
"""Abstract base class for Vector Database operators.
|
|
42
|
+
|
|
43
|
+
Subclasses must implement the abstract methods below. The interface is
|
|
44
|
+
intentionally small and uses `**kwargs` to allow operator-specific
|
|
45
|
+
configuration without changing the common API.
|
|
46
|
+
|
|
47
|
+
Example (high level):
|
|
48
|
+
|
|
49
|
+
class OpenSearch(VDB):
|
|
50
|
+
def __init__(self, **kwargs):
|
|
51
|
+
# parse kwargs, initialize client, call super().__init__(**kwargs)
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
def create_index(self, **kwargs):
|
|
55
|
+
# create index, mappings, settings
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
def write_to_index(self, records: list, **kwargs):
|
|
59
|
+
# transform NV-Ingest records and write to database
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
def retrieval(self, queries: list, **kwargs):
|
|
63
|
+
# convert queries to embeddings, k-NN search, format results
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
def run(self, records):
|
|
67
|
+
# orchestrate create_index + write_to_index
|
|
68
|
+
...
|
|
69
|
+
|
|
70
|
+
Notes on recommended constructor parameters (not enforced by this ABC):
|
|
71
|
+
- host (str): database hostname (default: 'localhost')
|
|
72
|
+
- port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
|
|
73
|
+
- index_name (str): base index name used by the operator
|
|
74
|
+
- dense_dim (int): dimensionality of stored dense embeddings
|
|
75
|
+
- enable_text/enable_images/... (bool): content-type toggles used when
|
|
76
|
+
extracting text from NV-Ingest records before indexing
|
|
77
|
+
|
|
78
|
+
The concrete operator may accept additional parameters (username,
|
|
79
|
+
password, ssl options, client-specific flags). Passing these via
|
|
80
|
+
`**kwargs` is the intended pattern.
|
|
81
|
+
"""
|
|
5
82
|
|
|
6
83
|
@abstractmethod
|
|
7
84
|
def __init__(self, **kwargs):
|
|
85
|
+
"""Initialize the VDB operator.
|
|
86
|
+
|
|
87
|
+
Implementations should extract configuration values from `kwargs`
|
|
88
|
+
(or use defaults) and initialize any client connections required to
|
|
89
|
+
talk to the target vector database. Implementations are encouraged to
|
|
90
|
+
call `super().__init__(**kwargs)` only if they want the base-class
|
|
91
|
+
behavior of storing kwargs on the instance (the base class itself does
|
|
92
|
+
not require that behavior).
|
|
93
|
+
|
|
94
|
+
Parameters (suggested/common):
|
|
95
|
+
- host (str): database host
|
|
96
|
+
- port (int): database port
|
|
97
|
+
- index_name (str): base name for created indices
|
|
98
|
+
- dense_dim (int): embedding vector dimension
|
|
99
|
+
- enable_text (bool): whether text content should be extracted/indexed
|
|
100
|
+
- enable_images (bool), enable_audio (bool), etc.: other toggles
|
|
101
|
+
|
|
102
|
+
The constructor should not perform heavy operations (like creating
|
|
103
|
+
indices) unless explicitly desired; prefer leaving that work to
|
|
104
|
+
`create_index` to make the operator easier to test.
|
|
105
|
+
"""
|
|
8
106
|
self.__dict__.update(kwargs)
|
|
9
107
|
|
|
10
108
|
@abstractmethod
|
|
11
109
|
def create_index(self, **kwargs):
|
|
110
|
+
"""Create and configure the index(es) required by this operator.
|
|
111
|
+
|
|
112
|
+
Implementations must ensure an appropriate index (or indices) exist
|
|
113
|
+
before data ingestion. For vector indexes this typically means
|
|
114
|
+
creating settings and mappings that enable k-NN/vector search (for
|
|
115
|
+
example, enabling an HNSW/FAISS engine, setting `dimension`, and any
|
|
116
|
+
engine-specific parameters).
|
|
117
|
+
|
|
118
|
+
Common keyword arguments (operator-specific):
|
|
119
|
+
- recreate (bool): if True, delete and recreate the index even if it
|
|
120
|
+
already exists (default: False)
|
|
121
|
+
- index_name (str): override the operator's configured index name for
|
|
122
|
+
this call
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
implementation-specific result (e.g., a boolean, the created
|
|
126
|
+
index name, or the raw response from the database client). There
|
|
127
|
+
is no strict requirement here because different DB clients return
|
|
128
|
+
different values; document behavior in concrete implementations.
|
|
129
|
+
"""
|
|
12
130
|
pass
|
|
13
131
|
|
|
14
132
|
@abstractmethod
|
|
15
133
|
def write_to_index(self, records: list, **kwargs):
|
|
134
|
+
"""Write a batch of NV-Ingest records to the vector database.
|
|
135
|
+
|
|
136
|
+
This method receives `records` formatted as NV-Ingest provides them
|
|
137
|
+
(commonly a list of record-sets). Implementations are responsible for
|
|
138
|
+
transforming each record into the target database document format,
|
|
139
|
+
validating the presence of embeddings and content, and using the most
|
|
140
|
+
efficient ingestion API available (for example a bulk endpoint).
|
|
141
|
+
|
|
142
|
+
Expected behavior:
|
|
143
|
+
- Iterate over the provided `records` (which can be nested lists of
|
|
144
|
+
record dictionaries) and transform each record to the DB document
|
|
145
|
+
structure (fields such as `dense` for the vector, `text` for the
|
|
146
|
+
content, and `metadata` for auxiliary fields are common in the
|
|
147
|
+
repository examples).
|
|
148
|
+
- Skip records missing required fields (for example, missing
|
|
149
|
+
embeddings) and log or report failures as appropriate.
|
|
150
|
+
- Use batching / bulk APIs to reduce overhead when writing large
|
|
151
|
+
volumes of documents.
|
|
152
|
+
|
|
153
|
+
Parameters:
|
|
154
|
+
- records (list): NV-Ingest records (see repository examples for
|
|
155
|
+
structure)
|
|
156
|
+
- batch_size (int, optional): how many documents to send per bulk
|
|
157
|
+
request; database-specific implementations can use this hint
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
implementation-specific result (e.g., number of documents
|
|
161
|
+
indexed, client response for bulk API). Concrete implementations
|
|
162
|
+
should document exact return values and failure semantics.
|
|
163
|
+
"""
|
|
16
164
|
pass
|
|
17
165
|
|
|
18
166
|
@abstractmethod
|
|
19
167
|
def retrieval(self, queries: list, **kwargs):
|
|
168
|
+
"""Perform similarity search for a list of text queries.
|
|
169
|
+
|
|
170
|
+
The typical retrieval flow implemented by operators in this ecosystem
|
|
171
|
+
is:
|
|
172
|
+
1. Convert each textual `query` into a dense embedding using an
|
|
173
|
+
external embedding model or service (the example uses an NVIDIA
|
|
174
|
+
embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
|
|
175
|
+
2. Issue a vector (k-NN) search to the database using the generated
|
|
176
|
+
embedding, requesting the top-k (configurable) neighbors.
|
|
177
|
+
3. Post-process results (for example, remove stored dense vectors
|
|
178
|
+
from returned documents to reduce payload size) and return a
|
|
179
|
+
list-of-lists of result documents aligned with the input `queries`.
|
|
180
|
+
|
|
181
|
+
Keyword arguments (common):
|
|
182
|
+
- index_name (str): index to search (default: operator's configured
|
|
183
|
+
index_name)
|
|
184
|
+
- top_k (int): number of nearest neighbors to return (default: 10)
|
|
185
|
+
- embedding_endpoint / model_name / nvidia_api_key: parameters needed
|
|
186
|
+
when the operator integrates with an external embedding service.
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
- queries (list[str]): list of text queries to be vectorized and
|
|
190
|
+
searched
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
- results (list[list[dict]]): for each query, a list of hit documents
|
|
194
|
+
(concrete implementations should specify the document shape they
|
|
195
|
+
return). Operators should remove large binary/vector fields from
|
|
196
|
+
responses where possible.
|
|
197
|
+
"""
|
|
20
198
|
pass
|
|
21
199
|
|
|
22
200
|
@abstractmethod
|
|
23
201
|
def run(self, records):
|
|
202
|
+
"""Main entry point used by the NV-Ingest pipeline.
|
|
203
|
+
|
|
204
|
+
The `run` method is intended to be a simple orchestration layer that
|
|
205
|
+
ensures the index exists and then ingests provided records. A minimal
|
|
206
|
+
recommended implementation is::
|
|
207
|
+
|
|
208
|
+
def run(self, records):
|
|
209
|
+
self.create_index()
|
|
210
|
+
self.write_to_index(records)
|
|
211
|
+
|
|
212
|
+
Implementers can add pre/post hooks, metrics, retries, or error
|
|
213
|
+
handling as needed for production readiness. Keep `run` simple so the
|
|
214
|
+
pipeline orchestration remains predictable.
|
|
215
|
+
|
|
216
|
+
Parameters:
|
|
217
|
+
- records: NV-Ingest records to index (format follows repository
|
|
218
|
+
conventions)
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
- implementation-specific result (for example, a summary dict or
|
|
222
|
+
boolean success flag).
|
|
223
|
+
"""
|
|
24
224
|
pass
|
|
25
225
|
|
|
26
226
|
def reindex(self, records: list, **kwargs):
|
|
227
|
+
"""Optional helper to rebuild or re-populate indexes with new data.
|
|
228
|
+
|
|
229
|
+
This non-abstract method is provided as an optional hook that concrete
|
|
230
|
+
classes may override. A typical reindex implementation will:
|
|
231
|
+
- optionally delete the existing index and recreate it (via
|
|
232
|
+
`create_index(recreate=True)`)
|
|
233
|
+
- call `write_to_index(records)` to populate the new index
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
- records (list): records used to populate the index
|
|
237
|
+
- recreate (bool, optional): whether to delete and recreate the
|
|
238
|
+
index before writing
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
- implementation-specific result
|
|
242
|
+
"""
|
|
27
243
|
pass
|
|
@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
|
|
|
44
44
|
logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
46
|
CONSISTENCY = CONSISTENCY_BOUNDED
|
|
47
|
+
DENSE_INDEX_NAME = "dense_index"
|
|
47
48
|
|
|
48
49
|
pandas_reader_map = {
|
|
49
50
|
".json": pd.read_json,
|
|
@@ -93,7 +94,7 @@ def create_meta_collection(
|
|
|
93
94
|
index_params = MilvusClient.prepare_index_params()
|
|
94
95
|
index_params.add_index(
|
|
95
96
|
field_name="vector",
|
|
96
|
-
index_name=
|
|
97
|
+
index_name=DENSE_INDEX_NAME,
|
|
97
98
|
index_type="FLAT",
|
|
98
99
|
metric_type="L2",
|
|
99
100
|
)
|
|
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
|
|
|
313
314
|
if local_index:
|
|
314
315
|
index_params.add_index(
|
|
315
316
|
field_name="vector",
|
|
316
|
-
index_name=
|
|
317
|
+
index_name=DENSE_INDEX_NAME,
|
|
317
318
|
index_type="FLAT",
|
|
318
319
|
metric_type="L2",
|
|
319
320
|
)
|
|
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
|
|
|
321
322
|
if gpu_index:
|
|
322
323
|
index_params.add_index(
|
|
323
324
|
field_name="vector",
|
|
324
|
-
index_name=
|
|
325
|
+
index_name=DENSE_INDEX_NAME,
|
|
325
326
|
index_type="GPU_CAGRA",
|
|
326
327
|
metric_type="L2",
|
|
327
328
|
params={
|
|
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
|
|
|
335
336
|
else:
|
|
336
337
|
index_params.add_index(
|
|
337
338
|
field_name="vector",
|
|
338
|
-
index_name=
|
|
339
|
+
index_name=DENSE_INDEX_NAME,
|
|
339
340
|
index_type="HNSW",
|
|
340
341
|
metric_type="L2",
|
|
341
342
|
params={"M": 64, "efConstruction": 512},
|
|
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
|
|
|
493
494
|
if isinstance(indexes, dict):
|
|
494
495
|
# Old Milvus behavior (< 2.5.6)
|
|
495
496
|
for k, v in indexes.items():
|
|
496
|
-
if k[1] ==
|
|
497
|
+
if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
|
|
497
498
|
d_idx = v._index_type
|
|
498
499
|
if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
|
|
499
500
|
s_idx = v._index_type
|
|
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
|
|
|
504
505
|
index_name = getattr(idx, "index_name", None)
|
|
505
506
|
index_type = getattr(idx, "index_type", None)
|
|
506
507
|
|
|
507
|
-
if index_name ==
|
|
508
|
+
if index_name == DENSE_INDEX_NAME:
|
|
508
509
|
d_idx = index_type
|
|
509
510
|
if sparse and index_name == "sparse_index":
|
|
510
511
|
s_idx = index_type
|
|
@@ -891,7 +892,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
|
|
|
891
892
|
logger.info(f"streamed {count} records")
|
|
892
893
|
|
|
893
894
|
|
|
894
|
-
def wait_for_index(collection_name: str,
|
|
895
|
+
def wait_for_index(collection_name: str, expected_rows_dict: dict, client: MilvusClient):
|
|
895
896
|
"""
|
|
896
897
|
This function waits for the index to be built. It checks
|
|
897
898
|
the indexed_rows of the index and waits for it to be equal
|
|
@@ -900,30 +901,28 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
900
901
|
(refer to MilvusClient.refresh_load for bulk inserts).
|
|
901
902
|
"""
|
|
902
903
|
client.flush(collection_name)
|
|
903
|
-
index_names = utility.list_indexes(collection_name)
|
|
904
904
|
indexed_rows = 0
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
905
|
+
# observe dense_index, all indexes get populated simultaneously
|
|
906
|
+
for index_name, rows_expected in expected_rows_dict.items():
|
|
907
|
+
indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
908
|
+
while indexed_rows < rows_expected:
|
|
909
|
+
# 0.5% of rows expected allowed without noticing an increase in indexed_rows
|
|
910
|
+
pos_movement = start_pos_movement = max((rows_expected - indexed_rows) * 0.005, 10)
|
|
909
911
|
for i in range(20):
|
|
910
|
-
|
|
912
|
+
prev_indexed_rows = indexed_rows
|
|
913
|
+
indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
911
914
|
time.sleep(1)
|
|
912
|
-
logger.info(
|
|
913
|
-
|
|
914
|
-
)
|
|
915
|
-
if new_indexed_rows == num_elements:
|
|
916
|
-
indexed_rows = new_indexed_rows
|
|
915
|
+
logger.info(f"Indexed rows, {collection_name}, {index_name} - {indexed_rows} / {rows_expected}")
|
|
916
|
+
if indexed_rows == rows_expected:
|
|
917
917
|
break
|
|
918
918
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
919
|
-
if
|
|
919
|
+
if indexed_rows == prev_indexed_rows:
|
|
920
920
|
pos_movement -= 1
|
|
921
921
|
else:
|
|
922
|
-
pos_movement =
|
|
922
|
+
pos_movement = start_pos_movement
|
|
923
923
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
924
924
|
if pos_movement == 0:
|
|
925
|
-
raise ValueError("Rows are not getting indexed as expected")
|
|
926
|
-
indexed_rows = new_indexed_rows
|
|
925
|
+
raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
|
|
927
926
|
return indexed_rows
|
|
928
927
|
|
|
929
928
|
|
|
@@ -1043,6 +1042,13 @@ def write_to_nvingest_collection(
|
|
|
1043
1042
|
if num_elements < threshold:
|
|
1044
1043
|
stream = True
|
|
1045
1044
|
if stream:
|
|
1045
|
+
# most be accessed/saved before adding new records
|
|
1046
|
+
index_names = utility.list_indexes(collection_name)
|
|
1047
|
+
expected_rows = {}
|
|
1048
|
+
for index_name in index_names:
|
|
1049
|
+
expected_rows[index_name] = (
|
|
1050
|
+
int(client.describe_index(collection_name, index_name)["indexed_rows"]) + num_elements
|
|
1051
|
+
)
|
|
1046
1052
|
stream_insert_milvus(
|
|
1047
1053
|
cleaned_records,
|
|
1048
1054
|
client,
|
|
@@ -1051,7 +1057,7 @@ def write_to_nvingest_collection(
|
|
|
1051
1057
|
if not local_index:
|
|
1052
1058
|
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1053
1059
|
# know how long this should take, it is num_elements dependent.
|
|
1054
|
-
wait_for_index(collection_name,
|
|
1060
|
+
wait_for_index(collection_name, expected_rows, client)
|
|
1055
1061
|
else:
|
|
1056
1062
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1057
1063
|
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
@@ -2002,6 +2008,12 @@ class Milvus(VDB):
|
|
|
2002
2008
|
"""
|
|
2003
2009
|
kwargs = locals().copy()
|
|
2004
2010
|
kwargs.pop("self", None)
|
|
2011
|
+
bucket_name = kwargs.get("bucket_name", None)
|
|
2012
|
+
if bucket_name is not None and bucket_name != ClientConfigSchema().minio_bucket_name:
|
|
2013
|
+
raise ValueError(
|
|
2014
|
+
"You must use the environment variable MINIO_BUCKET to specify bucket_name, detected:",
|
|
2015
|
+
f"`bucket_name`: {bucket_name} and MINIO_BUCKET: {ClientConfigSchema().minio_bucket_name}",
|
|
2016
|
+
)
|
|
2005
2017
|
super().__init__(**kwargs)
|
|
2006
2018
|
|
|
2007
2019
|
def create_index(self, **kwargs):
|
|
@@ -2057,3 +2069,24 @@ class Milvus(VDB):
|
|
|
2057
2069
|
self.write_to_index(records, collection_name=coll_name, **sub_write_params)
|
|
2058
2070
|
else:
|
|
2059
2071
|
raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
|
|
2072
|
+
return records
|
|
2073
|
+
|
|
2074
|
+
def run_async(self, records):
|
|
2075
|
+
collection_name, create_params = self.get_connection_params()
|
|
2076
|
+
_, write_params = self.get_write_params()
|
|
2077
|
+
if isinstance(collection_name, str):
|
|
2078
|
+
logger.info(f"creating index - {collection_name}")
|
|
2079
|
+
self.create_index(collection_name=collection_name, **create_params)
|
|
2080
|
+
records = records.result()
|
|
2081
|
+
logger.info(f"writing to index, for collection - {collection_name}")
|
|
2082
|
+
self.write_to_index(records, **write_params)
|
|
2083
|
+
elif isinstance(collection_name, dict):
|
|
2084
|
+
split_params_list = _dict_to_params(collection_name, write_params)
|
|
2085
|
+
for sub_params in split_params_list:
|
|
2086
|
+
coll_name, sub_write_params = sub_params
|
|
2087
|
+
sub_write_params.pop("collection_name", None)
|
|
2088
|
+
self.create_index(collection_name=coll_name, **create_params)
|
|
2089
|
+
self.write_to_index(records, collection_name=coll_name, **sub_write_params)
|
|
2090
|
+
else:
|
|
2091
|
+
raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
|
|
2092
|
+
return records
|
|
@@ -6,13 +6,13 @@ nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T
|
|
|
6
6
|
nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
|
|
7
7
|
nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
|
|
8
8
|
nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
|
|
9
|
-
nv_ingest_client/client/client.py,sha256=
|
|
9
|
+
nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
|
|
10
10
|
nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
|
|
11
|
-
nv_ingest_client/client/interface.py,sha256=
|
|
11
|
+
nv_ingest_client/client/interface.py,sha256=XQ2hHNBsL-Nnsk_w48UMxFqxfkO0CdQ2AOQZEdXU3OA,59990
|
|
12
12
|
nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
|
|
13
13
|
nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
|
|
14
14
|
nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
|
|
15
|
-
nv_ingest_client/primitives/jobs/job_spec.py,sha256=
|
|
15
|
+
nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQbDkhCXhLA9hNOURc,16831
|
|
16
16
|
nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
|
|
17
17
|
nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
|
|
18
18
|
nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
|
|
@@ -20,9 +20,10 @@ nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcY
|
|
|
20
20
|
nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
|
|
21
21
|
nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
|
|
22
22
|
nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
|
|
23
|
-
nv_ingest_client/primitives/tasks/extract.py,sha256=
|
|
23
|
+
nv_ingest_client/primitives/tasks/extract.py,sha256=ec2aKPU9OMOOw-oalQKAPaNRqgkREQ0ByLkFVqutD6E,9339
|
|
24
24
|
nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
|
|
25
25
|
nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
|
|
26
|
+
nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
|
|
26
27
|
nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
|
|
27
28
|
nv_ingest_client/primitives/tasks/store.py,sha256=nIOnCH8vw4FLCLVBJYnsS5Unc0QmuO_jEtUp7-E9FU4,4199
|
|
28
29
|
nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
|
|
@@ -39,17 +40,17 @@ nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywk
|
|
|
39
40
|
nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
|
|
40
41
|
nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
|
|
41
42
|
nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
|
|
42
|
-
nv_ingest_client/util/util.py,sha256=
|
|
43
|
+
nv_ingest_client/util/util.py,sha256=zvWgIxIeATrtrS8olo_8-fHQ4aDd83yg2SjNDcHIv4g,16805
|
|
43
44
|
nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
|
|
44
45
|
nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
-
nv_ingest_client/util/file_processing/extract.py,sha256=
|
|
46
|
+
nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
|
|
46
47
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
47
|
-
nv_ingest_client/util/vdb/adt_vdb.py,sha256=
|
|
48
|
-
nv_ingest_client/util/vdb/milvus.py,sha256=
|
|
48
|
+
nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
|
|
49
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=jCQyWb6xoQ6utGNccASmN09eJbwF2HlgrGGIkpoUfI8,80792
|
|
49
50
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
50
|
-
nv_ingest_client-2025.11.
|
|
51
|
-
nv_ingest_client-2025.11.
|
|
52
|
-
nv_ingest_client-2025.11.
|
|
53
|
-
nv_ingest_client-2025.11.
|
|
54
|
-
nv_ingest_client-2025.11.
|
|
55
|
-
nv_ingest_client-2025.11.
|
|
51
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
52
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/METADATA,sha256=jul59WHL8-9IYR27iL9ilxkw7IQRnqb7EMqBfJh7IGk,30627
|
|
53
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
54
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
55
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
56
|
+
nv_ingest_client-2025.11.27.dev20251127.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|