nv-ingest-client 2025.11.27.dev20251127__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest_client/client/interface.py +171 -53
- nv_ingest_client/nv_ingest_cli.py +2 -2
- nv_ingest_client/primitives/tasks/caption.py +12 -1
- nv_ingest_client/primitives/tasks/extract.py +50 -2
- nv_ingest_client/primitives/tasks/store.py +18 -13
- nv_ingest_client/util/vdb/lancedb.py +276 -0
- nv_ingest_client/util/vdb/milvus.py +18 -4
- {nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/METADATA +2 -1
- {nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/RECORD +13 -12
- {nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.11.27.dev20251127.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/top_level.txt +0 -0
|
@@ -53,6 +53,7 @@ from nv_ingest_client.primitives.tasks import SplitTask
|
|
|
53
53
|
from nv_ingest_client.primitives.tasks import StoreTask
|
|
54
54
|
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
55
55
|
from nv_ingest_client.primitives.tasks import UDFTask
|
|
56
|
+
from nv_ingest_client.util.file_processing.extract import EXTENSION_TO_DOCUMENT_TYPE
|
|
56
57
|
from nv_ingest_client.util.processing import check_schema
|
|
57
58
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
58
59
|
from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
|
|
@@ -421,6 +422,92 @@ class Ingestor:
|
|
|
421
422
|
|
|
422
423
|
return self
|
|
423
424
|
|
|
425
|
+
def _resolve_source_name(self, job_id: str, results_data: Optional[Union[List, Dict]] = None) -> str:
|
|
426
|
+
"""
|
|
427
|
+
Resolves the source name for a given job ID using available metadata or fallback options.
|
|
428
|
+
|
|
429
|
+
Parameters
|
|
430
|
+
----------
|
|
431
|
+
job_id : str
|
|
432
|
+
The job identifier.
|
|
433
|
+
results_data : Any, optional
|
|
434
|
+
The data associated with the job result, which might contain metadata.
|
|
435
|
+
|
|
436
|
+
Returns
|
|
437
|
+
-------
|
|
438
|
+
str
|
|
439
|
+
The resolved source name.
|
|
440
|
+
"""
|
|
441
|
+
source_name = "unknown_source"
|
|
442
|
+
job_spec = self._client._job_index_to_job_spec.get(job_id)
|
|
443
|
+
|
|
444
|
+
if job_spec:
|
|
445
|
+
source_name = job_spec.source_name
|
|
446
|
+
else:
|
|
447
|
+
try:
|
|
448
|
+
if results_data:
|
|
449
|
+
first_item = results_data[0] if isinstance(results_data, list) and results_data else results_data
|
|
450
|
+
if isinstance(first_item, dict):
|
|
451
|
+
source_name = first_item.get("metadata", {}).get("source_metadata", {}).get("source_id", "")
|
|
452
|
+
if not source_name:
|
|
453
|
+
source_name = f"{job_id}"
|
|
454
|
+
except (IndexError, KeyError, TypeError):
|
|
455
|
+
source_name = f"{job_id}"
|
|
456
|
+
|
|
457
|
+
return source_name
|
|
458
|
+
|
|
459
|
+
def _write_results_to_disk(self, doc_data: Any, source_name: str, job_id: str) -> Optional[LazyLoadedList]:
|
|
460
|
+
"""
|
|
461
|
+
Writes the results for a single job to a JSONL file and returns a LazyLoadedList.
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
doc_data : Any
|
|
466
|
+
The result data to save.
|
|
467
|
+
source_name : str
|
|
468
|
+
The name of the source document.
|
|
469
|
+
job_id : str
|
|
470
|
+
The job identifier.
|
|
471
|
+
|
|
472
|
+
Returns
|
|
473
|
+
-------
|
|
474
|
+
Optional[LazyLoadedList]
|
|
475
|
+
A proxy object to the saved file, or None if the save failed.
|
|
476
|
+
"""
|
|
477
|
+
if not self._output_config:
|
|
478
|
+
logger.warning("Attempted to write results to disk without output configuration.")
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
output_dir = self._output_config["output_directory"]
|
|
483
|
+
clean_source_basename = get_valid_filename(os.path.basename(source_name))
|
|
484
|
+
file_name, file_ext = os.path.splitext(clean_source_basename)
|
|
485
|
+
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
486
|
+
if self._output_config["compression"] == "gzip":
|
|
487
|
+
file_suffix += ".gz"
|
|
488
|
+
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
489
|
+
|
|
490
|
+
data_to_save = doc_data if isinstance(doc_data, list) else [doc_data]
|
|
491
|
+
|
|
492
|
+
num_items_saved = save_document_results_to_jsonl(
|
|
493
|
+
data_to_save,
|
|
494
|
+
jsonl_filepath,
|
|
495
|
+
source_name,
|
|
496
|
+
ensure_parent_dir_exists=False,
|
|
497
|
+
compression=self._output_config["compression"],
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
if num_items_saved > 0:
|
|
501
|
+
return LazyLoadedList(
|
|
502
|
+
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
503
|
+
)
|
|
504
|
+
except Exception as e_save:
|
|
505
|
+
logger.error(
|
|
506
|
+
f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
|
|
507
|
+
exc_info=True,
|
|
508
|
+
)
|
|
509
|
+
return None
|
|
510
|
+
|
|
424
511
|
def ingest(
|
|
425
512
|
self,
|
|
426
513
|
show_progress: bool = False,
|
|
@@ -488,52 +575,19 @@ class Ingestor:
|
|
|
488
575
|
|
|
489
576
|
def _perform_save_task(doc_data, job_id, source_name):
|
|
490
577
|
# This function runs in the io_executor
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
496
|
-
if self._output_config["compression"] == "gzip":
|
|
497
|
-
file_suffix += ".gz"
|
|
498
|
-
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
499
|
-
|
|
500
|
-
num_items_saved = save_document_results_to_jsonl(
|
|
501
|
-
doc_data,
|
|
502
|
-
jsonl_filepath,
|
|
503
|
-
source_name,
|
|
504
|
-
ensure_parent_dir_exists=False,
|
|
505
|
-
compression=self._output_config["compression"],
|
|
506
|
-
)
|
|
507
|
-
|
|
508
|
-
if num_items_saved > 0:
|
|
509
|
-
results = LazyLoadedList(
|
|
510
|
-
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
511
|
-
)
|
|
512
|
-
if results_lock:
|
|
513
|
-
with results_lock:
|
|
514
|
-
final_results_payload_list.append(results)
|
|
515
|
-
else: # Should not happen if io_executor is used
|
|
578
|
+
results = self._write_results_to_disk(doc_data, source_name, job_id)
|
|
579
|
+
if results:
|
|
580
|
+
if results_lock:
|
|
581
|
+
with results_lock:
|
|
516
582
|
final_results_payload_list.append(results)
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
f"Disk save I/O task error for job {job_id} (source: {source_name}): {e_save}",
|
|
520
|
-
exc_info=True,
|
|
521
|
-
)
|
|
583
|
+
else: # Should not happen if io_executor is used
|
|
584
|
+
final_results_payload_list.append(results)
|
|
522
585
|
|
|
523
586
|
def _disk_save_callback(
|
|
524
587
|
results_data: Dict[str, Any],
|
|
525
588
|
job_id: str,
|
|
526
589
|
):
|
|
527
|
-
source_name =
|
|
528
|
-
job_spec = self._client._job_index_to_job_spec.get(job_id)
|
|
529
|
-
if job_spec:
|
|
530
|
-
source_name = job_spec.source_name
|
|
531
|
-
else:
|
|
532
|
-
try:
|
|
533
|
-
if results_data:
|
|
534
|
-
source_name = results_data[0]["metadata"]["source_metadata"]["source_id"]
|
|
535
|
-
except (IndexError, KeyError, TypeError):
|
|
536
|
-
source_name = f"{job_id}"
|
|
590
|
+
source_name = self._resolve_source_name(job_id, results_data)
|
|
537
591
|
|
|
538
592
|
if not results_data:
|
|
539
593
|
logger.warning(f"No data in response for job {job_id} (source: {source_name}). Skipping save.")
|
|
@@ -734,12 +788,49 @@ class Ingestor:
|
|
|
734
788
|
|
|
735
789
|
proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently_async, **kwargs)
|
|
736
790
|
|
|
791
|
+
stream_to_callback_only = False
|
|
792
|
+
completion_callback = None
|
|
793
|
+
async_results_map = {}
|
|
794
|
+
|
|
795
|
+
io_executor = None
|
|
796
|
+
io_futures = []
|
|
797
|
+
|
|
798
|
+
if self._output_config:
|
|
799
|
+
stream_to_callback_only = True
|
|
800
|
+
output_dir = self._output_config["output_directory"]
|
|
801
|
+
|
|
802
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
803
|
+
|
|
804
|
+
io_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="IngestAsyncIO")
|
|
805
|
+
|
|
806
|
+
def _io_task(data: Dict[str, Any], job_id: str):
|
|
807
|
+
try:
|
|
808
|
+
source_name = self._resolve_source_name(job_id, data)
|
|
809
|
+
result = self._write_results_to_disk(data, source_name, job_id)
|
|
810
|
+
if result:
|
|
811
|
+
# Store the LazyLoadedList in our map using job_id as key
|
|
812
|
+
async_results_map[job_id] = result
|
|
813
|
+
except Exception as e:
|
|
814
|
+
logger.error(f"Error in async I/O task for job {job_id}: {e}", exc_info=True)
|
|
815
|
+
|
|
816
|
+
def _composite_callback(data: Dict[str, Any], job_id: str):
|
|
817
|
+
"""Callback executed by worker threads to save data to disk."""
|
|
818
|
+
try:
|
|
819
|
+
future = io_executor.submit(_io_task, data, job_id)
|
|
820
|
+
io_futures.append(future)
|
|
821
|
+
except Exception as e:
|
|
822
|
+
logger.error(f"Error in async callback for job {job_id}: {e}", exc_info=True)
|
|
823
|
+
|
|
824
|
+
completion_callback = _composite_callback
|
|
825
|
+
|
|
737
826
|
final_future: Future = Future()
|
|
738
827
|
|
|
739
828
|
processor_future = self._client.process_jobs_concurrently_async(
|
|
740
829
|
job_indices=self._job_ids,
|
|
741
830
|
job_queue_id=self._job_queue_id,
|
|
742
831
|
return_traces=return_traces,
|
|
832
|
+
completion_callback=completion_callback,
|
|
833
|
+
stream_to_callback_only=stream_to_callback_only,
|
|
743
834
|
**proc_kwargs,
|
|
744
835
|
)
|
|
745
836
|
|
|
@@ -759,6 +850,20 @@ class Ingestor:
|
|
|
759
850
|
|
|
760
851
|
results, failures, traces_list = proc_future.result()
|
|
761
852
|
|
|
853
|
+
if io_executor:
|
|
854
|
+
for f in as_completed(io_futures):
|
|
855
|
+
if f.exception():
|
|
856
|
+
logger.error(f"Async I/O task failed: {f.exception()}")
|
|
857
|
+
io_executor.shutdown(wait=True)
|
|
858
|
+
|
|
859
|
+
final_results_list = []
|
|
860
|
+
if self._output_config:
|
|
861
|
+
for item in results:
|
|
862
|
+
if isinstance(item, str) and item in async_results_map:
|
|
863
|
+
final_results_list.append(async_results_map[item])
|
|
864
|
+
else:
|
|
865
|
+
final_results_list = results
|
|
866
|
+
|
|
762
867
|
failed_job_ids = set()
|
|
763
868
|
for job_id_with_source, error_msg in failures:
|
|
764
869
|
job_id = job_id_with_source.split(":", 1)[0]
|
|
@@ -775,18 +880,22 @@ class Ingestor:
|
|
|
775
880
|
if self._job_states[job_id].state != JobStateEnum.COMPLETED:
|
|
776
881
|
self._job_states[job_id].state = JobStateEnum.COMPLETED
|
|
777
882
|
|
|
778
|
-
if self._vdb_bulk_upload and
|
|
883
|
+
if self._vdb_bulk_upload and final_results_list:
|
|
779
884
|
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="VDB_Uploader") as vdb_executor:
|
|
780
885
|
results_future = Future()
|
|
781
|
-
results_future.set_result(
|
|
886
|
+
results_future.set_result(final_results_list)
|
|
782
887
|
vdb_future = vdb_executor.submit(self._vdb_bulk_upload.run_async, results_future)
|
|
783
888
|
vdb_future.result()
|
|
784
889
|
|
|
890
|
+
if self._purge_results_after_vdb_upload and self._output_config:
|
|
891
|
+
logger.info("Purging saved results from disk after successful VDB upload.")
|
|
892
|
+
self._purge_saved_results(final_results_list)
|
|
893
|
+
|
|
785
894
|
parent_trace_ids = (
|
|
786
895
|
self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
787
896
|
)
|
|
788
897
|
|
|
789
|
-
returns = [
|
|
898
|
+
returns = [final_results_list]
|
|
790
899
|
if return_failures:
|
|
791
900
|
returns.append(failures)
|
|
792
901
|
if return_traces:
|
|
@@ -794,7 +903,7 @@ class Ingestor:
|
|
|
794
903
|
if include_parent_trace_ids:
|
|
795
904
|
returns.append(parent_trace_ids)
|
|
796
905
|
|
|
797
|
-
final_result = tuple(returns) if len(returns) > 1 else
|
|
906
|
+
final_result = tuple(returns) if len(returns) > 1 else final_results_list
|
|
798
907
|
|
|
799
908
|
if not final_future.done():
|
|
800
909
|
final_future.set_result(final_result)
|
|
@@ -812,6 +921,9 @@ class Ingestor:
|
|
|
812
921
|
):
|
|
813
922
|
job_state.state = final_state
|
|
814
923
|
|
|
924
|
+
if io_executor:
|
|
925
|
+
io_executor.shutdown(wait=False)
|
|
926
|
+
|
|
815
927
|
processor_future.add_done_callback(_processor_done_callback)
|
|
816
928
|
return final_future
|
|
817
929
|
|
|
@@ -963,11 +1075,18 @@ class Ingestor:
|
|
|
963
1075
|
**kwargs,
|
|
964
1076
|
)
|
|
965
1077
|
|
|
1078
|
+
api_document_type = EXTENSION_TO_DOCUMENT_TYPE.get(document_type.lower(), document_type)
|
|
1079
|
+
|
|
966
1080
|
# Extract method from task_options for API schema
|
|
967
1081
|
method = task_options.pop("extract_method", None)
|
|
968
1082
|
if method is None:
|
|
969
1083
|
# Let ExtractTask constructor handle default method selection
|
|
970
|
-
|
|
1084
|
+
if api_document_type == "docx":
|
|
1085
|
+
method = "python_docx"
|
|
1086
|
+
elif api_document_type == "pptx":
|
|
1087
|
+
method = "python_pptx"
|
|
1088
|
+
else:
|
|
1089
|
+
method = "pdfium" # Default fallback
|
|
971
1090
|
|
|
972
1091
|
# Build params dict for API schema
|
|
973
1092
|
params = {k: v for k, v in task_options.items() if k != "document_type"}
|
|
@@ -1088,13 +1207,9 @@ class Ingestor:
|
|
|
1088
1207
|
Ingestor
|
|
1089
1208
|
Returns self for chaining.
|
|
1090
1209
|
"""
|
|
1091
|
-
|
|
1092
|
-
if
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
# Provide default method if not specified (matching client StoreTask behavior)
|
|
1096
|
-
if "method" not in kwargs:
|
|
1097
|
-
kwargs["method"] = "minio"
|
|
1210
|
+
deprecated_method = kwargs.pop("store_method", None)
|
|
1211
|
+
if deprecated_method is not None:
|
|
1212
|
+
logger.warning("`store_method` is deprecated and no longer used. Configure storage_uri instead.")
|
|
1098
1213
|
|
|
1099
1214
|
task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
|
|
1100
1215
|
|
|
@@ -1102,7 +1217,9 @@ class Ingestor:
|
|
|
1102
1217
|
store_params = {
|
|
1103
1218
|
"structured": task_options.structured,
|
|
1104
1219
|
"images": task_options.images,
|
|
1105
|
-
"
|
|
1220
|
+
"storage_uri": task_options.storage_uri,
|
|
1221
|
+
"storage_options": task_options.storage_options,
|
|
1222
|
+
"public_base_url": task_options.public_base_url,
|
|
1106
1223
|
"params": task_options.params,
|
|
1107
1224
|
}
|
|
1108
1225
|
store_task = StoreTask(**store_params)
|
|
@@ -1347,6 +1464,7 @@ class Ingestor:
|
|
|
1347
1464
|
"api_key": task_options.api_key,
|
|
1348
1465
|
"endpoint_url": task_options.endpoint_url,
|
|
1349
1466
|
"prompt": task_options.prompt,
|
|
1467
|
+
"system_prompt": task_options.system_prompt,
|
|
1350
1468
|
"model_name": task_options.model_name,
|
|
1351
1469
|
}
|
|
1352
1470
|
caption_task = CaptionTask(**caption_params)
|
|
@@ -76,7 +76,7 @@ logger = logging.getLogger(__name__)
|
|
|
76
76
|
@click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
|
|
77
77
|
@click.option(
|
|
78
78
|
"--api_version",
|
|
79
|
-
default="
|
|
79
|
+
default="v2",
|
|
80
80
|
type=click.Choice(["v1", "v2"], case_sensitive=False),
|
|
81
81
|
help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
|
|
82
82
|
)
|
|
@@ -120,7 +120,7 @@ Each task must be specified with its type and corresponding options in the '[tas
|
|
|
120
120
|
Example:
|
|
121
121
|
--task 'split:{"split_by":"page", "split_length":10}'
|
|
122
122
|
--task 'extract:{"document_type":"pdf", "extract_text":true}'
|
|
123
|
-
--task 'extract:{"document_type":"pdf", "extract_method":"
|
|
123
|
+
--task 'extract:{"document_type":"pdf", "extract_method":"nemotron_parse"}'
|
|
124
124
|
--task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
|
|
125
125
|
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
|
126
126
|
--task 'embed'
|
|
@@ -22,18 +22,24 @@ class CaptionTask(Task):
|
|
|
22
22
|
api_key: str = None,
|
|
23
23
|
endpoint_url: str = None,
|
|
24
24
|
prompt: str = None,
|
|
25
|
+
system_prompt: str = None,
|
|
25
26
|
model_name: str = None,
|
|
26
27
|
) -> None:
|
|
27
28
|
super().__init__()
|
|
28
29
|
|
|
29
30
|
# Use the API schema for validation
|
|
30
31
|
validated_data = IngestTaskCaptionSchema(
|
|
31
|
-
api_key=api_key,
|
|
32
|
+
api_key=api_key,
|
|
33
|
+
endpoint_url=endpoint_url,
|
|
34
|
+
prompt=prompt,
|
|
35
|
+
system_prompt=system_prompt,
|
|
36
|
+
model_name=model_name,
|
|
32
37
|
)
|
|
33
38
|
|
|
34
39
|
self._api_key = validated_data.api_key
|
|
35
40
|
self._endpoint_url = validated_data.endpoint_url
|
|
36
41
|
self._prompt = validated_data.prompt
|
|
42
|
+
self._system_prompt = validated_data.system_prompt
|
|
37
43
|
self._model_name = validated_data.model_name
|
|
38
44
|
|
|
39
45
|
def __str__(self) -> str:
|
|
@@ -49,6 +55,8 @@ class CaptionTask(Task):
|
|
|
49
55
|
info += f" endpoint_url: {self._endpoint_url}\n"
|
|
50
56
|
if self._prompt:
|
|
51
57
|
info += f" prompt: {self._prompt}\n"
|
|
58
|
+
if self._system_prompt:
|
|
59
|
+
info += f" system_prompt: {self._system_prompt}\n"
|
|
52
60
|
if self._model_name:
|
|
53
61
|
info += f" model_name: {self._model_name}\n"
|
|
54
62
|
|
|
@@ -69,6 +77,9 @@ class CaptionTask(Task):
|
|
|
69
77
|
if self._prompt:
|
|
70
78
|
task_properties["prompt"] = self._prompt
|
|
71
79
|
|
|
80
|
+
if self._system_prompt:
|
|
81
|
+
task_properties["system_prompt"] = self._system_prompt
|
|
82
|
+
|
|
72
83
|
if self._model_name:
|
|
73
84
|
task_properties["model_name"] = self._model_name
|
|
74
85
|
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
import os
|
|
11
|
+
import warnings
|
|
12
|
+
from typing import get_args
|
|
11
13
|
from typing import Any
|
|
12
14
|
from typing import Dict
|
|
13
15
|
from typing import Literal
|
|
@@ -52,15 +54,27 @@ _DEFAULT_EXTRACTOR_MAP = {
|
|
|
52
54
|
|
|
53
55
|
_Type_Extract_Method_PDF = Literal[
|
|
54
56
|
"adobe",
|
|
55
|
-
"
|
|
57
|
+
"nemotron_parse",
|
|
56
58
|
"haystack",
|
|
57
59
|
"llama_parse",
|
|
58
60
|
"pdfium",
|
|
59
61
|
"tika",
|
|
60
62
|
"unstructured_io",
|
|
63
|
+
"unstructured_local",
|
|
64
|
+
"pdfium_hybrid",
|
|
61
65
|
"ocr",
|
|
62
66
|
]
|
|
63
67
|
|
|
68
|
+
_Type_Extract_Method_DOCX = Literal[
|
|
69
|
+
"python_docx",
|
|
70
|
+
"render_as_pdf",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
_Type_Extract_Method_PPTX = Literal[
|
|
74
|
+
"python_pptx",
|
|
75
|
+
"render_as_pdf",
|
|
76
|
+
]
|
|
77
|
+
|
|
64
78
|
_Type_Extract_Images_Method = Literal["group", "yolox"]
|
|
65
79
|
|
|
66
80
|
_Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
|
|
@@ -74,7 +88,7 @@ class ExtractTask(Task):
|
|
|
74
88
|
def __init__(
|
|
75
89
|
self,
|
|
76
90
|
document_type,
|
|
77
|
-
extract_method:
|
|
91
|
+
extract_method: Optional[str] = None,
|
|
78
92
|
extract_text: bool = False,
|
|
79
93
|
extract_images: bool = False,
|
|
80
94
|
extract_tables: bool = False,
|
|
@@ -109,6 +123,12 @@ class ExtractTask(Task):
|
|
|
109
123
|
)
|
|
110
124
|
extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
|
|
111
125
|
|
|
126
|
+
if extract_method == "nemoretriever_parse":
|
|
127
|
+
logger.warning("'nemoretriever_parse' is deprecated. Please use 'nemotron_parse' instead.")
|
|
128
|
+
extract_method = "nemotron_parse"
|
|
129
|
+
|
|
130
|
+
self._validate_extract_method(document_type, extract_method)
|
|
131
|
+
|
|
112
132
|
# Set default extract_charts if None
|
|
113
133
|
if extract_charts is None:
|
|
114
134
|
extract_charts = extract_tables
|
|
@@ -240,3 +260,31 @@ class ExtractTask(Task):
|
|
|
240
260
|
@property
|
|
241
261
|
def document_type(self):
|
|
242
262
|
return self._document_type.value
|
|
263
|
+
|
|
264
|
+
def _validate_extract_method(self, document_type: str, extract_method: str):
|
|
265
|
+
doc_type = document_type.lower()
|
|
266
|
+
|
|
267
|
+
valid_docx = set(get_args(_Type_Extract_Method_DOCX))
|
|
268
|
+
valid_pptx = set(get_args(_Type_Extract_Method_PPTX))
|
|
269
|
+
valid_pdf = set(get_args(_Type_Extract_Method_PDF))
|
|
270
|
+
|
|
271
|
+
if doc_type == "docx" and extract_method not in valid_docx:
|
|
272
|
+
raise ValueError(f"'{extract_method}' is invalid for DOCX. Options: {valid_docx}")
|
|
273
|
+
|
|
274
|
+
elif doc_type == "pptx" and extract_method not in valid_pptx:
|
|
275
|
+
raise ValueError(f"'{extract_method}' is invalid for PPTX. Options: {valid_pptx}")
|
|
276
|
+
|
|
277
|
+
elif doc_type == "pdf" and extract_method not in valid_pdf:
|
|
278
|
+
raise ValueError(f"'{extract_method}' is invalid for PDF. Options: {valid_pdf}")
|
|
279
|
+
|
|
280
|
+
elif doc_type not in ["docx", "pptx", "pdf"]:
|
|
281
|
+
is_docx_method = extract_method in valid_docx
|
|
282
|
+
is_pptx_method = extract_method in valid_pptx
|
|
283
|
+
is_pdf_method = extract_method in valid_pdf
|
|
284
|
+
|
|
285
|
+
if (is_docx_method or is_pptx_method) and not is_pdf_method:
|
|
286
|
+
warnings.warn(
|
|
287
|
+
f"extract_method '{extract_method}' is valid for Office documents but NOT for PDFs. "
|
|
288
|
+
"If your batch includes PDFs, extraction may fail for those files. "
|
|
289
|
+
"Consider leaving extract_method=None for mixed batches."
|
|
290
|
+
)
|
|
@@ -7,8 +7,7 @@
|
|
|
7
7
|
# pylint: disable=too-many-arguments
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
|
-
from typing import Dict
|
|
11
|
-
from typing import Literal
|
|
10
|
+
from typing import Dict, Literal, Optional
|
|
12
11
|
|
|
13
12
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
14
13
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
@@ -17,23 +16,19 @@ from .task_base import Task
|
|
|
17
16
|
|
|
18
17
|
logger = logging.getLogger(__name__)
|
|
19
18
|
|
|
20
|
-
_DEFAULT_STORE_METHOD = "minio"
|
|
21
|
-
|
|
22
19
|
|
|
23
20
|
class StoreTask(Task):
|
|
24
21
|
"""
|
|
25
22
|
Object for image storage task.
|
|
26
23
|
"""
|
|
27
24
|
|
|
28
|
-
_Type_Content_Type = Literal["image",]
|
|
29
|
-
|
|
30
|
-
_Type_Store_Method = Literal["minio",]
|
|
31
|
-
|
|
32
25
|
def __init__(
|
|
33
26
|
self,
|
|
34
27
|
structured: bool = True,
|
|
35
28
|
images: bool = False,
|
|
36
|
-
|
|
29
|
+
storage_uri: Optional[str] = None,
|
|
30
|
+
storage_options: Optional[dict] = None,
|
|
31
|
+
public_base_url: Optional[str] = None,
|
|
37
32
|
params: dict = None,
|
|
38
33
|
**extra_params,
|
|
39
34
|
) -> None:
|
|
@@ -51,12 +46,19 @@ class StoreTask(Task):
|
|
|
51
46
|
|
|
52
47
|
# Use the API schema for validation
|
|
53
48
|
validated_data = IngestTaskStoreSchema(
|
|
54
|
-
structured=structured,
|
|
49
|
+
structured=structured,
|
|
50
|
+
images=images,
|
|
51
|
+
storage_uri=storage_uri,
|
|
52
|
+
storage_options=storage_options or {},
|
|
53
|
+
public_base_url=public_base_url,
|
|
54
|
+
params=merged_params,
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
self._structured = validated_data.structured
|
|
58
58
|
self._images = validated_data.images
|
|
59
|
-
self.
|
|
59
|
+
self._storage_uri = validated_data.storage_uri
|
|
60
|
+
self._storage_options = validated_data.storage_options
|
|
61
|
+
self._public_base_url = validated_data.public_base_url
|
|
60
62
|
self._params = validated_data.params
|
|
61
63
|
self._extra_params = extra_params
|
|
62
64
|
|
|
@@ -68,7 +70,8 @@ class StoreTask(Task):
|
|
|
68
70
|
info += "Store Task:\n"
|
|
69
71
|
info += f" store structured types: {self._structured}\n"
|
|
70
72
|
info += f" store image types: {self._images}\n"
|
|
71
|
-
info += f"
|
|
73
|
+
info += f" storage uri: {self._storage_uri}\n"
|
|
74
|
+
info += f" public base url: {self._public_base_url}\n"
|
|
72
75
|
for key, value in self._extra_params.items():
|
|
73
76
|
info += f" {key}: {value}\n"
|
|
74
77
|
for key, value in self._params.items():
|
|
@@ -81,9 +84,11 @@ class StoreTask(Task):
|
|
|
81
84
|
"""
|
|
82
85
|
|
|
83
86
|
task_properties = {
|
|
84
|
-
"method": self._store_method,
|
|
85
87
|
"structured": self._structured,
|
|
86
88
|
"images": self._images,
|
|
89
|
+
"storage_uri": self._storage_uri,
|
|
90
|
+
"storage_options": self._storage_options,
|
|
91
|
+
"public_base_url": self._public_base_url,
|
|
87
92
|
"params": self._params,
|
|
88
93
|
**self._extra_params,
|
|
89
94
|
}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
5
|
+
from datetime import timedelta
|
|
6
|
+
from functools import partial
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from nv_ingest_client.util.transport import infer_microservice
|
|
9
|
+
import lancedb
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_lancedb_results(results):
|
|
16
|
+
"""Transform NV-Ingest pipeline results into LanceDB ingestible rows.
|
|
17
|
+
|
|
18
|
+
The NV-Ingest pipeline provides nested lists of record dictionaries. This
|
|
19
|
+
helper extracts the inner `metadata` dict for each record, filters out
|
|
20
|
+
entries without an embedding, and returns a list of dictionaries with the
|
|
21
|
+
exact fields expected by the LanceDB table schema used in
|
|
22
|
+
`LanceDB.create_index`.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
results : list
|
|
27
|
+
Nested list-of-lists containing record dicts in the NV-Ingest format.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
list
|
|
32
|
+
List of dictionaries with keys: `vector` (embedding list), `text`
|
|
33
|
+
(string content), `metadata` (page number) and `source` (source id).
|
|
34
|
+
|
|
35
|
+
Notes
|
|
36
|
+
-----
|
|
37
|
+
- The function expects each inner record to have a `metadata` mapping
|
|
38
|
+
containing `embedding`, `content`, `content_metadata.page_number`, and
|
|
39
|
+
`source_metadata.source_id`.
|
|
40
|
+
- Records with `embedding is None` are skipped.
|
|
41
|
+
"""
|
|
42
|
+
old_results = [res["metadata"] for result in results for res in result]
|
|
43
|
+
results = []
|
|
44
|
+
for result in old_results:
|
|
45
|
+
if result["embedding"] is None:
|
|
46
|
+
continue
|
|
47
|
+
results.append(
|
|
48
|
+
{
|
|
49
|
+
"vector": result["embedding"],
|
|
50
|
+
"text": result["content"],
|
|
51
|
+
"metadata": result["content_metadata"]["page_number"],
|
|
52
|
+
"source": result["source_metadata"]["source_id"],
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LanceDB(VDB):
|
|
59
|
+
"""LanceDB operator implementing the VDB interface.
|
|
60
|
+
|
|
61
|
+
This class adapts NV-Ingest records to LanceDB, providing index creation,
|
|
62
|
+
ingestion, and retrieval hooks. The implementation is intentionally small
|
|
63
|
+
and focuses on the example configuration used in NV-Ingest evaluation
|
|
64
|
+
scripts.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
uri=None,
|
|
70
|
+
overwrite=True,
|
|
71
|
+
table_name="nv-ingest",
|
|
72
|
+
index_type="IVF_HNSW_SQ",
|
|
73
|
+
metric="l2",
|
|
74
|
+
num_partitions=16,
|
|
75
|
+
num_sub_vectors=256,
|
|
76
|
+
**kwargs
|
|
77
|
+
):
|
|
78
|
+
"""Initialize the LanceDB VDB operator.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
uri: str, optional
|
|
83
|
+
LanceDB connection URI (default is "lancedb" for local file-based
|
|
84
|
+
storage).
|
|
85
|
+
overwrite : bool, optional
|
|
86
|
+
If True, existing tables will be overwritten during index creation.
|
|
87
|
+
If False, new data will be appended to existing tables.
|
|
88
|
+
table_name : str, optional
|
|
89
|
+
Name of the LanceDB table to create/use (default is "nv-ingest").
|
|
90
|
+
index_type : str, optional
|
|
91
|
+
Type of vector index to create (default is "IVF_HNSW_SQ").
|
|
92
|
+
metric : str, optional
|
|
93
|
+
Distance metric for the vector index (default is "l2").
|
|
94
|
+
num_partitions : int, optional
|
|
95
|
+
Number of partitions for the vector index (default is 16).
|
|
96
|
+
num_sub_vectors : int, optional
|
|
97
|
+
Number of sub-vectors for the vector index (default is 256).
|
|
98
|
+
**kwargs : dict
|
|
99
|
+
Forwarded configuration options. This implementation does not
|
|
100
|
+
actively consume specific keys, but passing parameters such as
|
|
101
|
+
`uri`, `index_name`, or security options is supported by the
|
|
102
|
+
interface pattern and may be used by future enhancements.
|
|
103
|
+
"""
|
|
104
|
+
self.uri = uri or "lancedb"
|
|
105
|
+
self.overwrite = overwrite
|
|
106
|
+
self.table_name = table_name
|
|
107
|
+
self.index_type = index_type
|
|
108
|
+
self.metric = metric
|
|
109
|
+
self.num_partitions = num_partitions
|
|
110
|
+
self.num_sub_vectors = num_sub_vectors
|
|
111
|
+
super().__init__(**kwargs)
|
|
112
|
+
|
|
113
|
+
def create_index(self, records=None, table_name="nv-ingest", **kwargs):
|
|
114
|
+
"""Create a LanceDB table and populate it with transformed records.
|
|
115
|
+
|
|
116
|
+
This method connects to LanceDB, transforms NV-Ingest records using
|
|
117
|
+
`create_lancedb_results`, builds a PyArrow schema that matches the
|
|
118
|
+
expected table layout, and creates/overwrites a table named `bo`.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
records : list, optional
|
|
123
|
+
NV-Ingest records in nested list format (the same structure passed
|
|
124
|
+
to `run`). If ``None``, an empty table will be created.
|
|
125
|
+
|
|
126
|
+
table_name : str, optional
|
|
127
|
+
Name of the LanceDB table to create (default is "nv-ingest").
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
table
|
|
132
|
+
The LanceDB table object returned by `db.create_table`.
|
|
133
|
+
"""
|
|
134
|
+
db = lancedb.connect(uri=self.uri)
|
|
135
|
+
results = create_lancedb_results(records)
|
|
136
|
+
schema = pa.schema(
|
|
137
|
+
[
|
|
138
|
+
pa.field("vector", pa.list_(pa.float32(), 2048)),
|
|
139
|
+
pa.field("text", pa.string()),
|
|
140
|
+
pa.field("metadata", pa.string()),
|
|
141
|
+
pa.field("source", pa.string()),
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
table = db.create_table(
|
|
145
|
+
table_name, data=results, schema=schema, mode="overwrite" if self.overwrite else "append"
|
|
146
|
+
)
|
|
147
|
+
return table
|
|
148
|
+
|
|
149
|
+
def write_to_index(
|
|
150
|
+
self,
|
|
151
|
+
records,
|
|
152
|
+
table=None,
|
|
153
|
+
index_type="IVF_HNSW_SQ",
|
|
154
|
+
metric="l2",
|
|
155
|
+
num_partitions=16,
|
|
156
|
+
num_sub_vectors=256,
|
|
157
|
+
**kwargs
|
|
158
|
+
):
|
|
159
|
+
"""Create an index on the LanceDB table and wait for it to become ready.
|
|
160
|
+
|
|
161
|
+
This function calls `table.create_index` with an IVF+HNSW+SQ index
|
|
162
|
+
configuration used in NV-Ingest benchmarks. After requesting index
|
|
163
|
+
construction it lists available indices and waits for each one to
|
|
164
|
+
reach a ready state using `table.wait_for_index`.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
records : list
|
|
169
|
+
The original records being indexed (not used directly in this
|
|
170
|
+
implementation but kept in the signature for consistency).
|
|
171
|
+
table : object
|
|
172
|
+
LanceDB table object returned by `create_index`.
|
|
173
|
+
"""
|
|
174
|
+
table.create_index(
|
|
175
|
+
index_type=index_type,
|
|
176
|
+
metric=metric,
|
|
177
|
+
num_partitions=num_partitions,
|
|
178
|
+
num_sub_vectors=num_sub_vectors,
|
|
179
|
+
# accelerator="cuda",
|
|
180
|
+
vector_column_name="vector",
|
|
181
|
+
)
|
|
182
|
+
for index_stub in table.list_indices():
|
|
183
|
+
table.wait_for_index([index_stub.name], timeout=timedelta(seconds=600))
|
|
184
|
+
|
|
185
|
+
def retrieval(
|
|
186
|
+
self,
|
|
187
|
+
queries,
|
|
188
|
+
table=None,
|
|
189
|
+
embedding_endpoint="http://localhost:8012/v1",
|
|
190
|
+
nvidia_api_key=None,
|
|
191
|
+
model_name="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
|
192
|
+
result_fields=["text", "metadata", "source"],
|
|
193
|
+
top_k=10,
|
|
194
|
+
**kwargs
|
|
195
|
+
):
|
|
196
|
+
"""Run similarity search for a list of text queries.
|
|
197
|
+
|
|
198
|
+
This method converts textual queries to embeddings by calling the
|
|
199
|
+
transport helper `infer_microservice` (configured to use an NVIDIA
|
|
200
|
+
embedding model in the example) and performs a vector search against
|
|
201
|
+
the LanceDB `table`.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
queries : list[str]
|
|
206
|
+
Text queries to be embedded and searched.
|
|
207
|
+
table : object
|
|
208
|
+
LanceDB table object with a built vector index.
|
|
209
|
+
embedding_endpoint : str, optional
|
|
210
|
+
URL of the embedding microservice (default is
|
|
211
|
+
"http://localhost:8012/v1").
|
|
212
|
+
nvidia_api_key : str, optional
|
|
213
|
+
NVIDIA API key for authentication with the embedding service. If
|
|
214
|
+
``None``, no authentication is used.
|
|
215
|
+
model_name : str, optional
|
|
216
|
+
Name of the embedding model to use (default is
|
|
217
|
+
"nvidia/llama-3.2-nv-embedqa-1b-v2").
|
|
218
|
+
result_fields : list, optional
|
|
219
|
+
List of field names to retrieve from each hit document (default is
|
|
220
|
+
`["text", "metadata", "source"]`).
|
|
221
|
+
top_k : int, optional
|
|
222
|
+
Number of top results to return per query (default is 10).
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
list[list[dict]]
|
|
227
|
+
For each input query, a list of hit documents (each document is a
|
|
228
|
+
dict with fields such as `text`, `metadata`, and `source`). The
|
|
229
|
+
example limits each query to 20 results.
|
|
230
|
+
"""
|
|
231
|
+
embed_model = partial(
|
|
232
|
+
infer_microservice,
|
|
233
|
+
model_name=model_name,
|
|
234
|
+
embedding_endpoint=embedding_endpoint,
|
|
235
|
+
nvidia_api_key=nvidia_api_key,
|
|
236
|
+
input_type="query",
|
|
237
|
+
output_names=["embeddings"],
|
|
238
|
+
grpc=not ("http" in urlparse(embedding_endpoint).scheme),
|
|
239
|
+
)
|
|
240
|
+
results = []
|
|
241
|
+
query_embeddings = embed_model(queries)
|
|
242
|
+
for query_embed in query_embeddings:
|
|
243
|
+
results.append(
|
|
244
|
+
table.search([query_embed], vector_column_name="vector").select(result_fields).limit(top_k).to_list()
|
|
245
|
+
)
|
|
246
|
+
return results
|
|
247
|
+
|
|
248
|
+
def run(self, records):
|
|
249
|
+
"""Orchestrate index creation and data ingestion.
|
|
250
|
+
|
|
251
|
+
The `run` method is the public entry point used by NV-Ingest pipeline
|
|
252
|
+
tasks. A minimal implementation first ensures the table exists by
|
|
253
|
+
calling `create_index` and then kicks off index construction with
|
|
254
|
+
`write_to_index`.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
records : list
|
|
259
|
+
NV-Ingest records to index.
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
list
|
|
264
|
+
The original `records` list is returned unchanged to make the
|
|
265
|
+
operator composable in pipelines.
|
|
266
|
+
"""
|
|
267
|
+
table = self.create_index(records=records, table_name=self.table_name)
|
|
268
|
+
self.write_to_index(
|
|
269
|
+
records,
|
|
270
|
+
table=table,
|
|
271
|
+
index_type=self.index_type,
|
|
272
|
+
metric=self.metric,
|
|
273
|
+
num_partitions=self.num_partitions,
|
|
274
|
+
num_sub_vectors=self.num_sub_vectors,
|
|
275
|
+
)
|
|
276
|
+
return records
|
|
@@ -287,6 +287,10 @@ def create_nvingest_index_params(
|
|
|
287
287
|
gpu_index: bool = True,
|
|
288
288
|
gpu_search: bool = False,
|
|
289
289
|
local_index: bool = True,
|
|
290
|
+
intermediate_graph_degree: int = 128,
|
|
291
|
+
graph_degree: int = 100,
|
|
292
|
+
m: int = 64,
|
|
293
|
+
ef_construction: int = 512,
|
|
290
294
|
) -> IndexParams:
|
|
291
295
|
"""
|
|
292
296
|
Creates index params necessary to create an index for a collection. At a minimum,
|
|
@@ -326,8 +330,8 @@ def create_nvingest_index_params(
|
|
|
326
330
|
index_type="GPU_CAGRA",
|
|
327
331
|
metric_type="L2",
|
|
328
332
|
params={
|
|
329
|
-
"intermediate_graph_degree":
|
|
330
|
-
"graph_degree":
|
|
333
|
+
"intermediate_graph_degree": intermediate_graph_degree,
|
|
334
|
+
"graph_degree": graph_degree,
|
|
331
335
|
"build_algo": "NN_DESCENT",
|
|
332
336
|
"cache_dataset_on_device": "true",
|
|
333
337
|
"adapt_for_cpu": "false" if gpu_search else "true",
|
|
@@ -339,7 +343,7 @@ def create_nvingest_index_params(
|
|
|
339
343
|
index_name=DENSE_INDEX_NAME,
|
|
340
344
|
index_type="HNSW",
|
|
341
345
|
metric_type="L2",
|
|
342
|
-
params={"M":
|
|
346
|
+
params={"M": m, "efConstruction": ef_construction},
|
|
343
347
|
)
|
|
344
348
|
if sparse and local_index:
|
|
345
349
|
index_params.add_index(
|
|
@@ -407,6 +411,10 @@ def create_nvingest_collection(
|
|
|
407
411
|
recreate_meta: bool = False,
|
|
408
412
|
username: str = None,
|
|
409
413
|
password: str = None,
|
|
414
|
+
intermediate_graph_degree: int = 128,
|
|
415
|
+
graph_degree: int = 100,
|
|
416
|
+
m: int = 64,
|
|
417
|
+
ef_construction: int = 512,
|
|
410
418
|
) -> CollectionSchema:
|
|
411
419
|
"""
|
|
412
420
|
Creates a milvus collection with an nv-ingest compatible schema under
|
|
@@ -457,6 +465,10 @@ def create_nvingest_collection(
|
|
|
457
465
|
gpu_index=gpu_index,
|
|
458
466
|
gpu_search=gpu_search,
|
|
459
467
|
local_index=local_index,
|
|
468
|
+
intermediate_graph_degree=intermediate_graph_degree,
|
|
469
|
+
graph_degree=graph_degree,
|
|
470
|
+
m=m,
|
|
471
|
+
ef_construction=ef_construction,
|
|
460
472
|
)
|
|
461
473
|
create_collection(client, collection_name, schema, index_params, recreate=recreate)
|
|
462
474
|
d_idx, s_idx = _get_index_types(index_params, sparse=sparse)
|
|
@@ -949,6 +961,7 @@ def write_to_nvingest_collection(
|
|
|
949
961
|
stream: bool = False,
|
|
950
962
|
username: str = None,
|
|
951
963
|
password: str = None,
|
|
964
|
+
no_wait_index: bool = False,
|
|
952
965
|
**kwargs,
|
|
953
966
|
):
|
|
954
967
|
"""
|
|
@@ -1054,7 +1067,7 @@ def write_to_nvingest_collection(
|
|
|
1054
1067
|
client,
|
|
1055
1068
|
collection_name,
|
|
1056
1069
|
)
|
|
1057
|
-
if not local_index:
|
|
1070
|
+
if not local_index and not no_wait_index:
|
|
1058
1071
|
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1059
1072
|
# know how long this should take, it is num_elements dependent.
|
|
1060
1073
|
wait_for_index(collection_name, expected_rows, client)
|
|
@@ -1971,6 +1984,7 @@ class Milvus(VDB):
|
|
|
1971
1984
|
threshold: int = 1000,
|
|
1972
1985
|
username: str = None,
|
|
1973
1986
|
password: str = None,
|
|
1987
|
+
no_wait_index: bool = False,
|
|
1974
1988
|
**kwargs,
|
|
1975
1989
|
):
|
|
1976
1990
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-client
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.17.dev20251217
|
|
4
4
|
Summary: Python client for the nv-ingest service
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
|
|
|
223
223
|
Requires-Dist: requests>=2.28.2
|
|
224
224
|
Requires-Dist: setuptools>=78.1.1
|
|
225
225
|
Requires-Dist: tqdm>=4.67.1
|
|
226
|
+
Requires-Dist: lancedb>=0.25.3
|
|
226
227
|
Provides-Extra: milvus
|
|
227
228
|
Requires-Dist: pymilvus==2.5.10; extra == "milvus"
|
|
228
229
|
Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
2
|
-
nv_ingest_client/nv_ingest_cli.py,sha256=
|
|
2
|
+
nv_ingest_client/nv_ingest_cli.py,sha256=qeZJZq_ltnNFiytQNwMY3VAL7nBUXW2HnwMzBGaKQJ0,14452
|
|
3
3
|
nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
4
4
|
nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
5
5
|
nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
|
|
@@ -8,7 +8,7 @@ nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI
|
|
|
8
8
|
nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
|
|
9
9
|
nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
|
|
10
10
|
nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
|
|
11
|
-
nv_ingest_client/client/interface.py,sha256=
|
|
11
|
+
nv_ingest_client/client/interface.py,sha256=1gmFQ7bVQDiEweChN_Divv1Y87a4cNkEgH2Shp4tIMw,64915
|
|
12
12
|
nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
|
|
13
13
|
nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
|
|
14
14
|
nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
|
|
@@ -16,16 +16,16 @@ nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQb
|
|
|
16
16
|
nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
|
|
17
17
|
nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
|
|
18
18
|
nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
|
|
19
|
-
nv_ingest_client/primitives/tasks/caption.py,sha256=
|
|
19
|
+
nv_ingest_client/primitives/tasks/caption.py,sha256=w-xPKN77zruUel0md4OA-x2ciELSLY-8Px1ds76gak0,2498
|
|
20
20
|
nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
|
|
21
21
|
nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
|
|
22
22
|
nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
|
|
23
|
-
nv_ingest_client/primitives/tasks/extract.py,sha256=
|
|
23
|
+
nv_ingest_client/primitives/tasks/extract.py,sha256=jTCOSQG1MG0RoQg4DxPgmYgeHQR7O24hmysygkWYyIY,11270
|
|
24
24
|
nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
|
|
25
25
|
nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
|
|
26
26
|
nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
|
|
27
27
|
nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
|
|
28
|
-
nv_ingest_client/primitives/tasks/store.py,sha256=
|
|
28
|
+
nv_ingest_client/primitives/tasks/store.py,sha256=UeIspL_RDPBbUV3gv8SK3tIoYNun8r4cSSMxXvBSaks,4575
|
|
29
29
|
nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
|
|
30
30
|
nv_ingest_client/primitives/tasks/task_base.py,sha256=Mrx6kgePJHolYd3Im6mVISXcVgdulLst2MYG5gPov9I,1687
|
|
31
31
|
nv_ingest_client/primitives/tasks/task_factory.py,sha256=uvGQXjgWmeF015jPWmBhiclzfrUf3_yD2PPeirQBczM,3218
|
|
@@ -46,11 +46,12 @@ nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
|
46
46
|
nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
|
|
47
47
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
48
48
|
nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
|
|
49
|
-
nv_ingest_client/util/vdb/
|
|
49
|
+
nv_ingest_client/util/vdb/lancedb.py,sha256=mLykdOFkLC5-SpRvHAvt0do9rhyQDqy_H48D6hEtegw,10037
|
|
50
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=NLlsYU5LdESh0r_Psvn0vzGiNN-70iouOGr3RgZaMVg,81316
|
|
50
51
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
51
|
-
nv_ingest_client-2025.
|
|
52
|
-
nv_ingest_client-2025.
|
|
53
|
-
nv_ingest_client-2025.
|
|
54
|
-
nv_ingest_client-2025.
|
|
55
|
-
nv_ingest_client-2025.
|
|
56
|
-
nv_ingest_client-2025.
|
|
52
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
53
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/METADATA,sha256=EbEZoUk3-GvCBAB2z0hqZjgMOGasw75hZCWTDk7yxpk,30658
|
|
54
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
56
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
57
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|