nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +561 -207
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +466 -59
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +58 -6
- nv_ingest_client/primitives/jobs/job_spec.py +32 -10
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +37 -76
- nv_ingest_client/primitives/tasks/extract.py +68 -169
- nv_ingest_client/primitives/tasks/filter.py +22 -28
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +10 -11
- nv_ingest_client/primitives/tasks/udf.py +349 -0
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client/util/image_disk_utils.py +300 -0
- nv_ingest_client/util/transport.py +12 -6
- nv_ingest_client/util/util.py +66 -0
- nv_ingest_client/util/vdb/milvus.py +220 -75
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
- nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,26 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import copy
|
|
1
3
|
import datetime
|
|
4
|
+
import json
|
|
2
5
|
import logging
|
|
6
|
+
import os
|
|
3
7
|
import time
|
|
8
|
+
from functools import partial
|
|
9
|
+
from pathlib import Path
|
|
4
10
|
from typing import Dict
|
|
5
11
|
from typing import List
|
|
6
12
|
from typing import Tuple
|
|
7
13
|
from typing import Union
|
|
8
14
|
from urllib.parse import urlparse
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from functools import partial
|
|
12
|
-
import json
|
|
13
|
-
import os
|
|
14
|
-
import numpy as np
|
|
15
|
-
import ast
|
|
16
|
-
import copy
|
|
17
15
|
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
18
|
import requests
|
|
19
|
+
from minio import Minio
|
|
19
20
|
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
|
|
21
|
+
from nv_ingest_client.util.transport import infer_microservice
|
|
20
22
|
from nv_ingest_client.util.util import ClientConfigSchema
|
|
23
|
+
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
21
24
|
from pymilvus import AnnSearchRequest
|
|
22
25
|
from pymilvus import BulkInsertState
|
|
23
26
|
from pymilvus import Collection
|
|
@@ -36,8 +39,6 @@ from pymilvus.model.sparse import BM25EmbeddingFunction
|
|
|
36
39
|
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
|
|
37
40
|
from pymilvus.orm.types import CONSISTENCY_BOUNDED
|
|
38
41
|
from scipy.sparse import csr_array
|
|
39
|
-
from nv_ingest_client.util.transport import infer_microservice
|
|
40
|
-
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
logger = logging.getLogger(__name__)
|
|
@@ -81,11 +82,10 @@ def create_nvingest_meta_schema():
|
|
|
81
82
|
|
|
82
83
|
def create_meta_collection(
|
|
83
84
|
schema: CollectionSchema,
|
|
84
|
-
milvus_uri: str = "http://localhost:19530",
|
|
85
85
|
collection_name: str = "meta",
|
|
86
86
|
recreate=False,
|
|
87
|
+
client: MilvusClient = None,
|
|
87
88
|
):
|
|
88
|
-
client = MilvusClient(milvus_uri)
|
|
89
89
|
if client.has_collection(collection_name) and not recreate:
|
|
90
90
|
# already exists, dont erase and recreate
|
|
91
91
|
return
|
|
@@ -103,7 +103,6 @@ def create_meta_collection(
|
|
|
103
103
|
def write_meta_collection(
|
|
104
104
|
collection_name: str,
|
|
105
105
|
fields: List[str],
|
|
106
|
-
milvus_uri: str = "http://localhost:19530",
|
|
107
106
|
creation_timestamp: str = None,
|
|
108
107
|
dense_index: str = None,
|
|
109
108
|
dense_dim: int = None,
|
|
@@ -111,6 +110,7 @@ def write_meta_collection(
|
|
|
111
110
|
embedding_model: str = None,
|
|
112
111
|
sparse_model: str = None,
|
|
113
112
|
meta_collection_name: str = "meta",
|
|
113
|
+
client: MilvusClient = None,
|
|
114
114
|
):
|
|
115
115
|
client_config = ClientConfigSchema()
|
|
116
116
|
data = {
|
|
@@ -129,14 +129,12 @@ def write_meta_collection(
|
|
|
129
129
|
},
|
|
130
130
|
"user_fields": [field.name for field in fields],
|
|
131
131
|
}
|
|
132
|
-
client = MilvusClient(milvus_uri)
|
|
133
132
|
client.insert(collection_name=meta_collection_name, data=data)
|
|
134
133
|
|
|
135
134
|
|
|
136
135
|
def log_new_meta_collection(
|
|
137
136
|
collection_name: str,
|
|
138
137
|
fields: List[str],
|
|
139
|
-
milvus_uri: str = "http://localhost:19530",
|
|
140
138
|
creation_timestamp: str = None,
|
|
141
139
|
dense_index: str = None,
|
|
142
140
|
dense_dim: int = None,
|
|
@@ -145,13 +143,13 @@ def log_new_meta_collection(
|
|
|
145
143
|
sparse_model: str = None,
|
|
146
144
|
meta_collection_name: str = "meta",
|
|
147
145
|
recreate: bool = False,
|
|
146
|
+
client: MilvusClient = None,
|
|
148
147
|
):
|
|
149
148
|
schema = create_nvingest_meta_schema()
|
|
150
|
-
create_meta_collection(schema,
|
|
149
|
+
create_meta_collection(schema, client=client, recreate=recreate)
|
|
151
150
|
write_meta_collection(
|
|
152
151
|
collection_name,
|
|
153
152
|
fields=fields,
|
|
154
|
-
milvus_uri=milvus_uri,
|
|
155
153
|
creation_timestamp=creation_timestamp,
|
|
156
154
|
dense_index=dense_index,
|
|
157
155
|
dense_dim=dense_dim,
|
|
@@ -159,6 +157,7 @@ def log_new_meta_collection(
|
|
|
159
157
|
embedding_model=embedding_model,
|
|
160
158
|
sparse_model=sparse_model,
|
|
161
159
|
meta_collection_name=meta_collection_name,
|
|
160
|
+
client=client,
|
|
162
161
|
)
|
|
163
162
|
|
|
164
163
|
|
|
@@ -168,12 +167,16 @@ def grab_meta_collection_info(
|
|
|
168
167
|
timestamp: str = None,
|
|
169
168
|
embedding_model: str = None,
|
|
170
169
|
embedding_dim: int = None,
|
|
171
|
-
|
|
170
|
+
client: MilvusClient = None,
|
|
171
|
+
milvus_uri: str = None,
|
|
172
|
+
username: str = None,
|
|
173
|
+
password: str = None,
|
|
172
174
|
):
|
|
173
175
|
timestamp = timestamp or ""
|
|
174
176
|
embedding_model = embedding_model or ""
|
|
175
177
|
embedding_dim = embedding_dim or ""
|
|
176
|
-
|
|
178
|
+
if milvus_uri:
|
|
179
|
+
client = MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
177
180
|
results = client.query_iterator(
|
|
178
181
|
collection_name=meta_collection_name,
|
|
179
182
|
output_fields=[
|
|
@@ -401,6 +404,8 @@ def create_nvingest_collection(
|
|
|
401
404
|
gpu_search: bool = False,
|
|
402
405
|
dense_dim: int = 2048,
|
|
403
406
|
recreate_meta: bool = False,
|
|
407
|
+
username: str = None,
|
|
408
|
+
password: str = None,
|
|
404
409
|
) -> CollectionSchema:
|
|
405
410
|
"""
|
|
406
411
|
Creates a milvus collection with an nv-ingest compatible schema under
|
|
@@ -410,9 +415,7 @@ def create_nvingest_collection(
|
|
|
410
415
|
----------
|
|
411
416
|
collection_name : str
|
|
412
417
|
Name of the collection to be created.
|
|
413
|
-
|
|
414
|
-
Milvus address with http(s) preffix and port. Can also be a file path, to activate
|
|
415
|
-
milvus-lite.
|
|
418
|
+
|
|
416
419
|
sparse : bool, optional
|
|
417
420
|
When set to true, this adds a Sparse index to the IndexParams, usually activated for
|
|
418
421
|
hybrid search.
|
|
@@ -423,6 +426,11 @@ def create_nvingest_collection(
|
|
|
423
426
|
If true, creates a GPU_CAGRA index for dense embeddings.
|
|
424
427
|
dense_dim : int, optional
|
|
425
428
|
Sets the dimension size for the dense embedding in the milvus schema.
|
|
429
|
+
username : str, optional
|
|
430
|
+
Milvus username.
|
|
431
|
+
password : str, optional
|
|
432
|
+
Milvus password.
|
|
433
|
+
|
|
426
434
|
|
|
427
435
|
Returns
|
|
428
436
|
-------
|
|
@@ -432,7 +440,7 @@ def create_nvingest_collection(
|
|
|
432
440
|
"""
|
|
433
441
|
local_index = False
|
|
434
442
|
if urlparse(milvus_uri).scheme:
|
|
435
|
-
connections.connect(uri=milvus_uri)
|
|
443
|
+
connections.connect(uri=milvus_uri, token=f"{username}:{password}")
|
|
436
444
|
server_version = utility.get_server_version()
|
|
437
445
|
if "lite" in server_version:
|
|
438
446
|
gpu_index = False
|
|
@@ -441,7 +449,7 @@ def create_nvingest_collection(
|
|
|
441
449
|
if milvus_uri.endswith(".db"):
|
|
442
450
|
local_index = True
|
|
443
451
|
|
|
444
|
-
client = MilvusClient(milvus_uri)
|
|
452
|
+
client = MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
445
453
|
schema = create_nvingest_schema(dense_dim=dense_dim, sparse=sparse, local_index=local_index)
|
|
446
454
|
index_params = create_nvingest_index_params(
|
|
447
455
|
sparse=sparse,
|
|
@@ -454,11 +462,11 @@ def create_nvingest_collection(
|
|
|
454
462
|
log_new_meta_collection(
|
|
455
463
|
collection_name,
|
|
456
464
|
fields=schema.fields,
|
|
457
|
-
milvus_uri=milvus_uri,
|
|
458
465
|
dense_index=str(d_idx),
|
|
459
466
|
dense_dim=dense_dim,
|
|
460
467
|
sparse_index=str(s_idx),
|
|
461
468
|
recreate=recreate_meta,
|
|
469
|
+
client=client,
|
|
462
470
|
)
|
|
463
471
|
return schema
|
|
464
472
|
|
|
@@ -729,7 +737,7 @@ def write_records_minio(records, writer: RemoteBulkWriter) -> RemoteBulkWriter:
|
|
|
729
737
|
for element in records:
|
|
730
738
|
writer.append_row(element)
|
|
731
739
|
writer.commit()
|
|
732
|
-
|
|
740
|
+
logger.debug(f"Wrote data to: {writer.batch_files}")
|
|
733
741
|
return writer
|
|
734
742
|
|
|
735
743
|
|
|
@@ -737,6 +745,12 @@ def bulk_insert_milvus(
|
|
|
737
745
|
collection_name: str,
|
|
738
746
|
writer: RemoteBulkWriter,
|
|
739
747
|
milvus_uri: str = "http://localhost:19530",
|
|
748
|
+
minio_endpoint: str = "localhost:9000",
|
|
749
|
+
access_key: str = "minioadmin",
|
|
750
|
+
secret_key: str = "minioadmin",
|
|
751
|
+
bucket_name: str = None,
|
|
752
|
+
username: str = None,
|
|
753
|
+
password: str = None,
|
|
740
754
|
):
|
|
741
755
|
"""
|
|
742
756
|
This function initialize the bulk ingest of all minio uploaded records, and checks for
|
|
@@ -753,28 +767,42 @@ def bulk_insert_milvus(
|
|
|
753
767
|
milvus_uri : str,
|
|
754
768
|
Milvus address with http(s) preffix and port. Can also be a file path, to activate
|
|
755
769
|
milvus-lite.
|
|
770
|
+
username : str, optional
|
|
771
|
+
Milvus username.
|
|
772
|
+
password : str, optional
|
|
773
|
+
Milvus password.
|
|
756
774
|
"""
|
|
757
|
-
|
|
758
|
-
connections.connect(uri=milvus_uri)
|
|
775
|
+
connections.connect(uri=milvus_uri, token=f"{username}:{password}")
|
|
759
776
|
t_bulk_start = time.time()
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
t_bulk_end = time.time()
|
|
772
|
-
print("Start time:", task.create_time_str)
|
|
773
|
-
print("Imported row count:", task.row_count)
|
|
774
|
-
print(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
775
|
-
if task.state == BulkInsertState.ImportFailed:
|
|
776
|
-
print("Failed reason:", task.failed_reason)
|
|
777
|
+
task_ids = []
|
|
778
|
+
|
|
779
|
+
for files in writer.batch_files:
|
|
780
|
+
task_id = utility.do_bulk_insert(
|
|
781
|
+
collection_name=collection_name,
|
|
782
|
+
files=files,
|
|
783
|
+
consistency_level=CONSISTENCY,
|
|
784
|
+
)
|
|
785
|
+
task_ids.append(task_id)
|
|
786
|
+
|
|
787
|
+
while len(task_ids) > 0:
|
|
777
788
|
time.sleep(1)
|
|
789
|
+
tasks = copy.copy(task_ids)
|
|
790
|
+
for task_id in tasks:
|
|
791
|
+
task = utility.get_bulk_insert_state(task_id=task_id)
|
|
792
|
+
state = task.state_name
|
|
793
|
+
logger.info(f"Checking task: {task_id} - imported rows: {task.row_count}")
|
|
794
|
+
if state == "Completed":
|
|
795
|
+
logger.info(f"Task: {task_id}")
|
|
796
|
+
logger.info(f"Start time: {task.create_time_str}")
|
|
797
|
+
logger.info(f"Imported row count: {task.row_count}")
|
|
798
|
+
task_ids.remove(task_id)
|
|
799
|
+
if task.state == BulkInsertState.ImportFailed:
|
|
800
|
+
logger.error(f"Task: {task_id}")
|
|
801
|
+
logger.error(f"Failed reason: {task.failed_reason}")
|
|
802
|
+
task_ids.remove(task_id)
|
|
803
|
+
|
|
804
|
+
t_bulk_end = time.time()
|
|
805
|
+
logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
778
806
|
|
|
779
807
|
|
|
780
808
|
def create_bm25_model(
|
|
@@ -839,7 +867,7 @@ def create_bm25_model(
|
|
|
839
867
|
return bm25_ef
|
|
840
868
|
|
|
841
869
|
|
|
842
|
-
def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
|
|
870
|
+
def stream_insert_milvus(records, client: MilvusClient, collection_name: str, batch_size: int = 5000):
|
|
843
871
|
"""
|
|
844
872
|
This function takes the input records and creates a corpus,
|
|
845
873
|
factoring in filters (i.e. texts, charts, tables) and fits
|
|
@@ -857,12 +885,48 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
|
|
|
857
885
|
Milvus Collection to search against
|
|
858
886
|
"""
|
|
859
887
|
count = 0
|
|
860
|
-
for
|
|
861
|
-
client.insert(collection_name=collection_name, data=[
|
|
862
|
-
count +=
|
|
888
|
+
for idx in range(0, len(records), batch_size):
|
|
889
|
+
client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
|
|
890
|
+
count += len(records[idx : idx + batch_size])
|
|
863
891
|
logger.info(f"streamed {count} records")
|
|
864
892
|
|
|
865
893
|
|
|
894
|
+
def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
|
|
895
|
+
"""
|
|
896
|
+
This function waits for the index to be built. It checks
|
|
897
|
+
the indexed_rows of the index and waits for it to be equal
|
|
898
|
+
to the number of records. This only works for streaming inserts,
|
|
899
|
+
bulk inserts are not supported by this function
|
|
900
|
+
(refer to MilvusClient.refresh_load for bulk inserts).
|
|
901
|
+
"""
|
|
902
|
+
client.flush(collection_name)
|
|
903
|
+
index_names = utility.list_indexes(collection_name)
|
|
904
|
+
indexed_rows = 0
|
|
905
|
+
for index_name in index_names:
|
|
906
|
+
indexed_rows = 0
|
|
907
|
+
while indexed_rows < num_elements:
|
|
908
|
+
pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
|
|
909
|
+
for i in range(20):
|
|
910
|
+
new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
911
|
+
time.sleep(1)
|
|
912
|
+
logger.info(
|
|
913
|
+
f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
|
|
914
|
+
)
|
|
915
|
+
if new_indexed_rows == num_elements:
|
|
916
|
+
indexed_rows = new_indexed_rows
|
|
917
|
+
break
|
|
918
|
+
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
919
|
+
if new_indexed_rows == indexed_rows:
|
|
920
|
+
pos_movement -= 1
|
|
921
|
+
else:
|
|
922
|
+
pos_movement = 10
|
|
923
|
+
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
924
|
+
if pos_movement == 0:
|
|
925
|
+
raise ValueError("Rows are not getting indexed as expected")
|
|
926
|
+
indexed_rows = new_indexed_rows
|
|
927
|
+
return indexed_rows
|
|
928
|
+
|
|
929
|
+
|
|
866
930
|
def write_to_nvingest_collection(
|
|
867
931
|
records,
|
|
868
932
|
collection_name: str,
|
|
@@ -878,12 +942,14 @@ def write_to_nvingest_collection(
|
|
|
878
942
|
compute_bm25_stats: bool = True,
|
|
879
943
|
access_key: str = "minioadmin",
|
|
880
944
|
secret_key: str = "minioadmin",
|
|
881
|
-
bucket_name: str =
|
|
945
|
+
bucket_name: str = None,
|
|
882
946
|
threshold: int = 1000,
|
|
883
947
|
meta_dataframe=None,
|
|
884
948
|
meta_source_field=None,
|
|
885
949
|
meta_fields=None,
|
|
886
950
|
stream: bool = False,
|
|
951
|
+
username: str = None,
|
|
952
|
+
password: str = None,
|
|
887
953
|
**kwargs,
|
|
888
954
|
):
|
|
889
955
|
"""
|
|
@@ -924,9 +990,13 @@ def write_to_nvingest_collection(
|
|
|
924
990
|
Minio bucket name.
|
|
925
991
|
stream : bool, optional
|
|
926
992
|
When true, the records will be inserted into milvus using the stream insert method.
|
|
993
|
+
username : str, optional
|
|
994
|
+
Milvus username.
|
|
995
|
+
password : str, optional
|
|
996
|
+
Milvus password.
|
|
927
997
|
"""
|
|
928
998
|
local_index = False
|
|
929
|
-
connections.connect(uri=milvus_uri)
|
|
999
|
+
connections.connect(uri=milvus_uri, token=f"{username}:{password}")
|
|
930
1000
|
if urlparse(milvus_uri).scheme:
|
|
931
1001
|
server_version = utility.get_server_version()
|
|
932
1002
|
if "lite" in server_version:
|
|
@@ -949,7 +1019,7 @@ def write_to_nvingest_collection(
|
|
|
949
1019
|
elif local_index and sparse:
|
|
950
1020
|
bm25_ef = BM25EmbeddingFunction(build_default_analyzer(language="en"))
|
|
951
1021
|
bm25_ef.load(bm25_save_path)
|
|
952
|
-
client = MilvusClient(milvus_uri)
|
|
1022
|
+
client = MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
953
1023
|
schema = Collection(collection_name).schema
|
|
954
1024
|
if isinstance(meta_dataframe, str):
|
|
955
1025
|
meta_dataframe = pandas_file_reader(meta_dataframe)
|
|
@@ -978,7 +1048,16 @@ def write_to_nvingest_collection(
|
|
|
978
1048
|
client,
|
|
979
1049
|
collection_name,
|
|
980
1050
|
)
|
|
1051
|
+
if not local_index:
|
|
1052
|
+
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1053
|
+
# know how long this should take, it is num_elements dependent.
|
|
1054
|
+
wait_for_index(collection_name, num_elements, client)
|
|
981
1055
|
else:
|
|
1056
|
+
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1057
|
+
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
1058
|
+
if not minio_client.bucket_exists(bucket_name):
|
|
1059
|
+
minio_client.make_bucket(bucket_name)
|
|
1060
|
+
|
|
982
1061
|
# Connections parameters to access the remote bucket
|
|
983
1062
|
conn = RemoteBulkWriter.S3ConnectParam(
|
|
984
1063
|
endpoint=minio_endpoint, # the default MinIO service started along with Milvus
|
|
@@ -997,9 +1076,20 @@ def write_to_nvingest_collection(
|
|
|
997
1076
|
cleaned_records,
|
|
998
1077
|
text_writer,
|
|
999
1078
|
)
|
|
1000
|
-
bulk_insert_milvus(
|
|
1079
|
+
bulk_insert_milvus(
|
|
1080
|
+
collection_name,
|
|
1081
|
+
writer,
|
|
1082
|
+
milvus_uri,
|
|
1083
|
+
minio_endpoint,
|
|
1084
|
+
access_key,
|
|
1085
|
+
secret_key,
|
|
1086
|
+
bucket_name,
|
|
1087
|
+
username=username,
|
|
1088
|
+
password=password,
|
|
1089
|
+
)
|
|
1001
1090
|
# fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
|
|
1002
1091
|
client.refresh_load(collection_name)
|
|
1092
|
+
logger.info(f"Refresh load response: {client.get_load_state(collection_name)}")
|
|
1003
1093
|
|
|
1004
1094
|
|
|
1005
1095
|
def dense_retrieval(
|
|
@@ -1028,8 +1118,8 @@ def dense_retrieval(
|
|
|
1028
1118
|
Milvus Collection to search against
|
|
1029
1119
|
client : MilvusClient
|
|
1030
1120
|
Client connected to mivlus instance.
|
|
1031
|
-
dense_model :
|
|
1032
|
-
|
|
1121
|
+
dense_model : Partial Function
|
|
1122
|
+
Partial function to generate dense embeddings with queries.
|
|
1033
1123
|
top_k : int
|
|
1034
1124
|
Number of search results to return per query.
|
|
1035
1125
|
dense_field : str
|
|
@@ -1043,7 +1133,8 @@ def dense_retrieval(
|
|
|
1043
1133
|
"""
|
|
1044
1134
|
dense_embeddings = []
|
|
1045
1135
|
for query in queries:
|
|
1046
|
-
dense_embeddings.append(dense_model.get_query_embedding(query))
|
|
1136
|
+
# dense_embeddings.append(dense_model.get_query_embedding(query))
|
|
1137
|
+
dense_embeddings += dense_model([query])
|
|
1047
1138
|
|
|
1048
1139
|
search_params = {}
|
|
1049
1140
|
if not gpu_search and not local_index:
|
|
@@ -1112,7 +1203,7 @@ def hybrid_retrieval(
|
|
|
1112
1203
|
dense_embeddings = []
|
|
1113
1204
|
sparse_embeddings = []
|
|
1114
1205
|
for query in queries:
|
|
1115
|
-
dense_embeddings
|
|
1206
|
+
dense_embeddings += dense_model([query])
|
|
1116
1207
|
if sparse_model:
|
|
1117
1208
|
sparse_embeddings.append(_format_sparse_embedding(sparse_model.encode_queries([query])))
|
|
1118
1209
|
else:
|
|
@@ -1181,6 +1272,9 @@ def nvingest_retrieval(
|
|
|
1181
1272
|
nv_ranker_max_batch_size: int = 64,
|
|
1182
1273
|
_filter: str = "",
|
|
1183
1274
|
ef_param: int = 200,
|
|
1275
|
+
client: MilvusClient = None,
|
|
1276
|
+
username: str = None,
|
|
1277
|
+
password: str = None,
|
|
1184
1278
|
**kwargs,
|
|
1185
1279
|
):
|
|
1186
1280
|
"""
|
|
@@ -1227,6 +1321,12 @@ def nvingest_retrieval(
|
|
|
1227
1321
|
Max size for the number of candidates to rerank.
|
|
1228
1322
|
nv_ranker_top_k : int,
|
|
1229
1323
|
The number of candidates to return after reranking.
|
|
1324
|
+
client : MilvusClient, optional
|
|
1325
|
+
Milvus client instance.
|
|
1326
|
+
username : str, optional
|
|
1327
|
+
Milvus username.
|
|
1328
|
+
password : str, optional
|
|
1329
|
+
Milvus password.
|
|
1230
1330
|
Returns
|
|
1231
1331
|
-------
|
|
1232
1332
|
List
|
|
@@ -1239,16 +1339,22 @@ def nvingest_retrieval(
|
|
|
1239
1339
|
kwargs.pop("vdb_op", None)
|
|
1240
1340
|
queries = kwargs.pop("queries", [])
|
|
1241
1341
|
return vdb_op.retrieval(queries, **kwargs)
|
|
1242
|
-
from llama_index.embeddings.nvidia import NVIDIAEmbedding
|
|
1243
1342
|
|
|
1244
1343
|
client_config = ClientConfigSchema()
|
|
1245
1344
|
nvidia_api_key = client_config.nvidia_api_key
|
|
1246
|
-
# required for NVIDIAEmbedding call if the endpoint is Nvidia build api.
|
|
1247
1345
|
embedding_endpoint = embedding_endpoint if embedding_endpoint else client_config.embedding_nim_endpoint
|
|
1248
1346
|
model_name = model_name if model_name else client_config.embedding_nim_model_name
|
|
1249
1347
|
local_index = False
|
|
1250
|
-
embed_model =
|
|
1251
|
-
|
|
1348
|
+
embed_model = partial(
|
|
1349
|
+
infer_microservice,
|
|
1350
|
+
model_name=model_name,
|
|
1351
|
+
embedding_endpoint=embedding_endpoint,
|
|
1352
|
+
nvidia_api_key=nvidia_api_key,
|
|
1353
|
+
input_type="query",
|
|
1354
|
+
output_names=["embeddings"],
|
|
1355
|
+
grpc=not ("http" in urlparse(embedding_endpoint).scheme),
|
|
1356
|
+
)
|
|
1357
|
+
client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
1252
1358
|
final_top_k = top_k
|
|
1253
1359
|
if nv_ranker:
|
|
1254
1360
|
top_k = nv_ranker_top_k
|
|
@@ -1304,7 +1410,14 @@ def nvingest_retrieval(
|
|
|
1304
1410
|
return results
|
|
1305
1411
|
|
|
1306
1412
|
|
|
1307
|
-
def remove_records(
|
|
1413
|
+
def remove_records(
|
|
1414
|
+
source_name: str,
|
|
1415
|
+
collection_name: str,
|
|
1416
|
+
milvus_uri: str = "http://localhost:19530",
|
|
1417
|
+
username: str = None,
|
|
1418
|
+
password: str = None,
|
|
1419
|
+
client: MilvusClient = None,
|
|
1420
|
+
):
|
|
1308
1421
|
"""
|
|
1309
1422
|
This function allows a user to remove chunks associated with an ingested file.
|
|
1310
1423
|
Supply the full path of the file you would like to remove and this function will
|
|
@@ -1319,6 +1432,12 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
|
|
|
1319
1432
|
milvus_uri : str,
|
|
1320
1433
|
Milvus address with http(s) preffix and port. Can also be a file path, to activate
|
|
1321
1434
|
milvus-lite.
|
|
1435
|
+
client : MilvusClient, optional
|
|
1436
|
+
Milvus client instance.
|
|
1437
|
+
username : str, optional
|
|
1438
|
+
Milvus username.
|
|
1439
|
+
password : str, optional
|
|
1440
|
+
Milvus password.
|
|
1322
1441
|
|
|
1323
1442
|
Returns
|
|
1324
1443
|
-------
|
|
@@ -1326,7 +1445,7 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
|
|
|
1326
1445
|
Dictionary with one key, `delete_cnt`. The value represents the number of entities
|
|
1327
1446
|
removed.
|
|
1328
1447
|
"""
|
|
1329
|
-
client = MilvusClient(milvus_uri)
|
|
1448
|
+
client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
1330
1449
|
result_ids = client.delete(
|
|
1331
1450
|
collection_name=collection_name,
|
|
1332
1451
|
filter=f'(source["source_name"] == "{source_name}")',
|
|
@@ -1433,6 +1552,9 @@ def pull_all_milvus(
|
|
|
1433
1552
|
write_dir: str = None,
|
|
1434
1553
|
batch_size: int = 1000,
|
|
1435
1554
|
include_embeddings: bool = False,
|
|
1555
|
+
username: str = None,
|
|
1556
|
+
password: str = None,
|
|
1557
|
+
client: MilvusClient = None,
|
|
1436
1558
|
):
|
|
1437
1559
|
"""
|
|
1438
1560
|
This function takes the input collection name and pulls all the records
|
|
@@ -1451,12 +1573,18 @@ def pull_all_milvus(
|
|
|
1451
1573
|
The number of records to pull in each batch. Defaults to 1000.
|
|
1452
1574
|
include_embeddings : bool, optional
|
|
1453
1575
|
Whether to include the embeddings in the output. Defaults to False.
|
|
1576
|
+
username : str, optional
|
|
1577
|
+
Milvus username.
|
|
1578
|
+
password : str, optional
|
|
1579
|
+
Milvus password.
|
|
1580
|
+
client : MilvusClient, optional
|
|
1581
|
+
Milvus client instance.
|
|
1454
1582
|
Returns
|
|
1455
1583
|
-------
|
|
1456
1584
|
List
|
|
1457
1585
|
List of records/files with records from the collection.
|
|
1458
1586
|
"""
|
|
1459
|
-
client = MilvusClient(milvus_uri)
|
|
1587
|
+
client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
1460
1588
|
output_fields = ["source", "content_metadata", "text"]
|
|
1461
1589
|
if include_embeddings:
|
|
1462
1590
|
output_fields.append("vector")
|
|
@@ -1525,12 +1653,15 @@ def embed_index_collection(
|
|
|
1525
1653
|
compute_bm25_stats: bool = True,
|
|
1526
1654
|
access_key: str = "minioadmin",
|
|
1527
1655
|
secret_key: str = "minioadmin",
|
|
1528
|
-
bucket_name: str =
|
|
1656
|
+
bucket_name: str = None,
|
|
1529
1657
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1530
1658
|
meta_source_field: str = None,
|
|
1531
1659
|
meta_fields: list[str] = None,
|
|
1532
|
-
|
|
1660
|
+
input_type: str = "passage",
|
|
1533
1661
|
truncate: str = "END",
|
|
1662
|
+
client: MilvusClient = None,
|
|
1663
|
+
username: str = None,
|
|
1664
|
+
password: str = None,
|
|
1534
1665
|
**kwargs,
|
|
1535
1666
|
):
|
|
1536
1667
|
"""
|
|
@@ -1562,17 +1693,22 @@ def embed_index_collection(
|
|
|
1562
1693
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1563
1694
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1564
1695
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1565
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1696
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1566
1697
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1567
1698
|
containing metadata. Defaults to None.
|
|
1568
1699
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
1569
1700
|
Defaults to None.
|
|
1570
1701
|
meta_fields (list[str], optional): A list of metadata fields to include. Defaults to None.
|
|
1702
|
+
client : MilvusClient, optional
|
|
1703
|
+
Milvus client instance.
|
|
1704
|
+
username : str, optional
|
|
1705
|
+
Milvus username.
|
|
1706
|
+
password : str, optional
|
|
1707
|
+
Milvus password.
|
|
1571
1708
|
**kwargs: Additional keyword arguments for customization.
|
|
1572
1709
|
"""
|
|
1573
1710
|
client_config = ClientConfigSchema()
|
|
1574
1711
|
nvidia_api_key = nvidia_api_key if nvidia_api_key else client_config.nvidia_api_key
|
|
1575
|
-
# required for NVIDIAEmbedding call if the endpoint is Nvidia build api.
|
|
1576
1712
|
embedding_endpoint = embedding_endpoint if embedding_endpoint else client_config.embedding_nim_endpoint
|
|
1577
1713
|
model_name = model_name if model_name else client_config.embedding_nim_model_name
|
|
1578
1714
|
# if not scheme we assume we are using grpc
|
|
@@ -1601,6 +1737,8 @@ def embed_index_collection(
|
|
|
1601
1737
|
meta_dataframe=meta_dataframe,
|
|
1602
1738
|
meta_source_field=meta_source_field,
|
|
1603
1739
|
meta_fields=meta_fields,
|
|
1740
|
+
username=username,
|
|
1741
|
+
password=password,
|
|
1604
1742
|
**kwargs,
|
|
1605
1743
|
)
|
|
1606
1744
|
# running in parts
|
|
@@ -1614,7 +1752,7 @@ def embed_index_collection(
|
|
|
1614
1752
|
model_name,
|
|
1615
1753
|
embedding_endpoint,
|
|
1616
1754
|
nvidia_api_key,
|
|
1617
|
-
|
|
1755
|
+
input_type,
|
|
1618
1756
|
truncate,
|
|
1619
1757
|
batch_size,
|
|
1620
1758
|
grpc,
|
|
@@ -1632,7 +1770,7 @@ def embed_index_collection(
|
|
|
1632
1770
|
model_name,
|
|
1633
1771
|
embedding_endpoint,
|
|
1634
1772
|
nvidia_api_key,
|
|
1635
|
-
|
|
1773
|
+
input_type,
|
|
1636
1774
|
truncate,
|
|
1637
1775
|
batch_size,
|
|
1638
1776
|
grpc,
|
|
@@ -1670,7 +1808,7 @@ def reindex_collection(
|
|
|
1670
1808
|
compute_bm25_stats: bool = True,
|
|
1671
1809
|
access_key: str = "minioadmin",
|
|
1672
1810
|
secret_key: str = "minioadmin",
|
|
1673
|
-
bucket_name: str =
|
|
1811
|
+
bucket_name: str = None,
|
|
1674
1812
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1675
1813
|
meta_source_field: str = None,
|
|
1676
1814
|
meta_fields: list[str] = None,
|
|
@@ -1711,7 +1849,7 @@ def reindex_collection(
|
|
|
1711
1849
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1712
1850
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1713
1851
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1714
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1852
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1715
1853
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1716
1854
|
containing metadata. Defaults to None.
|
|
1717
1855
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -1819,11 +1957,14 @@ class Milvus(VDB):
|
|
|
1819
1957
|
compute_bm25_stats: bool = True,
|
|
1820
1958
|
access_key: str = "minioadmin",
|
|
1821
1959
|
secret_key: str = "minioadmin",
|
|
1822
|
-
bucket_name: str =
|
|
1960
|
+
bucket_name: str = None,
|
|
1823
1961
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1824
1962
|
meta_source_field: str = None,
|
|
1825
1963
|
meta_fields: list[str] = None,
|
|
1826
1964
|
stream: bool = False,
|
|
1965
|
+
threshold: int = 1000,
|
|
1966
|
+
username: str = None,
|
|
1967
|
+
password: str = None,
|
|
1827
1968
|
**kwargs,
|
|
1828
1969
|
):
|
|
1829
1970
|
"""
|
|
@@ -1847,15 +1988,17 @@ class Milvus(VDB):
|
|
|
1847
1988
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1848
1989
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1849
1990
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1850
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1991
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1851
1992
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1852
1993
|
containing metadata. Defaults to None.
|
|
1853
1994
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
1854
1995
|
Defaults to None.
|
|
1855
1996
|
meta_fields (list[str], optional): A list of metadata fields to include. Defaults to None.
|
|
1856
|
-
**kwargs: Additional keyword arguments for customization.
|
|
1857
1997
|
stream (bool, optional): When true, the records will be inserted into milvus using the stream
|
|
1858
1998
|
insert method.
|
|
1999
|
+
username (str, optional): The username for Milvus authentication. Defaults to None.
|
|
2000
|
+
password (str, optional): The password for Milvus authentication. Defaults to None.
|
|
2001
|
+
**kwargs: Additional keyword arguments for customization.
|
|
1859
2002
|
"""
|
|
1860
2003
|
kwargs = locals().copy()
|
|
1861
2004
|
kwargs.pop("self", None)
|
|
@@ -1885,6 +2028,8 @@ class Milvus(VDB):
|
|
|
1885
2028
|
"gpu_index": self.__dict__.get("gpu_index", True),
|
|
1886
2029
|
"gpu_search": self.__dict__.get("gpu_search", True),
|
|
1887
2030
|
"dense_dim": self.__dict__.get("dense_dim", 2048),
|
|
2031
|
+
"username": self.__dict__.get("username", None),
|
|
2032
|
+
"password": self.__dict__.get("password", None),
|
|
1888
2033
|
}
|
|
1889
2034
|
return (self.collection_name, conn_dict)
|
|
1890
2035
|
|