nv-ingest-client 2025.8.10.dev20250810__py3-none-any.whl → 2025.8.12.dev20250812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/util/transport.py +2 -2
- nv_ingest_client/util/vdb/milvus.py +69 -29
- {nv_ingest_client-2025.8.10.dev20250810.dist-info → nv_ingest_client-2025.8.12.dev20250812.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.8.10.dev20250810.dist-info → nv_ingest_client-2025.8.12.dev20250812.dist-info}/RECORD +8 -8
- {nv_ingest_client-2025.8.10.dev20250810.dist-info → nv_ingest_client-2025.8.12.dev20250812.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.8.10.dev20250810.dist-info → nv_ingest_client-2025.8.12.dev20250812.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.8.10.dev20250810.dist-info → nv_ingest_client-2025.8.12.dev20250812.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.8.10.dev20250810.dist-info → nv_ingest_client-2025.8.12.dev20250812.dist-info}/top_level.txt +0 -0
|
@@ -26,7 +26,7 @@ def infer_microservice(
|
|
|
26
26
|
client = NimClient(
|
|
27
27
|
model_interface=EmbeddingModelInterface(),
|
|
28
28
|
protocol="grpc",
|
|
29
|
-
endpoints=(embedding_endpoint,
|
|
29
|
+
endpoints=(embedding_endpoint, None),
|
|
30
30
|
auth_token=nvidia_api_key,
|
|
31
31
|
)
|
|
32
32
|
return client.infer(
|
|
@@ -43,7 +43,7 @@ def infer_microservice(
|
|
|
43
43
|
client = NimClient(
|
|
44
44
|
model_interface=EmbeddingModelInterface(),
|
|
45
45
|
protocol="http",
|
|
46
|
-
endpoints=(
|
|
46
|
+
endpoints=(None, embedding_endpoint),
|
|
47
47
|
auth_token=nvidia_api_key,
|
|
48
48
|
)
|
|
49
49
|
return client.infer(data, model_name, input_type=input_type, truncate=truncate, batch_size=batch_size)
|
|
@@ -1,23 +1,28 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import copy
|
|
1
3
|
import datetime
|
|
4
|
+
import json
|
|
2
5
|
import logging
|
|
6
|
+
import os
|
|
3
7
|
import time
|
|
8
|
+
from functools import partial
|
|
9
|
+
from pathlib import Path
|
|
4
10
|
from typing import Dict
|
|
5
11
|
from typing import List
|
|
6
12
|
from typing import Tuple
|
|
7
13
|
from typing import Union
|
|
8
14
|
from urllib.parse import urlparse
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from functools import partial
|
|
12
|
-
import json
|
|
13
|
-
import os
|
|
14
|
-
import numpy as np
|
|
15
|
-
import ast
|
|
16
|
-
import copy
|
|
17
15
|
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
18
|
import requests
|
|
19
|
+
from minio import Minio
|
|
20
|
+
from minio.commonconfig import CopySource
|
|
21
|
+
from minio.deleteobjects import DeleteObject
|
|
19
22
|
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
|
|
23
|
+
from nv_ingest_client.util.transport import infer_microservice
|
|
20
24
|
from nv_ingest_client.util.util import ClientConfigSchema
|
|
25
|
+
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
21
26
|
from pymilvus import AnnSearchRequest
|
|
22
27
|
from pymilvus import BulkInsertState
|
|
23
28
|
from pymilvus import Collection
|
|
@@ -36,13 +41,11 @@ from pymilvus.model.sparse import BM25EmbeddingFunction
|
|
|
36
41
|
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
|
|
37
42
|
from pymilvus.orm.types import CONSISTENCY_BOUNDED
|
|
38
43
|
from scipy.sparse import csr_array
|
|
39
|
-
from nv_ingest_client.util.transport import infer_microservice
|
|
40
|
-
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
41
|
-
|
|
42
44
|
|
|
43
45
|
logger = logging.getLogger(__name__)
|
|
44
46
|
|
|
45
47
|
CONSISTENCY = CONSISTENCY_BOUNDED
|
|
48
|
+
MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
|
|
46
49
|
|
|
47
50
|
pandas_reader_map = {
|
|
48
51
|
".json": pd.read_json,
|
|
@@ -737,6 +740,10 @@ def bulk_insert_milvus(
|
|
|
737
740
|
collection_name: str,
|
|
738
741
|
writer: RemoteBulkWriter,
|
|
739
742
|
milvus_uri: str = "http://localhost:19530",
|
|
743
|
+
minio_endpoint: str = "localhost:9000",
|
|
744
|
+
access_key: str = "minioadmin",
|
|
745
|
+
secret_key: str = "minioadmin",
|
|
746
|
+
bucket_name: str = "nv-ingest",
|
|
740
747
|
):
|
|
741
748
|
"""
|
|
742
749
|
This function initialize the bulk ingest of all minio uploaded records, and checks for
|
|
@@ -754,28 +761,49 @@ def bulk_insert_milvus(
|
|
|
754
761
|
Milvus address with http(s) preffix and port. Can also be a file path, to activate
|
|
755
762
|
milvus-lite.
|
|
756
763
|
"""
|
|
764
|
+
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
757
765
|
|
|
758
766
|
connections.connect(uri=milvus_uri)
|
|
759
767
|
t_bulk_start = time.time()
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
files
|
|
764
|
-
|
|
765
|
-
|
|
768
|
+
task_ids = []
|
|
769
|
+
uploaded_files = []
|
|
770
|
+
for files in writer.batch_files:
|
|
771
|
+
for f in files:
|
|
772
|
+
# Hack: do_bulk_insert only reads from the default bucket ('a-bucket'),
|
|
773
|
+
# so we first copy objects from the source bucket into 'a-bucket' before inserting.
|
|
774
|
+
try:
|
|
775
|
+
minio_client.copy_object(MINIO_DEFAULT_BUCKET_NAME, f, CopySource(bucket_name, f))
|
|
776
|
+
uploaded_files.append(f)
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
|
|
779
|
+
|
|
780
|
+
task_id = utility.do_bulk_insert(
|
|
781
|
+
collection_name=collection_name,
|
|
782
|
+
files=files,
|
|
783
|
+
consistency_level=CONSISTENCY,
|
|
784
|
+
)
|
|
785
|
+
task_ids.append(task_id)
|
|
766
786
|
# list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
|
|
767
|
-
|
|
768
|
-
while state != "Completed":
|
|
769
|
-
task = utility.get_bulk_insert_state(task_id=task_id)
|
|
770
|
-
state = task.state_name
|
|
771
|
-
if state == "Completed":
|
|
772
|
-
t_bulk_end = time.time()
|
|
773
|
-
logger.info("Start time:", task.create_time_str)
|
|
774
|
-
logger.info("Imported row count:", task.row_count)
|
|
775
|
-
logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
776
|
-
if task.state == BulkInsertState.ImportFailed:
|
|
777
|
-
logger.error("Failed reason:", task.failed_reason)
|
|
787
|
+
while len(task_ids) > 0:
|
|
778
788
|
time.sleep(1)
|
|
789
|
+
for task_id in task_ids:
|
|
790
|
+
task = utility.get_bulk_insert_state(task_id=task_id)
|
|
791
|
+
state = task.state_name
|
|
792
|
+
if state == "Completed":
|
|
793
|
+
logger.info(f"Task: {task_id}")
|
|
794
|
+
logger.info(f"Start time: {task.create_time_str}")
|
|
795
|
+
logger.info(f"Imported row count: {task.row_count}")
|
|
796
|
+
task_ids.remove(task_id)
|
|
797
|
+
if task.state == BulkInsertState.ImportFailed:
|
|
798
|
+
logger.error(f"Task: {task_id}")
|
|
799
|
+
logger.error(f"Failed reason: {task.failed_reason}")
|
|
800
|
+
task_ids.remove(task_id)
|
|
801
|
+
|
|
802
|
+
# Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
|
|
803
|
+
minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
|
|
804
|
+
|
|
805
|
+
t_bulk_end = time.time()
|
|
806
|
+
logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
779
807
|
|
|
780
808
|
|
|
781
809
|
def create_bm25_model(
|
|
@@ -980,6 +1008,10 @@ def write_to_nvingest_collection(
|
|
|
980
1008
|
collection_name,
|
|
981
1009
|
)
|
|
982
1010
|
else:
|
|
1011
|
+
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1012
|
+
if not minio_client.bucket_exists(bucket_name):
|
|
1013
|
+
minio_client.make_bucket(bucket_name)
|
|
1014
|
+
|
|
983
1015
|
# Connections parameters to access the remote bucket
|
|
984
1016
|
conn = RemoteBulkWriter.S3ConnectParam(
|
|
985
1017
|
endpoint=minio_endpoint, # the default MinIO service started along with Milvus
|
|
@@ -998,7 +1030,15 @@ def write_to_nvingest_collection(
|
|
|
998
1030
|
cleaned_records,
|
|
999
1031
|
text_writer,
|
|
1000
1032
|
)
|
|
1001
|
-
bulk_insert_milvus(
|
|
1033
|
+
bulk_insert_milvus(
|
|
1034
|
+
collection_name,
|
|
1035
|
+
writer,
|
|
1036
|
+
milvus_uri,
|
|
1037
|
+
minio_endpoint,
|
|
1038
|
+
access_key,
|
|
1039
|
+
secret_key,
|
|
1040
|
+
bucket_name,
|
|
1041
|
+
)
|
|
1002
1042
|
# fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
|
|
1003
1043
|
client.refresh_load(collection_name)
|
|
1004
1044
|
|
|
@@ -37,18 +37,18 @@ nv_ingest_client/util/milvus.py,sha256=MwBix_UBg54i7xONBIwjcqeKSBkqunxBJBK2f0bPM
|
|
|
37
37
|
nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywkcLuNieozvPWvo0,3785
|
|
38
38
|
nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
|
|
39
39
|
nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
|
|
40
|
-
nv_ingest_client/util/transport.py,sha256=
|
|
40
|
+
nv_ingest_client/util/transport.py,sha256=G1wrwaJLXf8S2yTuq6ZwG1NNMf4cHfp1igLxGQ87apc,1646
|
|
41
41
|
nv_ingest_client/util/util.py,sha256=6KQkE5vXmBUoxETpjLdtPYm1pNCYCnqwC-Df56ETGQ4,14748
|
|
42
42
|
nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
|
|
43
43
|
nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
44
|
nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
|
|
45
45
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
46
46
|
nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
|
|
47
|
-
nv_ingest_client/util/vdb/milvus.py,sha256=
|
|
47
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=qIX-evj5wIxODi7I6w2Hu9jOupEmLQLeom-eLn4Xon8,75212
|
|
48
48
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
49
|
-
nv_ingest_client-2025.8.
|
|
50
|
-
nv_ingest_client-2025.8.
|
|
51
|
-
nv_ingest_client-2025.8.
|
|
52
|
-
nv_ingest_client-2025.8.
|
|
53
|
-
nv_ingest_client-2025.8.
|
|
54
|
-
nv_ingest_client-2025.8.
|
|
49
|
+
nv_ingest_client-2025.8.12.dev20250812.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
50
|
+
nv_ingest_client-2025.8.12.dev20250812.dist-info/METADATA,sha256=GYWKUN1-UQpWKxTCmSewl9frvvEvm3DZNtqg5wajKXA,30737
|
|
51
|
+
nv_ingest_client-2025.8.12.dev20250812.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
52
|
+
nv_ingest_client-2025.8.12.dev20250812.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
53
|
+
nv_ingest_client-2025.8.12.dev20250812.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
54
|
+
nv_ingest_client-2025.8.12.dev20250812.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|