nv-ingest-client 2025.8.11.dev20250811__py3-none-any.whl → 2025.8.13.dev20250813__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/util/vdb/milvus.py +46 -12
- {nv_ingest_client-2025.8.11.dev20250811.dist-info → nv_ingest_client-2025.8.13.dev20250813.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.8.11.dev20250811.dist-info → nv_ingest_client-2025.8.13.dev20250813.dist-info}/RECORD +7 -7
- {nv_ingest_client-2025.8.11.dev20250811.dist-info → nv_ingest_client-2025.8.13.dev20250813.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.8.11.dev20250811.dist-info → nv_ingest_client-2025.8.13.dev20250813.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.8.11.dev20250811.dist-info → nv_ingest_client-2025.8.13.dev20250813.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.8.11.dev20250811.dist-info → nv_ingest_client-2025.8.13.dev20250813.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,28 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import copy
|
|
1
3
|
import datetime
|
|
4
|
+
import json
|
|
2
5
|
import logging
|
|
6
|
+
import os
|
|
3
7
|
import time
|
|
8
|
+
from functools import partial
|
|
9
|
+
from pathlib import Path
|
|
4
10
|
from typing import Dict
|
|
5
11
|
from typing import List
|
|
6
12
|
from typing import Tuple
|
|
7
13
|
from typing import Union
|
|
8
14
|
from urllib.parse import urlparse
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from functools import partial
|
|
12
|
-
import json
|
|
13
|
-
import os
|
|
14
|
-
import numpy as np
|
|
15
|
-
import ast
|
|
16
|
-
import copy
|
|
17
15
|
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
18
|
import requests
|
|
19
|
+
from minio import Minio
|
|
20
|
+
from minio.commonconfig import CopySource
|
|
21
|
+
from minio.deleteobjects import DeleteObject
|
|
19
22
|
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
|
|
23
|
+
from nv_ingest_client.util.transport import infer_microservice
|
|
20
24
|
from nv_ingest_client.util.util import ClientConfigSchema
|
|
25
|
+
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
21
26
|
from pymilvus import AnnSearchRequest
|
|
22
27
|
from pymilvus import BulkInsertState
|
|
23
28
|
from pymilvus import Collection
|
|
@@ -36,13 +41,11 @@ from pymilvus.model.sparse import BM25EmbeddingFunction
|
|
|
36
41
|
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
|
|
37
42
|
from pymilvus.orm.types import CONSISTENCY_BOUNDED
|
|
38
43
|
from scipy.sparse import csr_array
|
|
39
|
-
from nv_ingest_client.util.transport import infer_microservice
|
|
40
|
-
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
41
|
-
|
|
42
44
|
|
|
43
45
|
logger = logging.getLogger(__name__)
|
|
44
46
|
|
|
45
47
|
CONSISTENCY = CONSISTENCY_BOUNDED
|
|
48
|
+
MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
|
|
46
49
|
|
|
47
50
|
pandas_reader_map = {
|
|
48
51
|
".json": pd.read_json,
|
|
@@ -737,6 +740,10 @@ def bulk_insert_milvus(
|
|
|
737
740
|
collection_name: str,
|
|
738
741
|
writer: RemoteBulkWriter,
|
|
739
742
|
milvus_uri: str = "http://localhost:19530",
|
|
743
|
+
minio_endpoint: str = "localhost:9000",
|
|
744
|
+
access_key: str = "minioadmin",
|
|
745
|
+
secret_key: str = "minioadmin",
|
|
746
|
+
bucket_name: str = "nv-ingest",
|
|
740
747
|
):
|
|
741
748
|
"""
|
|
742
749
|
This function initialize the bulk ingest of all minio uploaded records, and checks for
|
|
@@ -754,11 +761,22 @@ def bulk_insert_milvus(
|
|
|
754
761
|
Milvus address with http(s) preffix and port. Can also be a file path, to activate
|
|
755
762
|
milvus-lite.
|
|
756
763
|
"""
|
|
764
|
+
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
757
765
|
|
|
758
766
|
connections.connect(uri=milvus_uri)
|
|
759
767
|
t_bulk_start = time.time()
|
|
760
768
|
task_ids = []
|
|
769
|
+
uploaded_files = []
|
|
761
770
|
for files in writer.batch_files:
|
|
771
|
+
for f in files:
|
|
772
|
+
# Hack: do_bulk_insert only reads from the default bucket ('a-bucket'),
|
|
773
|
+
# so we first copy objects from the source bucket into 'a-bucket' before inserting.
|
|
774
|
+
try:
|
|
775
|
+
minio_client.copy_object(MINIO_DEFAULT_BUCKET_NAME, f, CopySource(bucket_name, f))
|
|
776
|
+
uploaded_files.append(f)
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
|
|
779
|
+
|
|
762
780
|
task_id = utility.do_bulk_insert(
|
|
763
781
|
collection_name=collection_name,
|
|
764
782
|
files=files,
|
|
@@ -780,6 +798,10 @@ def bulk_insert_milvus(
|
|
|
780
798
|
logger.error(f"Task: {task_id}")
|
|
781
799
|
logger.error(f"Failed reason: {task.failed_reason}")
|
|
782
800
|
task_ids.remove(task_id)
|
|
801
|
+
|
|
802
|
+
# Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
|
|
803
|
+
minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
|
|
804
|
+
|
|
783
805
|
t_bulk_end = time.time()
|
|
784
806
|
logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
785
807
|
|
|
@@ -986,6 +1008,10 @@ def write_to_nvingest_collection(
|
|
|
986
1008
|
collection_name,
|
|
987
1009
|
)
|
|
988
1010
|
else:
|
|
1011
|
+
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1012
|
+
if not minio_client.bucket_exists(bucket_name):
|
|
1013
|
+
minio_client.make_bucket(bucket_name)
|
|
1014
|
+
|
|
989
1015
|
# Connections parameters to access the remote bucket
|
|
990
1016
|
conn = RemoteBulkWriter.S3ConnectParam(
|
|
991
1017
|
endpoint=minio_endpoint, # the default MinIO service started along with Milvus
|
|
@@ -1004,7 +1030,15 @@ def write_to_nvingest_collection(
|
|
|
1004
1030
|
cleaned_records,
|
|
1005
1031
|
text_writer,
|
|
1006
1032
|
)
|
|
1007
|
-
bulk_insert_milvus(
|
|
1033
|
+
bulk_insert_milvus(
|
|
1034
|
+
collection_name,
|
|
1035
|
+
writer,
|
|
1036
|
+
milvus_uri,
|
|
1037
|
+
minio_endpoint,
|
|
1038
|
+
access_key,
|
|
1039
|
+
secret_key,
|
|
1040
|
+
bucket_name,
|
|
1041
|
+
)
|
|
1008
1042
|
# fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
|
|
1009
1043
|
client.refresh_load(collection_name)
|
|
1010
1044
|
|
|
@@ -44,11 +44,11 @@ nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
|
44
44
|
nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
|
|
45
45
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
46
46
|
nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
|
|
47
|
-
nv_ingest_client/util/vdb/milvus.py,sha256=
|
|
47
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=qIX-evj5wIxODi7I6w2Hu9jOupEmLQLeom-eLn4Xon8,75212
|
|
48
48
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
49
|
-
nv_ingest_client-2025.8.
|
|
50
|
-
nv_ingest_client-2025.8.
|
|
51
|
-
nv_ingest_client-2025.8.
|
|
52
|
-
nv_ingest_client-2025.8.
|
|
53
|
-
nv_ingest_client-2025.8.
|
|
54
|
-
nv_ingest_client-2025.8.
|
|
49
|
+
nv_ingest_client-2025.8.13.dev20250813.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
50
|
+
nv_ingest_client-2025.8.13.dev20250813.dist-info/METADATA,sha256=XAxE0M-2XLRkOhng51l7ImFGj8An7ridxkpMKTMhCw0,30737
|
|
51
|
+
nv_ingest_client-2025.8.13.dev20250813.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
52
|
+
nv_ingest_client-2025.8.13.dev20250813.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
53
|
+
nv_ingest_client-2025.8.13.dev20250813.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
54
|
+
nv_ingest_client-2025.8.13.dev20250813.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|