nv-ingest-client 2025.8.10.dev20250810__py3-none-any.whl → 2025.8.12.dev20250812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -26,7 +26,7 @@ def infer_microservice(
26
26
  client = NimClient(
27
27
  model_interface=EmbeddingModelInterface(),
28
28
  protocol="grpc",
29
- endpoints=(embedding_endpoint, embedding_endpoint),
29
+ endpoints=(embedding_endpoint, None),
30
30
  auth_token=nvidia_api_key,
31
31
  )
32
32
  return client.infer(
@@ -43,7 +43,7 @@ def infer_microservice(
43
43
  client = NimClient(
44
44
  model_interface=EmbeddingModelInterface(),
45
45
  protocol="http",
46
- endpoints=(embedding_endpoint, embedding_endpoint),
46
+ endpoints=(None, embedding_endpoint),
47
47
  auth_token=nvidia_api_key,
48
48
  )
49
49
  return client.infer(data, model_name, input_type=input_type, truncate=truncate, batch_size=batch_size)
@@ -1,23 +1,28 @@
1
+ import ast
2
+ import copy
1
3
  import datetime
4
+ import json
2
5
  import logging
6
+ import os
3
7
  import time
8
+ from functools import partial
9
+ from pathlib import Path
4
10
  from typing import Dict
5
11
  from typing import List
6
12
  from typing import Tuple
7
13
  from typing import Union
8
14
  from urllib.parse import urlparse
9
- from pathlib import Path
10
- import pandas as pd
11
- from functools import partial
12
- import json
13
- import os
14
- import numpy as np
15
- import ast
16
- import copy
17
15
 
16
+ import numpy as np
17
+ import pandas as pd
18
18
  import requests
19
+ from minio import Minio
20
+ from minio.commonconfig import CopySource
21
+ from minio.deleteobjects import DeleteObject
19
22
  from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
23
+ from nv_ingest_client.util.transport import infer_microservice
20
24
  from nv_ingest_client.util.util import ClientConfigSchema
25
+ from nv_ingest_client.util.vdb.adt_vdb import VDB
21
26
  from pymilvus import AnnSearchRequest
22
27
  from pymilvus import BulkInsertState
23
28
  from pymilvus import Collection
@@ -36,13 +41,11 @@ from pymilvus.model.sparse import BM25EmbeddingFunction
36
41
  from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
37
42
  from pymilvus.orm.types import CONSISTENCY_BOUNDED
38
43
  from scipy.sparse import csr_array
39
- from nv_ingest_client.util.transport import infer_microservice
40
- from nv_ingest_client.util.vdb.adt_vdb import VDB
41
-
42
44
 
43
45
  logger = logging.getLogger(__name__)
44
46
 
45
47
  CONSISTENCY = CONSISTENCY_BOUNDED
48
+ MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
46
49
 
47
50
  pandas_reader_map = {
48
51
  ".json": pd.read_json,
@@ -737,6 +740,10 @@ def bulk_insert_milvus(
737
740
  collection_name: str,
738
741
  writer: RemoteBulkWriter,
739
742
  milvus_uri: str = "http://localhost:19530",
743
+ minio_endpoint: str = "localhost:9000",
744
+ access_key: str = "minioadmin",
745
+ secret_key: str = "minioadmin",
746
+ bucket_name: str = "nv-ingest",
740
747
  ):
741
748
  """
742
749
  This function initialize the bulk ingest of all minio uploaded records, and checks for
@@ -754,28 +761,49 @@ def bulk_insert_milvus(
754
761
  Milvus address with http(s) preffix and port. Can also be a file path, to activate
755
762
  milvus-lite.
756
763
  """
764
+ minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
757
765
 
758
766
  connections.connect(uri=milvus_uri)
759
767
  t_bulk_start = time.time()
760
- files_to_upload = [_file for file_set in writer.batch_files for _file in file_set]
761
- task_id = utility.do_bulk_insert(
762
- collection_name=collection_name,
763
- files=files_to_upload,
764
- consistency_level=CONSISTENCY,
765
- )
768
+ task_ids = []
769
+ uploaded_files = []
770
+ for files in writer.batch_files:
771
+ for f in files:
772
+ # Hack: do_bulk_insert only reads from the default bucket ('a-bucket'),
773
+ # so we first copy objects from the source bucket into 'a-bucket' before inserting.
774
+ try:
775
+ minio_client.copy_object(MINIO_DEFAULT_BUCKET_NAME, f, CopySource(bucket_name, f))
776
+ uploaded_files.append(f)
777
+ except Exception as e:
778
+ logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
779
+
780
+ task_id = utility.do_bulk_insert(
781
+ collection_name=collection_name,
782
+ files=files,
783
+ consistency_level=CONSISTENCY,
784
+ )
785
+ task_ids.append(task_id)
766
786
  # list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
767
- state = "Pending"
768
- while state != "Completed":
769
- task = utility.get_bulk_insert_state(task_id=task_id)
770
- state = task.state_name
771
- if state == "Completed":
772
- t_bulk_end = time.time()
773
- logger.info("Start time:", task.create_time_str)
774
- logger.info("Imported row count:", task.row_count)
775
- logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
776
- if task.state == BulkInsertState.ImportFailed:
777
- logger.error("Failed reason:", task.failed_reason)
787
+ while len(task_ids) > 0:
778
788
  time.sleep(1)
789
+ for task_id in task_ids:
790
+ task = utility.get_bulk_insert_state(task_id=task_id)
791
+ state = task.state_name
792
+ if state == "Completed":
793
+ logger.info(f"Task: {task_id}")
794
+ logger.info(f"Start time: {task.create_time_str}")
795
+ logger.info(f"Imported row count: {task.row_count}")
796
+ task_ids.remove(task_id)
797
+ if task.state == BulkInsertState.ImportFailed:
798
+ logger.error(f"Task: {task_id}")
799
+ logger.error(f"Failed reason: {task.failed_reason}")
800
+ task_ids.remove(task_id)
801
+
802
+ # Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
803
+ minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
804
+
805
+ t_bulk_end = time.time()
806
+ logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
779
807
 
780
808
 
781
809
  def create_bm25_model(
@@ -980,6 +1008,10 @@ def write_to_nvingest_collection(
980
1008
  collection_name,
981
1009
  )
982
1010
  else:
1011
+ minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1012
+ if not minio_client.bucket_exists(bucket_name):
1013
+ minio_client.make_bucket(bucket_name)
1014
+
983
1015
  # Connections parameters to access the remote bucket
984
1016
  conn = RemoteBulkWriter.S3ConnectParam(
985
1017
  endpoint=minio_endpoint, # the default MinIO service started along with Milvus
@@ -998,7 +1030,15 @@ def write_to_nvingest_collection(
998
1030
  cleaned_records,
999
1031
  text_writer,
1000
1032
  )
1001
- bulk_insert_milvus(collection_name, writer, milvus_uri)
1033
+ bulk_insert_milvus(
1034
+ collection_name,
1035
+ writer,
1036
+ milvus_uri,
1037
+ minio_endpoint,
1038
+ access_key,
1039
+ secret_key,
1040
+ bucket_name,
1041
+ )
1002
1042
  # fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
1003
1043
  client.refresh_load(collection_name)
1004
1044
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.8.10.dev20250810
3
+ Version: 2025.8.12.dev20250812
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -37,18 +37,18 @@ nv_ingest_client/util/milvus.py,sha256=MwBix_UBg54i7xONBIwjcqeKSBkqunxBJBK2f0bPM
37
37
  nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywkcLuNieozvPWvo0,3785
38
38
  nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
39
39
  nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
40
- nv_ingest_client/util/transport.py,sha256=Rzdj9GxYsJVbGuh95H2AoHTMsFj-oZC1TiN9pT5vRPA,1674
40
+ nv_ingest_client/util/transport.py,sha256=G1wrwaJLXf8S2yTuq6ZwG1NNMf4cHfp1igLxGQ87apc,1646
41
41
  nv_ingest_client/util/util.py,sha256=6KQkE5vXmBUoxETpjLdtPYm1pNCYCnqwC-Df56ETGQ4,14748
42
42
  nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
43
43
  nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
45
45
  nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
46
46
  nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
47
- nv_ingest_client/util/vdb/milvus.py,sha256=uVgCTjg-Njz9Lq_sURNlZEky4KtdapxPr6a5Ug6vCmo,73511
47
+ nv_ingest_client/util/vdb/milvus.py,sha256=qIX-evj5wIxODi7I6w2Hu9jOupEmLQLeom-eLn4Xon8,75212
48
48
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
49
- nv_ingest_client-2025.8.10.dev20250810.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
- nv_ingest_client-2025.8.10.dev20250810.dist-info/METADATA,sha256=jqMm_wuxEx1RnNUukpX93lZMHI3L0N5rnfSf6VYet40,30737
51
- nv_ingest_client-2025.8.10.dev20250810.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
52
- nv_ingest_client-2025.8.10.dev20250810.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
53
- nv_ingest_client-2025.8.10.dev20250810.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
54
- nv_ingest_client-2025.8.10.dev20250810.dist-info/RECORD,,
49
+ nv_ingest_client-2025.8.12.dev20250812.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
+ nv_ingest_client-2025.8.12.dev20250812.dist-info/METADATA,sha256=GYWKUN1-UQpWKxTCmSewl9frvvEvm3DZNtqg5wajKXA,30737
51
+ nv_ingest_client-2025.8.12.dev20250812.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
52
+ nv_ingest_client-2025.8.12.dev20250812.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
53
+ nv_ingest_client-2025.8.12.dev20250812.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
54
+ nv_ingest_client-2025.8.12.dev20250812.dist-info/RECORD,,