nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,26 @@
1
+ import ast
2
+ import copy
1
3
  import datetime
4
+ import json
2
5
  import logging
6
+ import os
3
7
  import time
8
+ from functools import partial
9
+ from pathlib import Path
4
10
  from typing import Dict
5
11
  from typing import List
6
12
  from typing import Tuple
7
13
  from typing import Union
8
14
  from urllib.parse import urlparse
9
- from pathlib import Path
10
- import pandas as pd
11
- from functools import partial
12
- import json
13
- import os
14
- import numpy as np
15
- import ast
16
- import copy
17
15
 
16
+ import numpy as np
17
+ import pandas as pd
18
18
  import requests
19
+ from minio import Minio
19
20
  from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
21
+ from nv_ingest_client.util.transport import infer_microservice
20
22
  from nv_ingest_client.util.util import ClientConfigSchema
23
+ from nv_ingest_client.util.vdb.adt_vdb import VDB
21
24
  from pymilvus import AnnSearchRequest
22
25
  from pymilvus import BulkInsertState
23
26
  from pymilvus import Collection
@@ -36,8 +39,6 @@ from pymilvus.model.sparse import BM25EmbeddingFunction
36
39
  from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
37
40
  from pymilvus.orm.types import CONSISTENCY_BOUNDED
38
41
  from scipy.sparse import csr_array
39
- from nv_ingest_client.util.transport import infer_microservice
40
- from nv_ingest_client.util.vdb.adt_vdb import VDB
41
42
 
42
43
 
43
44
  logger = logging.getLogger(__name__)
@@ -81,11 +82,10 @@ def create_nvingest_meta_schema():
81
82
 
82
83
  def create_meta_collection(
83
84
  schema: CollectionSchema,
84
- milvus_uri: str = "http://localhost:19530",
85
85
  collection_name: str = "meta",
86
86
  recreate=False,
87
+ client: MilvusClient = None,
87
88
  ):
88
- client = MilvusClient(milvus_uri)
89
89
  if client.has_collection(collection_name) and not recreate:
90
90
  # already exists, dont erase and recreate
91
91
  return
@@ -103,7 +103,6 @@ def create_meta_collection(
103
103
  def write_meta_collection(
104
104
  collection_name: str,
105
105
  fields: List[str],
106
- milvus_uri: str = "http://localhost:19530",
107
106
  creation_timestamp: str = None,
108
107
  dense_index: str = None,
109
108
  dense_dim: int = None,
@@ -111,6 +110,7 @@ def write_meta_collection(
111
110
  embedding_model: str = None,
112
111
  sparse_model: str = None,
113
112
  meta_collection_name: str = "meta",
113
+ client: MilvusClient = None,
114
114
  ):
115
115
  client_config = ClientConfigSchema()
116
116
  data = {
@@ -129,14 +129,12 @@ def write_meta_collection(
129
129
  },
130
130
  "user_fields": [field.name for field in fields],
131
131
  }
132
- client = MilvusClient(milvus_uri)
133
132
  client.insert(collection_name=meta_collection_name, data=data)
134
133
 
135
134
 
136
135
  def log_new_meta_collection(
137
136
  collection_name: str,
138
137
  fields: List[str],
139
- milvus_uri: str = "http://localhost:19530",
140
138
  creation_timestamp: str = None,
141
139
  dense_index: str = None,
142
140
  dense_dim: int = None,
@@ -145,13 +143,13 @@ def log_new_meta_collection(
145
143
  sparse_model: str = None,
146
144
  meta_collection_name: str = "meta",
147
145
  recreate: bool = False,
146
+ client: MilvusClient = None,
148
147
  ):
149
148
  schema = create_nvingest_meta_schema()
150
- create_meta_collection(schema, milvus_uri, recreate=recreate)
149
+ create_meta_collection(schema, client=client, recreate=recreate)
151
150
  write_meta_collection(
152
151
  collection_name,
153
152
  fields=fields,
154
- milvus_uri=milvus_uri,
155
153
  creation_timestamp=creation_timestamp,
156
154
  dense_index=dense_index,
157
155
  dense_dim=dense_dim,
@@ -159,6 +157,7 @@ def log_new_meta_collection(
159
157
  embedding_model=embedding_model,
160
158
  sparse_model=sparse_model,
161
159
  meta_collection_name=meta_collection_name,
160
+ client=client,
162
161
  )
163
162
 
164
163
 
@@ -168,12 +167,16 @@ def grab_meta_collection_info(
168
167
  timestamp: str = None,
169
168
  embedding_model: str = None,
170
169
  embedding_dim: int = None,
171
- milvus_uri: str = "http://localhost:19530",
170
+ client: MilvusClient = None,
171
+ milvus_uri: str = None,
172
+ username: str = None,
173
+ password: str = None,
172
174
  ):
173
175
  timestamp = timestamp or ""
174
176
  embedding_model = embedding_model or ""
175
177
  embedding_dim = embedding_dim or ""
176
- client = MilvusClient(milvus_uri)
178
+ if milvus_uri:
179
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
177
180
  results = client.query_iterator(
178
181
  collection_name=meta_collection_name,
179
182
  output_fields=[
@@ -401,6 +404,8 @@ def create_nvingest_collection(
401
404
  gpu_search: bool = False,
402
405
  dense_dim: int = 2048,
403
406
  recreate_meta: bool = False,
407
+ username: str = None,
408
+ password: str = None,
404
409
  ) -> CollectionSchema:
405
410
  """
406
411
  Creates a milvus collection with an nv-ingest compatible schema under
@@ -410,9 +415,7 @@ def create_nvingest_collection(
410
415
  ----------
411
416
  collection_name : str
412
417
  Name of the collection to be created.
413
- milvus_uri : str,
414
- Milvus address with http(s) preffix and port. Can also be a file path, to activate
415
- milvus-lite.
418
+
416
419
  sparse : bool, optional
417
420
  When set to true, this adds a Sparse index to the IndexParams, usually activated for
418
421
  hybrid search.
@@ -423,6 +426,11 @@ def create_nvingest_collection(
423
426
  If true, creates a GPU_CAGRA index for dense embeddings.
424
427
  dense_dim : int, optional
425
428
  Sets the dimension size for the dense embedding in the milvus schema.
429
+ username : str, optional
430
+ Milvus username.
431
+ password : str, optional
432
+ Milvus password.
433
+
426
434
 
427
435
  Returns
428
436
  -------
@@ -432,7 +440,7 @@ def create_nvingest_collection(
432
440
  """
433
441
  local_index = False
434
442
  if urlparse(milvus_uri).scheme:
435
- connections.connect(uri=milvus_uri)
443
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
436
444
  server_version = utility.get_server_version()
437
445
  if "lite" in server_version:
438
446
  gpu_index = False
@@ -441,7 +449,7 @@ def create_nvingest_collection(
441
449
  if milvus_uri.endswith(".db"):
442
450
  local_index = True
443
451
 
444
- client = MilvusClient(milvus_uri)
452
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
445
453
  schema = create_nvingest_schema(dense_dim=dense_dim, sparse=sparse, local_index=local_index)
446
454
  index_params = create_nvingest_index_params(
447
455
  sparse=sparse,
@@ -454,11 +462,11 @@ def create_nvingest_collection(
454
462
  log_new_meta_collection(
455
463
  collection_name,
456
464
  fields=schema.fields,
457
- milvus_uri=milvus_uri,
458
465
  dense_index=str(d_idx),
459
466
  dense_dim=dense_dim,
460
467
  sparse_index=str(s_idx),
461
468
  recreate=recreate_meta,
469
+ client=client,
462
470
  )
463
471
  return schema
464
472
 
@@ -729,7 +737,7 @@ def write_records_minio(records, writer: RemoteBulkWriter) -> RemoteBulkWriter:
729
737
  for element in records:
730
738
  writer.append_row(element)
731
739
  writer.commit()
732
- print(f"Wrote data to: {writer.batch_files}")
740
+ logger.debug(f"Wrote data to: {writer.batch_files}")
733
741
  return writer
734
742
 
735
743
 
@@ -737,6 +745,12 @@ def bulk_insert_milvus(
737
745
  collection_name: str,
738
746
  writer: RemoteBulkWriter,
739
747
  milvus_uri: str = "http://localhost:19530",
748
+ minio_endpoint: str = "localhost:9000",
749
+ access_key: str = "minioadmin",
750
+ secret_key: str = "minioadmin",
751
+ bucket_name: str = None,
752
+ username: str = None,
753
+ password: str = None,
740
754
  ):
741
755
  """
742
756
  This function initialize the bulk ingest of all minio uploaded records, and checks for
@@ -753,28 +767,42 @@ def bulk_insert_milvus(
753
767
  milvus_uri : str,
754
768
  Milvus address with http(s) preffix and port. Can also be a file path, to activate
755
769
  milvus-lite.
770
+ username : str, optional
771
+ Milvus username.
772
+ password : str, optional
773
+ Milvus password.
756
774
  """
757
-
758
- connections.connect(uri=milvus_uri)
775
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
759
776
  t_bulk_start = time.time()
760
- task_id = utility.do_bulk_insert(
761
- collection_name=collection_name,
762
- files=writer.batch_files[0],
763
- consistency_level=CONSISTENCY,
764
- )
765
- # list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
766
- state = "Pending"
767
- while state != "Completed":
768
- task = utility.get_bulk_insert_state(task_id=task_id)
769
- state = task.state_name
770
- if state == "Completed":
771
- t_bulk_end = time.time()
772
- print("Start time:", task.create_time_str)
773
- print("Imported row count:", task.row_count)
774
- print(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
775
- if task.state == BulkInsertState.ImportFailed:
776
- print("Failed reason:", task.failed_reason)
777
+ task_ids = []
778
+
779
+ for files in writer.batch_files:
780
+ task_id = utility.do_bulk_insert(
781
+ collection_name=collection_name,
782
+ files=files,
783
+ consistency_level=CONSISTENCY,
784
+ )
785
+ task_ids.append(task_id)
786
+
787
+ while len(task_ids) > 0:
777
788
  time.sleep(1)
789
+ tasks = copy.copy(task_ids)
790
+ for task_id in tasks:
791
+ task = utility.get_bulk_insert_state(task_id=task_id)
792
+ state = task.state_name
793
+ logger.info(f"Checking task: {task_id} - imported rows: {task.row_count}")
794
+ if state == "Completed":
795
+ logger.info(f"Task: {task_id}")
796
+ logger.info(f"Start time: {task.create_time_str}")
797
+ logger.info(f"Imported row count: {task.row_count}")
798
+ task_ids.remove(task_id)
799
+ if task.state == BulkInsertState.ImportFailed:
800
+ logger.error(f"Task: {task_id}")
801
+ logger.error(f"Failed reason: {task.failed_reason}")
802
+ task_ids.remove(task_id)
803
+
804
+ t_bulk_end = time.time()
805
+ logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
778
806
 
779
807
 
780
808
  def create_bm25_model(
@@ -839,7 +867,7 @@ def create_bm25_model(
839
867
  return bm25_ef
840
868
 
841
869
 
842
- def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
870
+ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, batch_size: int = 5000):
843
871
  """
844
872
  This function takes the input records and creates a corpus,
845
873
  factoring in filters (i.e. texts, charts, tables) and fits
@@ -857,12 +885,48 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
857
885
  Milvus Collection to search against
858
886
  """
859
887
  count = 0
860
- for element in records:
861
- client.insert(collection_name=collection_name, data=[element])
862
- count += 1
888
+ for idx in range(0, len(records), batch_size):
889
+ client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
890
+ count += len(records[idx : idx + batch_size])
863
891
  logger.info(f"streamed {count} records")
864
892
 
865
893
 
894
+ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
895
+ """
896
+ This function waits for the index to be built. It checks
897
+ the indexed_rows of the index and waits for it to be equal
898
+ to the number of records. This only works for streaming inserts,
899
+ bulk inserts are not supported by this function
900
+ (refer to MilvusClient.refresh_load for bulk inserts).
901
+ """
902
+ client.flush(collection_name)
903
+ index_names = utility.list_indexes(collection_name)
904
+ indexed_rows = 0
905
+ for index_name in index_names:
906
+ indexed_rows = 0
907
+ while indexed_rows < num_elements:
908
+ pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
909
+ for i in range(20):
910
+ new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
911
+ time.sleep(1)
912
+ logger.info(
913
+ f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
914
+ )
915
+ if new_indexed_rows == num_elements:
916
+ indexed_rows = new_indexed_rows
917
+ break
918
+ # check if indexed_rows is staying the same, too many times means something is wrong
919
+ if new_indexed_rows == indexed_rows:
920
+ pos_movement -= 1
921
+ else:
922
+ pos_movement = 10
923
+ # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
924
+ if pos_movement == 0:
925
+ raise ValueError("Rows are not getting indexed as expected")
926
+ indexed_rows = new_indexed_rows
927
+ return indexed_rows
928
+
929
+
866
930
  def write_to_nvingest_collection(
867
931
  records,
868
932
  collection_name: str,
@@ -878,12 +942,14 @@ def write_to_nvingest_collection(
878
942
  compute_bm25_stats: bool = True,
879
943
  access_key: str = "minioadmin",
880
944
  secret_key: str = "minioadmin",
881
- bucket_name: str = "a-bucket",
945
+ bucket_name: str = None,
882
946
  threshold: int = 1000,
883
947
  meta_dataframe=None,
884
948
  meta_source_field=None,
885
949
  meta_fields=None,
886
950
  stream: bool = False,
951
+ username: str = None,
952
+ password: str = None,
887
953
  **kwargs,
888
954
  ):
889
955
  """
@@ -924,9 +990,13 @@ def write_to_nvingest_collection(
924
990
  Minio bucket name.
925
991
  stream : bool, optional
926
992
  When true, the records will be inserted into milvus using the stream insert method.
993
+ username : str, optional
994
+ Milvus username.
995
+ password : str, optional
996
+ Milvus password.
927
997
  """
928
998
  local_index = False
929
- connections.connect(uri=milvus_uri)
999
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
930
1000
  if urlparse(milvus_uri).scheme:
931
1001
  server_version = utility.get_server_version()
932
1002
  if "lite" in server_version:
@@ -949,7 +1019,7 @@ def write_to_nvingest_collection(
949
1019
  elif local_index and sparse:
950
1020
  bm25_ef = BM25EmbeddingFunction(build_default_analyzer(language="en"))
951
1021
  bm25_ef.load(bm25_save_path)
952
- client = MilvusClient(milvus_uri)
1022
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
953
1023
  schema = Collection(collection_name).schema
954
1024
  if isinstance(meta_dataframe, str):
955
1025
  meta_dataframe = pandas_file_reader(meta_dataframe)
@@ -978,7 +1048,16 @@ def write_to_nvingest_collection(
978
1048
  client,
979
1049
  collection_name,
980
1050
  )
1051
+ if not local_index:
1052
+ # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
1053
+ # know how long this should take, it is num_elements dependent.
1054
+ wait_for_index(collection_name, num_elements, client)
981
1055
  else:
1056
+ minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1057
+ bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
1058
+ if not minio_client.bucket_exists(bucket_name):
1059
+ minio_client.make_bucket(bucket_name)
1060
+
982
1061
  # Connections parameters to access the remote bucket
983
1062
  conn = RemoteBulkWriter.S3ConnectParam(
984
1063
  endpoint=minio_endpoint, # the default MinIO service started along with Milvus
@@ -997,9 +1076,20 @@ def write_to_nvingest_collection(
997
1076
  cleaned_records,
998
1077
  text_writer,
999
1078
  )
1000
- bulk_insert_milvus(collection_name, writer, milvus_uri)
1079
+ bulk_insert_milvus(
1080
+ collection_name,
1081
+ writer,
1082
+ milvus_uri,
1083
+ minio_endpoint,
1084
+ access_key,
1085
+ secret_key,
1086
+ bucket_name,
1087
+ username=username,
1088
+ password=password,
1089
+ )
1001
1090
  # fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
1002
1091
  client.refresh_load(collection_name)
1092
+ logger.info(f"Refresh load response: {client.get_load_state(collection_name)}")
1003
1093
 
1004
1094
 
1005
1095
  def dense_retrieval(
@@ -1028,8 +1118,8 @@ def dense_retrieval(
1028
1118
  Milvus Collection to search against
1029
1119
  client : MilvusClient
1030
1120
  Client connected to mivlus instance.
1031
- dense_model : NVIDIAEmbedding
1032
- Dense model to generate dense embeddings for queries.
1121
+ dense_model : Partial Function
1122
+ Partial function to generate dense embeddings with queries.
1033
1123
  top_k : int
1034
1124
  Number of search results to return per query.
1035
1125
  dense_field : str
@@ -1043,7 +1133,8 @@ def dense_retrieval(
1043
1133
  """
1044
1134
  dense_embeddings = []
1045
1135
  for query in queries:
1046
- dense_embeddings.append(dense_model.get_query_embedding(query))
1136
+ # dense_embeddings.append(dense_model.get_query_embedding(query))
1137
+ dense_embeddings += dense_model([query])
1047
1138
 
1048
1139
  search_params = {}
1049
1140
  if not gpu_search and not local_index:
@@ -1112,7 +1203,7 @@ def hybrid_retrieval(
1112
1203
  dense_embeddings = []
1113
1204
  sparse_embeddings = []
1114
1205
  for query in queries:
1115
- dense_embeddings.append(dense_model.get_query_embedding(query))
1206
+ dense_embeddings += dense_model([query])
1116
1207
  if sparse_model:
1117
1208
  sparse_embeddings.append(_format_sparse_embedding(sparse_model.encode_queries([query])))
1118
1209
  else:
@@ -1181,6 +1272,9 @@ def nvingest_retrieval(
1181
1272
  nv_ranker_max_batch_size: int = 64,
1182
1273
  _filter: str = "",
1183
1274
  ef_param: int = 200,
1275
+ client: MilvusClient = None,
1276
+ username: str = None,
1277
+ password: str = None,
1184
1278
  **kwargs,
1185
1279
  ):
1186
1280
  """
@@ -1227,6 +1321,12 @@ def nvingest_retrieval(
1227
1321
  Max size for the number of candidates to rerank.
1228
1322
  nv_ranker_top_k : int,
1229
1323
  The number of candidates to return after reranking.
1324
+ client : MilvusClient, optional
1325
+ Milvus client instance.
1326
+ username : str, optional
1327
+ Milvus username.
1328
+ password : str, optional
1329
+ Milvus password.
1230
1330
  Returns
1231
1331
  -------
1232
1332
  List
@@ -1239,16 +1339,22 @@ def nvingest_retrieval(
1239
1339
  kwargs.pop("vdb_op", None)
1240
1340
  queries = kwargs.pop("queries", [])
1241
1341
  return vdb_op.retrieval(queries, **kwargs)
1242
- from llama_index.embeddings.nvidia import NVIDIAEmbedding
1243
1342
 
1244
1343
  client_config = ClientConfigSchema()
1245
1344
  nvidia_api_key = client_config.nvidia_api_key
1246
- # required for NVIDIAEmbedding call if the endpoint is Nvidia build api.
1247
1345
  embedding_endpoint = embedding_endpoint if embedding_endpoint else client_config.embedding_nim_endpoint
1248
1346
  model_name = model_name if model_name else client_config.embedding_nim_model_name
1249
1347
  local_index = False
1250
- embed_model = NVIDIAEmbedding(base_url=embedding_endpoint, model=model_name, nvidia_api_key=nvidia_api_key)
1251
- client = MilvusClient(milvus_uri)
1348
+ embed_model = partial(
1349
+ infer_microservice,
1350
+ model_name=model_name,
1351
+ embedding_endpoint=embedding_endpoint,
1352
+ nvidia_api_key=nvidia_api_key,
1353
+ input_type="query",
1354
+ output_names=["embeddings"],
1355
+ grpc=not ("http" in urlparse(embedding_endpoint).scheme),
1356
+ )
1357
+ client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
1252
1358
  final_top_k = top_k
1253
1359
  if nv_ranker:
1254
1360
  top_k = nv_ranker_top_k
@@ -1304,7 +1410,14 @@ def nvingest_retrieval(
1304
1410
  return results
1305
1411
 
1306
1412
 
1307
- def remove_records(source_name: str, collection_name: str, milvus_uri: str = "http://localhost:19530"):
1413
+ def remove_records(
1414
+ source_name: str,
1415
+ collection_name: str,
1416
+ milvus_uri: str = "http://localhost:19530",
1417
+ username: str = None,
1418
+ password: str = None,
1419
+ client: MilvusClient = None,
1420
+ ):
1308
1421
  """
1309
1422
  This function allows a user to remove chunks associated with an ingested file.
1310
1423
  Supply the full path of the file you would like to remove and this function will
@@ -1319,6 +1432,12 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
1319
1432
  milvus_uri : str,
1320
1433
  Milvus address with http(s) preffix and port. Can also be a file path, to activate
1321
1434
  milvus-lite.
1435
+ client : MilvusClient, optional
1436
+ Milvus client instance.
1437
+ username : str, optional
1438
+ Milvus username.
1439
+ password : str, optional
1440
+ Milvus password.
1322
1441
 
1323
1442
  Returns
1324
1443
  -------
@@ -1326,7 +1445,7 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
1326
1445
  Dictionary with one key, `delete_cnt`. The value represents the number of entities
1327
1446
  removed.
1328
1447
  """
1329
- client = MilvusClient(milvus_uri)
1448
+ client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
1330
1449
  result_ids = client.delete(
1331
1450
  collection_name=collection_name,
1332
1451
  filter=f'(source["source_name"] == "{source_name}")',
@@ -1433,6 +1552,9 @@ def pull_all_milvus(
1433
1552
  write_dir: str = None,
1434
1553
  batch_size: int = 1000,
1435
1554
  include_embeddings: bool = False,
1555
+ username: str = None,
1556
+ password: str = None,
1557
+ client: MilvusClient = None,
1436
1558
  ):
1437
1559
  """
1438
1560
  This function takes the input collection name and pulls all the records
@@ -1451,12 +1573,18 @@ def pull_all_milvus(
1451
1573
  The number of records to pull in each batch. Defaults to 1000.
1452
1574
  include_embeddings : bool, optional
1453
1575
  Whether to include the embeddings in the output. Defaults to False.
1576
+ username : str, optional
1577
+ Milvus username.
1578
+ password : str, optional
1579
+ Milvus password.
1580
+ client : MilvusClient, optional
1581
+ Milvus client instance.
1454
1582
  Returns
1455
1583
  -------
1456
1584
  List
1457
1585
  List of records/files with records from the collection.
1458
1586
  """
1459
- client = MilvusClient(milvus_uri)
1587
+ client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
1460
1588
  output_fields = ["source", "content_metadata", "text"]
1461
1589
  if include_embeddings:
1462
1590
  output_fields.append("vector")
@@ -1525,12 +1653,15 @@ def embed_index_collection(
1525
1653
  compute_bm25_stats: bool = True,
1526
1654
  access_key: str = "minioadmin",
1527
1655
  secret_key: str = "minioadmin",
1528
- bucket_name: str = "a-bucket",
1656
+ bucket_name: str = None,
1529
1657
  meta_dataframe: Union[str, pd.DataFrame] = None,
1530
1658
  meta_source_field: str = None,
1531
1659
  meta_fields: list[str] = None,
1532
- intput_type: str = "passage",
1660
+ input_type: str = "passage",
1533
1661
  truncate: str = "END",
1662
+ client: MilvusClient = None,
1663
+ username: str = None,
1664
+ password: str = None,
1534
1665
  **kwargs,
1535
1666
  ):
1536
1667
  """
@@ -1562,17 +1693,22 @@ def embed_index_collection(
1562
1693
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1563
1694
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1564
1695
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1565
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "a-bucket".
1696
+ bucket_name (str, optional): The name of the MinIO bucket.
1566
1697
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1567
1698
  containing metadata. Defaults to None.
1568
1699
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
1569
1700
  Defaults to None.
1570
1701
  meta_fields (list[str], optional): A list of metadata fields to include. Defaults to None.
1702
+ client : MilvusClient, optional
1703
+ Milvus client instance.
1704
+ username : str, optional
1705
+ Milvus username.
1706
+ password : str, optional
1707
+ Milvus password.
1571
1708
  **kwargs: Additional keyword arguments for customization.
1572
1709
  """
1573
1710
  client_config = ClientConfigSchema()
1574
1711
  nvidia_api_key = nvidia_api_key if nvidia_api_key else client_config.nvidia_api_key
1575
- # required for NVIDIAEmbedding call if the endpoint is Nvidia build api.
1576
1712
  embedding_endpoint = embedding_endpoint if embedding_endpoint else client_config.embedding_nim_endpoint
1577
1713
  model_name = model_name if model_name else client_config.embedding_nim_model_name
1578
1714
  # if not scheme we assume we are using grpc
@@ -1601,6 +1737,8 @@ def embed_index_collection(
1601
1737
  meta_dataframe=meta_dataframe,
1602
1738
  meta_source_field=meta_source_field,
1603
1739
  meta_fields=meta_fields,
1740
+ username=username,
1741
+ password=password,
1604
1742
  **kwargs,
1605
1743
  )
1606
1744
  # running in parts
@@ -1614,7 +1752,7 @@ def embed_index_collection(
1614
1752
  model_name,
1615
1753
  embedding_endpoint,
1616
1754
  nvidia_api_key,
1617
- intput_type,
1755
+ input_type,
1618
1756
  truncate,
1619
1757
  batch_size,
1620
1758
  grpc,
@@ -1632,7 +1770,7 @@ def embed_index_collection(
1632
1770
  model_name,
1633
1771
  embedding_endpoint,
1634
1772
  nvidia_api_key,
1635
- intput_type,
1773
+ input_type,
1636
1774
  truncate,
1637
1775
  batch_size,
1638
1776
  grpc,
@@ -1670,7 +1808,7 @@ def reindex_collection(
1670
1808
  compute_bm25_stats: bool = True,
1671
1809
  access_key: str = "minioadmin",
1672
1810
  secret_key: str = "minioadmin",
1673
- bucket_name: str = "a-bucket",
1811
+ bucket_name: str = None,
1674
1812
  meta_dataframe: Union[str, pd.DataFrame] = None,
1675
1813
  meta_source_field: str = None,
1676
1814
  meta_fields: list[str] = None,
@@ -1711,7 +1849,7 @@ def reindex_collection(
1711
1849
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1712
1850
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1713
1851
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1714
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "a-bucket".
1852
+ bucket_name (str, optional): The name of the MinIO bucket.
1715
1853
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1716
1854
  containing metadata. Defaults to None.
1717
1855
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1819,11 +1957,14 @@ class Milvus(VDB):
1819
1957
  compute_bm25_stats: bool = True,
1820
1958
  access_key: str = "minioadmin",
1821
1959
  secret_key: str = "minioadmin",
1822
- bucket_name: str = "a-bucket",
1960
+ bucket_name: str = None,
1823
1961
  meta_dataframe: Union[str, pd.DataFrame] = None,
1824
1962
  meta_source_field: str = None,
1825
1963
  meta_fields: list[str] = None,
1826
1964
  stream: bool = False,
1965
+ threshold: int = 1000,
1966
+ username: str = None,
1967
+ password: str = None,
1827
1968
  **kwargs,
1828
1969
  ):
1829
1970
  """
@@ -1847,15 +1988,17 @@ class Milvus(VDB):
1847
1988
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1848
1989
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1849
1990
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1850
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "a-bucket".
1991
+ bucket_name (str, optional): The name of the MinIO bucket.
1851
1992
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1852
1993
  containing metadata. Defaults to None.
1853
1994
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
1854
1995
  Defaults to None.
1855
1996
  meta_fields (list[str], optional): A list of metadata fields to include. Defaults to None.
1856
- **kwargs: Additional keyword arguments for customization.
1857
1997
  stream (bool, optional): When true, the records will be inserted into milvus using the stream
1858
1998
  insert method.
1999
+ username (str, optional): The username for Milvus authentication. Defaults to None.
2000
+ password (str, optional): The password for Milvus authentication. Defaults to None.
2001
+ **kwargs: Additional keyword arguments for customization.
1859
2002
  """
1860
2003
  kwargs = locals().copy()
1861
2004
  kwargs.pop("self", None)
@@ -1885,6 +2028,8 @@ class Milvus(VDB):
1885
2028
  "gpu_index": self.__dict__.get("gpu_index", True),
1886
2029
  "gpu_search": self.__dict__.get("gpu_search", True),
1887
2030
  "dense_dim": self.__dict__.get("dense_dim", 2048),
2031
+ "username": self.__dict__.get("username", None),
2032
+ "password": self.__dict__.get("password", None),
1888
2033
  }
1889
2034
  return (self.collection_name, conn_dict)
1890
2035