nv-ingest-client 2025.8.18.dev20250818__py3-none-any.whl → 2025.8.20.dev20250820__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -94,8 +94,14 @@ def get_dataset_files(dataset_bytes: BytesIO, shuffle: bool = False) -> list:
94
94
  dataset_bytes.seek(0)
95
95
  dataset = json.load(dataset_bytes)
96
96
  sampled_files = dataset.get("sampled_files", [])
97
- if shuffle:
98
- random.shuffle(sampled_files)
97
+ if shuffle and len(sampled_files) > 1:
98
+ original = list(sampled_files)
99
+ # Create a shuffled copy without mutating the original list
100
+ shuffled = random.sample(sampled_files, k=len(sampled_files))
101
+ # Guard against seeded RNG or accidental identity by forcing a different order
102
+ if shuffled == original:
103
+ shuffled = shuffled[1:] + shuffled[:1]
104
+ return shuffled
99
105
  return sampled_files
100
106
  except json.JSONDecodeError as err:
101
107
  raise ValueError(f"{err}")
@@ -84,11 +84,10 @@ def create_nvingest_meta_schema():
84
84
 
85
85
  def create_meta_collection(
86
86
  schema: CollectionSchema,
87
- milvus_uri: str = "http://localhost:19530",
88
87
  collection_name: str = "meta",
89
88
  recreate=False,
89
+ client: MilvusClient = None,
90
90
  ):
91
- client = MilvusClient(milvus_uri)
92
91
  if client.has_collection(collection_name) and not recreate:
93
92
  # already exists, dont erase and recreate
94
93
  return
@@ -106,7 +105,6 @@ def create_meta_collection(
106
105
  def write_meta_collection(
107
106
  collection_name: str,
108
107
  fields: List[str],
109
- milvus_uri: str = "http://localhost:19530",
110
108
  creation_timestamp: str = None,
111
109
  dense_index: str = None,
112
110
  dense_dim: int = None,
@@ -114,6 +112,7 @@ def write_meta_collection(
114
112
  embedding_model: str = None,
115
113
  sparse_model: str = None,
116
114
  meta_collection_name: str = "meta",
115
+ client: MilvusClient = None,
117
116
  ):
118
117
  client_config = ClientConfigSchema()
119
118
  data = {
@@ -132,14 +131,12 @@ def write_meta_collection(
132
131
  },
133
132
  "user_fields": [field.name for field in fields],
134
133
  }
135
- client = MilvusClient(milvus_uri)
136
134
  client.insert(collection_name=meta_collection_name, data=data)
137
135
 
138
136
 
139
137
  def log_new_meta_collection(
140
138
  collection_name: str,
141
139
  fields: List[str],
142
- milvus_uri: str = "http://localhost:19530",
143
140
  creation_timestamp: str = None,
144
141
  dense_index: str = None,
145
142
  dense_dim: int = None,
@@ -148,13 +145,13 @@ def log_new_meta_collection(
148
145
  sparse_model: str = None,
149
146
  meta_collection_name: str = "meta",
150
147
  recreate: bool = False,
148
+ client: MilvusClient = None,
151
149
  ):
152
150
  schema = create_nvingest_meta_schema()
153
- create_meta_collection(schema, milvus_uri, recreate=recreate)
151
+ create_meta_collection(schema, client=client, recreate=recreate)
154
152
  write_meta_collection(
155
153
  collection_name,
156
154
  fields=fields,
157
- milvus_uri=milvus_uri,
158
155
  creation_timestamp=creation_timestamp,
159
156
  dense_index=dense_index,
160
157
  dense_dim=dense_dim,
@@ -162,6 +159,7 @@ def log_new_meta_collection(
162
159
  embedding_model=embedding_model,
163
160
  sparse_model=sparse_model,
164
161
  meta_collection_name=meta_collection_name,
162
+ client=client,
165
163
  )
166
164
 
167
165
 
@@ -171,12 +169,16 @@ def grab_meta_collection_info(
171
169
  timestamp: str = None,
172
170
  embedding_model: str = None,
173
171
  embedding_dim: int = None,
174
- milvus_uri: str = "http://localhost:19530",
172
+ client: MilvusClient = None,
173
+ milvus_uri: str = None,
174
+ username: str = None,
175
+ password: str = None,
175
176
  ):
176
177
  timestamp = timestamp or ""
177
178
  embedding_model = embedding_model or ""
178
179
  embedding_dim = embedding_dim or ""
179
- client = MilvusClient(milvus_uri)
180
+ if milvus_uri:
181
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
180
182
  results = client.query_iterator(
181
183
  collection_name=meta_collection_name,
182
184
  output_fields=[
@@ -404,6 +406,8 @@ def create_nvingest_collection(
404
406
  gpu_search: bool = False,
405
407
  dense_dim: int = 2048,
406
408
  recreate_meta: bool = False,
409
+ username: str = None,
410
+ password: str = None,
407
411
  ) -> CollectionSchema:
408
412
  """
409
413
  Creates a milvus collection with an nv-ingest compatible schema under
@@ -413,9 +417,7 @@ def create_nvingest_collection(
413
417
  ----------
414
418
  collection_name : str
415
419
  Name of the collection to be created.
416
- milvus_uri : str,
417
- Milvus address with http(s) preffix and port. Can also be a file path, to activate
418
- milvus-lite.
420
+
419
421
  sparse : bool, optional
420
422
  When set to true, this adds a Sparse index to the IndexParams, usually activated for
421
423
  hybrid search.
@@ -426,6 +428,11 @@ def create_nvingest_collection(
426
428
  If true, creates a GPU_CAGRA index for dense embeddings.
427
429
  dense_dim : int, optional
428
430
  Sets the dimension size for the dense embedding in the milvus schema.
431
+ username : str, optional
432
+ Milvus username.
433
+ password : str, optional
434
+ Milvus password.
435
+
429
436
 
430
437
  Returns
431
438
  -------
@@ -435,7 +442,7 @@ def create_nvingest_collection(
435
442
  """
436
443
  local_index = False
437
444
  if urlparse(milvus_uri).scheme:
438
- connections.connect(uri=milvus_uri)
445
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
439
446
  server_version = utility.get_server_version()
440
447
  if "lite" in server_version:
441
448
  gpu_index = False
@@ -444,7 +451,7 @@ def create_nvingest_collection(
444
451
  if milvus_uri.endswith(".db"):
445
452
  local_index = True
446
453
 
447
- client = MilvusClient(milvus_uri)
454
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
448
455
  schema = create_nvingest_schema(dense_dim=dense_dim, sparse=sparse, local_index=local_index)
449
456
  index_params = create_nvingest_index_params(
450
457
  sparse=sparse,
@@ -457,11 +464,11 @@ def create_nvingest_collection(
457
464
  log_new_meta_collection(
458
465
  collection_name,
459
466
  fields=schema.fields,
460
- milvus_uri=milvus_uri,
461
467
  dense_index=str(d_idx),
462
468
  dense_dim=dense_dim,
463
469
  sparse_index=str(s_idx),
464
470
  recreate=recreate_meta,
471
+ client=client,
465
472
  )
466
473
  return schema
467
474
 
@@ -744,6 +751,8 @@ def bulk_insert_milvus(
744
751
  access_key: str = "minioadmin",
745
752
  secret_key: str = "minioadmin",
746
753
  bucket_name: str = "nv-ingest",
754
+ username: str = None,
755
+ password: str = None,
747
756
  ):
748
757
  """
749
758
  This function initialize the bulk ingest of all minio uploaded records, and checks for
@@ -760,10 +769,14 @@ def bulk_insert_milvus(
760
769
  milvus_uri : str,
761
770
  Milvus address with http(s) preffix and port. Can also be a file path, to activate
762
771
  milvus-lite.
772
+ username : str, optional
773
+ Milvus username.
774
+ password : str, optional
775
+ Milvus password.
763
776
  """
764
777
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
765
778
 
766
- connections.connect(uri=milvus_uri)
779
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
767
780
  t_bulk_start = time.time()
768
781
  task_ids = []
769
782
  uploaded_files = []
@@ -913,6 +926,8 @@ def write_to_nvingest_collection(
913
926
  meta_source_field=None,
914
927
  meta_fields=None,
915
928
  stream: bool = False,
929
+ username: str = None,
930
+ password: str = None,
916
931
  **kwargs,
917
932
  ):
918
933
  """
@@ -953,9 +968,13 @@ def write_to_nvingest_collection(
953
968
  Minio bucket name.
954
969
  stream : bool, optional
955
970
  When true, the records will be inserted into milvus using the stream insert method.
971
+ username : str, optional
972
+ Milvus username.
973
+ password : str, optional
974
+ Milvus password.
956
975
  """
957
976
  local_index = False
958
- connections.connect(uri=milvus_uri)
977
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
959
978
  if urlparse(milvus_uri).scheme:
960
979
  server_version = utility.get_server_version()
961
980
  if "lite" in server_version:
@@ -978,7 +997,7 @@ def write_to_nvingest_collection(
978
997
  elif local_index and sparse:
979
998
  bm25_ef = BM25EmbeddingFunction(build_default_analyzer(language="en"))
980
999
  bm25_ef.load(bm25_save_path)
981
- client = MilvusClient(milvus_uri)
1000
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
982
1001
  schema = Collection(collection_name).schema
983
1002
  if isinstance(meta_dataframe, str):
984
1003
  meta_dataframe = pandas_file_reader(meta_dataframe)
@@ -1038,6 +1057,8 @@ def write_to_nvingest_collection(
1038
1057
  access_key,
1039
1058
  secret_key,
1040
1059
  bucket_name,
1060
+ username=username,
1061
+ password=password,
1041
1062
  )
1042
1063
  # fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
1043
1064
  client.refresh_load(collection_name)
@@ -1222,6 +1243,9 @@ def nvingest_retrieval(
1222
1243
  nv_ranker_max_batch_size: int = 64,
1223
1244
  _filter: str = "",
1224
1245
  ef_param: int = 200,
1246
+ client: MilvusClient = None,
1247
+ username: str = None,
1248
+ password: str = None,
1225
1249
  **kwargs,
1226
1250
  ):
1227
1251
  """
@@ -1268,6 +1292,12 @@ def nvingest_retrieval(
1268
1292
  Max size for the number of candidates to rerank.
1269
1293
  nv_ranker_top_k : int,
1270
1294
  The number of candidates to return after reranking.
1295
+ client : MilvusClient, optional
1296
+ Milvus client instance.
1297
+ username : str, optional
1298
+ Milvus username.
1299
+ password : str, optional
1300
+ Milvus password.
1271
1301
  Returns
1272
1302
  -------
1273
1303
  List
@@ -1289,7 +1319,7 @@ def nvingest_retrieval(
1289
1319
  model_name = model_name if model_name else client_config.embedding_nim_model_name
1290
1320
  local_index = False
1291
1321
  embed_model = NVIDIAEmbedding(base_url=embedding_endpoint, model=model_name, nvidia_api_key=nvidia_api_key)
1292
- client = MilvusClient(milvus_uri)
1322
+ client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
1293
1323
  final_top_k = top_k
1294
1324
  if nv_ranker:
1295
1325
  top_k = nv_ranker_top_k
@@ -1345,7 +1375,14 @@ def nvingest_retrieval(
1345
1375
  return results
1346
1376
 
1347
1377
 
1348
- def remove_records(source_name: str, collection_name: str, milvus_uri: str = "http://localhost:19530"):
1378
+ def remove_records(
1379
+ source_name: str,
1380
+ collection_name: str,
1381
+ milvus_uri: str = "http://localhost:19530",
1382
+ username: str = None,
1383
+ password: str = None,
1384
+ client: MilvusClient = None,
1385
+ ):
1349
1386
  """
1350
1387
  This function allows a user to remove chunks associated with an ingested file.
1351
1388
  Supply the full path of the file you would like to remove and this function will
@@ -1360,6 +1397,12 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
1360
1397
  milvus_uri : str,
1361
1398
  Milvus address with http(s) preffix and port. Can also be a file path, to activate
1362
1399
  milvus-lite.
1400
+ client : MilvusClient, optional
1401
+ Milvus client instance.
1402
+ username : str, optional
1403
+ Milvus username.
1404
+ password : str, optional
1405
+ Milvus password.
1363
1406
 
1364
1407
  Returns
1365
1408
  -------
@@ -1367,7 +1410,7 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
1367
1410
  Dictionary with one key, `delete_cnt`. The value represents the number of entities
1368
1411
  removed.
1369
1412
  """
1370
- client = MilvusClient(milvus_uri)
1413
+ client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
1371
1414
  result_ids = client.delete(
1372
1415
  collection_name=collection_name,
1373
1416
  filter=f'(source["source_name"] == "{source_name}")',
@@ -1474,6 +1517,9 @@ def pull_all_milvus(
1474
1517
  write_dir: str = None,
1475
1518
  batch_size: int = 1000,
1476
1519
  include_embeddings: bool = False,
1520
+ username: str = None,
1521
+ password: str = None,
1522
+ client: MilvusClient = None,
1477
1523
  ):
1478
1524
  """
1479
1525
  This function takes the input collection name and pulls all the records
@@ -1492,12 +1538,18 @@ def pull_all_milvus(
1492
1538
  The number of records to pull in each batch. Defaults to 1000.
1493
1539
  include_embeddings : bool, optional
1494
1540
  Whether to include the embeddings in the output. Defaults to False.
1541
+ username : str, optional
1542
+ Milvus username.
1543
+ password : str, optional
1544
+ Milvus password.
1545
+ client : MilvusClient, optional
1546
+ Milvus client instance.
1495
1547
  Returns
1496
1548
  -------
1497
1549
  List
1498
1550
  List of records/files with records from the collection.
1499
1551
  """
1500
- client = MilvusClient(milvus_uri)
1552
+ client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
1501
1553
  output_fields = ["source", "content_metadata", "text"]
1502
1554
  if include_embeddings:
1503
1555
  output_fields.append("vector")
@@ -1572,6 +1624,9 @@ def embed_index_collection(
1572
1624
  meta_fields: list[str] = None,
1573
1625
  intput_type: str = "passage",
1574
1626
  truncate: str = "END",
1627
+ client: MilvusClient = None,
1628
+ username: str = None,
1629
+ password: str = None,
1575
1630
  **kwargs,
1576
1631
  ):
1577
1632
  """
@@ -1609,6 +1664,12 @@ def embed_index_collection(
1609
1664
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
1610
1665
  Defaults to None.
1611
1666
  meta_fields (list[str], optional): A list of metadata fields to include. Defaults to None.
1667
+ client : MilvusClient, optional
1668
+ Milvus client instance.
1669
+ username : str, optional
1670
+ Milvus username.
1671
+ password : str, optional
1672
+ Milvus password.
1612
1673
  **kwargs: Additional keyword arguments for customization.
1613
1674
  """
1614
1675
  client_config = ClientConfigSchema()
@@ -1642,6 +1703,8 @@ def embed_index_collection(
1642
1703
  meta_dataframe=meta_dataframe,
1643
1704
  meta_source_field=meta_source_field,
1644
1705
  meta_fields=meta_fields,
1706
+ username=username,
1707
+ password=password,
1645
1708
  **kwargs,
1646
1709
  )
1647
1710
  # running in parts
@@ -1866,6 +1929,8 @@ class Milvus(VDB):
1866
1929
  meta_fields: list[str] = None,
1867
1930
  stream: bool = False,
1868
1931
  threshold: int = 1000,
1932
+ username: str = None,
1933
+ password: str = None,
1869
1934
  **kwargs,
1870
1935
  ):
1871
1936
  """
@@ -1895,9 +1960,11 @@ class Milvus(VDB):
1895
1960
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
1896
1961
  Defaults to None.
1897
1962
  meta_fields (list[str], optional): A list of metadata fields to include. Defaults to None.
1898
- **kwargs: Additional keyword arguments for customization.
1899
1963
  stream (bool, optional): When true, the records will be inserted into milvus using the stream
1900
1964
  insert method.
1965
+ username (str, optional): The username for Milvus authentication. Defaults to None.
1966
+ password (str, optional): The password for Milvus authentication. Defaults to None.
1967
+ **kwargs: Additional keyword arguments for customization.
1901
1968
  """
1902
1969
  kwargs = locals().copy()
1903
1970
  kwargs.pop("self", None)
@@ -1927,6 +1994,8 @@ class Milvus(VDB):
1927
1994
  "gpu_index": self.__dict__.get("gpu_index", True),
1928
1995
  "gpu_search": self.__dict__.get("gpu_search", True),
1929
1996
  "dense_dim": self.__dict__.get("dense_dim", 2048),
1997
+ "username": self.__dict__.get("username", None),
1998
+ "password": self.__dict__.get("password", None),
1930
1999
  }
1931
2000
  return (self.collection_name, conn_dict)
1932
2001
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.8.18.dev20250818
3
+ Version: 2025.8.20.dev20250820
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -30,7 +30,7 @@ nv_ingest_client/primitives/tasks/task_factory.py,sha256=x8FXrhlgRYTxM0rLvsUvM8w
30
30
  nv_ingest_client/primitives/tasks/udf.py,sha256=5e_WJVgocnK-z0EGCEwPO_zG8WJEhuIsOUTjPmr8REY,12833
31
31
  nv_ingest_client/primitives/tasks/vdb_upload.py,sha256=mXOyQJfQfaoN96nntzevd0sKUs60-AHi8lc1jxG3DAw,1765
32
32
  nv_ingest_client/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
- nv_ingest_client/util/dataset.py,sha256=b6if_hM15iUJC4rvSHS0cmGBsSuZ3W-NoKDMTulx4b8,3316
33
+ nv_ingest_client/util/dataset.py,sha256=2yDPs47HNj8AOdOAfJL4XVji0BMRJq_NH8CG4s4xT-Q,3701
34
34
  nv_ingest_client/util/milvus.py,sha256=MwBix_UBg54i7xONBIwjcqeKSBkqunxBJBK2f0bPMoo,61
35
35
  nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywkcLuNieozvPWvo0,3785
36
36
  nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
@@ -42,11 +42,11 @@ nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
42
42
  nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
43
43
  nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
44
44
  nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
45
- nv_ingest_client/util/vdb/milvus.py,sha256=SIUiW285lDFUXwJjes_58Y3c4pK51SHFqbn0QEqOmm4,75243
45
+ nv_ingest_client/util/vdb/milvus.py,sha256=5yjn9uZ0fB10RrJml0WdImsfvfcowDtwrPrl_oYnnF0,77436
46
46
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
47
- nv_ingest_client-2025.8.18.dev20250818.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
- nv_ingest_client-2025.8.18.dev20250818.dist-info/METADATA,sha256=7AoAmMB2B45WV5L1-nHJPTjMQDaO3fExvKWv-5xp6gg,30737
49
- nv_ingest_client-2025.8.18.dev20250818.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
- nv_ingest_client-2025.8.18.dev20250818.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
51
- nv_ingest_client-2025.8.18.dev20250818.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
52
- nv_ingest_client-2025.8.18.dev20250818.dist-info/RECORD,,
47
+ nv_ingest_client-2025.8.20.dev20250820.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
+ nv_ingest_client-2025.8.20.dev20250820.dist-info/METADATA,sha256=54Czy3ATSEasGQ0SwUpgxKw1wErWaryTKkHw9-LYpcE,30737
49
+ nv_ingest_client-2025.8.20.dev20250820.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
+ nv_ingest_client-2025.8.20.dev20250820.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
51
+ nv_ingest_client-2025.8.20.dev20250820.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
52
+ nv_ingest_client-2025.8.20.dev20250820.dist-info/RECORD,,