nv-ingest-client 2025.8.19.dev20250819__py3-none-any.whl → 2025.8.21.dev20250821__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -32,6 +32,7 @@ from nv_ingest_client.client import NvIngestClient
32
32
  from nv_ingest_client.util.dataset import get_dataset_files
33
33
  from nv_ingest_client.util.dataset import get_dataset_statistics
34
34
  from nv_ingest_client.util.system import ensure_directory_with_permissions
35
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
35
36
 
36
37
  try:
37
38
  NV_INGEST_VERSION = version("nv_ingest")
@@ -237,7 +238,9 @@ def main(
237
238
 
238
239
  try:
239
240
  configure_logging(logger, log_level)
240
- logging.debug(f"nv-ingest-cli:params:\n{json.dumps(ctx.params, indent=2, default=repr)}")
241
+ # Sanitize CLI params before logging to avoid leaking secrets
242
+ _sanitized_params = sanitize_for_logging(dict(ctx.params))
243
+ logging.debug(f"nv-ingest-cli:params:\n{json.dumps(_sanitized_params, indent=2, default=repr)}")
241
244
 
242
245
  docs = list(doc)
243
246
  if dataset:
@@ -260,7 +263,16 @@ def main(
260
263
  logger.info(_msg)
261
264
 
262
265
  if not dry_run:
263
- logging.debug(f"Creating message client: {client_host} and port: {client_port} -> {client_kwargs}")
266
+ # Sanitize client kwargs (JSON string) before logging
267
+ try:
268
+ _client_kwargs_obj = json.loads(client_kwargs)
269
+ except Exception:
270
+ _client_kwargs_obj = {"raw": client_kwargs}
271
+ _sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
272
+ logging.debug(
273
+ f"Creating message client: {client_host} and port: {client_port} -> "
274
+ f"{json.dumps(_sanitized_client_kwargs, indent=2, default=repr)}"
275
+ )
264
276
 
265
277
  if client_type == "rest":
266
278
  client_allocator = RestClient
@@ -9,8 +9,8 @@
9
9
  import logging
10
10
  from typing import Dict
11
11
 
12
- from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
13
12
 
13
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
14
14
  from .task_base import Task
15
15
 
16
16
  logger = logging.getLogger(__name__)
@@ -94,8 +94,14 @@ def get_dataset_files(dataset_bytes: BytesIO, shuffle: bool = False) -> list:
94
94
  dataset_bytes.seek(0)
95
95
  dataset = json.load(dataset_bytes)
96
96
  sampled_files = dataset.get("sampled_files", [])
97
- if shuffle:
98
- random.shuffle(sampled_files)
97
+ if shuffle and len(sampled_files) > 1:
98
+ original = list(sampled_files)
99
+ # Create a shuffled copy without mutating the original list
100
+ shuffled = random.sample(sampled_files, k=len(sampled_files))
101
+ # Guard against seeded RNG or accidental identity by forcing a different order
102
+ if shuffled == original:
103
+ shuffled = shuffled[1:] + shuffled[:1]
104
+ return shuffled
99
105
  return sampled_files
100
106
  except json.JSONDecodeError as err:
101
107
  raise ValueError(f"{err}")
@@ -170,10 +170,15 @@ def grab_meta_collection_info(
170
170
  embedding_model: str = None,
171
171
  embedding_dim: int = None,
172
172
  client: MilvusClient = None,
173
+ milvus_uri: str = None,
174
+ username: str = None,
175
+ password: str = None,
173
176
  ):
174
177
  timestamp = timestamp or ""
175
178
  embedding_model = embedding_model or ""
176
179
  embedding_dim = embedding_dim or ""
180
+ if milvus_uri:
181
+ client = MilvusClient(milvus_uri, token=f"{username}:{password}")
177
182
  results = client.query_iterator(
178
183
  collection_name=meta_collection_name,
179
184
  output_fields=[
@@ -771,7 +776,7 @@ def bulk_insert_milvus(
771
776
  """
772
777
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
773
778
 
774
- connections.connect(uri=milvus_uri, username=username, password=password)
779
+ connections.connect(uri=milvus_uri, token=f"{username}:{password}")
775
780
  t_bulk_start = time.time()
776
781
  task_ids = []
777
782
  uploaded_files = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.8.19.dev20250819
3
+ Version: 2025.8.21.dev20250821
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -1,5 +1,5 @@
1
1
  nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
2
- nv_ingest_client/nv_ingest_cli.py,sha256=uonsSDRSKXqUl5meY2u_FMgUjgDCOkDRPDzCpAIIM6I,12966
2
+ nv_ingest_client/nv_ingest_cli.py,sha256=GG7x_fe423NHQRDmpNcTtNI2P_g1xgg9SQ5JjbdBAIU,13592
3
3
  nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
4
4
  nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
5
5
  nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
@@ -15,7 +15,7 @@ nv_ingest_client/primitives/jobs/job_spec.py,sha256=NYT8K31b6p2v0zbIYugcARqJ8DTH
15
15
  nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
16
16
  nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
17
17
  nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
18
- nv_ingest_client/primitives/tasks/caption.py,sha256=J8sMIYujPb-ysWj1w3TXPSBLCnhHns_z4tZjzhDOQIs,2130
18
+ nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcYqtesS-HaZzeh4rI,2130
19
19
  nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
20
20
  nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
21
21
  nv_ingest_client/primitives/tasks/embed.py,sha256=I6Irmvm1Qj9oqzDGSgfykCtfz8pz9LNxiXO-t29nXv8,5916
@@ -30,7 +30,7 @@ nv_ingest_client/primitives/tasks/task_factory.py,sha256=x8FXrhlgRYTxM0rLvsUvM8w
30
30
  nv_ingest_client/primitives/tasks/udf.py,sha256=5e_WJVgocnK-z0EGCEwPO_zG8WJEhuIsOUTjPmr8REY,12833
31
31
  nv_ingest_client/primitives/tasks/vdb_upload.py,sha256=mXOyQJfQfaoN96nntzevd0sKUs60-AHi8lc1jxG3DAw,1765
32
32
  nv_ingest_client/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
- nv_ingest_client/util/dataset.py,sha256=b6if_hM15iUJC4rvSHS0cmGBsSuZ3W-NoKDMTulx4b8,3316
33
+ nv_ingest_client/util/dataset.py,sha256=2yDPs47HNj8AOdOAfJL4XVji0BMRJq_NH8CG4s4xT-Q,3701
34
34
  nv_ingest_client/util/milvus.py,sha256=MwBix_UBg54i7xONBIwjcqeKSBkqunxBJBK2f0bPMoo,61
35
35
  nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywkcLuNieozvPWvo0,3785
36
36
  nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
@@ -42,11 +42,11 @@ nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
42
42
  nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
43
43
  nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
44
44
  nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
45
- nv_ingest_client/util/vdb/milvus.py,sha256=PC3qXjrdTab2xVS3FZkhj_28T5R9DNaHZ8a7D721Pik,77269
45
+ nv_ingest_client/util/vdb/milvus.py,sha256=5yjn9uZ0fB10RrJml0WdImsfvfcowDtwrPrl_oYnnF0,77436
46
46
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
47
- nv_ingest_client-2025.8.19.dev20250819.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
- nv_ingest_client-2025.8.19.dev20250819.dist-info/METADATA,sha256=QYd4COuKD4YRil1snCzCeQpUBEuSzpnb5wGVGW__VMk,30737
49
- nv_ingest_client-2025.8.19.dev20250819.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
- nv_ingest_client-2025.8.19.dev20250819.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
51
- nv_ingest_client-2025.8.19.dev20250819.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
52
- nv_ingest_client-2025.8.19.dev20250819.dist-info/RECORD,,
47
+ nv_ingest_client-2025.8.21.dev20250821.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
+ nv_ingest_client-2025.8.21.dev20250821.dist-info/METADATA,sha256=NHS_9MLL0jBbpb1bvSD2b0jF2jrmicXTbJep_gd2v2U,30737
49
+ nv_ingest_client-2025.8.21.dev20250821.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
+ nv_ingest_client-2025.8.21.dev20250821.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
51
+ nv_ingest_client-2025.8.21.dev20250821.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
52
+ nv_ingest_client-2025.8.21.dev20250821.dist-info/RECORD,,