nv-ingest-client 2025.8.19.dev20250819__py3-none-any.whl → 2025.8.21.dev20250821__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/nv_ingest_cli.py +14 -2
- nv_ingest_client/primitives/tasks/caption.py +1 -1
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/vdb/milvus.py +6 -1
- {nv_ingest_client-2025.8.19.dev20250819.dist-info → nv_ingest_client-2025.8.21.dev20250821.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.8.19.dev20250819.dist-info → nv_ingest_client-2025.8.21.dev20250821.dist-info}/RECORD +10 -10
- {nv_ingest_client-2025.8.19.dev20250819.dist-info → nv_ingest_client-2025.8.21.dev20250821.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.8.19.dev20250819.dist-info → nv_ingest_client-2025.8.21.dev20250821.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.8.19.dev20250819.dist-info → nv_ingest_client-2025.8.21.dev20250821.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.8.19.dev20250819.dist-info → nv_ingest_client-2025.8.21.dev20250821.dist-info}/top_level.txt +0 -0
|
@@ -32,6 +32,7 @@ from nv_ingest_client.client import NvIngestClient
|
|
|
32
32
|
from nv_ingest_client.util.dataset import get_dataset_files
|
|
33
33
|
from nv_ingest_client.util.dataset import get_dataset_statistics
|
|
34
34
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
35
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
35
36
|
|
|
36
37
|
try:
|
|
37
38
|
NV_INGEST_VERSION = version("nv_ingest")
|
|
@@ -237,7 +238,9 @@ def main(
|
|
|
237
238
|
|
|
238
239
|
try:
|
|
239
240
|
configure_logging(logger, log_level)
|
|
240
|
-
|
|
241
|
+
# Sanitize CLI params before logging to avoid leaking secrets
|
|
242
|
+
_sanitized_params = sanitize_for_logging(dict(ctx.params))
|
|
243
|
+
logging.debug(f"nv-ingest-cli:params:\n{json.dumps(_sanitized_params, indent=2, default=repr)}")
|
|
241
244
|
|
|
242
245
|
docs = list(doc)
|
|
243
246
|
if dataset:
|
|
@@ -260,7 +263,16 @@ def main(
|
|
|
260
263
|
logger.info(_msg)
|
|
261
264
|
|
|
262
265
|
if not dry_run:
|
|
263
|
-
|
|
266
|
+
# Sanitize client kwargs (JSON string) before logging
|
|
267
|
+
try:
|
|
268
|
+
_client_kwargs_obj = json.loads(client_kwargs)
|
|
269
|
+
except Exception:
|
|
270
|
+
_client_kwargs_obj = {"raw": client_kwargs}
|
|
271
|
+
_sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
|
|
272
|
+
logging.debug(
|
|
273
|
+
f"Creating message client: {client_host} and port: {client_port} -> "
|
|
274
|
+
f"{json.dumps(_sanitized_client_kwargs, indent=2, default=repr)}"
|
|
275
|
+
)
|
|
264
276
|
|
|
265
277
|
if client_type == "rest":
|
|
266
278
|
client_allocator = RestClient
|
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
import logging
|
|
10
10
|
from typing import Dict
|
|
11
11
|
|
|
12
|
-
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
13
12
|
|
|
13
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
14
14
|
from .task_base import Task
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
nv_ingest_client/util/dataset.py
CHANGED
|
@@ -94,8 +94,14 @@ def get_dataset_files(dataset_bytes: BytesIO, shuffle: bool = False) -> list:
|
|
|
94
94
|
dataset_bytes.seek(0)
|
|
95
95
|
dataset = json.load(dataset_bytes)
|
|
96
96
|
sampled_files = dataset.get("sampled_files", [])
|
|
97
|
-
if shuffle:
|
|
98
|
-
|
|
97
|
+
if shuffle and len(sampled_files) > 1:
|
|
98
|
+
original = list(sampled_files)
|
|
99
|
+
# Create a shuffled copy without mutating the original list
|
|
100
|
+
shuffled = random.sample(sampled_files, k=len(sampled_files))
|
|
101
|
+
# Guard against seeded RNG or accidental identity by forcing a different order
|
|
102
|
+
if shuffled == original:
|
|
103
|
+
shuffled = shuffled[1:] + shuffled[:1]
|
|
104
|
+
return shuffled
|
|
99
105
|
return sampled_files
|
|
100
106
|
except json.JSONDecodeError as err:
|
|
101
107
|
raise ValueError(f"{err}")
|
|
@@ -170,10 +170,15 @@ def grab_meta_collection_info(
|
|
|
170
170
|
embedding_model: str = None,
|
|
171
171
|
embedding_dim: int = None,
|
|
172
172
|
client: MilvusClient = None,
|
|
173
|
+
milvus_uri: str = None,
|
|
174
|
+
username: str = None,
|
|
175
|
+
password: str = None,
|
|
173
176
|
):
|
|
174
177
|
timestamp = timestamp or ""
|
|
175
178
|
embedding_model = embedding_model or ""
|
|
176
179
|
embedding_dim = embedding_dim or ""
|
|
180
|
+
if milvus_uri:
|
|
181
|
+
client = MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
177
182
|
results = client.query_iterator(
|
|
178
183
|
collection_name=meta_collection_name,
|
|
179
184
|
output_fields=[
|
|
@@ -771,7 +776,7 @@ def bulk_insert_milvus(
|
|
|
771
776
|
"""
|
|
772
777
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
773
778
|
|
|
774
|
-
connections.connect(uri=milvus_uri,
|
|
779
|
+
connections.connect(uri=milvus_uri, token=f"{username}:{password}")
|
|
775
780
|
t_bulk_start = time.time()
|
|
776
781
|
task_ids = []
|
|
777
782
|
uploaded_files = []
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
2
|
-
nv_ingest_client/nv_ingest_cli.py,sha256=
|
|
2
|
+
nv_ingest_client/nv_ingest_cli.py,sha256=GG7x_fe423NHQRDmpNcTtNI2P_g1xgg9SQ5JjbdBAIU,13592
|
|
3
3
|
nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
4
4
|
nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
5
5
|
nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
|
|
@@ -15,7 +15,7 @@ nv_ingest_client/primitives/jobs/job_spec.py,sha256=NYT8K31b6p2v0zbIYugcARqJ8DTH
|
|
|
15
15
|
nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
|
|
16
16
|
nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
|
|
17
17
|
nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
|
|
18
|
-
nv_ingest_client/primitives/tasks/caption.py,sha256=
|
|
18
|
+
nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcYqtesS-HaZzeh4rI,2130
|
|
19
19
|
nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
|
|
20
20
|
nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
|
|
21
21
|
nv_ingest_client/primitives/tasks/embed.py,sha256=I6Irmvm1Qj9oqzDGSgfykCtfz8pz9LNxiXO-t29nXv8,5916
|
|
@@ -30,7 +30,7 @@ nv_ingest_client/primitives/tasks/task_factory.py,sha256=x8FXrhlgRYTxM0rLvsUvM8w
|
|
|
30
30
|
nv_ingest_client/primitives/tasks/udf.py,sha256=5e_WJVgocnK-z0EGCEwPO_zG8WJEhuIsOUTjPmr8REY,12833
|
|
31
31
|
nv_ingest_client/primitives/tasks/vdb_upload.py,sha256=mXOyQJfQfaoN96nntzevd0sKUs60-AHi8lc1jxG3DAw,1765
|
|
32
32
|
nv_ingest_client/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
-
nv_ingest_client/util/dataset.py,sha256=
|
|
33
|
+
nv_ingest_client/util/dataset.py,sha256=2yDPs47HNj8AOdOAfJL4XVji0BMRJq_NH8CG4s4xT-Q,3701
|
|
34
34
|
nv_ingest_client/util/milvus.py,sha256=MwBix_UBg54i7xONBIwjcqeKSBkqunxBJBK2f0bPMoo,61
|
|
35
35
|
nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywkcLuNieozvPWvo0,3785
|
|
36
36
|
nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
|
|
@@ -42,11 +42,11 @@ nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
|
42
42
|
nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
|
|
43
43
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
44
44
|
nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
|
|
45
|
-
nv_ingest_client/util/vdb/milvus.py,sha256=
|
|
45
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=5yjn9uZ0fB10RrJml0WdImsfvfcowDtwrPrl_oYnnF0,77436
|
|
46
46
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
47
|
-
nv_ingest_client-2025.8.
|
|
48
|
-
nv_ingest_client-2025.8.
|
|
49
|
-
nv_ingest_client-2025.8.
|
|
50
|
-
nv_ingest_client-2025.8.
|
|
51
|
-
nv_ingest_client-2025.8.
|
|
52
|
-
nv_ingest_client-2025.8.
|
|
47
|
+
nv_ingest_client-2025.8.21.dev20250821.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
48
|
+
nv_ingest_client-2025.8.21.dev20250821.dist-info/METADATA,sha256=NHS_9MLL0jBbpb1bvSD2b0jF2jrmicXTbJep_gd2v2U,30737
|
|
49
|
+
nv_ingest_client-2025.8.21.dev20250821.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
50
|
+
nv_ingest_client-2025.8.21.dev20250821.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
51
|
+
nv_ingest_client-2025.8.21.dev20250821.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
52
|
+
nv_ingest_client-2025.8.21.dev20250821.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|