nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +511 -205
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +137 -24
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +28 -4
- nv_ingest_client/primitives/jobs/job_spec.py +1 -0
- nv_ingest_client/primitives/tasks/embed.py +16 -0
- nv_ingest_client/primitives/tasks/extract.py +1 -1
- nv_ingest_client/primitives/tasks/filter.py +1 -1
- nv_ingest_client/primitives/tasks/task_factory.py +9 -12
- nv_ingest_client/primitives/tasks/udf.py +24 -27
- nv_ingest_client/util/document_analysis.py +1 -1
- nv_ingest_client/util/util.py +26 -0
- nv_ingest_client/util/vdb/milvus.py +12 -9
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/RECORD +21 -20
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.9.26.dev20250926.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
nv_ingest_client/util/util.py
CHANGED
|
@@ -350,6 +350,32 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
350
350
|
return job_specs
|
|
351
351
|
|
|
352
352
|
|
|
353
|
+
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
|
+
"""
|
|
355
|
+
Apply PDF split configuration to a list of JobSpec objects.
|
|
356
|
+
|
|
357
|
+
Modifies job specs in-place by adding pdf_config to extended_options for PDF files only.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
job_specs : List[JobSpec]
|
|
362
|
+
List of job specifications to potentially modify
|
|
363
|
+
pages_per_chunk : int
|
|
364
|
+
Number of pages per PDF chunk (will be stored as-is; server performs clamping)
|
|
365
|
+
|
|
366
|
+
Notes
|
|
367
|
+
-----
|
|
368
|
+
- Only modifies job specs with document_type == "pdf" (case-insensitive)
|
|
369
|
+
- Modifies job specs in-place
|
|
370
|
+
- Safe to call on mixed document types (only PDFs are affected)
|
|
371
|
+
"""
|
|
372
|
+
for job_spec in job_specs:
|
|
373
|
+
if job_spec.document_type.lower() == "pdf":
|
|
374
|
+
if "pdf_config" not in job_spec._extended_options:
|
|
375
|
+
job_spec._extended_options["pdf_config"] = {}
|
|
376
|
+
job_spec._extended_options["pdf_config"]["split_page_count"] = pages_per_chunk
|
|
377
|
+
|
|
378
|
+
|
|
353
379
|
def filter_function_kwargs(func, **kwargs):
|
|
354
380
|
"""
|
|
355
381
|
Filters and returns keyword arguments that match the parameters of a given function.
|
|
@@ -776,13 +776,13 @@ def bulk_insert_milvus(
|
|
|
776
776
|
t_bulk_start = time.time()
|
|
777
777
|
task_ids = []
|
|
778
778
|
|
|
779
|
-
|
|
780
|
-
utility.do_bulk_insert(
|
|
779
|
+
for files in writer.batch_files:
|
|
780
|
+
task_id = utility.do_bulk_insert(
|
|
781
781
|
collection_name=collection_name,
|
|
782
|
-
files=
|
|
782
|
+
files=files,
|
|
783
783
|
consistency_level=CONSISTENCY,
|
|
784
784
|
)
|
|
785
|
-
|
|
785
|
+
task_ids.append(task_id)
|
|
786
786
|
|
|
787
787
|
while len(task_ids) > 0:
|
|
788
788
|
time.sleep(1)
|
|
@@ -917,7 +917,9 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
917
917
|
break
|
|
918
918
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
919
919
|
if new_indexed_rows == indexed_rows:
|
|
920
|
-
pos_movement
|
|
920
|
+
pos_movement -= 1
|
|
921
|
+
else:
|
|
922
|
+
pos_movement = 10
|
|
921
923
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
922
924
|
if pos_movement == 0:
|
|
923
925
|
raise ValueError("Rows are not getting indexed as expected")
|
|
@@ -1046,9 +1048,10 @@ def write_to_nvingest_collection(
|
|
|
1046
1048
|
client,
|
|
1047
1049
|
collection_name,
|
|
1048
1050
|
)
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1051
|
+
if not local_index:
|
|
1052
|
+
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1053
|
+
# know how long this should take, it is num_elements dependent.
|
|
1054
|
+
wait_for_index(collection_name, num_elements, client)
|
|
1052
1055
|
else:
|
|
1053
1056
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1054
1057
|
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
@@ -1349,7 +1352,7 @@ def nvingest_retrieval(
|
|
|
1349
1352
|
nvidia_api_key=nvidia_api_key,
|
|
1350
1353
|
input_type="query",
|
|
1351
1354
|
output_names=["embeddings"],
|
|
1352
|
-
grpc=not (urlparse(embedding_endpoint).scheme
|
|
1355
|
+
grpc=not ("http" in urlparse(embedding_endpoint).scheme),
|
|
1353
1356
|
)
|
|
1354
1357
|
client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
1355
1358
|
final_top_k = top_k
|
|
@@ -1,54 +1,55 @@
|
|
|
1
1
|
nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
2
|
-
nv_ingest_client/nv_ingest_cli.py,sha256=
|
|
2
|
+
nv_ingest_client/nv_ingest_cli.py,sha256=84fc0-6TUe-0BMasRIiRH4okfjno4AKCaKvUwJEZ45k,14457
|
|
3
3
|
nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
4
4
|
nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
5
5
|
nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
|
|
6
|
-
nv_ingest_client/cli/util/processing.py,sha256=
|
|
6
|
+
nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
|
|
7
7
|
nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
|
|
8
8
|
nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
|
|
9
|
-
nv_ingest_client/client/client.py,sha256=
|
|
10
|
-
nv_ingest_client/client/
|
|
11
|
-
nv_ingest_client/client/
|
|
9
|
+
nv_ingest_client/client/client.py,sha256=WH2KRuaqoRm0qe3XAomAJQUetCDXp84xqcsMdumICbk,77505
|
|
10
|
+
nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
|
|
11
|
+
nv_ingest_client/client/interface.py,sha256=vmRdooNkaMVBv6RSxcgMYHfmMs0E3ZBnyrp5mmmhCOI,51247
|
|
12
|
+
nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
|
|
12
13
|
nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
|
|
13
14
|
nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
|
|
14
|
-
nv_ingest_client/primitives/jobs/job_spec.py,sha256=
|
|
15
|
+
nv_ingest_client/primitives/jobs/job_spec.py,sha256=teAZbpvxn25jIEUP5YJsAX_E_z9iWhejS-uy5opshFM,15681
|
|
15
16
|
nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
|
|
16
17
|
nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
|
|
17
18
|
nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
|
|
18
19
|
nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcYqtesS-HaZzeh4rI,2130
|
|
19
20
|
nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
|
|
20
21
|
nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
|
|
21
|
-
nv_ingest_client/primitives/tasks/embed.py,sha256=
|
|
22
|
-
nv_ingest_client/primitives/tasks/extract.py,sha256=
|
|
23
|
-
nv_ingest_client/primitives/tasks/filter.py,sha256=
|
|
22
|
+
nv_ingest_client/primitives/tasks/embed.py,sha256=YFnymU1UWID2gSrz1anlaL_SRMmDr3dNTeZv2UDu9kQ,6739
|
|
23
|
+
nv_ingest_client/primitives/tasks/extract.py,sha256=bRriVkQyXN-UwzprHIt4Lp0iwmAojLEXqBb-IUrf3vY,9328
|
|
24
|
+
nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
|
|
24
25
|
nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
|
|
25
26
|
nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
|
|
26
27
|
nv_ingest_client/primitives/tasks/store.py,sha256=nIOnCH8vw4FLCLVBJYnsS5Unc0QmuO_jEtUp7-E9FU4,4199
|
|
27
28
|
nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
|
|
28
29
|
nv_ingest_client/primitives/tasks/task_base.py,sha256=Mrx6kgePJHolYd3Im6mVISXcVgdulLst2MYG5gPov9I,1687
|
|
29
|
-
nv_ingest_client/primitives/tasks/task_factory.py,sha256=
|
|
30
|
-
nv_ingest_client/primitives/tasks/udf.py,sha256=
|
|
30
|
+
nv_ingest_client/primitives/tasks/task_factory.py,sha256=uvGQXjgWmeF015jPWmBhiclzfrUf3_yD2PPeirQBczM,3218
|
|
31
|
+
nv_ingest_client/primitives/tasks/udf.py,sha256=GZgckhrWSTIQMYLkw4R4XFtx2YeUesAJI22LsNwvBjc,12773
|
|
31
32
|
nv_ingest_client/primitives/tasks/vdb_upload.py,sha256=mXOyQJfQfaoN96nntzevd0sKUs60-AHi8lc1jxG3DAw,1765
|
|
32
33
|
nv_ingest_client/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
34
|
nv_ingest_client/util/dataset.py,sha256=2yDPs47HNj8AOdOAfJL4XVji0BMRJq_NH8CG4s4xT-Q,3701
|
|
34
|
-
nv_ingest_client/util/document_analysis.py,sha256=
|
|
35
|
+
nv_ingest_client/util/document_analysis.py,sha256=T4olsfjwm4BZmT9xXT8M8RWKhdCPSASsDpzQmJDflts,10569
|
|
35
36
|
nv_ingest_client/util/image_disk_utils.py,sha256=M-lSRBvNlOMm20uiYygQ0Oh4GMKspih7G03rKNRzOSE,11507
|
|
36
37
|
nv_ingest_client/util/milvus.py,sha256=MwBix_UBg54i7xONBIwjcqeKSBkqunxBJBK2f0bPMoo,61
|
|
37
38
|
nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywkcLuNieozvPWvo0,3785
|
|
38
39
|
nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
|
|
39
40
|
nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
|
|
40
41
|
nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
|
|
41
|
-
nv_ingest_client/util/util.py,sha256=
|
|
42
|
+
nv_ingest_client/util/util.py,sha256=qwJ4MqF8w4-lws76z8iz1V0Hz_ebDYN8yAKyJPGuHuU,15828
|
|
42
43
|
nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
|
|
43
44
|
nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
45
|
nv_ingest_client/util/file_processing/extract.py,sha256=uXEATBYZXjxdymGTNQvvzDD2eHgpuq4PdU6HsMl0Lp0,4662
|
|
45
46
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
46
47
|
nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
|
|
47
|
-
nv_ingest_client/util/vdb/milvus.py,sha256=
|
|
48
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=6XWRh2SDJlgVZOKZVXG3cZTB4L-ZHIiiTenuIzkxp2Y,78704
|
|
48
49
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
49
|
-
nv_ingest_client-2025.
|
|
50
|
-
nv_ingest_client-2025.
|
|
51
|
-
nv_ingest_client-2025.
|
|
52
|
-
nv_ingest_client-2025.
|
|
53
|
-
nv_ingest_client-2025.
|
|
54
|
-
nv_ingest_client-2025.
|
|
50
|
+
nv_ingest_client-2025.11.2.dev20251102.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
51
|
+
nv_ingest_client-2025.11.2.dev20251102.dist-info/METADATA,sha256=5WbspmKFTwC952iUCOqw5Wt07eWhsY2XwgdKl2DwbzE,30626
|
|
52
|
+
nv_ingest_client-2025.11.2.dev20251102.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
53
|
+
nv_ingest_client-2025.11.2.dev20251102.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
54
|
+
nv_ingest_client-2025.11.2.dev20251102.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
55
|
+
nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|