nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +561 -207
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +466 -59
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +58 -6
- nv_ingest_client/primitives/jobs/job_spec.py +32 -10
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +37 -76
- nv_ingest_client/primitives/tasks/extract.py +68 -169
- nv_ingest_client/primitives/tasks/filter.py +22 -28
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +10 -11
- nv_ingest_client/primitives/tasks/udf.py +349 -0
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client/util/image_disk_utils.py +300 -0
- nv_ingest_client/util/transport.py +12 -6
- nv_ingest_client/util/util.py +66 -0
- nv_ingest_client/util/vdb/milvus.py +220 -75
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
- nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
nv_ingest_client/util/util.py
CHANGED
|
@@ -8,6 +8,8 @@ import logging
|
|
|
8
8
|
import os
|
|
9
9
|
import time
|
|
10
10
|
import typing
|
|
11
|
+
import math
|
|
12
|
+
import heapq
|
|
11
13
|
from typing import Dict
|
|
12
14
|
from typing import List
|
|
13
15
|
|
|
@@ -33,6 +35,7 @@ class ClientConfigSchema:
|
|
|
33
35
|
"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
|
|
34
36
|
)
|
|
35
37
|
self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
|
|
38
|
+
self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
@unified_exception_handler
|
|
@@ -242,6 +245,43 @@ def generate_matching_files(file_sources):
|
|
|
242
245
|
yield file_path
|
|
243
246
|
|
|
244
247
|
|
|
248
|
+
def balanced_groups_flat_order(
|
|
249
|
+
file_paths,
|
|
250
|
+
group_size=16,
|
|
251
|
+
weight_fn=os.path.getsize,
|
|
252
|
+
):
|
|
253
|
+
# 1) sizes, sorted big -> small
|
|
254
|
+
# Sort by weight (descending), then by filename (ascending) for ties
|
|
255
|
+
def sort_key(weight_path_tuple):
|
|
256
|
+
weight, path = weight_path_tuple
|
|
257
|
+
return (-weight, path)
|
|
258
|
+
|
|
259
|
+
files = sorted(((weight_fn(p), p) for p in file_paths), key=sort_key)
|
|
260
|
+
n = len(files)
|
|
261
|
+
num_bins = math.ceil(n / group_size)
|
|
262
|
+
|
|
263
|
+
# 2) bins + heap over current loads (only for bins that are not full yet)
|
|
264
|
+
bins = [[] for _ in range(num_bins)]
|
|
265
|
+
loads = [0] * num_bins
|
|
266
|
+
counts = [0] * num_bins
|
|
267
|
+
heap = [(0, i) for i in range(num_bins)]
|
|
268
|
+
heapq.heapify(heap)
|
|
269
|
+
|
|
270
|
+
# 3) place biggest first into the currently lightest bin
|
|
271
|
+
for size, path in files:
|
|
272
|
+
total, i = heapq.heappop(heap)
|
|
273
|
+
bins[i].append(path)
|
|
274
|
+
loads[i] += size
|
|
275
|
+
counts[i] += 1
|
|
276
|
+
if counts[i] < group_size: # still has capacity
|
|
277
|
+
heapq.heappush(heap, (loads[i], i))
|
|
278
|
+
|
|
279
|
+
# 4) sort bins by cumulative size (largest first), then flatten
|
|
280
|
+
sorted_bins = [bins[i] for _, i in sorted(zip(loads, range(num_bins)), reverse=True)]
|
|
281
|
+
balanced_ls = [p for b in sorted_bins for p in b]
|
|
282
|
+
return balanced_ls
|
|
283
|
+
|
|
284
|
+
|
|
245
285
|
def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
246
286
|
"""
|
|
247
287
|
Create and job specifications (JobSpecs) for a batch of files.
|
|
@@ -310,6 +350,32 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
310
350
|
return job_specs
|
|
311
351
|
|
|
312
352
|
|
|
353
|
+
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
|
+
"""
|
|
355
|
+
Apply PDF split configuration to a list of JobSpec objects.
|
|
356
|
+
|
|
357
|
+
Modifies job specs in-place by adding pdf_config to extended_options for PDF files only.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
job_specs : List[JobSpec]
|
|
362
|
+
List of job specifications to potentially modify
|
|
363
|
+
pages_per_chunk : int
|
|
364
|
+
Number of pages per PDF chunk (will be stored as-is; server performs clamping)
|
|
365
|
+
|
|
366
|
+
Notes
|
|
367
|
+
-----
|
|
368
|
+
- Only modifies job specs with document_type == "pdf" (case-insensitive)
|
|
369
|
+
- Modifies job specs in-place
|
|
370
|
+
- Safe to call on mixed document types (only PDFs are affected)
|
|
371
|
+
"""
|
|
372
|
+
for job_spec in job_specs:
|
|
373
|
+
if job_spec.document_type.lower() == "pdf":
|
|
374
|
+
if "pdf_config" not in job_spec._extended_options:
|
|
375
|
+
job_spec._extended_options["pdf_config"] = {}
|
|
376
|
+
job_spec._extended_options["pdf_config"]["split_page_count"] = pages_per_chunk
|
|
377
|
+
|
|
378
|
+
|
|
313
379
|
def filter_function_kwargs(func, **kwargs):
|
|
314
380
|
"""
|
|
315
381
|
Filters and returns keyword arguments that match the parameters of a given function.
|