nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,8 @@ import logging
8
8
  import os
9
9
  import time
10
10
  import typing
11
+ import math
12
+ import heapq
11
13
  from typing import Dict
12
14
  from typing import List
13
15
 
@@ -33,6 +35,7 @@ class ClientConfigSchema:
33
35
  "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
34
36
  )
35
37
  self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
38
+ self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
36
39
 
37
40
 
38
41
  @unified_exception_handler
@@ -242,6 +245,43 @@ def generate_matching_files(file_sources):
242
245
  yield file_path
243
246
 
244
247
 
248
+ def balanced_groups_flat_order(
249
+ file_paths,
250
+ group_size=16,
251
+ weight_fn=os.path.getsize,
252
+ ):
253
+ # 1) sizes, sorted big -> small
254
+ # Sort by weight (descending), then by filename (ascending) for ties
255
+ def sort_key(weight_path_tuple):
256
+ weight, path = weight_path_tuple
257
+ return (-weight, path)
258
+
259
+ files = sorted(((weight_fn(p), p) for p in file_paths), key=sort_key)
260
+ n = len(files)
261
+ num_bins = math.ceil(n / group_size)
262
+
263
+ # 2) bins + heap over current loads (only for bins that are not full yet)
264
+ bins = [[] for _ in range(num_bins)]
265
+ loads = [0] * num_bins
266
+ counts = [0] * num_bins
267
+ heap = [(0, i) for i in range(num_bins)]
268
+ heapq.heapify(heap)
269
+
270
+ # 3) place biggest first into the currently lightest bin
271
+ for size, path in files:
272
+ total, i = heapq.heappop(heap)
273
+ bins[i].append(path)
274
+ loads[i] += size
275
+ counts[i] += 1
276
+ if counts[i] < group_size: # still has capacity
277
+ heapq.heappush(heap, (loads[i], i))
278
+
279
+ # 4) sort bins by cumulative size (largest first), then flatten
280
+ sorted_bins = [bins[i] for _, i in sorted(zip(loads, range(num_bins)), reverse=True)]
281
+ balanced_ls = [p for b in sorted_bins for p in b]
282
+ return balanced_ls
283
+
284
+
245
285
  def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
246
286
  """
247
287
  Create and job specifications (JobSpecs) for a batch of files.
@@ -310,6 +350,32 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
310
350
  return job_specs
311
351
 
312
352
 
353
+ def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
354
+ """
355
+ Apply PDF split configuration to a list of JobSpec objects.
356
+
357
+ Modifies job specs in-place by adding pdf_config to extended_options for PDF files only.
358
+
359
+ Parameters
360
+ ----------
361
+ job_specs : List[JobSpec]
362
+ List of job specifications to potentially modify
363
+ pages_per_chunk : int
364
+ Number of pages per PDF chunk (will be stored as-is; server performs clamping)
365
+
366
+ Notes
367
+ -----
368
+ - Only modifies job specs with document_type == "pdf" (case-insensitive)
369
+ - Modifies job specs in-place
370
+ - Safe to call on mixed document types (only PDFs are affected)
371
+ """
372
+ for job_spec in job_specs:
373
+ if job_spec.document_type.lower() == "pdf":
374
+ if "pdf_config" not in job_spec._extended_options:
375
+ job_spec._extended_options["pdf_config"] = {}
376
+ job_spec._extended_options["pdf_config"]["split_page_count"] = pages_per_chunk
377
+
378
+
313
379
  def filter_function_kwargs(func, **kwargs):
314
380
  """
315
381
  Filters and returns keyword arguments that match the parameters of a given function.