PyPI - nv-ingest-client - Versions diffs - 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl - Mend

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show

nv_ingest_client/cli/util/click.py +182 -30
nv_ingest_client/cli/util/processing.py +0 -393
nv_ingest_client/client/client.py +561 -207
nv_ingest_client/client/ingest_job_handler.py +412 -0
nv_ingest_client/client/interface.py +466 -59
nv_ingest_client/client/util/processing.py +11 -1
nv_ingest_client/nv_ingest_cli.py +58 -6
nv_ingest_client/primitives/jobs/job_spec.py +32 -10
nv_ingest_client/primitives/tasks/__init__.py +6 -4
nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
nv_ingest_client/primitives/tasks/caption.py +10 -16
nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
nv_ingest_client/primitives/tasks/dedup.py +12 -21
nv_ingest_client/primitives/tasks/embed.py +37 -76
nv_ingest_client/primitives/tasks/extract.py +68 -169
nv_ingest_client/primitives/tasks/filter.py +22 -28
nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
nv_ingest_client/primitives/tasks/split.py +17 -18
nv_ingest_client/primitives/tasks/store.py +29 -29
nv_ingest_client/primitives/tasks/task_base.py +1 -72
nv_ingest_client/primitives/tasks/task_factory.py +10 -11
nv_ingest_client/primitives/tasks/udf.py +349 -0
nv_ingest_client/util/dataset.py +8 -2
nv_ingest_client/util/document_analysis.py +314 -0
nv_ingest_client/util/image_disk_utils.py +300 -0
nv_ingest_client/util/transport.py +12 -6
nv_ingest_client/util/util.py +66 -0
nv_ingest_client/util/vdb/milvus.py +220 -75
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
nv_ingest_client/cli/util/tasks.py +0 -3
nv_ingest_client/primitives/exceptions.py +0 -0
nv_ingest_client/primitives/tasks/transform.py +0 -0
nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0

nv_ingest_client/util/util.py CHANGED Viewed

@@ -8,6 +8,8 @@ import logging
 import os
 import time
 import typing
+import math
+import heapq
 from typing import Dict
 from typing import List
@@ -33,6 +35,7 @@ class ClientConfigSchema:
             "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
         )
         self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+        self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
 @unified_exception_handler
@@ -242,6 +245,43 @@ def generate_matching_files(file_sources):
                 yield file_path
+def balanced_groups_flat_order(
+    file_paths,
+    group_size=16,
+    weight_fn=os.path.getsize,
+):
+    # 1) sizes, sorted big -> small
+    # Sort by weight (descending), then by filename (ascending) for ties
+    def sort_key(weight_path_tuple):
+        weight, path = weight_path_tuple
+        return (-weight, path)
+    files = sorted(((weight_fn(p), p) for p in file_paths), key=sort_key)
+    n = len(files)
+    num_bins = math.ceil(n / group_size)
+    # 2) bins + heap over current loads (only for bins that are not full yet)
+    bins = [[] for _ in range(num_bins)]
+    loads = [0] * num_bins
+    counts = [0] * num_bins
+    heap = [(0, i) for i in range(num_bins)]
+    heapq.heapify(heap)
+    # 3) place biggest first into the currently lightest bin
+    for size, path in files:
+        total, i = heapq.heappop(heap)
+        bins[i].append(path)
+        loads[i] += size
+        counts[i] += 1
+        if counts[i] < group_size:  # still has capacity
+            heapq.heappush(heap, (loads[i], i))
+    # 4) sort bins by cumulative size (largest first), then flatten
+    sorted_bins = [bins[i] for _, i in sorted(zip(loads, range(num_bins)), reverse=True)]
+    balanced_ls = [p for b in sorted_bins for p in b]
+    return balanced_ls
 def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
     """
     Create and job specifications (JobSpecs) for a batch of files.
@@ -310,6 +350,32 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
     return job_specs
+def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
+    """
+    Apply PDF split configuration to a list of JobSpec objects.
+    Modifies job specs in-place by adding pdf_config to extended_options for PDF files only.
+    Parameters
+    ----------
+    job_specs : List[JobSpec]
+        List of job specifications to potentially modify
+    pages_per_chunk : int
+        Number of pages per PDF chunk (will be stored as-is; server performs clamping)
+    Notes
+    -----
+    - Only modifies job specs with document_type == "pdf" (case-insensitive)
+    - Modifies job specs in-place
+    - Safe to call on mixed document types (only PDFs are affected)
+    """
+    for job_spec in job_specs:
+        if job_spec.document_type.lower() == "pdf":
+            if "pdf_config" not in job_spec._extended_options:
+                job_spec._extended_options["pdf_config"] = {}
+            job_spec._extended_options["pdf_config"]["split_page_count"] = pages_per_chunk
 def filter_function_kwargs(func, **kwargs):
     """
     Filters and returns keyword arguments that match the parameters of a given function.

nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl