nv-ingest-client 2025.10.14.dev20251014__tar.gz → 2025.10.16.dev20251016__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.10.14.dev20251014/src/nv_ingest_client.egg-info → nv_ingest_client-2025.10.16.dev20251016}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/client/client.py +159 -220
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/client/ingest_job_handler.py +6 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/client/interface.py +39 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/nv_ingest_cli.py +22 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/jobs/job_spec.py +1 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/document_analysis.py +1 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/util.py +26 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/vdb/milvus.py +8 -5
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/LICENSE +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/README.md +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/pyproject.toml +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/setup.cfg +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client.egg-info/SOURCES.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/version.py +0 -0
|
@@ -9,7 +9,6 @@ import json
|
|
|
9
9
|
import logging
|
|
10
10
|
import math
|
|
11
11
|
import os
|
|
12
|
-
import random
|
|
13
12
|
import time
|
|
14
13
|
import threading
|
|
15
14
|
import copy
|
|
@@ -36,7 +35,11 @@ from nv_ingest_client.primitives.tasks import TaskType
|
|
|
36
35
|
from nv_ingest_client.primitives.tasks import is_valid_task_type
|
|
37
36
|
from nv_ingest_client.primitives.tasks import task_factory
|
|
38
37
|
from nv_ingest_client.util.processing import handle_future_result, IngestJobFailure
|
|
39
|
-
from nv_ingest_client.util.util import
|
|
38
|
+
from nv_ingest_client.util.util import (
|
|
39
|
+
create_job_specs_for_batch,
|
|
40
|
+
check_ingest_result,
|
|
41
|
+
apply_pdf_split_config_to_job_specs,
|
|
42
|
+
)
|
|
40
43
|
|
|
41
44
|
logger = logging.getLogger(__name__)
|
|
42
45
|
|
|
@@ -61,15 +64,12 @@ class DataDecodeException(Exception):
|
|
|
61
64
|
|
|
62
65
|
class _ConcurrentProcessor:
|
|
63
66
|
"""
|
|
64
|
-
Manages
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
they become available within the batch using `as_completed`. Retries due
|
|
71
|
-
to job readiness timeouts are handled by adding the job index to the next
|
|
72
|
-
processing batch.
|
|
67
|
+
Manages asynchronous submission and result fetching while keeping a steady
|
|
68
|
+
pool of up to `batch_size` in-flight jobs:
|
|
69
|
+
- Retries (202/TimeoutError) are re-queued immediately.
|
|
70
|
+
- New jobs are submitted as capacity frees up.
|
|
71
|
+
- Fetches are started for jobs added each cycle.
|
|
72
|
+
- We always attempt to keep the executor saturated up to `batch_size`.
|
|
73
73
|
"""
|
|
74
74
|
|
|
75
75
|
def __init__(
|
|
@@ -146,8 +146,6 @@ class _ConcurrentProcessor:
|
|
|
146
146
|
# State variables managed across batch cycles
|
|
147
147
|
self.retry_job_ids: List[str] = []
|
|
148
148
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
149
|
-
self.next_allowed_fetch_time: Dict[str, float] = {}
|
|
150
|
-
self._retry_backoff_cap: float = 5.0
|
|
151
149
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
152
150
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
153
151
|
|
|
@@ -195,8 +193,6 @@ class _ConcurrentProcessor:
|
|
|
195
193
|
# Cleanup retry count if it exists for this job
|
|
196
194
|
if job_index in self.retry_counts:
|
|
197
195
|
del self.retry_counts[job_index]
|
|
198
|
-
if job_index in self.next_allowed_fetch_time:
|
|
199
|
-
del self.next_allowed_fetch_time[job_index]
|
|
200
196
|
|
|
201
197
|
# Attempt to mark state as FAILED locally in the client (best effort)
|
|
202
198
|
try:
|
|
@@ -254,8 +250,6 @@ class _ConcurrentProcessor:
|
|
|
254
250
|
# Cleanup retry count if it exists
|
|
255
251
|
if job_index in self.retry_counts:
|
|
256
252
|
del self.retry_counts[job_index]
|
|
257
|
-
if job_index in self.next_allowed_fetch_time:
|
|
258
|
-
del self.next_allowed_fetch_time[job_index]
|
|
259
253
|
|
|
260
254
|
# Execute completion callback if provided
|
|
261
255
|
if self.completion_callback:
|
|
@@ -301,7 +295,7 @@ class _ConcurrentProcessor:
|
|
|
301
295
|
|
|
302
296
|
def _collect_retry_jobs_for_batch(self) -> List[str]:
|
|
303
297
|
"""
|
|
304
|
-
Collect
|
|
298
|
+
Collect retry jobs for this batch, mirroring handler behavior (no pacing filter).
|
|
305
299
|
|
|
306
300
|
Returns
|
|
307
301
|
-------
|
|
@@ -311,34 +305,17 @@ class _ConcurrentProcessor:
|
|
|
311
305
|
if not self.retry_job_ids:
|
|
312
306
|
return []
|
|
313
307
|
|
|
314
|
-
|
|
315
|
-
eligible: List[str] =
|
|
316
|
-
|
|
317
|
-
for job_id in self.retry_job_ids:
|
|
318
|
-
allowed_at = self.next_allowed_fetch_time.get(job_id, 0.0)
|
|
319
|
-
if allowed_at <= now:
|
|
320
|
-
eligible.append(job_id)
|
|
321
|
-
else:
|
|
322
|
-
remaining.append(job_id)
|
|
323
|
-
|
|
308
|
+
# Take all retries this cycle and clear the list (handler resets per-iteration)
|
|
309
|
+
eligible: List[str] = list(self.retry_job_ids)
|
|
310
|
+
self.retry_job_ids = []
|
|
324
311
|
if eligible and self.verbose:
|
|
325
312
|
logger.debug(f"Adding {len(eligible)} retry jobs to current batch.")
|
|
326
|
-
|
|
327
|
-
# Keep non-eligible retries for a later batch
|
|
328
|
-
self.retry_job_ids = remaining
|
|
329
313
|
return eligible
|
|
330
314
|
|
|
331
315
|
def _schedule_retry(self, job_index: str) -> None:
|
|
332
316
|
"""
|
|
333
|
-
Schedule
|
|
317
|
+
Schedule an immediate retry for a job (no pacing), mirroring handler behavior.
|
|
334
318
|
"""
|
|
335
|
-
now = time.time()
|
|
336
|
-
attempt = max(1, self.retry_counts.get(job_index, 1))
|
|
337
|
-
base = max(0.01, float(self.retry_delay) if self.retry_delay is not None else 1.0)
|
|
338
|
-
delay = min(base * (2 ** (attempt - 1)), self._retry_backoff_cap)
|
|
339
|
-
jitter = random.uniform(0.8, 1.2)
|
|
340
|
-
wait_s = delay * jitter
|
|
341
|
-
self.next_allowed_fetch_time[job_index] = now + wait_s
|
|
342
319
|
if job_index not in self.retry_job_ids:
|
|
343
320
|
self.retry_job_ids.append(job_index)
|
|
344
321
|
|
|
@@ -401,11 +378,6 @@ class _ConcurrentProcessor:
|
|
|
401
378
|
_ = self.client.submit_job_async(current_batch_new_job_indices, self.job_queue_id)
|
|
402
379
|
# Add successfully initiated jobs to the overall batch list
|
|
403
380
|
current_batch_job_indices.extend(current_batch_new_job_indices)
|
|
404
|
-
# Stagger the first fetch attempt slightly to avoid immediate 202s
|
|
405
|
-
now = time.time()
|
|
406
|
-
for job_index in current_batch_new_job_indices:
|
|
407
|
-
allowed_at = self.next_allowed_fetch_time.get(job_index, 0.0)
|
|
408
|
-
self.next_allowed_fetch_time[job_index] = max(allowed_at, now + float(self.initial_fetch_delay))
|
|
409
381
|
# Update count of total initiated jobs
|
|
410
382
|
submitted_new_indices_count += len(current_batch_new_job_indices)
|
|
411
383
|
return current_batch_job_indices, submitted_new_indices_count
|
|
@@ -436,35 +408,18 @@ class _ConcurrentProcessor:
|
|
|
436
408
|
normalized_job_indices : List[str]
|
|
437
409
|
The job indices normalized to those actually returned by the client if a discrepancy occurs.
|
|
438
410
|
"""
|
|
439
|
-
# Filter indices by next_allowed_fetch_time to respect pacing for new jobs
|
|
440
|
-
now = time.time()
|
|
441
|
-
eligible_indices: List[str] = []
|
|
442
|
-
deferred_indices: List[str] = []
|
|
443
|
-
for idx in current_batch_job_indices:
|
|
444
|
-
if self.next_allowed_fetch_time.get(idx, 0.0) <= now:
|
|
445
|
-
eligible_indices.append(idx)
|
|
446
|
-
else:
|
|
447
|
-
deferred_indices.append(idx)
|
|
448
|
-
|
|
449
|
-
# Defer ineligible jobs for later retry window
|
|
450
|
-
for idx in deferred_indices:
|
|
451
|
-
if idx not in self.retry_job_ids:
|
|
452
|
-
self.retry_job_ids.append(idx)
|
|
453
|
-
|
|
454
411
|
if self.verbose:
|
|
455
|
-
logger.debug(
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
batch_futures_dict = (
|
|
461
|
-
self.client.fetch_job_result_async(eligible_indices, data_only=False) if eligible_indices else {}
|
|
412
|
+
logger.debug(f"Calling fetch_job_result_async for {len(current_batch_job_indices)} jobs.")
|
|
413
|
+
batch_futures_dict: Dict[Future, str] = (
|
|
414
|
+
self.client.fetch_job_result_async(current_batch_job_indices, data_only=False, timeout=None)
|
|
415
|
+
if current_batch_job_indices
|
|
416
|
+
else {}
|
|
462
417
|
)
|
|
463
418
|
|
|
464
419
|
# Check for discrepancies where client might not return all futures
|
|
465
|
-
if
|
|
420
|
+
if current_batch_job_indices and (len(batch_futures_dict) != len(current_batch_job_indices)):
|
|
466
421
|
returned_indices = set(batch_futures_dict.values())
|
|
467
|
-
missing_indices = [idx for idx in
|
|
422
|
+
missing_indices = [idx for idx in current_batch_job_indices if idx not in returned_indices]
|
|
468
423
|
logger.error(
|
|
469
424
|
f"fetch_job_result_async discrepancy: Expected {len(current_batch_job_indices)}, got "
|
|
470
425
|
f"{len(batch_futures_dict)}. Missing: {missing_indices}"
|
|
@@ -479,82 +434,10 @@ class _ConcurrentProcessor:
|
|
|
479
434
|
# Continue processing only the futures we received
|
|
480
435
|
normalized_job_indices = list(returned_indices)
|
|
481
436
|
else:
|
|
482
|
-
normalized_job_indices = list(
|
|
437
|
+
normalized_job_indices = list(current_batch_job_indices)
|
|
483
438
|
|
|
484
439
|
return batch_futures_dict, normalized_job_indices
|
|
485
440
|
|
|
486
|
-
def _process_batch_futures(self, batch_futures_dict: Dict[Future, str], batch_timeout: float) -> None:
|
|
487
|
-
"""
|
|
488
|
-
Process the batch futures as they complete, handling success, 202-timeout retries,
|
|
489
|
-
and failures according to existing logic.
|
|
490
|
-
"""
|
|
491
|
-
if not batch_futures_dict:
|
|
492
|
-
if self.verbose:
|
|
493
|
-
logger.debug("No futures returned/available for processing in this batch.")
|
|
494
|
-
return
|
|
495
|
-
|
|
496
|
-
try:
|
|
497
|
-
for future in as_completed(batch_futures_dict.keys(), timeout=batch_timeout):
|
|
498
|
-
job_index = batch_futures_dict[future]
|
|
499
|
-
try:
|
|
500
|
-
# Expect list with one tuple: [(data, index, trace)]
|
|
501
|
-
result_list = future.result()
|
|
502
|
-
if not isinstance(result_list, list) or len(result_list) != 1:
|
|
503
|
-
raise ValueError(f"Expected list length 1, got {len(result_list)}")
|
|
504
|
-
|
|
505
|
-
result_tuple = result_list[0]
|
|
506
|
-
if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
|
|
507
|
-
raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
|
|
508
|
-
|
|
509
|
-
full_response_dict, fetched_job_index, trace_id = result_tuple
|
|
510
|
-
|
|
511
|
-
if fetched_job_index != job_index:
|
|
512
|
-
logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
|
|
513
|
-
|
|
514
|
-
self._handle_processing_success(job_index, full_response_dict, trace_id)
|
|
515
|
-
|
|
516
|
-
except TimeoutError:
|
|
517
|
-
# Handle job not ready - check retry policy and schedule paced retry
|
|
518
|
-
self.retry_counts[job_index] += 1
|
|
519
|
-
if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
|
|
520
|
-
if self.verbose:
|
|
521
|
-
logger.info(
|
|
522
|
-
f"Job {job_index} not ready, scheduling paced retry (Attempt "
|
|
523
|
-
f"{self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
|
|
524
|
-
)
|
|
525
|
-
self._schedule_retry(job_index)
|
|
526
|
-
else:
|
|
527
|
-
error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
|
|
528
|
-
logger.error(error_msg)
|
|
529
|
-
self._handle_processing_failure(job_index, error_msg)
|
|
530
|
-
|
|
531
|
-
except (ValueError, RuntimeError) as e:
|
|
532
|
-
logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
|
|
533
|
-
self._handle_processing_failure(job_index, f"Error processing result: {e}")
|
|
534
|
-
except Exception as e:
|
|
535
|
-
logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
|
|
536
|
-
self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
|
|
537
|
-
|
|
538
|
-
except TimeoutError:
|
|
539
|
-
self._handle_batch_timeout(batch_futures_dict, batch_timeout)
|
|
540
|
-
|
|
541
|
-
def _handle_batch_timeout(self, batch_futures_dict: Dict[Future, str], batch_timeout: float) -> None:
|
|
542
|
-
"""
|
|
543
|
-
Handle a timeout while waiting for batch futures, mirroring the original behavior.
|
|
544
|
-
"""
|
|
545
|
-
logger.error(
|
|
546
|
-
f"Batch processing timed out after {batch_timeout}s waiting for futures. "
|
|
547
|
-
"Some jobs in batch may be lost or incomplete."
|
|
548
|
-
)
|
|
549
|
-
remaining_indices_in_batch = []
|
|
550
|
-
for f, idx in batch_futures_dict.items():
|
|
551
|
-
if not f.done():
|
|
552
|
-
remaining_indices_in_batch.append(idx)
|
|
553
|
-
f.cancel() # Attempt to cancel underlying task
|
|
554
|
-
logger.warning(f"Jobs potentially lost/cancelled due to batch timeout: {remaining_indices_in_batch}")
|
|
555
|
-
for idx in remaining_indices_in_batch:
|
|
556
|
-
self._handle_processing_failure(idx, f"Batch processing timed out after {batch_timeout}s")
|
|
557
|
-
|
|
558
441
|
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
|
|
559
442
|
"""
|
|
560
443
|
Executes the main processing loop in batches.
|
|
@@ -583,78 +466,117 @@ class _ConcurrentProcessor:
|
|
|
583
466
|
total_jobs = len(self.all_job_indices_list)
|
|
584
467
|
submitted_new_indices_count = 0 # Tracks indices for which submission has been initiated at least once
|
|
585
468
|
|
|
586
|
-
logger.
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
469
|
+
logger.debug(f"Starting batch processing for {total_jobs} jobs with batch size {self.batch_size}.")
|
|
470
|
+
|
|
471
|
+
# Keep up to batch_size jobs in-flight at all times
|
|
472
|
+
inflight_futures: Dict[Future, str] = {}
|
|
473
|
+
|
|
474
|
+
while (submitted_new_indices_count < total_jobs) or self.retry_job_ids or inflight_futures:
|
|
475
|
+
# 1) Top up from retries first
|
|
476
|
+
capacity = max(0, self.batch_size - len(inflight_futures))
|
|
477
|
+
to_fetch: List[str] = []
|
|
478
|
+
if capacity > 0 and self.retry_job_ids:
|
|
479
|
+
take = min(capacity, len(self.retry_job_ids))
|
|
480
|
+
retry_now = self.retry_job_ids[:take]
|
|
481
|
+
self.retry_job_ids = self.retry_job_ids[take:]
|
|
482
|
+
to_fetch.extend(retry_now)
|
|
483
|
+
capacity -= len(retry_now)
|
|
484
|
+
|
|
485
|
+
# 2) Then add new jobs up to capacity
|
|
486
|
+
if capacity > 0 and (submitted_new_indices_count < total_jobs):
|
|
487
|
+
new_count = min(capacity, total_jobs - submitted_new_indices_count)
|
|
488
|
+
new_job_indices = self.all_job_indices_list[
|
|
489
|
+
submitted_new_indices_count : submitted_new_indices_count + new_count
|
|
490
|
+
]
|
|
491
|
+
|
|
492
|
+
if not self.job_queue_id:
|
|
493
|
+
error_msg = "Cannot submit new jobs: job_queue_id is not set."
|
|
494
|
+
logger.error(error_msg)
|
|
495
|
+
for job_index in new_job_indices:
|
|
496
|
+
self._handle_processing_failure(job_index, error_msg, is_submission_failure=True)
|
|
497
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
498
|
+
if self.fail_on_submit_error:
|
|
499
|
+
raise ValueError(error_msg)
|
|
500
|
+
else:
|
|
501
|
+
try:
|
|
502
|
+
_ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
|
|
503
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
504
|
+
to_fetch.extend(new_job_indices)
|
|
505
|
+
except Exception as e:
|
|
506
|
+
error_msg = f"Batch async submission initiation failed for {len(new_job_indices)} new jobs: {e}"
|
|
507
|
+
logger.error(error_msg, exc_info=True)
|
|
508
|
+
for job_index in new_job_indices:
|
|
509
|
+
self._handle_processing_failure(
|
|
510
|
+
job_index, f"Batch submission initiation error: {e}", is_submission_failure=True
|
|
511
|
+
)
|
|
512
|
+
submitted_new_indices_count += len(new_job_indices)
|
|
513
|
+
if self.fail_on_submit_error:
|
|
514
|
+
raise RuntimeError(error_msg) from e
|
|
599
515
|
|
|
600
|
-
# 3)
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
# (errors already logged and failures recorded inside helper)
|
|
610
|
-
if self.fail_on_submit_error:
|
|
611
|
-
raise
|
|
612
|
-
|
|
613
|
-
# 4) If no jobs to fetch this cycle, decide whether to exit or continue
|
|
614
|
-
if not current_batch_job_indices:
|
|
615
|
-
if self.verbose:
|
|
616
|
-
logger.debug("No jobs identified for fetching in this batch iteration.")
|
|
617
|
-
if not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
618
|
-
logger.debug("Exiting loop: No jobs to fetch and no retries pending.")
|
|
619
|
-
break
|
|
620
|
-
# If retries remain but are not yet eligible, sleep until earliest allowed
|
|
621
|
-
if self.retry_job_ids:
|
|
622
|
-
now = time.time()
|
|
623
|
-
future_times = [self.next_allowed_fetch_time.get(j, now) for j in self.retry_job_ids]
|
|
624
|
-
# Consider only times in the future
|
|
625
|
-
future_times = [t for t in future_times if t > now]
|
|
626
|
-
if future_times:
|
|
627
|
-
sleep_for = min(max(min(future_times) - now, 0.05), 1.0)
|
|
628
|
-
if self.verbose:
|
|
629
|
-
logger.debug(f"Pacing retries: sleeping {sleep_for:.2f}s waiting for next allowed fetch.")
|
|
630
|
-
time.sleep(sleep_for)
|
|
631
|
-
continue
|
|
632
|
-
|
|
633
|
-
# 5) Initiate fetching for the current batch
|
|
634
|
-
try:
|
|
635
|
-
batch_futures_dict, _ = self._initiate_fetch_for_batch(current_batch_job_indices)
|
|
636
|
-
except Exception as fetch_init_err:
|
|
637
|
-
error_msg = (
|
|
638
|
-
f"fetch_job_result_async failed for batch ({len(current_batch_job_indices)} jobs): {fetch_init_err}"
|
|
639
|
-
)
|
|
640
|
-
logger.error(error_msg, exc_info=True)
|
|
641
|
-
logger.warning(
|
|
642
|
-
f"Marking all {len(current_batch_job_indices)} jobs in failed fetch initiation batch as failed."
|
|
643
|
-
)
|
|
644
|
-
for job_index in current_batch_job_indices:
|
|
645
|
-
self._handle_processing_failure(
|
|
646
|
-
job_index, f"Fetch initiation failed for batch: {fetch_init_err}", is_submission_failure=True
|
|
516
|
+
# 3) Launch fetches for the jobs we added to this cycle
|
|
517
|
+
if to_fetch:
|
|
518
|
+
try:
|
|
519
|
+
new_futures = self.client.fetch_job_result_async(to_fetch, data_only=False, timeout=None)
|
|
520
|
+
inflight_futures.update(new_futures)
|
|
521
|
+
except Exception as fetch_init_err:
|
|
522
|
+
logger.error(
|
|
523
|
+
f"fetch_job_result_async failed to start for {len(to_fetch)} jobs: {fetch_init_err}",
|
|
524
|
+
exc_info=True,
|
|
647
525
|
)
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
#
|
|
526
|
+
for job_index in to_fetch:
|
|
527
|
+
self._handle_processing_failure(
|
|
528
|
+
job_index, f"Fetch initiation error: {fetch_init_err}", is_submission_failure=True
|
|
529
|
+
)
|
|
530
|
+
if self.fail_on_submit_error:
|
|
531
|
+
raise RuntimeError(
|
|
532
|
+
f"Stopping due to fetch initiation failure: {fetch_init_err}"
|
|
533
|
+
) from fetch_init_err
|
|
534
|
+
|
|
535
|
+
# 4) If nothing left anywhere, exit
|
|
536
|
+
if not inflight_futures and not self.retry_job_ids and submitted_new_indices_count >= total_jobs:
|
|
537
|
+
logger.debug("Exiting loop: No in-flight jobs, no retries, and all jobs submitted.")
|
|
538
|
+
break
|
|
539
|
+
|
|
540
|
+
# 5) Wait for at least one in-flight future to complete, then process done ones
|
|
541
|
+
if inflight_futures:
|
|
542
|
+
done, _ = concurrent.futures.wait(
|
|
543
|
+
set(inflight_futures.keys()), return_when=concurrent.futures.FIRST_COMPLETED
|
|
544
|
+
)
|
|
545
|
+
for future in done:
|
|
546
|
+
job_index = inflight_futures.pop(future, None)
|
|
547
|
+
if job_index is None:
|
|
548
|
+
continue
|
|
549
|
+
try:
|
|
550
|
+
result_list = future.result()
|
|
551
|
+
if not isinstance(result_list, list) or len(result_list) != 1:
|
|
552
|
+
raise ValueError(f"Expected list length 1, got {len(result_list)}")
|
|
553
|
+
result_tuple = result_list[0]
|
|
554
|
+
if not isinstance(result_tuple, (tuple, list)) or len(result_tuple) != 3:
|
|
555
|
+
raise ValueError(f"Expected tuple/list length 3, got {len(result_tuple)}")
|
|
556
|
+
full_response_dict, fetched_job_index, trace_id = result_tuple
|
|
557
|
+
if fetched_job_index != job_index:
|
|
558
|
+
logger.warning(f"Mismatch: Future for {job_index} returned {fetched_job_index}")
|
|
559
|
+
self._handle_processing_success(job_index, full_response_dict, trace_id)
|
|
560
|
+
except TimeoutError:
|
|
561
|
+
# Not ready -> immediate retry
|
|
562
|
+
self.retry_counts[job_index] += 1
|
|
563
|
+
if self.max_job_retries is None or self.retry_counts[job_index] <= self.max_job_retries:
|
|
564
|
+
if self.verbose:
|
|
565
|
+
logger.info(
|
|
566
|
+
f"Job {job_index} not ready, scheduling retry "
|
|
567
|
+
f"(Attempt {self.retry_counts[job_index]}/{self.max_job_retries or 'inf'})."
|
|
568
|
+
)
|
|
569
|
+
self._schedule_retry(job_index)
|
|
570
|
+
else:
|
|
571
|
+
error_msg = f"Exceeded max fetch retries ({self.max_job_retries}) for job {job_index}."
|
|
572
|
+
logger.error(error_msg)
|
|
573
|
+
self._handle_processing_failure(job_index, error_msg)
|
|
574
|
+
except (ValueError, RuntimeError) as e:
|
|
575
|
+
logger.error(f"Job {job_index} failed processing result: {e}", exc_info=self.verbose)
|
|
576
|
+
self._handle_processing_failure(job_index, f"Error processing result: {e}")
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.exception(f"Unhandled error processing future for job {job_index}: {e}")
|
|
579
|
+
self._handle_processing_failure(job_index, f"Unhandled error processing future: {e}")
|
|
658
580
|
|
|
659
581
|
# --- Final Logging ---
|
|
660
582
|
self._log_final_status(total_jobs)
|
|
@@ -688,11 +610,12 @@ class NvIngestClient:
|
|
|
688
610
|
message_client_port : int, optional
|
|
689
611
|
Port of the REST/message service. Defaults to 7670.
|
|
690
612
|
message_client_kwargs : dict, optional
|
|
691
|
-
Extra keyword arguments passed to the client allocator.
|
|
613
|
+
Extra keyword arguments passed to the client allocator. For RestClient,
|
|
614
|
+
can include 'api_version' (e.g., 'v1' or 'v2'). Defaults to 'v1'.
|
|
692
615
|
msg_counter_id : str, optional
|
|
693
616
|
Identifier for message counting. Defaults to "nv-ingest-message-id".
|
|
694
617
|
worker_pool_size : int, optional
|
|
695
|
-
Number of workers in the thread pool. Defaults to
|
|
618
|
+
Number of workers in the thread pool. Defaults to 8.
|
|
696
619
|
|
|
697
620
|
Returns
|
|
698
621
|
-------
|
|
@@ -714,7 +637,7 @@ class NvIngestClient:
|
|
|
714
637
|
**self._message_client_kwargs,
|
|
715
638
|
)
|
|
716
639
|
|
|
717
|
-
# Initialize the worker pool with the specified size
|
|
640
|
+
# Initialize the worker pool with the specified size (used for both submit and fetch)
|
|
718
641
|
self._worker_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
|
|
719
642
|
|
|
720
643
|
# Telemetry state and controls
|
|
@@ -1210,6 +1133,7 @@ class NvIngestClient:
|
|
|
1210
1133
|
self,
|
|
1211
1134
|
job_ids: Union[str, List[str]],
|
|
1212
1135
|
data_only: bool = False,
|
|
1136
|
+
timeout: Optional[Tuple[int, Optional[float]]] = None,
|
|
1213
1137
|
) -> List[Tuple[Any, str, Optional[str]]]:
|
|
1214
1138
|
"""
|
|
1215
1139
|
Fetch job results via CLI semantics (synchronous list return).
|
|
@@ -1229,7 +1153,8 @@ class NvIngestClient:
|
|
|
1229
1153
|
if isinstance(job_ids, str):
|
|
1230
1154
|
job_ids = [job_ids]
|
|
1231
1155
|
|
|
1232
|
-
|
|
1156
|
+
eff_timeout: Tuple[int, Optional[float]] = timeout if timeout is not None else (100, None)
|
|
1157
|
+
return [self._fetch_job_result(job_id, timeout=eff_timeout, data_only=data_only) for job_id in job_ids]
|
|
1233
1158
|
|
|
1234
1159
|
def _validate_batch_size(self, batch_size: Optional[int]) -> int:
|
|
1235
1160
|
"""
|
|
@@ -1346,8 +1271,8 @@ class NvIngestClient:
|
|
|
1346
1271
|
# Validate and set batch_size
|
|
1347
1272
|
validated_batch_size = self._validate_batch_size(batch_size)
|
|
1348
1273
|
|
|
1349
|
-
# Prepare timeout tuple
|
|
1350
|
-
effective_timeout: Tuple[int,
|
|
1274
|
+
# Prepare timeout tuple to mirror handler behavior: finite connect, unbounded read (long-poll)
|
|
1275
|
+
effective_timeout: Tuple[int, Optional[float]] = (int(timeout), None)
|
|
1351
1276
|
|
|
1352
1277
|
# Delegate to the concurrent processor
|
|
1353
1278
|
processor = _ConcurrentProcessor(
|
|
@@ -1402,7 +1327,12 @@ class NvIngestClient:
|
|
|
1402
1327
|
job_state.trace_id = future.result()[0] # Trace_id from `submit_job` endpoint submission
|
|
1403
1328
|
job_state.future = None
|
|
1404
1329
|
|
|
1405
|
-
def fetch_job_result_async(
|
|
1330
|
+
def fetch_job_result_async(
|
|
1331
|
+
self,
|
|
1332
|
+
job_ids: Union[str, List[str]],
|
|
1333
|
+
data_only: bool = True,
|
|
1334
|
+
timeout: Optional[Tuple[int, Optional[float]]] = None,
|
|
1335
|
+
) -> Dict[Future, str]:
|
|
1406
1336
|
"""
|
|
1407
1337
|
Fetches job results for a list or a single job ID asynchronously and returns a mapping of futures to job IDs.
|
|
1408
1338
|
|
|
@@ -1423,7 +1353,7 @@ class NvIngestClient:
|
|
|
1423
1353
|
future_to_job_id = {}
|
|
1424
1354
|
for job_id in job_ids:
|
|
1425
1355
|
job_state = self._get_and_check_job_state(job_id)
|
|
1426
|
-
future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only)
|
|
1356
|
+
future = self._worker_pool.submit(self.fetch_job_result_cli, job_id, data_only, timeout)
|
|
1427
1357
|
job_state.future = future
|
|
1428
1358
|
future_to_job_id[future] = job_id
|
|
1429
1359
|
|
|
@@ -1707,7 +1637,9 @@ class NvIngestClient:
|
|
|
1707
1637
|
|
|
1708
1638
|
return results
|
|
1709
1639
|
|
|
1710
|
-
def create_jobs_for_batch(
|
|
1640
|
+
def create_jobs_for_batch(
|
|
1641
|
+
self, files_batch: List[str], tasks: Dict[str, Any], pdf_split_page_count: int = None
|
|
1642
|
+
) -> List[str]:
|
|
1711
1643
|
"""
|
|
1712
1644
|
Create and submit job specifications (JobSpecs) for a batch of files, returning the job IDs.
|
|
1713
1645
|
This function takes a batch of files, processes each file to extract its content and type,
|
|
@@ -1723,6 +1655,9 @@ class NvIngestClient:
|
|
|
1723
1655
|
A dictionary of tasks to be added to each job. The keys represent task names, and the
|
|
1724
1656
|
values represent task specifications or configurations. Standard tasks include "split",
|
|
1725
1657
|
"extract", "store", "caption", "dedup", "filter", "embed".
|
|
1658
|
+
pdf_split_page_count : int, optional
|
|
1659
|
+
Number of pages per PDF chunk for splitting (1-128). If provided, this will be added
|
|
1660
|
+
to the job spec's extended_options for PDF files.
|
|
1726
1661
|
|
|
1727
1662
|
Returns
|
|
1728
1663
|
-------
|
|
@@ -1769,6 +1704,10 @@ class NvIngestClient:
|
|
|
1769
1704
|
|
|
1770
1705
|
job_specs = create_job_specs_for_batch(files_batch)
|
|
1771
1706
|
|
|
1707
|
+
# Apply PDF split config if provided
|
|
1708
|
+
if pdf_split_page_count is not None:
|
|
1709
|
+
apply_pdf_split_config_to_job_specs(job_specs, pdf_split_page_count)
|
|
1710
|
+
|
|
1772
1711
|
job_ids = []
|
|
1773
1712
|
for job_spec in job_specs:
|
|
1774
1713
|
logger.debug(f"Tasks: {tasks.keys()}")
|
|
@@ -45,6 +45,7 @@ class IngestJobHandler:
|
|
|
45
45
|
show_progress: bool = True,
|
|
46
46
|
show_telemetry: bool = False,
|
|
47
47
|
job_queue_id: str = "ingest_task_queue",
|
|
48
|
+
pdf_split_page_count: int = None,
|
|
48
49
|
) -> None:
|
|
49
50
|
self.client = client
|
|
50
51
|
self.files = files
|
|
@@ -56,6 +57,7 @@ class IngestJobHandler:
|
|
|
56
57
|
self.show_progress = show_progress
|
|
57
58
|
self.show_telemetry = show_telemetry
|
|
58
59
|
self.job_queue_id = job_queue_id
|
|
60
|
+
self.pdf_split_page_count = pdf_split_page_count
|
|
59
61
|
self._pbar = None
|
|
60
62
|
# Internal state used across iterations
|
|
61
63
|
self._retry_job_ids: List[str] = []
|
|
@@ -144,7 +146,9 @@ class IngestJobHandler:
|
|
|
144
146
|
new_job_count: int = min(self.batch_size - cur_job_count, len(self.files) - self._processed)
|
|
145
147
|
batch_files: List[str] = self.files[self._processed : self._processed + new_job_count]
|
|
146
148
|
|
|
147
|
-
new_job_indices: List[str] = self.client.create_jobs_for_batch(
|
|
149
|
+
new_job_indices: List[str] = self.client.create_jobs_for_batch(
|
|
150
|
+
batch_files, self.tasks, pdf_split_page_count=self.pdf_split_page_count
|
|
151
|
+
)
|
|
148
152
|
if len(new_job_indices) != new_job_count:
|
|
149
153
|
missing_jobs: int = new_job_count - len(new_job_indices)
|
|
150
154
|
error_msg: str = (
|
|
@@ -304,6 +308,7 @@ class IngestJobHandler:
|
|
|
304
308
|
trace_ids: Dict[str, str] = defaultdict(list) # type: ignore
|
|
305
309
|
failed_jobs: List[str] = []
|
|
306
310
|
retry_counts: Dict[str, int] = defaultdict(int)
|
|
311
|
+
pages_per_sec: float = None
|
|
307
312
|
|
|
308
313
|
start_time_ns: int = time.time_ns()
|
|
309
314
|
self._init_progress_bar(total_files)
|
|
@@ -54,7 +54,7 @@ from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
|
54
54
|
from nv_ingest_client.primitives.tasks import UDFTask
|
|
55
55
|
from nv_ingest_client.util.processing import check_schema
|
|
56
56
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
57
|
-
from nv_ingest_client.util.util import filter_function_kwargs
|
|
57
|
+
from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
|
|
58
58
|
from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
|
|
59
59
|
from tqdm import tqdm
|
|
60
60
|
|
|
@@ -1237,6 +1237,44 @@ class Ingestor:
|
|
|
1237
1237
|
|
|
1238
1238
|
return self
|
|
1239
1239
|
|
|
1240
|
+
@ensure_job_specs
|
|
1241
|
+
def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
|
|
1242
|
+
"""
|
|
1243
|
+
Configure PDF splitting behavior for V2 API.
|
|
1244
|
+
|
|
1245
|
+
Parameters
|
|
1246
|
+
----------
|
|
1247
|
+
pages_per_chunk : int, optional
|
|
1248
|
+
Number of pages per PDF chunk (default: 32)
|
|
1249
|
+
Server enforces boundaries: min=1, max=128
|
|
1250
|
+
|
|
1251
|
+
Returns
|
|
1252
|
+
-------
|
|
1253
|
+
Ingestor
|
|
1254
|
+
Self for method chaining
|
|
1255
|
+
|
|
1256
|
+
Notes
|
|
1257
|
+
-----
|
|
1258
|
+
- Only affects V2 API endpoints with PDF splitting support
|
|
1259
|
+
- Server will clamp values outside [1, 128] range
|
|
1260
|
+
- Smaller chunks = more parallelism but more overhead
|
|
1261
|
+
- Larger chunks = less overhead but reduced concurrency
|
|
1262
|
+
"""
|
|
1263
|
+
MIN_PAGES = 1
|
|
1264
|
+
MAX_PAGES = 128
|
|
1265
|
+
|
|
1266
|
+
# Warn if value will be clamped by server
|
|
1267
|
+
if pages_per_chunk < MIN_PAGES:
|
|
1268
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
|
|
1269
|
+
elif pages_per_chunk > MAX_PAGES:
|
|
1270
|
+
logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
|
|
1271
|
+
|
|
1272
|
+
# Flatten all job specs and apply PDF config using shared utility
|
|
1273
|
+
all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
|
|
1274
|
+
apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
|
|
1275
|
+
|
|
1276
|
+
return self
|
|
1277
|
+
|
|
1240
1278
|
def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
|
|
1241
1279
|
"""
|
|
1242
1280
|
Counts the jobs in specified states.
|
|
@@ -74,6 +74,12 @@ logger = logging.getLogger(__name__)
|
|
|
74
74
|
@click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
|
|
75
75
|
@click.option("--client_port", default=7670, type=int, help="Port for the client endpoint.")
|
|
76
76
|
@click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
|
|
77
|
+
@click.option(
|
|
78
|
+
"--api_version",
|
|
79
|
+
default="v1",
|
|
80
|
+
type=click.Choice(["v1", "v2"], case_sensitive=False),
|
|
81
|
+
help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
|
|
82
|
+
)
|
|
77
83
|
@click.option(
|
|
78
84
|
"--client_type",
|
|
79
85
|
default="rest",
|
|
@@ -119,6 +125,8 @@ Example:
|
|
|
119
125
|
--task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
|
|
120
126
|
--task 'embed'
|
|
121
127
|
--task 'caption:{}'
|
|
128
|
+
--pdf_split_page_count 64 # Configure PDF splitting (requires --api_version v2)
|
|
129
|
+
--api_version v2 # Use V2 API for PDF splitting support
|
|
122
130
|
|
|
123
131
|
\b
|
|
124
132
|
Tasks and Options:
|
|
@@ -207,6 +215,12 @@ for locating portions of the system that might be bottlenecks for the overall ru
|
|
|
207
215
|
)
|
|
208
216
|
@click.option("--zipkin_host", default="localhost", help="DNS name or Zipkin API.")
|
|
209
217
|
@click.option("--zipkin_port", default=9411, type=int, help="Port for the Zipkin trace API")
|
|
218
|
+
@click.option(
|
|
219
|
+
"--pdf_split_page_count",
|
|
220
|
+
default=None,
|
|
221
|
+
type=int,
|
|
222
|
+
help="Number of pages per PDF chunk for splitting. Allows per-request tuning of PDF split size in v2 api.",
|
|
223
|
+
)
|
|
210
224
|
@click.option("--version", is_flag=True, help="Show version.")
|
|
211
225
|
@click.pass_context
|
|
212
226
|
def main(
|
|
@@ -215,6 +229,7 @@ def main(
|
|
|
215
229
|
client_host: str,
|
|
216
230
|
client_kwargs: str,
|
|
217
231
|
client_port: int,
|
|
232
|
+
api_version: str,
|
|
218
233
|
client_type: str,
|
|
219
234
|
concurrency_n: int,
|
|
220
235
|
dataset: str,
|
|
@@ -228,6 +243,7 @@ def main(
|
|
|
228
243
|
collect_profiling_traces: bool,
|
|
229
244
|
zipkin_host: str,
|
|
230
245
|
zipkin_port: int,
|
|
246
|
+
pdf_split_page_count: int,
|
|
231
247
|
task: [str],
|
|
232
248
|
version: [bool],
|
|
233
249
|
):
|
|
@@ -268,6 +284,10 @@ def main(
|
|
|
268
284
|
_client_kwargs_obj = json.loads(client_kwargs)
|
|
269
285
|
except Exception:
|
|
270
286
|
_client_kwargs_obj = {"raw": client_kwargs}
|
|
287
|
+
|
|
288
|
+
# Merge api_version into client_kwargs
|
|
289
|
+
_client_kwargs_obj["api_version"] = api_version
|
|
290
|
+
|
|
271
291
|
_sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
|
|
272
292
|
logging.debug(
|
|
273
293
|
f"Creating message client: {client_host} and port: {client_port} -> "
|
|
@@ -285,7 +305,7 @@ def main(
|
|
|
285
305
|
message_client_allocator=client_allocator,
|
|
286
306
|
message_client_hostname=client_host,
|
|
287
307
|
message_client_port=client_port,
|
|
288
|
-
message_client_kwargs=
|
|
308
|
+
message_client_kwargs=_client_kwargs_obj,
|
|
289
309
|
worker_pool_size=concurrency_n,
|
|
290
310
|
)
|
|
291
311
|
|
|
@@ -300,6 +320,7 @@ def main(
|
|
|
300
320
|
save_images_separately=save_images_separately,
|
|
301
321
|
show_progress=True,
|
|
302
322
|
show_telemetry=True,
|
|
323
|
+
pdf_split_page_count=pdf_split_page_count,
|
|
303
324
|
)
|
|
304
325
|
(total_files, trace_times, pages_processed, trace_ids) = handler.run()
|
|
305
326
|
|
|
@@ -110,6 +110,7 @@ class JobSpec:
|
|
|
110
110
|
"job_id": str(self._job_id),
|
|
111
111
|
"tasks": [task.to_dict() for task in self._tasks],
|
|
112
112
|
"tracing_options": self._extended_options.get("tracing_options", {}),
|
|
113
|
+
"pdf_config": self._extended_options.get("pdf_config", {}),
|
|
113
114
|
}
|
|
114
115
|
|
|
115
116
|
@property
|
|
@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def analyze_document_chunks(
|
|
23
|
-
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
23
|
+
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]],
|
|
24
24
|
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
25
25
|
"""
|
|
26
26
|
Analyze ingestor results to count elements by type and page for each document.
|
|
@@ -350,6 +350,32 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
350
350
|
return job_specs
|
|
351
351
|
|
|
352
352
|
|
|
353
|
+
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
|
+
"""
|
|
355
|
+
Apply PDF split configuration to a list of JobSpec objects.
|
|
356
|
+
|
|
357
|
+
Modifies job specs in-place by adding pdf_config to extended_options for PDF files only.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
job_specs : List[JobSpec]
|
|
362
|
+
List of job specifications to potentially modify
|
|
363
|
+
pages_per_chunk : int
|
|
364
|
+
Number of pages per PDF chunk (will be stored as-is; server performs clamping)
|
|
365
|
+
|
|
366
|
+
Notes
|
|
367
|
+
-----
|
|
368
|
+
- Only modifies job specs with document_type == "pdf" (case-insensitive)
|
|
369
|
+
- Modifies job specs in-place
|
|
370
|
+
- Safe to call on mixed document types (only PDFs are affected)
|
|
371
|
+
"""
|
|
372
|
+
for job_spec in job_specs:
|
|
373
|
+
if job_spec.document_type.lower() == "pdf":
|
|
374
|
+
if "pdf_config" not in job_spec._extended_options:
|
|
375
|
+
job_spec._extended_options["pdf_config"] = {}
|
|
376
|
+
job_spec._extended_options["pdf_config"]["split_page_count"] = pages_per_chunk
|
|
377
|
+
|
|
378
|
+
|
|
353
379
|
def filter_function_kwargs(func, **kwargs):
|
|
354
380
|
"""
|
|
355
381
|
Filters and returns keyword arguments that match the parameters of a given function.
|
|
@@ -917,7 +917,9 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
917
917
|
break
|
|
918
918
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
919
919
|
if new_indexed_rows == indexed_rows:
|
|
920
|
-
pos_movement
|
|
920
|
+
pos_movement -= 1
|
|
921
|
+
else:
|
|
922
|
+
pos_movement = 10
|
|
921
923
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
922
924
|
if pos_movement == 0:
|
|
923
925
|
raise ValueError("Rows are not getting indexed as expected")
|
|
@@ -1046,9 +1048,10 @@ def write_to_nvingest_collection(
|
|
|
1046
1048
|
client,
|
|
1047
1049
|
collection_name,
|
|
1048
1050
|
)
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1051
|
+
if not local_index:
|
|
1052
|
+
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1053
|
+
# know how long this should take, it is num_elements dependent.
|
|
1054
|
+
wait_for_index(collection_name, num_elements, client)
|
|
1052
1055
|
else:
|
|
1053
1056
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1054
1057
|
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
@@ -1349,7 +1352,7 @@ def nvingest_retrieval(
|
|
|
1349
1352
|
nvidia_api_key=nvidia_api_key,
|
|
1350
1353
|
input_type="query",
|
|
1351
1354
|
output_names=["embeddings"],
|
|
1352
|
-
grpc=not (urlparse(embedding_endpoint).scheme
|
|
1355
|
+
grpc=not ("http" in urlparse(embedding_endpoint).scheme),
|
|
1353
1356
|
)
|
|
1354
1357
|
client = client or MilvusClient(milvus_uri, token=f"{username}:{password}")
|
|
1355
1358
|
final_top_k = top_k
|
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/MANIFEST.in
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/README.md
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/pyproject.toml
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/setup.cfg
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.10.14.dev20251014 → nv_ingest_client-2025.10.16.dev20251016}/src/version.py
RENAMED
|
File without changes
|