nv-ingest 2025.10.9.dev20251009__py3-none-any.whl → 2025.10.10.dev20251010__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +104 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +816 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +192 -10
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/METADATA +1 -1
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/RECORD +12 -8
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/top_level.txt +0 -0
|
@@ -7,9 +7,7 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
from json import JSONDecodeError
|
|
10
|
-
from typing import Optional, Dict, Any
|
|
11
|
-
|
|
12
|
-
from typing import List
|
|
10
|
+
from typing import Optional, Dict, Any, List
|
|
13
11
|
|
|
14
12
|
import redis
|
|
15
13
|
|
|
@@ -133,6 +131,8 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
133
131
|
self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
|
|
134
132
|
self._cache_prefix: str = "processing_cache:"
|
|
135
133
|
self._state_prefix: str = "job_state:"
|
|
134
|
+
# Bound async-to-thread concurrency slightly below Redis connection pool
|
|
135
|
+
self._async_operation_semaphore: Optional[asyncio.Semaphore] = None
|
|
136
136
|
|
|
137
137
|
self._ingest_client = RedisClient(
|
|
138
138
|
host=self._redis_hostname,
|
|
@@ -151,6 +151,16 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
151
151
|
f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
|
|
152
152
|
)
|
|
153
153
|
|
|
154
|
+
def _get_async_semaphore(self) -> asyncio.Semaphore:
|
|
155
|
+
if self._async_operation_semaphore is None:
|
|
156
|
+
semaphore_limit = max(1, self._concurrency_level - 2)
|
|
157
|
+
self._async_operation_semaphore = asyncio.Semaphore(semaphore_limit)
|
|
158
|
+
return self._async_operation_semaphore
|
|
159
|
+
|
|
160
|
+
async def _run_bounded_to_thread(self, func, *args, **kwargs):
|
|
161
|
+
async with self._get_async_semaphore():
|
|
162
|
+
return await asyncio.to_thread(func, *args, **kwargs)
|
|
163
|
+
|
|
154
164
|
async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
|
|
155
165
|
"""
|
|
156
166
|
Validates, prepares, and submits a job specification to the Redis task queue.
|
|
@@ -211,7 +221,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
211
221
|
logger.debug(
|
|
212
222
|
f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
|
|
213
223
|
)
|
|
214
|
-
await
|
|
224
|
+
await self._run_bounded_to_thread(
|
|
215
225
|
self._ingest_client.submit_message,
|
|
216
226
|
channel_name=self._redis_task_queue,
|
|
217
227
|
message=job_spec_json,
|
|
@@ -252,7 +262,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
252
262
|
try:
|
|
253
263
|
result_channel: str = f"{job_id}"
|
|
254
264
|
logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
|
|
255
|
-
message = await
|
|
265
|
+
message = await self._run_bounded_to_thread(
|
|
256
266
|
self._ingest_client.fetch_message,
|
|
257
267
|
channel_name=result_channel,
|
|
258
268
|
timeout=10,
|
|
@@ -264,7 +274,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
264
274
|
logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
|
|
265
275
|
raise TimeoutError("No data found (unexpected None response).")
|
|
266
276
|
except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
|
|
267
|
-
logger.
|
|
277
|
+
logger.debug(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
|
|
268
278
|
raise e
|
|
269
279
|
except Exception as e:
|
|
270
280
|
logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
|
|
@@ -289,7 +299,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
289
299
|
ttl_to_set: Optional[int] = self._state_ttl_seconds
|
|
290
300
|
try:
|
|
291
301
|
logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
|
|
292
|
-
await
|
|
302
|
+
await self._run_bounded_to_thread(
|
|
293
303
|
self._ingest_client.get_client().set,
|
|
294
304
|
state_key,
|
|
295
305
|
state,
|
|
@@ -317,7 +327,10 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
317
327
|
"""
|
|
318
328
|
state_key: str = f"{self._state_prefix}{job_id}"
|
|
319
329
|
try:
|
|
320
|
-
data_bytes: Optional[bytes] = await
|
|
330
|
+
data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
|
|
331
|
+
self._ingest_client.get_client().get,
|
|
332
|
+
state_key,
|
|
333
|
+
)
|
|
321
334
|
if data_bytes:
|
|
322
335
|
state: str = data_bytes.decode("utf-8")
|
|
323
336
|
logger.debug(f"Retrieved state for {job_id}: {state}")
|
|
@@ -350,7 +363,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
350
363
|
cache_key: str = f"{self._cache_prefix}{job_id}"
|
|
351
364
|
try:
|
|
352
365
|
data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
|
|
353
|
-
await
|
|
366
|
+
await self._run_bounded_to_thread(
|
|
354
367
|
self._ingest_client.get_client().set,
|
|
355
368
|
cache_key,
|
|
356
369
|
data_to_store,
|
|
@@ -375,7 +388,10 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
375
388
|
"""
|
|
376
389
|
cache_key: str = f"{self._cache_prefix}{job_id}"
|
|
377
390
|
try:
|
|
378
|
-
data_bytes: Optional[bytes] = await
|
|
391
|
+
data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
|
|
392
|
+
self._ingest_client.get_client().get,
|
|
393
|
+
cache_key,
|
|
394
|
+
)
|
|
379
395
|
if data_bytes is None:
|
|
380
396
|
return []
|
|
381
397
|
return [ProcessingJob(**job) for job in json.loads(data_bytes)]
|
|
@@ -393,3 +409,169 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
393
409
|
The current fetch mode.
|
|
394
410
|
"""
|
|
395
411
|
return self._fetch_mode
|
|
412
|
+
|
|
413
|
+
async def set_parent_job_mapping(
|
|
414
|
+
self,
|
|
415
|
+
parent_job_id: str,
|
|
416
|
+
subjob_ids: List[str],
|
|
417
|
+
metadata: Dict[str, Any],
|
|
418
|
+
*,
|
|
419
|
+
subjob_descriptors: Optional[List[Dict[str, Any]]] = None,
|
|
420
|
+
) -> None:
|
|
421
|
+
"""
|
|
422
|
+
Store parent-subjob mapping in Redis for V2 PDF splitting.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
parent_job_id : str
|
|
427
|
+
The parent job identifier
|
|
428
|
+
subjob_ids : List[str]
|
|
429
|
+
List of subjob identifiers
|
|
430
|
+
metadata : Dict[str, Any]
|
|
431
|
+
Metadata about the parent job (total_pages, original_source_id, etc.)
|
|
432
|
+
subjob_descriptors : List[Dict[str, Any]], optional
|
|
433
|
+
Detailed descriptors (job_id, chunk_index, start/end pages) for subjobs
|
|
434
|
+
"""
|
|
435
|
+
parent_key = f"parent:{parent_job_id}:subjobs"
|
|
436
|
+
metadata_key = f"parent:{parent_job_id}:metadata"
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
# Store subjob IDs as a set
|
|
440
|
+
await self._run_bounded_to_thread(
|
|
441
|
+
self._ingest_client.get_client().sadd,
|
|
442
|
+
parent_key,
|
|
443
|
+
*subjob_ids,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Store metadata as hash (including original subjob ordering for deterministic fetches)
|
|
447
|
+
metadata_to_store = dict(metadata)
|
|
448
|
+
try:
|
|
449
|
+
metadata_to_store["subjob_order"] = json.dumps(subjob_ids)
|
|
450
|
+
except (TypeError, ValueError):
|
|
451
|
+
logger.warning(
|
|
452
|
+
"Unable to serialize subjob ordering for parent %s; falling back to Redis set ordering",
|
|
453
|
+
parent_job_id,
|
|
454
|
+
)
|
|
455
|
+
metadata_to_store.pop("subjob_order", None)
|
|
456
|
+
|
|
457
|
+
if subjob_descriptors:
|
|
458
|
+
metadata_to_store["subjob_descriptors"] = json.dumps(subjob_descriptors)
|
|
459
|
+
|
|
460
|
+
await self._run_bounded_to_thread(
|
|
461
|
+
self._ingest_client.get_client().hset,
|
|
462
|
+
metadata_key,
|
|
463
|
+
mapping=metadata_to_store,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Set TTL on both keys to match state TTL
|
|
467
|
+
if self._state_ttl_seconds:
|
|
468
|
+
await self._run_bounded_to_thread(
|
|
469
|
+
self._ingest_client.get_client().expire,
|
|
470
|
+
parent_key,
|
|
471
|
+
self._state_ttl_seconds,
|
|
472
|
+
)
|
|
473
|
+
await self._run_bounded_to_thread(
|
|
474
|
+
self._ingest_client.get_client().expire,
|
|
475
|
+
metadata_key,
|
|
476
|
+
self._state_ttl_seconds,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
logger.debug(f"Stored parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
|
|
480
|
+
|
|
481
|
+
except Exception as err:
|
|
482
|
+
logger.exception(f"Error storing parent job mapping for {parent_job_id}: {err}")
|
|
483
|
+
raise
|
|
484
|
+
|
|
485
|
+
async def get_parent_job_info(self, parent_job_id: str) -> Optional[Dict[str, Any]]:
|
|
486
|
+
"""
|
|
487
|
+
Retrieve parent job information including subjob IDs and metadata.
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
parent_job_id : str
|
|
492
|
+
The parent job identifier
|
|
493
|
+
|
|
494
|
+
Returns
|
|
495
|
+
-------
|
|
496
|
+
Dict[str, Any] or None
|
|
497
|
+
Dictionary with 'subjob_ids' and 'metadata' keys, or None if not a parent job
|
|
498
|
+
"""
|
|
499
|
+
parent_key = f"parent:{parent_job_id}:subjobs"
|
|
500
|
+
metadata_key = f"parent:{parent_job_id}:metadata"
|
|
501
|
+
|
|
502
|
+
try:
|
|
503
|
+
# Check if this is a parent job
|
|
504
|
+
exists = await self._run_bounded_to_thread(
|
|
505
|
+
self._ingest_client.get_client().exists,
|
|
506
|
+
parent_key,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
if not exists:
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
# Get subjob IDs
|
|
513
|
+
subjob_ids_bytes = await self._run_bounded_to_thread(
|
|
514
|
+
self._ingest_client.get_client().smembers,
|
|
515
|
+
parent_key,
|
|
516
|
+
)
|
|
517
|
+
subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes}
|
|
518
|
+
|
|
519
|
+
# Get metadata
|
|
520
|
+
metadata_dict = await self._run_bounded_to_thread(
|
|
521
|
+
self._ingest_client.get_client().hgetall,
|
|
522
|
+
metadata_key,
|
|
523
|
+
)
|
|
524
|
+
metadata = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata_dict.items()}
|
|
525
|
+
|
|
526
|
+
# Convert numeric strings back to numbers
|
|
527
|
+
if "total_pages" in metadata:
|
|
528
|
+
metadata["total_pages"] = int(metadata["total_pages"])
|
|
529
|
+
if "pages_per_chunk" in metadata:
|
|
530
|
+
try:
|
|
531
|
+
metadata["pages_per_chunk"] = int(metadata["pages_per_chunk"])
|
|
532
|
+
except ValueError:
|
|
533
|
+
metadata.pop("pages_per_chunk", None)
|
|
534
|
+
|
|
535
|
+
ordered_ids: Optional[List[str]] = None
|
|
536
|
+
stored_order = metadata.pop("subjob_order", None)
|
|
537
|
+
if stored_order:
|
|
538
|
+
try:
|
|
539
|
+
candidate_order = json.loads(stored_order)
|
|
540
|
+
if isinstance(candidate_order, list):
|
|
541
|
+
ordered_ids = [sid for sid in candidate_order if sid in subjob_id_set]
|
|
542
|
+
except (ValueError, TypeError) as exc:
|
|
543
|
+
logger.warning(
|
|
544
|
+
"Failed to parse stored subjob order for parent %s: %s",
|
|
545
|
+
parent_job_id,
|
|
546
|
+
exc,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
if ordered_ids is None:
|
|
550
|
+
ordered_ids = sorted(subjob_id_set)
|
|
551
|
+
else:
|
|
552
|
+
remaining_ids = sorted(subjob_id_set - set(ordered_ids))
|
|
553
|
+
ordered_ids.extend(remaining_ids)
|
|
554
|
+
|
|
555
|
+
subjob_descriptors: Optional[List[Dict[str, Any]]] = None
|
|
556
|
+
stored_descriptors = metadata.pop("subjob_descriptors", None)
|
|
557
|
+
if stored_descriptors:
|
|
558
|
+
try:
|
|
559
|
+
decoded = json.loads(stored_descriptors)
|
|
560
|
+
if isinstance(decoded, list):
|
|
561
|
+
subjob_descriptors = decoded
|
|
562
|
+
except (ValueError, TypeError) as exc:
|
|
563
|
+
logger.warning(
|
|
564
|
+
"Failed to parse stored subjob descriptors for parent %s: %s",
|
|
565
|
+
parent_job_id,
|
|
566
|
+
exc,
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
return {
|
|
570
|
+
"subjob_ids": ordered_ids,
|
|
571
|
+
"metadata": metadata,
|
|
572
|
+
"subjob_descriptors": subjob_descriptors or [],
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
except Exception as err:
|
|
576
|
+
logger.error(f"Error retrieving parent job info for {parent_job_id}: {err}")
|
|
577
|
+
return None
|
{nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/RECORD
RENAMED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
nv_ingest/__init__.py,sha256=vJLPeuxiIHqbxXPJSu9qe3MS-GPavbOUExyRq83DxxM,895
|
|
2
2
|
nv_ingest/version.py,sha256=MG7DxlzpnoJI56vqxwzs9WeMAEI3uPhfDiNLs6GN6wI,986
|
|
3
|
-
nv_ingest/api/__init__.py,sha256=
|
|
4
|
-
nv_ingest/api/main.py,sha256=
|
|
3
|
+
nv_ingest/api/__init__.py,sha256=ED07QUqwVyJalH0ahhnnjvc2W_in6TpZZ5nJ6NWU9-Y,271
|
|
4
|
+
nv_ingest/api/main.py,sha256=uCCkUNLS1xE9TDYKDOdxEfo_9jQWumpQAPWrxj5m9Go,1706
|
|
5
|
+
nv_ingest/api/tracing.py,sha256=NkqMuUiB6ixGU5MYp3TrODsZDQepJ1kbH8JFHsYjuE0,2940
|
|
5
6
|
nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
6
7
|
nv_ingest/api/v1/health.py,sha256=pV-RoVq5y0iBPp0qZoLzd1xKpd0JiHAi0UMyMj99LqU,4740
|
|
7
8
|
nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
|
|
8
9
|
nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
|
|
10
|
+
nv_ingest/api/v2/README.md,sha256=eJHe-AXOczH1FH0qOsQ4PNR1UCkt3nPFcAPcZ6PEDjk,4307
|
|
11
|
+
nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
12
|
+
nv_ingest/api/v2/ingest.py,sha256=XuEMgc1iRNOux83xRuTA5X9drPAR_vGDKEhyHXf9D5Q,32203
|
|
9
13
|
nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
10
14
|
nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
11
15
|
nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
@@ -99,7 +103,7 @@ nv_ingest/framework/util/flow_control/udf_intercept.py,sha256=zQ9uuCcHLEd0P52Eiw
|
|
|
99
103
|
nv_ingest/framework/util/service/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
100
104
|
nv_ingest/framework/util/service/impl/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
101
105
|
nv_ingest/framework/util/service/impl/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
102
|
-
nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=
|
|
106
|
+
nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=OuGC3FFhkLQLR3x4s-tyxGguYYn8ORKr2xkzMy2br0g,22552
|
|
103
107
|
nv_ingest/framework/util/service/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
104
108
|
nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
105
109
|
nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
|
|
@@ -113,8 +117,8 @@ nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3Z
|
|
|
113
117
|
nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
114
118
|
nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
|
|
115
119
|
nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
|
|
116
|
-
nv_ingest-2025.10.
|
|
117
|
-
nv_ingest-2025.10.
|
|
118
|
-
nv_ingest-2025.10.
|
|
119
|
-
nv_ingest-2025.10.
|
|
120
|
-
nv_ingest-2025.10.
|
|
120
|
+
nv_ingest-2025.10.10.dev20251010.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
121
|
+
nv_ingest-2025.10.10.dev20251010.dist-info/METADATA,sha256=tflK_t8mmrF2P-mZYhoa673OVzPhqQKOXTFEEJ_77ng,15122
|
|
122
|
+
nv_ingest-2025.10.10.dev20251010.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
123
|
+
nv_ingest-2025.10.10.dev20251010.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
|
|
124
|
+
nv_ingest-2025.10.10.dev20251010.dist-info/RECORD,,
|
{nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.10.dev20251010.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|