nv-ingest 2025.10.8.dev20251008__py3-none-any.whl → 2025.10.10.dev20251010__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

@@ -7,9 +7,7 @@ import json
7
7
  import logging
8
8
  import os
9
9
  from json import JSONDecodeError
10
- from typing import Optional, Dict, Any
11
-
12
- from typing import List
10
+ from typing import Optional, Dict, Any, List
13
11
 
14
12
  import redis
15
13
 
@@ -133,6 +131,8 @@ class RedisIngestService(IngestServiceMeta):
133
131
  self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
134
132
  self._cache_prefix: str = "processing_cache:"
135
133
  self._state_prefix: str = "job_state:"
134
+ # Bound async-to-thread concurrency slightly below Redis connection pool
135
+ self._async_operation_semaphore: Optional[asyncio.Semaphore] = None
136
136
 
137
137
  self._ingest_client = RedisClient(
138
138
  host=self._redis_hostname,
@@ -151,6 +151,16 @@ class RedisIngestService(IngestServiceMeta):
151
151
  f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
152
152
  )
153
153
 
154
+ def _get_async_semaphore(self) -> asyncio.Semaphore:
155
+ if self._async_operation_semaphore is None:
156
+ semaphore_limit = max(1, self._concurrency_level - 2)
157
+ self._async_operation_semaphore = asyncio.Semaphore(semaphore_limit)
158
+ return self._async_operation_semaphore
159
+
160
+ async def _run_bounded_to_thread(self, func, *args, **kwargs):
161
+ async with self._get_async_semaphore():
162
+ return await asyncio.to_thread(func, *args, **kwargs)
163
+
154
164
  async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
155
165
  """
156
166
  Validates, prepares, and submits a job specification to the Redis task queue.
@@ -211,7 +221,7 @@ class RedisIngestService(IngestServiceMeta):
211
221
  logger.debug(
212
222
  f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
213
223
  )
214
- await asyncio.to_thread(
224
+ await self._run_bounded_to_thread(
215
225
  self._ingest_client.submit_message,
216
226
  channel_name=self._redis_task_queue,
217
227
  message=job_spec_json,
@@ -252,7 +262,7 @@ class RedisIngestService(IngestServiceMeta):
252
262
  try:
253
263
  result_channel: str = f"{job_id}"
254
264
  logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
255
- message = await asyncio.to_thread(
265
+ message = await self._run_bounded_to_thread(
256
266
  self._ingest_client.fetch_message,
257
267
  channel_name=result_channel,
258
268
  timeout=10,
@@ -264,7 +274,7 @@ class RedisIngestService(IngestServiceMeta):
264
274
  logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
265
275
  raise TimeoutError("No data found (unexpected None response).")
266
276
  except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
267
- logger.info(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
277
+ logger.debug(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
268
278
  raise e
269
279
  except Exception as e:
270
280
  logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
@@ -289,7 +299,7 @@ class RedisIngestService(IngestServiceMeta):
289
299
  ttl_to_set: Optional[int] = self._state_ttl_seconds
290
300
  try:
291
301
  logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
292
- await asyncio.to_thread(
302
+ await self._run_bounded_to_thread(
293
303
  self._ingest_client.get_client().set,
294
304
  state_key,
295
305
  state,
@@ -317,7 +327,10 @@ class RedisIngestService(IngestServiceMeta):
317
327
  """
318
328
  state_key: str = f"{self._state_prefix}{job_id}"
319
329
  try:
320
- data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, state_key)
330
+ data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
331
+ self._ingest_client.get_client().get,
332
+ state_key,
333
+ )
321
334
  if data_bytes:
322
335
  state: str = data_bytes.decode("utf-8")
323
336
  logger.debug(f"Retrieved state for {job_id}: {state}")
@@ -350,7 +363,7 @@ class RedisIngestService(IngestServiceMeta):
350
363
  cache_key: str = f"{self._cache_prefix}{job_id}"
351
364
  try:
352
365
  data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
353
- await asyncio.to_thread(
366
+ await self._run_bounded_to_thread(
354
367
  self._ingest_client.get_client().set,
355
368
  cache_key,
356
369
  data_to_store,
@@ -375,7 +388,10 @@ class RedisIngestService(IngestServiceMeta):
375
388
  """
376
389
  cache_key: str = f"{self._cache_prefix}{job_id}"
377
390
  try:
378
- data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, cache_key)
391
+ data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
392
+ self._ingest_client.get_client().get,
393
+ cache_key,
394
+ )
379
395
  if data_bytes is None:
380
396
  return []
381
397
  return [ProcessingJob(**job) for job in json.loads(data_bytes)]
@@ -393,3 +409,169 @@ class RedisIngestService(IngestServiceMeta):
393
409
  The current fetch mode.
394
410
  """
395
411
  return self._fetch_mode
412
+
413
+ async def set_parent_job_mapping(
414
+ self,
415
+ parent_job_id: str,
416
+ subjob_ids: List[str],
417
+ metadata: Dict[str, Any],
418
+ *,
419
+ subjob_descriptors: Optional[List[Dict[str, Any]]] = None,
420
+ ) -> None:
421
+ """
422
+ Store parent-subjob mapping in Redis for V2 PDF splitting.
423
+
424
+ Parameters
425
+ ----------
426
+ parent_job_id : str
427
+ The parent job identifier
428
+ subjob_ids : List[str]
429
+ List of subjob identifiers
430
+ metadata : Dict[str, Any]
431
+ Metadata about the parent job (total_pages, original_source_id, etc.)
432
+ subjob_descriptors : List[Dict[str, Any]], optional
433
+ Detailed descriptors (job_id, chunk_index, start/end pages) for subjobs
434
+ """
435
+ parent_key = f"parent:{parent_job_id}:subjobs"
436
+ metadata_key = f"parent:{parent_job_id}:metadata"
437
+
438
+ try:
439
+ # Store subjob IDs as a set
440
+ await self._run_bounded_to_thread(
441
+ self._ingest_client.get_client().sadd,
442
+ parent_key,
443
+ *subjob_ids,
444
+ )
445
+
446
+ # Store metadata as hash (including original subjob ordering for deterministic fetches)
447
+ metadata_to_store = dict(metadata)
448
+ try:
449
+ metadata_to_store["subjob_order"] = json.dumps(subjob_ids)
450
+ except (TypeError, ValueError):
451
+ logger.warning(
452
+ "Unable to serialize subjob ordering for parent %s; falling back to Redis set ordering",
453
+ parent_job_id,
454
+ )
455
+ metadata_to_store.pop("subjob_order", None)
456
+
457
+ if subjob_descriptors:
458
+ metadata_to_store["subjob_descriptors"] = json.dumps(subjob_descriptors)
459
+
460
+ await self._run_bounded_to_thread(
461
+ self._ingest_client.get_client().hset,
462
+ metadata_key,
463
+ mapping=metadata_to_store,
464
+ )
465
+
466
+ # Set TTL on both keys to match state TTL
467
+ if self._state_ttl_seconds:
468
+ await self._run_bounded_to_thread(
469
+ self._ingest_client.get_client().expire,
470
+ parent_key,
471
+ self._state_ttl_seconds,
472
+ )
473
+ await self._run_bounded_to_thread(
474
+ self._ingest_client.get_client().expire,
475
+ metadata_key,
476
+ self._state_ttl_seconds,
477
+ )
478
+
479
+ logger.debug(f"Stored parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
480
+
481
+ except Exception as err:
482
+ logger.exception(f"Error storing parent job mapping for {parent_job_id}: {err}")
483
+ raise
484
+
485
+ async def get_parent_job_info(self, parent_job_id: str) -> Optional[Dict[str, Any]]:
486
+ """
487
+ Retrieve parent job information including subjob IDs and metadata.
488
+
489
+ Parameters
490
+ ----------
491
+ parent_job_id : str
492
+ The parent job identifier
493
+
494
+ Returns
495
+ -------
496
+ Dict[str, Any] or None
497
+ Dictionary with 'subjob_ids' and 'metadata' keys, or None if not a parent job
498
+ """
499
+ parent_key = f"parent:{parent_job_id}:subjobs"
500
+ metadata_key = f"parent:{parent_job_id}:metadata"
501
+
502
+ try:
503
+ # Check if this is a parent job
504
+ exists = await self._run_bounded_to_thread(
505
+ self._ingest_client.get_client().exists,
506
+ parent_key,
507
+ )
508
+
509
+ if not exists:
510
+ return None
511
+
512
+ # Get subjob IDs
513
+ subjob_ids_bytes = await self._run_bounded_to_thread(
514
+ self._ingest_client.get_client().smembers,
515
+ parent_key,
516
+ )
517
+ subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes}
518
+
519
+ # Get metadata
520
+ metadata_dict = await self._run_bounded_to_thread(
521
+ self._ingest_client.get_client().hgetall,
522
+ metadata_key,
523
+ )
524
+ metadata = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata_dict.items()}
525
+
526
+ # Convert numeric strings back to numbers
527
+ if "total_pages" in metadata:
528
+ metadata["total_pages"] = int(metadata["total_pages"])
529
+ if "pages_per_chunk" in metadata:
530
+ try:
531
+ metadata["pages_per_chunk"] = int(metadata["pages_per_chunk"])
532
+ except ValueError:
533
+ metadata.pop("pages_per_chunk", None)
534
+
535
+ ordered_ids: Optional[List[str]] = None
536
+ stored_order = metadata.pop("subjob_order", None)
537
+ if stored_order:
538
+ try:
539
+ candidate_order = json.loads(stored_order)
540
+ if isinstance(candidate_order, list):
541
+ ordered_ids = [sid for sid in candidate_order if sid in subjob_id_set]
542
+ except (ValueError, TypeError) as exc:
543
+ logger.warning(
544
+ "Failed to parse stored subjob order for parent %s: %s",
545
+ parent_job_id,
546
+ exc,
547
+ )
548
+
549
+ if ordered_ids is None:
550
+ ordered_ids = sorted(subjob_id_set)
551
+ else:
552
+ remaining_ids = sorted(subjob_id_set - set(ordered_ids))
553
+ ordered_ids.extend(remaining_ids)
554
+
555
+ subjob_descriptors: Optional[List[Dict[str, Any]]] = None
556
+ stored_descriptors = metadata.pop("subjob_descriptors", None)
557
+ if stored_descriptors:
558
+ try:
559
+ decoded = json.loads(stored_descriptors)
560
+ if isinstance(decoded, list):
561
+ subjob_descriptors = decoded
562
+ except (ValueError, TypeError) as exc:
563
+ logger.warning(
564
+ "Failed to parse stored subjob descriptors for parent %s: %s",
565
+ parent_job_id,
566
+ exc,
567
+ )
568
+
569
+ return {
570
+ "subjob_ids": ordered_ids,
571
+ "metadata": metadata,
572
+ "subjob_descriptors": subjob_descriptors or [],
573
+ }
574
+
575
+ except Exception as err:
576
+ logger.error(f"Error retrieving parent job info for {parent_job_id}: {err}")
577
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.8.dev20251008
3
+ Version: 2025.10.10.dev20251010
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -230,6 +230,7 @@ Requires-Dist: openai>=1.82.0
230
230
  Requires-Dist: opentelemetry-api>=1.27.0
231
231
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
232
232
  Requires-Dist: opentelemetry-sdk>=1.27.0
233
+ Requires-Dist: psutil>=7.1.0
233
234
  Requires-Dist: pydantic>2.0.0
234
235
  Requires-Dist: pydantic-settings>2.0.0
235
236
  Requires-Dist: pypdfium2==4.30.0
@@ -1,11 +1,15 @@
1
1
  nv_ingest/__init__.py,sha256=vJLPeuxiIHqbxXPJSu9qe3MS-GPavbOUExyRq83DxxM,895
2
2
  nv_ingest/version.py,sha256=MG7DxlzpnoJI56vqxwzs9WeMAEI3uPhfDiNLs6GN6wI,986
3
- nv_ingest/api/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
4
- nv_ingest/api/main.py,sha256=XE-p4lJp1E7CCDOB8ENtYFrf63Dtq2bzQiGxpRfL2LA,1603
3
+ nv_ingest/api/__init__.py,sha256=ED07QUqwVyJalH0ahhnnjvc2W_in6TpZZ5nJ6NWU9-Y,271
4
+ nv_ingest/api/main.py,sha256=uCCkUNLS1xE9TDYKDOdxEfo_9jQWumpQAPWrxj5m9Go,1706
5
+ nv_ingest/api/tracing.py,sha256=NkqMuUiB6ixGU5MYp3TrODsZDQepJ1kbH8JFHsYjuE0,2940
5
6
  nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
6
7
  nv_ingest/api/v1/health.py,sha256=pV-RoVq5y0iBPp0qZoLzd1xKpd0JiHAi0UMyMj99LqU,4740
7
8
  nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
8
9
  nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
10
+ nv_ingest/api/v2/README.md,sha256=eJHe-AXOczH1FH0qOsQ4PNR1UCkt3nPFcAPcZ6PEDjk,4307
11
+ nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
12
+ nv_ingest/api/v2/ingest.py,sha256=XuEMgc1iRNOux83xRuTA5X9drPAR_vGDKEhyHXf9D5Q,32203
9
13
  nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
10
14
  nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
11
15
  nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -99,7 +103,7 @@ nv_ingest/framework/util/flow_control/udf_intercept.py,sha256=zQ9uuCcHLEd0P52Eiw
99
103
  nv_ingest/framework/util/service/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
100
104
  nv_ingest/framework/util/service/impl/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
101
105
  nv_ingest/framework/util/service/impl/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
102
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=KbzQFo7qVbCITiKYVPcGN0x4NI8piJy70Dz-8jf59Xs,15415
106
+ nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=OuGC3FFhkLQLR3x4s-tyxGguYYn8ORKr2xkzMy2br0g,22552
103
107
  nv_ingest/framework/util/service/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
104
108
  nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
105
109
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
@@ -113,8 +117,8 @@ nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3Z
113
117
  nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
114
118
  nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
115
119
  nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
116
- nv_ingest-2025.10.8.dev20251008.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
117
- nv_ingest-2025.10.8.dev20251008.dist-info/METADATA,sha256=hhFBvw9s5nCRjXnd5DFOANjkNoUlLLnjJ1M3A9mk2Kk,15092
118
- nv_ingest-2025.10.8.dev20251008.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
119
- nv_ingest-2025.10.8.dev20251008.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
120
- nv_ingest-2025.10.8.dev20251008.dist-info/RECORD,,
120
+ nv_ingest-2025.10.10.dev20251010.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
121
+ nv_ingest-2025.10.10.dev20251010.dist-info/METADATA,sha256=tflK_t8mmrF2P-mZYhoa673OVzPhqQKOXTFEEJ_77ng,15122
122
+ nv_ingest-2025.10.10.dev20251010.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
+ nv_ingest-2025.10.10.dev20251010.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
124
+ nv_ingest-2025.10.10.dev20251010.dist-info/RECORD,,