nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  9. nv_ingest/framework/orchestration/execution/options.py +112 -0
  10. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
  12. nv_ingest/framework/orchestration/process/execution.py +495 -0
  13. nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
  14. nv_ingest/framework/orchestration/process/strategies.py +218 -0
  15. nv_ingest/framework/orchestration/process/termination.py +147 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
  17. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  18. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
  19. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  20. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
  21. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
  22. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
  23. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
  24. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
  25. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  26. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
  28. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  29. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
  30. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
  31. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  32. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  33. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
  34. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
  35. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
  36. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  37. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  38. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
  39. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
  40. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
  41. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  42. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  43. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
  44. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
  45. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
  46. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  47. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  48. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  49. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  50. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  51. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  52. nv_ingest/pipeline/__init__.py +3 -0
  53. nv_ingest/pipeline/config/__init__.py +3 -0
  54. nv_ingest/pipeline/config/loaders.py +229 -0
  55. nv_ingest/pipeline/config/replica_resolver.py +237 -0
  56. nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
  57. nv_ingest/pipeline/default_pipeline_impl.py +557 -0
  58. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  59. nv_ingest/pipeline/pipeline_schema.py +398 -0
  60. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
  61. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
  62. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  63. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  64. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
  65. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
  66. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,352 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import functools
6
+ import hashlib
7
+ import inspect
8
+ import logging
9
+ import os
10
+ import time
11
+ from typing import Dict, List, Any, Optional, Callable
12
+ from dataclasses import dataclass
13
+
14
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
15
+ from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class CachedUDF:
22
+ """Cached UDF function with metadata"""
23
+
24
+ function: callable
25
+ function_name: str
26
+ signature_validated: bool
27
+ created_at: float
28
+ last_used: float
29
+ use_count: int
30
+
31
+
32
+ class UDFCache:
33
+ """LRU cache for compiled and validated UDF functions"""
34
+
35
+ def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
36
+ self.max_size = max_size
37
+ self.ttl_seconds = ttl_seconds
38
+ self.cache: Dict[str, CachedUDF] = {}
39
+ self.access_order: List[str] = [] # For LRU tracking
40
+
41
+ def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
42
+ """Generate cache key from UDF string and function name"""
43
+ content = f"{udf_function_str.strip()}:{udf_function_name}"
44
+ return hashlib.sha256(content.encode()).hexdigest()
45
+
46
+ def _evict_lru(self):
47
+ """Remove least recently used item"""
48
+ if self.access_order:
49
+ lru_key = self.access_order.pop(0)
50
+ self.cache.pop(lru_key, None)
51
+
52
+ def _cleanup_expired(self):
53
+ """Remove expired entries if TTL is configured"""
54
+ if not self.ttl_seconds:
55
+ return
56
+
57
+ current_time = time.time()
58
+ expired_keys = [
59
+ key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
60
+ ]
61
+
62
+ for key in expired_keys:
63
+ self.cache.pop(key, None)
64
+ if key in self.access_order:
65
+ self.access_order.remove(key)
66
+
67
+ def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
68
+ """Get cached UDF function if available"""
69
+ self._cleanup_expired()
70
+
71
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
72
+
73
+ if cache_key in self.cache:
74
+ # Update access tracking
75
+ if cache_key in self.access_order:
76
+ self.access_order.remove(cache_key)
77
+ self.access_order.append(cache_key)
78
+
79
+ # Update usage stats
80
+ cached_udf = self.cache[cache_key]
81
+ cached_udf.last_used = time.time()
82
+ cached_udf.use_count += 1
83
+
84
+ return cached_udf
85
+
86
+ return None
87
+
88
+ def put(
89
+ self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
90
+ ) -> str:
91
+ """Cache a compiled and validated UDF function"""
92
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
93
+
94
+ # Evict LRU if at capacity
95
+ while len(self.cache) >= self.max_size:
96
+ self._evict_lru()
97
+
98
+ current_time = time.time()
99
+ cached_udf = CachedUDF(
100
+ function=function,
101
+ function_name=udf_function_name,
102
+ signature_validated=signature_validated,
103
+ created_at=current_time,
104
+ last_used=current_time,
105
+ use_count=1,
106
+ )
107
+
108
+ self.cache[cache_key] = cached_udf
109
+ self.access_order.append(cache_key)
110
+
111
+ return cache_key
112
+
113
+ def get_stats(self) -> Dict[str, Any]:
114
+ """Get cache statistics"""
115
+ total_uses = sum(udf.use_count for udf in self.cache.values())
116
+ most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
117
+ return {
118
+ "size": len(self.cache),
119
+ "max_size": self.max_size,
120
+ "total_uses": total_uses,
121
+ "most_used_function": most_used.function_name if most_used else None,
122
+ "most_used_count": most_used.use_count if most_used else 0,
123
+ }
124
+
125
+
126
+ # Global cache instance
127
+ _udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
128
+
129
+
130
+ def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
131
+ """Compile and validate UDF function (extracted for caching)"""
132
+ # Execute the UDF function string in a controlled namespace
133
+ namespace: Dict[str, Any] = {}
134
+ try:
135
+ exec(udf_function_str, namespace)
136
+ except Exception as e:
137
+ raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
138
+
139
+ # Extract the specified function from the namespace
140
+ if udf_function_name in namespace and callable(namespace[udf_function_name]):
141
+ udf_function = namespace[udf_function_name]
142
+ else:
143
+ raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
144
+
145
+ # Validate the UDF function signature
146
+ try:
147
+ ingest_callable_signature(inspect.signature(udf_function))
148
+ except Exception as e:
149
+ raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
150
+
151
+ return udf_function
152
+
153
+
154
+ def execute_targeted_udfs(
155
+ control_message: IngestControlMessage, stage_name: str, directive: str
156
+ ) -> IngestControlMessage:
157
+ """Execute UDFs that target this stage with the given directive."""
158
+ # Early exit if no UDF tasks exist - check by task type, not task ID
159
+ udf_tasks_exist = any(task.type == "udf" for task in control_message.get_tasks())
160
+ if not udf_tasks_exist:
161
+ return control_message
162
+
163
+ # Remove all UDF tasks and get them - handle case where no tasks found
164
+ try:
165
+ all_udf_tasks = remove_all_tasks_by_type(control_message, "udf")
166
+ except ValueError:
167
+ # No UDF tasks found - this can happen due to race conditions
168
+ logger.debug(f"No UDF tasks found for stage '{stage_name}' directive '{directive}'")
169
+ return control_message
170
+
171
+ # Execute applicable UDFs and collect remaining ones
172
+ remaining_tasks = []
173
+
174
+ for task_properties in all_udf_tasks:
175
+ # Check if this UDF targets this stage with the specified directive
176
+ target_stage = task_properties.get("target_stage", "")
177
+ run_before = task_properties.get("run_before", False)
178
+ run_after = task_properties.get("run_after", False)
179
+
180
+ # Determine if this UDF should execute
181
+ should_execute = False
182
+ if directive == "run_before" and run_before and target_stage == stage_name:
183
+ should_execute = True
184
+ elif directive == "run_after" and run_after and target_stage == stage_name:
185
+ should_execute = True
186
+
187
+ if should_execute:
188
+ try:
189
+ # Get UDF function details
190
+ udf_function_str = task_properties.get("udf_function", "").strip()
191
+ udf_function_name = task_properties.get("udf_function_name", "").strip()
192
+ task_id = task_properties.get("task_id", "unknown")
193
+
194
+ # Skip empty UDF functions
195
+ if not udf_function_str:
196
+ logger.debug(f"UDF task {task_id} has empty function, skipping")
197
+ remaining_tasks.append(task_properties)
198
+ continue
199
+
200
+ # Validate function name
201
+ if not udf_function_name:
202
+ raise ValueError(f"UDF task {task_id} missing required 'udf_function_name' property")
203
+
204
+ # Get or compile UDF function
205
+ cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
206
+ if cached_udf:
207
+ udf_function = cached_udf.function
208
+ logger.debug(f"UDF task {task_id}: Using cached function '{udf_function_name}'")
209
+ else:
210
+ udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_id)
211
+ _udf_cache.put(udf_function_str, udf_function_name, udf_function)
212
+ logger.debug(f"UDF task {task_id}: Cached function '{udf_function_name}'")
213
+
214
+ # Execute the UDF
215
+ control_message = udf_function(control_message)
216
+
217
+ # Validate return type
218
+ if not isinstance(control_message, IngestControlMessage):
219
+ raise ValueError(f"UDF task {task_id} must return IngestControlMessage")
220
+
221
+ logger.info(f"Executed UDF {task_id} '{udf_function_name}' {directive} stage '{stage_name}'")
222
+
223
+ except Exception as e:
224
+ logger.error(f"UDF {task_id} failed {directive} stage '{stage_name}': {e}")
225
+ # Keep failed task for next stage
226
+ remaining_tasks.append(task_properties)
227
+ else:
228
+ # Keep non-applicable task for next stage
229
+ remaining_tasks.append(task_properties)
230
+
231
+ # Re-add all remaining UDF tasks
232
+ for task_properties in remaining_tasks:
233
+ from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
234
+
235
+ task = ControlMessageTask(type="udf", id=task_properties.get("task_id", "unknown"), properties=task_properties)
236
+ control_message.add_task(task)
237
+
238
+ return control_message
239
+
240
+
241
+ def remove_task_by_id(control_message: IngestControlMessage, task_id: str) -> IngestControlMessage:
242
+ """Remove a specific task by ID from the control message"""
243
+ try:
244
+ control_message.remove_task(task_id)
245
+ except RuntimeError as e:
246
+ logger.warning(f"Could not remove task {task_id}: {e}")
247
+
248
+ return control_message
249
+
250
+
251
+ def udf_intercept_hook(stage_name: Optional[str] = None, enable_run_before: bool = True, enable_run_after: bool = True):
252
+ """
253
+ Decorator that executes UDFs targeted at this stage.
254
+
255
+ This decorator integrates with the existing UDF system, providing full
256
+ UDF compilation, caching, and execution capabilities. UDFs can target
257
+ specific stages using run_before or run_after directives.
258
+
259
+ Args:
260
+ stage_name: Name of the stage (e.g., "image_dedup", "text_extract").
261
+ If None, will attempt to use self.stage_name from the decorated method's instance.
262
+ enable_run_before: Whether to execute UDFs with run_before=True (default: True)
263
+ enable_run_after: Whether to execute UDFs with run_after=True (default: True)
264
+
265
+ Examples:
266
+ # Automatic stage name detection (recommended)
267
+ @traceable("image_deduplication")
268
+ @udf_intercept_hook() # Uses self.stage_name automatically
269
+ @filter_by_task(required_tasks=["dedup"])
270
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
271
+ return control_message
272
+
273
+ # Explicit stage name (fallback/override)
274
+ @traceable("data_sink")
275
+ @udf_intercept_hook("data_sink", enable_run_after=False)
276
+ @filter_by_task(required_tasks=["store"])
277
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
278
+ return control_message
279
+
280
+ # Only run_after UDFs (e.g., for source stages)
281
+ @traceable("data_source")
282
+ @udf_intercept_hook(enable_run_before=False) # Uses self.stage_name automatically
283
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
284
+ return control_message
285
+ """
286
+
287
+ def decorator(func: Callable) -> Callable:
288
+ @functools.wraps(func)
289
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
290
+ # Check if UDF processing is globally disabled
291
+ if os.getenv("INGEST_DISABLE_UDF_PROCESSING"):
292
+ logger.debug("UDF processing is disabled via INGEST_DISABLE_UDF_PROCESSING environment variable")
293
+ return func(*args, **kwargs)
294
+
295
+ # Determine the stage name to use
296
+ resolved_stage_name = stage_name
297
+
298
+ # If no explicit stage_name provided, try to get it from self.stage_name
299
+ if resolved_stage_name is None and len(args) >= 1:
300
+ stage_instance = args[0] # 'self' in method calls
301
+ if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
302
+ resolved_stage_name = stage_instance.stage_name
303
+ logger.debug(f"Using auto-detected stage name: '{resolved_stage_name}'")
304
+ else:
305
+ logger.warning(
306
+ "No stage_name provided and could not auto-detect from instance. Skipping UDF intercept."
307
+ )
308
+ return func(*args, **kwargs)
309
+ elif resolved_stage_name is None:
310
+ logger.warning(
311
+ "No stage_name provided and no instance available for auto-detection. Skipping UDF intercept."
312
+ )
313
+ return func(*args, **kwargs)
314
+
315
+ # Extract control_message from args (handle both self.method and function cases)
316
+ control_message = None
317
+ if len(args) >= 2 and hasattr(args[1], "get_tasks"):
318
+ control_message = args[1] # self.method case
319
+ args_list = list(args)
320
+ elif len(args) >= 1 and hasattr(args[0], "get_tasks"):
321
+ control_message = args[0] # function case
322
+ args_list = list(args)
323
+
324
+ if control_message:
325
+ # Execute UDFs that should run before this stage (if enabled)
326
+ if enable_run_before:
327
+ control_message = execute_targeted_udfs(control_message, resolved_stage_name, "run_before")
328
+ # Update args with modified control_message
329
+ if len(args) >= 2 and hasattr(args[1], "get_tasks"):
330
+ args_list[1] = control_message
331
+ else:
332
+ args_list[0] = control_message
333
+
334
+ # Execute the original stage logic
335
+ result = func(*tuple(args_list), **kwargs)
336
+
337
+ # Execute UDFs that should run after this stage (if enabled)
338
+ if enable_run_after and hasattr(result, "get_tasks"): # Result is control_message
339
+ result = execute_targeted_udfs(result, resolved_stage_name, "run_after")
340
+
341
+ return result
342
+ else:
343
+ return func(*args, **kwargs)
344
+
345
+ return wrapper
346
+
347
+ return decorator
348
+
349
+
350
+ def get_udf_cache_stats() -> Dict[str, Any]:
351
+ """Get UDF cache performance statistics"""
352
+ return _udf_cache.get_stats()
@@ -7,9 +7,7 @@ import json
7
7
  import logging
8
8
  import os
9
9
  from json import JSONDecodeError
10
- from typing import Optional, Dict, Any
11
-
12
- from typing import List
10
+ from typing import Optional, Dict, Any, List
13
11
 
14
12
  import redis
15
13
 
@@ -133,6 +131,8 @@ class RedisIngestService(IngestServiceMeta):
133
131
  self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
134
132
  self._cache_prefix: str = "processing_cache:"
135
133
  self._state_prefix: str = "job_state:"
134
+ # Bound async-to-thread concurrency slightly below Redis connection pool
135
+ self._async_operation_semaphore: Optional[asyncio.Semaphore] = None
136
136
 
137
137
  self._ingest_client = RedisClient(
138
138
  host=self._redis_hostname,
@@ -151,6 +151,16 @@ class RedisIngestService(IngestServiceMeta):
151
151
  f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
152
152
  )
153
153
 
154
+ def _get_async_semaphore(self) -> asyncio.Semaphore:
155
+ if self._async_operation_semaphore is None:
156
+ semaphore_limit = max(1, self._concurrency_level - 2)
157
+ self._async_operation_semaphore = asyncio.Semaphore(semaphore_limit)
158
+ return self._async_operation_semaphore
159
+
160
+ async def _run_bounded_to_thread(self, func, *args, **kwargs):
161
+ async with self._get_async_semaphore():
162
+ return await asyncio.to_thread(func, *args, **kwargs)
163
+
154
164
  async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
155
165
  """
156
166
  Validates, prepares, and submits a job specification to the Redis task queue.
@@ -208,12 +218,33 @@ class RedisIngestService(IngestServiceMeta):
208
218
  ttl_for_result: Optional[int] = (
209
219
  self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
210
220
  )
221
+ # Determine target queue based on optional QoS hint
222
+ queue_hint = None
223
+ try:
224
+ routing_opts = job_spec.get("routing_options") or {}
225
+ tracing_opts = job_spec.get("tracing_options") or {}
226
+ queue_hint = routing_opts.get("queue_hint") or tracing_opts.get("queue_hint")
227
+ except Exception:
228
+ queue_hint = None
229
+ allowed = {"default", "immediate", "micro", "small", "medium", "large"}
230
+ if isinstance(queue_hint, str) and queue_hint in allowed:
231
+ if queue_hint == "default":
232
+ channel_name = self._redis_task_queue
233
+ else:
234
+ channel_name = f"{self._redis_task_queue}_{queue_hint}"
235
+ else:
236
+ channel_name = self._redis_task_queue
237
+ logger.debug(
238
+ f"Submitting job {trace_id} to queue '{channel_name}' (hint={queue_hint}) "
239
+ f"with result TTL: {ttl_for_result}"
240
+ )
241
+
211
242
  logger.debug(
212
243
  f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
213
244
  )
214
- await asyncio.to_thread(
245
+ await self._run_bounded_to_thread(
215
246
  self._ingest_client.submit_message,
216
- channel_name=self._redis_task_queue,
247
+ channel_name=channel_name,
217
248
  message=job_spec_json,
218
249
  ttl_seconds=ttl_for_result,
219
250
  )
@@ -252,7 +283,7 @@ class RedisIngestService(IngestServiceMeta):
252
283
  try:
253
284
  result_channel: str = f"{job_id}"
254
285
  logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
255
- message = await asyncio.to_thread(
286
+ message = await self._run_bounded_to_thread(
256
287
  self._ingest_client.fetch_message,
257
288
  channel_name=result_channel,
258
289
  timeout=10,
@@ -264,7 +295,7 @@ class RedisIngestService(IngestServiceMeta):
264
295
  logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
265
296
  raise TimeoutError("No data found (unexpected None response).")
266
297
  except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
267
- logger.info(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
298
+ logger.debug(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
268
299
  raise e
269
300
  except Exception as e:
270
301
  logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
@@ -289,7 +320,7 @@ class RedisIngestService(IngestServiceMeta):
289
320
  ttl_to_set: Optional[int] = self._state_ttl_seconds
290
321
  try:
291
322
  logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
292
- await asyncio.to_thread(
323
+ await self._run_bounded_to_thread(
293
324
  self._ingest_client.get_client().set,
294
325
  state_key,
295
326
  state,
@@ -317,7 +348,10 @@ class RedisIngestService(IngestServiceMeta):
317
348
  """
318
349
  state_key: str = f"{self._state_prefix}{job_id}"
319
350
  try:
320
- data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, state_key)
351
+ data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
352
+ self._ingest_client.get_client().get,
353
+ state_key,
354
+ )
321
355
  if data_bytes:
322
356
  state: str = data_bytes.decode("utf-8")
323
357
  logger.debug(f"Retrieved state for {job_id}: {state}")
@@ -350,7 +384,7 @@ class RedisIngestService(IngestServiceMeta):
350
384
  cache_key: str = f"{self._cache_prefix}{job_id}"
351
385
  try:
352
386
  data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
353
- await asyncio.to_thread(
387
+ await self._run_bounded_to_thread(
354
388
  self._ingest_client.get_client().set,
355
389
  cache_key,
356
390
  data_to_store,
@@ -375,7 +409,10 @@ class RedisIngestService(IngestServiceMeta):
375
409
  """
376
410
  cache_key: str = f"{self._cache_prefix}{job_id}"
377
411
  try:
378
- data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, cache_key)
412
+ data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
413
+ self._ingest_client.get_client().get,
414
+ cache_key,
415
+ )
379
416
  if data_bytes is None:
380
417
  return []
381
418
  return [ProcessingJob(**job) for job in json.loads(data_bytes)]
@@ -393,3 +430,170 @@ class RedisIngestService(IngestServiceMeta):
393
430
  The current fetch mode.
394
431
  """
395
432
  return self._fetch_mode
433
+
434
+ async def set_parent_job_mapping(
435
+ self,
436
+ parent_job_id: str,
437
+ subjob_ids: List[str],
438
+ metadata: Dict[str, Any],
439
+ *,
440
+ subjob_descriptors: Optional[List[Dict[str, Any]]] = None,
441
+ ) -> None:
442
+ """
443
+ Store parent-subjob mapping in Redis for V2 PDF splitting.
444
+
445
+ Parameters
446
+ ----------
447
+ parent_job_id : str
448
+ The parent job identifier
449
+ subjob_ids : List[str]
450
+ List of subjob identifiers
451
+ metadata : Dict[str, Any]
452
+ Metadata about the parent job (total_pages, original_source_id, etc.)
453
+ subjob_descriptors : List[Dict[str, Any]], optional
454
+ Detailed descriptors (job_id, chunk_index, start/end pages) for subjobs
455
+ """
456
+ parent_key = f"parent:{parent_job_id}:subjobs"
457
+ metadata_key = f"parent:{parent_job_id}:metadata"
458
+
459
+ try:
460
+ # Store subjob IDs as a set (only if there are subjobs)
461
+ if subjob_ids:
462
+ await self._run_bounded_to_thread(
463
+ self._ingest_client.get_client().sadd,
464
+ parent_key,
465
+ *subjob_ids,
466
+ )
467
+
468
+ # Store metadata as hash (including original subjob ordering for deterministic fetches)
469
+ metadata_to_store = dict(metadata)
470
+ try:
471
+ metadata_to_store["subjob_order"] = json.dumps(subjob_ids)
472
+ except (TypeError, ValueError):
473
+ logger.warning(
474
+ "Unable to serialize subjob ordering for parent %s; falling back to Redis set ordering",
475
+ parent_job_id,
476
+ )
477
+ metadata_to_store.pop("subjob_order", None)
478
+
479
+ if subjob_descriptors:
480
+ metadata_to_store["subjob_descriptors"] = json.dumps(subjob_descriptors)
481
+
482
+ await self._run_bounded_to_thread(
483
+ self._ingest_client.get_client().hset,
484
+ metadata_key,
485
+ mapping=metadata_to_store,
486
+ )
487
+
488
+ # Set TTL on both keys to match state TTL
489
+ if self._state_ttl_seconds:
490
+ await self._run_bounded_to_thread(
491
+ self._ingest_client.get_client().expire,
492
+ parent_key,
493
+ self._state_ttl_seconds,
494
+ )
495
+ await self._run_bounded_to_thread(
496
+ self._ingest_client.get_client().expire,
497
+ metadata_key,
498
+ self._state_ttl_seconds,
499
+ )
500
+
501
+ logger.debug(f"Stored parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
502
+
503
+ except Exception as err:
504
+ logger.exception(f"Error storing parent job mapping for {parent_job_id}: {err}")
505
+ raise
506
+
507
+ async def get_parent_job_info(self, parent_job_id: str) -> Optional[Dict[str, Any]]:
508
+ """
509
+ Retrieve parent job information including subjob IDs and metadata.
510
+
511
+ Parameters
512
+ ----------
513
+ parent_job_id : str
514
+ The parent job identifier
515
+
516
+ Returns
517
+ -------
518
+ Dict[str, Any] or None
519
+ Dictionary with 'subjob_ids' and 'metadata' keys, or None if not a parent job
520
+ """
521
+ parent_key = f"parent:{parent_job_id}:subjobs"
522
+ metadata_key = f"parent:{parent_job_id}:metadata"
523
+
524
+ try:
525
+ # Check if this is a parent job (check metadata_key since non-split PDFs may not have parent_key)
526
+ exists = await self._run_bounded_to_thread(
527
+ self._ingest_client.get_client().exists,
528
+ metadata_key, # Check metadata instead of parent_key for non-split PDF support
529
+ )
530
+
531
+ if not exists:
532
+ return None
533
+
534
+ # Get subjob IDs (may be empty for non-split PDFs)
535
+ subjob_ids_bytes = await self._run_bounded_to_thread(
536
+ self._ingest_client.get_client().smembers,
537
+ parent_key,
538
+ )
539
+ subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes} if subjob_ids_bytes else set()
540
+
541
+ # Get metadata
542
+ metadata_dict = await self._run_bounded_to_thread(
543
+ self._ingest_client.get_client().hgetall,
544
+ metadata_key,
545
+ )
546
+ metadata = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata_dict.items()}
547
+
548
+ # Convert numeric strings back to numbers
549
+ if "total_pages" in metadata:
550
+ metadata["total_pages"] = int(metadata["total_pages"])
551
+ if "pages_per_chunk" in metadata:
552
+ try:
553
+ metadata["pages_per_chunk"] = int(metadata["pages_per_chunk"])
554
+ except ValueError:
555
+ metadata.pop("pages_per_chunk", None)
556
+
557
+ ordered_ids: Optional[List[str]] = None
558
+ stored_order = metadata.pop("subjob_order", None)
559
+ if stored_order:
560
+ try:
561
+ candidate_order = json.loads(stored_order)
562
+ if isinstance(candidate_order, list):
563
+ ordered_ids = [sid for sid in candidate_order if sid in subjob_id_set]
564
+ except (ValueError, TypeError) as exc:
565
+ logger.warning(
566
+ "Failed to parse stored subjob order for parent %s: %s",
567
+ parent_job_id,
568
+ exc,
569
+ )
570
+
571
+ if ordered_ids is None:
572
+ ordered_ids = sorted(subjob_id_set)
573
+ else:
574
+ remaining_ids = sorted(subjob_id_set - set(ordered_ids))
575
+ ordered_ids.extend(remaining_ids)
576
+
577
+ subjob_descriptors: Optional[List[Dict[str, Any]]] = None
578
+ stored_descriptors = metadata.pop("subjob_descriptors", None)
579
+ if stored_descriptors:
580
+ try:
581
+ decoded = json.loads(stored_descriptors)
582
+ if isinstance(decoded, list):
583
+ subjob_descriptors = decoded
584
+ except (ValueError, TypeError) as exc:
585
+ logger.warning(
586
+ "Failed to parse stored subjob descriptors for parent %s: %s",
587
+ parent_job_id,
588
+ exc,
589
+ )
590
+
591
+ return {
592
+ "subjob_ids": ordered_ids,
593
+ "metadata": metadata,
594
+ "subjob_descriptors": subjob_descriptors or [],
595
+ }
596
+
597
+ except Exception as err:
598
+ logger.error(f"Error retrieving parent job info for {parent_job_id}: {err}")
599
+ return None
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0