nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import hashlib
|
|
7
|
+
import inspect
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
from typing import Dict, List, Any, Optional, Callable
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
|
|
15
|
+
from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class CachedUDF:
|
|
22
|
+
"""Cached UDF function with metadata"""
|
|
23
|
+
|
|
24
|
+
function: callable
|
|
25
|
+
function_name: str
|
|
26
|
+
signature_validated: bool
|
|
27
|
+
created_at: float
|
|
28
|
+
last_used: float
|
|
29
|
+
use_count: int
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class UDFCache:
|
|
33
|
+
"""LRU cache for compiled and validated UDF functions"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
|
|
36
|
+
self.max_size = max_size
|
|
37
|
+
self.ttl_seconds = ttl_seconds
|
|
38
|
+
self.cache: Dict[str, CachedUDF] = {}
|
|
39
|
+
self.access_order: List[str] = [] # For LRU tracking
|
|
40
|
+
|
|
41
|
+
def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
|
|
42
|
+
"""Generate cache key from UDF string and function name"""
|
|
43
|
+
content = f"{udf_function_str.strip()}:{udf_function_name}"
|
|
44
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
45
|
+
|
|
46
|
+
def _evict_lru(self):
|
|
47
|
+
"""Remove least recently used item"""
|
|
48
|
+
if self.access_order:
|
|
49
|
+
lru_key = self.access_order.pop(0)
|
|
50
|
+
self.cache.pop(lru_key, None)
|
|
51
|
+
|
|
52
|
+
def _cleanup_expired(self):
|
|
53
|
+
"""Remove expired entries if TTL is configured"""
|
|
54
|
+
if not self.ttl_seconds:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
current_time = time.time()
|
|
58
|
+
expired_keys = [
|
|
59
|
+
key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
for key in expired_keys:
|
|
63
|
+
self.cache.pop(key, None)
|
|
64
|
+
if key in self.access_order:
|
|
65
|
+
self.access_order.remove(key)
|
|
66
|
+
|
|
67
|
+
def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
|
|
68
|
+
"""Get cached UDF function if available"""
|
|
69
|
+
self._cleanup_expired()
|
|
70
|
+
|
|
71
|
+
cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
|
|
72
|
+
|
|
73
|
+
if cache_key in self.cache:
|
|
74
|
+
# Update access tracking
|
|
75
|
+
if cache_key in self.access_order:
|
|
76
|
+
self.access_order.remove(cache_key)
|
|
77
|
+
self.access_order.append(cache_key)
|
|
78
|
+
|
|
79
|
+
# Update usage stats
|
|
80
|
+
cached_udf = self.cache[cache_key]
|
|
81
|
+
cached_udf.last_used = time.time()
|
|
82
|
+
cached_udf.use_count += 1
|
|
83
|
+
|
|
84
|
+
return cached_udf
|
|
85
|
+
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
def put(
|
|
89
|
+
self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
|
|
90
|
+
) -> str:
|
|
91
|
+
"""Cache a compiled and validated UDF function"""
|
|
92
|
+
cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
|
|
93
|
+
|
|
94
|
+
# Evict LRU if at capacity
|
|
95
|
+
while len(self.cache) >= self.max_size:
|
|
96
|
+
self._evict_lru()
|
|
97
|
+
|
|
98
|
+
current_time = time.time()
|
|
99
|
+
cached_udf = CachedUDF(
|
|
100
|
+
function=function,
|
|
101
|
+
function_name=udf_function_name,
|
|
102
|
+
signature_validated=signature_validated,
|
|
103
|
+
created_at=current_time,
|
|
104
|
+
last_used=current_time,
|
|
105
|
+
use_count=1,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.cache[cache_key] = cached_udf
|
|
109
|
+
self.access_order.append(cache_key)
|
|
110
|
+
|
|
111
|
+
return cache_key
|
|
112
|
+
|
|
113
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
114
|
+
"""Get cache statistics"""
|
|
115
|
+
total_uses = sum(udf.use_count for udf in self.cache.values())
|
|
116
|
+
most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
|
|
117
|
+
return {
|
|
118
|
+
"size": len(self.cache),
|
|
119
|
+
"max_size": self.max_size,
|
|
120
|
+
"total_uses": total_uses,
|
|
121
|
+
"most_used_function": most_used.function_name if most_used else None,
|
|
122
|
+
"most_used_count": most_used.use_count if most_used else 0,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Global cache instance
|
|
127
|
+
_udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
|
|
131
|
+
"""Compile and validate UDF function (extracted for caching)"""
|
|
132
|
+
# Execute the UDF function string in a controlled namespace
|
|
133
|
+
namespace: Dict[str, Any] = {}
|
|
134
|
+
try:
|
|
135
|
+
exec(udf_function_str, namespace)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
|
|
138
|
+
|
|
139
|
+
# Extract the specified function from the namespace
|
|
140
|
+
if udf_function_name in namespace and callable(namespace[udf_function_name]):
|
|
141
|
+
udf_function = namespace[udf_function_name]
|
|
142
|
+
else:
|
|
143
|
+
raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
|
|
144
|
+
|
|
145
|
+
# Validate the UDF function signature
|
|
146
|
+
try:
|
|
147
|
+
ingest_callable_signature(inspect.signature(udf_function))
|
|
148
|
+
except Exception as e:
|
|
149
|
+
raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
|
|
150
|
+
|
|
151
|
+
return udf_function
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def execute_targeted_udfs(
|
|
155
|
+
control_message: IngestControlMessage, stage_name: str, directive: str
|
|
156
|
+
) -> IngestControlMessage:
|
|
157
|
+
"""Execute UDFs that target this stage with the given directive."""
|
|
158
|
+
# Early exit if no UDF tasks exist - check by task type, not task ID
|
|
159
|
+
udf_tasks_exist = any(task.type == "udf" for task in control_message.get_tasks())
|
|
160
|
+
if not udf_tasks_exist:
|
|
161
|
+
return control_message
|
|
162
|
+
|
|
163
|
+
# Remove all UDF tasks and get them - handle case where no tasks found
|
|
164
|
+
try:
|
|
165
|
+
all_udf_tasks = remove_all_tasks_by_type(control_message, "udf")
|
|
166
|
+
except ValueError:
|
|
167
|
+
# No UDF tasks found - this can happen due to race conditions
|
|
168
|
+
logger.debug(f"No UDF tasks found for stage '{stage_name}' directive '{directive}'")
|
|
169
|
+
return control_message
|
|
170
|
+
|
|
171
|
+
# Execute applicable UDFs and collect remaining ones
|
|
172
|
+
remaining_tasks = []
|
|
173
|
+
|
|
174
|
+
for task_properties in all_udf_tasks:
|
|
175
|
+
# Check if this UDF targets this stage with the specified directive
|
|
176
|
+
target_stage = task_properties.get("target_stage", "")
|
|
177
|
+
run_before = task_properties.get("run_before", False)
|
|
178
|
+
run_after = task_properties.get("run_after", False)
|
|
179
|
+
|
|
180
|
+
# Determine if this UDF should execute
|
|
181
|
+
should_execute = False
|
|
182
|
+
if directive == "run_before" and run_before and target_stage == stage_name:
|
|
183
|
+
should_execute = True
|
|
184
|
+
elif directive == "run_after" and run_after and target_stage == stage_name:
|
|
185
|
+
should_execute = True
|
|
186
|
+
|
|
187
|
+
if should_execute:
|
|
188
|
+
try:
|
|
189
|
+
# Get UDF function details
|
|
190
|
+
udf_function_str = task_properties.get("udf_function", "").strip()
|
|
191
|
+
udf_function_name = task_properties.get("udf_function_name", "").strip()
|
|
192
|
+
task_id = task_properties.get("task_id", "unknown")
|
|
193
|
+
|
|
194
|
+
# Skip empty UDF functions
|
|
195
|
+
if not udf_function_str:
|
|
196
|
+
logger.debug(f"UDF task {task_id} has empty function, skipping")
|
|
197
|
+
remaining_tasks.append(task_properties)
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
# Validate function name
|
|
201
|
+
if not udf_function_name:
|
|
202
|
+
raise ValueError(f"UDF task {task_id} missing required 'udf_function_name' property")
|
|
203
|
+
|
|
204
|
+
# Get or compile UDF function
|
|
205
|
+
cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
|
|
206
|
+
if cached_udf:
|
|
207
|
+
udf_function = cached_udf.function
|
|
208
|
+
logger.debug(f"UDF task {task_id}: Using cached function '{udf_function_name}'")
|
|
209
|
+
else:
|
|
210
|
+
udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_id)
|
|
211
|
+
_udf_cache.put(udf_function_str, udf_function_name, udf_function)
|
|
212
|
+
logger.debug(f"UDF task {task_id}: Cached function '{udf_function_name}'")
|
|
213
|
+
|
|
214
|
+
# Execute the UDF
|
|
215
|
+
control_message = udf_function(control_message)
|
|
216
|
+
|
|
217
|
+
# Validate return type
|
|
218
|
+
if not isinstance(control_message, IngestControlMessage):
|
|
219
|
+
raise ValueError(f"UDF task {task_id} must return IngestControlMessage")
|
|
220
|
+
|
|
221
|
+
logger.info(f"Executed UDF {task_id} '{udf_function_name}' {directive} stage '{stage_name}'")
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
logger.error(f"UDF {task_id} failed {directive} stage '{stage_name}': {e}")
|
|
225
|
+
# Keep failed task for next stage
|
|
226
|
+
remaining_tasks.append(task_properties)
|
|
227
|
+
else:
|
|
228
|
+
# Keep non-applicable task for next stage
|
|
229
|
+
remaining_tasks.append(task_properties)
|
|
230
|
+
|
|
231
|
+
# Re-add all remaining UDF tasks
|
|
232
|
+
for task_properties in remaining_tasks:
|
|
233
|
+
from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
|
|
234
|
+
|
|
235
|
+
task = ControlMessageTask(type="udf", id=task_properties.get("task_id", "unknown"), properties=task_properties)
|
|
236
|
+
control_message.add_task(task)
|
|
237
|
+
|
|
238
|
+
return control_message
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def remove_task_by_id(control_message: IngestControlMessage, task_id: str) -> IngestControlMessage:
|
|
242
|
+
"""Remove a specific task by ID from the control message"""
|
|
243
|
+
try:
|
|
244
|
+
control_message.remove_task(task_id)
|
|
245
|
+
except RuntimeError as e:
|
|
246
|
+
logger.warning(f"Could not remove task {task_id}: {e}")
|
|
247
|
+
|
|
248
|
+
return control_message
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def udf_intercept_hook(stage_name: Optional[str] = None, enable_run_before: bool = True, enable_run_after: bool = True):
|
|
252
|
+
"""
|
|
253
|
+
Decorator that executes UDFs targeted at this stage.
|
|
254
|
+
|
|
255
|
+
This decorator integrates with the existing UDF system, providing full
|
|
256
|
+
UDF compilation, caching, and execution capabilities. UDFs can target
|
|
257
|
+
specific stages using run_before or run_after directives.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
stage_name: Name of the stage (e.g., "image_dedup", "text_extract").
|
|
261
|
+
If None, will attempt to use self.stage_name from the decorated method's instance.
|
|
262
|
+
enable_run_before: Whether to execute UDFs with run_before=True (default: True)
|
|
263
|
+
enable_run_after: Whether to execute UDFs with run_after=True (default: True)
|
|
264
|
+
|
|
265
|
+
Examples:
|
|
266
|
+
# Automatic stage name detection (recommended)
|
|
267
|
+
@traceable("image_deduplication")
|
|
268
|
+
@udf_intercept_hook() # Uses self.stage_name automatically
|
|
269
|
+
@filter_by_task(required_tasks=["dedup"])
|
|
270
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
271
|
+
return control_message
|
|
272
|
+
|
|
273
|
+
# Explicit stage name (fallback/override)
|
|
274
|
+
@traceable("data_sink")
|
|
275
|
+
@udf_intercept_hook("data_sink", enable_run_after=False)
|
|
276
|
+
@filter_by_task(required_tasks=["store"])
|
|
277
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
278
|
+
return control_message
|
|
279
|
+
|
|
280
|
+
# Only run_after UDFs (e.g., for source stages)
|
|
281
|
+
@traceable("data_source")
|
|
282
|
+
@udf_intercept_hook(enable_run_before=False) # Uses self.stage_name automatically
|
|
283
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
284
|
+
return control_message
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
def decorator(func: Callable) -> Callable:
|
|
288
|
+
@functools.wraps(func)
|
|
289
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
290
|
+
# Check if UDF processing is globally disabled
|
|
291
|
+
if os.getenv("INGEST_DISABLE_UDF_PROCESSING"):
|
|
292
|
+
logger.debug("UDF processing is disabled via INGEST_DISABLE_UDF_PROCESSING environment variable")
|
|
293
|
+
return func(*args, **kwargs)
|
|
294
|
+
|
|
295
|
+
# Determine the stage name to use
|
|
296
|
+
resolved_stage_name = stage_name
|
|
297
|
+
|
|
298
|
+
# If no explicit stage_name provided, try to get it from self.stage_name
|
|
299
|
+
if resolved_stage_name is None and len(args) >= 1:
|
|
300
|
+
stage_instance = args[0] # 'self' in method calls
|
|
301
|
+
if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
|
|
302
|
+
resolved_stage_name = stage_instance.stage_name
|
|
303
|
+
logger.debug(f"Using auto-detected stage name: '{resolved_stage_name}'")
|
|
304
|
+
else:
|
|
305
|
+
logger.warning(
|
|
306
|
+
"No stage_name provided and could not auto-detect from instance. Skipping UDF intercept."
|
|
307
|
+
)
|
|
308
|
+
return func(*args, **kwargs)
|
|
309
|
+
elif resolved_stage_name is None:
|
|
310
|
+
logger.warning(
|
|
311
|
+
"No stage_name provided and no instance available for auto-detection. Skipping UDF intercept."
|
|
312
|
+
)
|
|
313
|
+
return func(*args, **kwargs)
|
|
314
|
+
|
|
315
|
+
# Extract control_message from args (handle both self.method and function cases)
|
|
316
|
+
control_message = None
|
|
317
|
+
if len(args) >= 2 and hasattr(args[1], "get_tasks"):
|
|
318
|
+
control_message = args[1] # self.method case
|
|
319
|
+
args_list = list(args)
|
|
320
|
+
elif len(args) >= 1 and hasattr(args[0], "get_tasks"):
|
|
321
|
+
control_message = args[0] # function case
|
|
322
|
+
args_list = list(args)
|
|
323
|
+
|
|
324
|
+
if control_message:
|
|
325
|
+
# Execute UDFs that should run before this stage (if enabled)
|
|
326
|
+
if enable_run_before:
|
|
327
|
+
control_message = execute_targeted_udfs(control_message, resolved_stage_name, "run_before")
|
|
328
|
+
# Update args with modified control_message
|
|
329
|
+
if len(args) >= 2 and hasattr(args[1], "get_tasks"):
|
|
330
|
+
args_list[1] = control_message
|
|
331
|
+
else:
|
|
332
|
+
args_list[0] = control_message
|
|
333
|
+
|
|
334
|
+
# Execute the original stage logic
|
|
335
|
+
result = func(*tuple(args_list), **kwargs)
|
|
336
|
+
|
|
337
|
+
# Execute UDFs that should run after this stage (if enabled)
|
|
338
|
+
if enable_run_after and hasattr(result, "get_tasks"): # Result is control_message
|
|
339
|
+
result = execute_targeted_udfs(result, resolved_stage_name, "run_after")
|
|
340
|
+
|
|
341
|
+
return result
|
|
342
|
+
else:
|
|
343
|
+
return func(*args, **kwargs)
|
|
344
|
+
|
|
345
|
+
return wrapper
|
|
346
|
+
|
|
347
|
+
return decorator
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def get_udf_cache_stats() -> Dict[str, Any]:
|
|
351
|
+
"""Get UDF cache performance statistics"""
|
|
352
|
+
return _udf_cache.get_stats()
|
|
@@ -7,9 +7,7 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
from json import JSONDecodeError
|
|
10
|
-
from typing import Optional, Dict, Any
|
|
11
|
-
|
|
12
|
-
from typing import List
|
|
10
|
+
from typing import Optional, Dict, Any, List
|
|
13
11
|
|
|
14
12
|
import redis
|
|
15
13
|
|
|
@@ -133,6 +131,8 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
133
131
|
self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
|
|
134
132
|
self._cache_prefix: str = "processing_cache:"
|
|
135
133
|
self._state_prefix: str = "job_state:"
|
|
134
|
+
# Bound async-to-thread concurrency slightly below Redis connection pool
|
|
135
|
+
self._async_operation_semaphore: Optional[asyncio.Semaphore] = None
|
|
136
136
|
|
|
137
137
|
self._ingest_client = RedisClient(
|
|
138
138
|
host=self._redis_hostname,
|
|
@@ -151,6 +151,16 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
151
151
|
f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
|
|
152
152
|
)
|
|
153
153
|
|
|
154
|
+
def _get_async_semaphore(self) -> asyncio.Semaphore:
|
|
155
|
+
if self._async_operation_semaphore is None:
|
|
156
|
+
semaphore_limit = max(1, self._concurrency_level - 2)
|
|
157
|
+
self._async_operation_semaphore = asyncio.Semaphore(semaphore_limit)
|
|
158
|
+
return self._async_operation_semaphore
|
|
159
|
+
|
|
160
|
+
async def _run_bounded_to_thread(self, func, *args, **kwargs):
|
|
161
|
+
async with self._get_async_semaphore():
|
|
162
|
+
return await asyncio.to_thread(func, *args, **kwargs)
|
|
163
|
+
|
|
154
164
|
async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
|
|
155
165
|
"""
|
|
156
166
|
Validates, prepares, and submits a job specification to the Redis task queue.
|
|
@@ -208,12 +218,33 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
208
218
|
ttl_for_result: Optional[int] = (
|
|
209
219
|
self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
|
|
210
220
|
)
|
|
221
|
+
# Determine target queue based on optional QoS hint
|
|
222
|
+
queue_hint = None
|
|
223
|
+
try:
|
|
224
|
+
routing_opts = job_spec.get("routing_options") or {}
|
|
225
|
+
tracing_opts = job_spec.get("tracing_options") or {}
|
|
226
|
+
queue_hint = routing_opts.get("queue_hint") or tracing_opts.get("queue_hint")
|
|
227
|
+
except Exception:
|
|
228
|
+
queue_hint = None
|
|
229
|
+
allowed = {"default", "immediate", "micro", "small", "medium", "large"}
|
|
230
|
+
if isinstance(queue_hint, str) and queue_hint in allowed:
|
|
231
|
+
if queue_hint == "default":
|
|
232
|
+
channel_name = self._redis_task_queue
|
|
233
|
+
else:
|
|
234
|
+
channel_name = f"{self._redis_task_queue}_{queue_hint}"
|
|
235
|
+
else:
|
|
236
|
+
channel_name = self._redis_task_queue
|
|
237
|
+
logger.debug(
|
|
238
|
+
f"Submitting job {trace_id} to queue '{channel_name}' (hint={queue_hint}) "
|
|
239
|
+
f"with result TTL: {ttl_for_result}"
|
|
240
|
+
)
|
|
241
|
+
|
|
211
242
|
logger.debug(
|
|
212
243
|
f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
|
|
213
244
|
)
|
|
214
|
-
await
|
|
245
|
+
await self._run_bounded_to_thread(
|
|
215
246
|
self._ingest_client.submit_message,
|
|
216
|
-
channel_name=
|
|
247
|
+
channel_name=channel_name,
|
|
217
248
|
message=job_spec_json,
|
|
218
249
|
ttl_seconds=ttl_for_result,
|
|
219
250
|
)
|
|
@@ -252,7 +283,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
252
283
|
try:
|
|
253
284
|
result_channel: str = f"{job_id}"
|
|
254
285
|
logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
|
|
255
|
-
message = await
|
|
286
|
+
message = await self._run_bounded_to_thread(
|
|
256
287
|
self._ingest_client.fetch_message,
|
|
257
288
|
channel_name=result_channel,
|
|
258
289
|
timeout=10,
|
|
@@ -264,7 +295,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
264
295
|
logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
|
|
265
296
|
raise TimeoutError("No data found (unexpected None response).")
|
|
266
297
|
except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
|
|
267
|
-
logger.
|
|
298
|
+
logger.debug(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
|
|
268
299
|
raise e
|
|
269
300
|
except Exception as e:
|
|
270
301
|
logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
|
|
@@ -289,7 +320,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
289
320
|
ttl_to_set: Optional[int] = self._state_ttl_seconds
|
|
290
321
|
try:
|
|
291
322
|
logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
|
|
292
|
-
await
|
|
323
|
+
await self._run_bounded_to_thread(
|
|
293
324
|
self._ingest_client.get_client().set,
|
|
294
325
|
state_key,
|
|
295
326
|
state,
|
|
@@ -317,7 +348,10 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
317
348
|
"""
|
|
318
349
|
state_key: str = f"{self._state_prefix}{job_id}"
|
|
319
350
|
try:
|
|
320
|
-
data_bytes: Optional[bytes] = await
|
|
351
|
+
data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
|
|
352
|
+
self._ingest_client.get_client().get,
|
|
353
|
+
state_key,
|
|
354
|
+
)
|
|
321
355
|
if data_bytes:
|
|
322
356
|
state: str = data_bytes.decode("utf-8")
|
|
323
357
|
logger.debug(f"Retrieved state for {job_id}: {state}")
|
|
@@ -350,7 +384,7 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
350
384
|
cache_key: str = f"{self._cache_prefix}{job_id}"
|
|
351
385
|
try:
|
|
352
386
|
data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
|
|
353
|
-
await
|
|
387
|
+
await self._run_bounded_to_thread(
|
|
354
388
|
self._ingest_client.get_client().set,
|
|
355
389
|
cache_key,
|
|
356
390
|
data_to_store,
|
|
@@ -375,7 +409,10 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
375
409
|
"""
|
|
376
410
|
cache_key: str = f"{self._cache_prefix}{job_id}"
|
|
377
411
|
try:
|
|
378
|
-
data_bytes: Optional[bytes] = await
|
|
412
|
+
data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
|
|
413
|
+
self._ingest_client.get_client().get,
|
|
414
|
+
cache_key,
|
|
415
|
+
)
|
|
379
416
|
if data_bytes is None:
|
|
380
417
|
return []
|
|
381
418
|
return [ProcessingJob(**job) for job in json.loads(data_bytes)]
|
|
@@ -393,3 +430,170 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
393
430
|
The current fetch mode.
|
|
394
431
|
"""
|
|
395
432
|
return self._fetch_mode
|
|
433
|
+
|
|
434
|
+
async def set_parent_job_mapping(
|
|
435
|
+
self,
|
|
436
|
+
parent_job_id: str,
|
|
437
|
+
subjob_ids: List[str],
|
|
438
|
+
metadata: Dict[str, Any],
|
|
439
|
+
*,
|
|
440
|
+
subjob_descriptors: Optional[List[Dict[str, Any]]] = None,
|
|
441
|
+
) -> None:
|
|
442
|
+
"""
|
|
443
|
+
Store parent-subjob mapping in Redis for V2 PDF splitting.
|
|
444
|
+
|
|
445
|
+
Parameters
|
|
446
|
+
----------
|
|
447
|
+
parent_job_id : str
|
|
448
|
+
The parent job identifier
|
|
449
|
+
subjob_ids : List[str]
|
|
450
|
+
List of subjob identifiers
|
|
451
|
+
metadata : Dict[str, Any]
|
|
452
|
+
Metadata about the parent job (total_pages, original_source_id, etc.)
|
|
453
|
+
subjob_descriptors : List[Dict[str, Any]], optional
|
|
454
|
+
Detailed descriptors (job_id, chunk_index, start/end pages) for subjobs
|
|
455
|
+
"""
|
|
456
|
+
parent_key = f"parent:{parent_job_id}:subjobs"
|
|
457
|
+
metadata_key = f"parent:{parent_job_id}:metadata"
|
|
458
|
+
|
|
459
|
+
try:
|
|
460
|
+
# Store subjob IDs as a set (only if there are subjobs)
|
|
461
|
+
if subjob_ids:
|
|
462
|
+
await self._run_bounded_to_thread(
|
|
463
|
+
self._ingest_client.get_client().sadd,
|
|
464
|
+
parent_key,
|
|
465
|
+
*subjob_ids,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
# Store metadata as hash (including original subjob ordering for deterministic fetches)
|
|
469
|
+
metadata_to_store = dict(metadata)
|
|
470
|
+
try:
|
|
471
|
+
metadata_to_store["subjob_order"] = json.dumps(subjob_ids)
|
|
472
|
+
except (TypeError, ValueError):
|
|
473
|
+
logger.warning(
|
|
474
|
+
"Unable to serialize subjob ordering for parent %s; falling back to Redis set ordering",
|
|
475
|
+
parent_job_id,
|
|
476
|
+
)
|
|
477
|
+
metadata_to_store.pop("subjob_order", None)
|
|
478
|
+
|
|
479
|
+
if subjob_descriptors:
|
|
480
|
+
metadata_to_store["subjob_descriptors"] = json.dumps(subjob_descriptors)
|
|
481
|
+
|
|
482
|
+
await self._run_bounded_to_thread(
|
|
483
|
+
self._ingest_client.get_client().hset,
|
|
484
|
+
metadata_key,
|
|
485
|
+
mapping=metadata_to_store,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
# Set TTL on both keys to match state TTL
|
|
489
|
+
if self._state_ttl_seconds:
|
|
490
|
+
await self._run_bounded_to_thread(
|
|
491
|
+
self._ingest_client.get_client().expire,
|
|
492
|
+
parent_key,
|
|
493
|
+
self._state_ttl_seconds,
|
|
494
|
+
)
|
|
495
|
+
await self._run_bounded_to_thread(
|
|
496
|
+
self._ingest_client.get_client().expire,
|
|
497
|
+
metadata_key,
|
|
498
|
+
self._state_ttl_seconds,
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
logger.debug(f"Stored parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
|
|
502
|
+
|
|
503
|
+
except Exception as err:
|
|
504
|
+
logger.exception(f"Error storing parent job mapping for {parent_job_id}: {err}")
|
|
505
|
+
raise
|
|
506
|
+
|
|
507
|
+
async def get_parent_job_info(self, parent_job_id: str) -> Optional[Dict[str, Any]]:
|
|
508
|
+
"""
|
|
509
|
+
Retrieve parent job information including subjob IDs and metadata.
|
|
510
|
+
|
|
511
|
+
Parameters
|
|
512
|
+
----------
|
|
513
|
+
parent_job_id : str
|
|
514
|
+
The parent job identifier
|
|
515
|
+
|
|
516
|
+
Returns
|
|
517
|
+
-------
|
|
518
|
+
Dict[str, Any] or None
|
|
519
|
+
Dictionary with 'subjob_ids' and 'metadata' keys, or None if not a parent job
|
|
520
|
+
"""
|
|
521
|
+
parent_key = f"parent:{parent_job_id}:subjobs"
|
|
522
|
+
metadata_key = f"parent:{parent_job_id}:metadata"
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
# Check if this is a parent job (check metadata_key since non-split PDFs may not have parent_key)
|
|
526
|
+
exists = await self._run_bounded_to_thread(
|
|
527
|
+
self._ingest_client.get_client().exists,
|
|
528
|
+
metadata_key, # Check metadata instead of parent_key for non-split PDF support
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
if not exists:
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
# Get subjob IDs (may be empty for non-split PDFs)
|
|
535
|
+
subjob_ids_bytes = await self._run_bounded_to_thread(
|
|
536
|
+
self._ingest_client.get_client().smembers,
|
|
537
|
+
parent_key,
|
|
538
|
+
)
|
|
539
|
+
subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes} if subjob_ids_bytes else set()
|
|
540
|
+
|
|
541
|
+
# Get metadata
|
|
542
|
+
metadata_dict = await self._run_bounded_to_thread(
|
|
543
|
+
self._ingest_client.get_client().hgetall,
|
|
544
|
+
metadata_key,
|
|
545
|
+
)
|
|
546
|
+
metadata = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata_dict.items()}
|
|
547
|
+
|
|
548
|
+
# Convert numeric strings back to numbers
|
|
549
|
+
if "total_pages" in metadata:
|
|
550
|
+
metadata["total_pages"] = int(metadata["total_pages"])
|
|
551
|
+
if "pages_per_chunk" in metadata:
|
|
552
|
+
try:
|
|
553
|
+
metadata["pages_per_chunk"] = int(metadata["pages_per_chunk"])
|
|
554
|
+
except ValueError:
|
|
555
|
+
metadata.pop("pages_per_chunk", None)
|
|
556
|
+
|
|
557
|
+
ordered_ids: Optional[List[str]] = None
|
|
558
|
+
stored_order = metadata.pop("subjob_order", None)
|
|
559
|
+
if stored_order:
|
|
560
|
+
try:
|
|
561
|
+
candidate_order = json.loads(stored_order)
|
|
562
|
+
if isinstance(candidate_order, list):
|
|
563
|
+
ordered_ids = [sid for sid in candidate_order if sid in subjob_id_set]
|
|
564
|
+
except (ValueError, TypeError) as exc:
|
|
565
|
+
logger.warning(
|
|
566
|
+
"Failed to parse stored subjob order for parent %s: %s",
|
|
567
|
+
parent_job_id,
|
|
568
|
+
exc,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
if ordered_ids is None:
|
|
572
|
+
ordered_ids = sorted(subjob_id_set)
|
|
573
|
+
else:
|
|
574
|
+
remaining_ids = sorted(subjob_id_set - set(ordered_ids))
|
|
575
|
+
ordered_ids.extend(remaining_ids)
|
|
576
|
+
|
|
577
|
+
subjob_descriptors: Optional[List[Dict[str, Any]]] = None
|
|
578
|
+
stored_descriptors = metadata.pop("subjob_descriptors", None)
|
|
579
|
+
if stored_descriptors:
|
|
580
|
+
try:
|
|
581
|
+
decoded = json.loads(stored_descriptors)
|
|
582
|
+
if isinstance(decoded, list):
|
|
583
|
+
subjob_descriptors = decoded
|
|
584
|
+
except (ValueError, TypeError) as exc:
|
|
585
|
+
logger.warning(
|
|
586
|
+
"Failed to parse stored subjob descriptors for parent %s: %s",
|
|
587
|
+
parent_job_id,
|
|
588
|
+
exc,
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
return {
|
|
592
|
+
"subjob_ids": ordered_ids,
|
|
593
|
+
"metadata": metadata,
|
|
594
|
+
"subjob_descriptors": subjob_descriptors or [],
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
except Exception as err:
|
|
598
|
+
logger.error(f"Error retrieving parent job info for {parent_job_id}: {err}")
|
|
599
|
+
return None
|