dory-processor-sdk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +101 -0
- dory/auth/__init__.py +10 -0
- dory/auth/oauth2.py +153 -0
- dory/auto_instrument.py +142 -0
- dory/cli/__init__.py +5 -0
- dory/cli/main.py +137 -0
- dory/cli/templates.py +123 -0
- dory/config/__init__.py +23 -0
- dory/config/defaults.py +24 -0
- dory/config/loader.py +430 -0
- dory/config/presets.py +73 -0
- dory/config/schema.py +84 -0
- dory/core/__init__.py +27 -0
- dory/core/app.py +434 -0
- dory/core/context.py +209 -0
- dory/core/lifecycle.py +214 -0
- dory/core/meta.py +121 -0
- dory/core/modes.py +479 -0
- dory/core/processor.py +564 -0
- dory/core/signals.py +122 -0
- dory/decorators.py +142 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +644 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +419 -0
- dory/errors/__init__.py +139 -0
- dory/errors/classification.py +362 -0
- dory/errors/codes.py +498 -0
- dory/geo/__init__.py +40 -0
- dory/geo/geolocalizer.py +1034 -0
- dory/health/__init__.py +12 -0
- dory/health/probes.py +210 -0
- dory/health/server.py +635 -0
- dory/k8s/__init__.py +80 -0
- dory/k8s/annotation_watcher.py +184 -0
- dory/k8s/client.py +251 -0
- dory/k8s/labels.py +505 -0
- dory/k8s/pod_metadata.py +182 -0
- dory/logging/__init__.py +9 -0
- dory/logging/logger.py +148 -0
- dory/metrics/__init__.py +7 -0
- dory/metrics/collector.py +301 -0
- dory/middleware/__init__.py +46 -0
- dory/middleware/connection_tracker.py +608 -0
- dory/middleware/request_id.py +325 -0
- dory/middleware/request_tracker.py +511 -0
- dory/migration/__init__.py +33 -0
- dory/migration/configmap.py +232 -0
- dory/migration/s3_store.py +594 -0
- dory/migration/serialization.py +135 -0
- dory/migration/state_manager.py +286 -0
- dory/migration/transfer.py +382 -0
- dory/monitoring/__init__.py +29 -0
- dory/monitoring/opentelemetry.py +489 -0
- dory/output/__init__.py +31 -0
- dory/output/envelope.py +137 -0
- dory/output/formatter.py +113 -0
- dory/output/rabbitmq.py +632 -0
- dory/output/routing.py +318 -0
- dory/output/validator.py +199 -0
- dory/py.typed +2 -0
- dory/recovery/__init__.py +60 -0
- dory/recovery/golden_image.py +487 -0
- dory/recovery/golden_snapshot.py +713 -0
- dory/recovery/golden_validator.py +518 -0
- dory/recovery/partial_recovery.py +482 -0
- dory/recovery/recovery_decision.py +242 -0
- dory/recovery/restart_detector.py +142 -0
- dory/recovery/state_validator.py +183 -0
- dory/resilience/__init__.py +45 -0
- dory/resilience/circuit_breaker.py +457 -0
- dory/resilience/retry.py +389 -0
- dory/simple.py +342 -0
- dory/types.py +68 -0
- dory/utils/__init__.py +31 -0
- dory/utils/errors.py +59 -0
- dory/utils/retry.py +115 -0
- dory/utils/timeout.py +80 -0
- dory_processor_sdk-0.0.1.dist-info/METADATA +424 -0
- dory_processor_sdk-0.0.1.dist-info/RECORD +86 -0
- dory_processor_sdk-0.0.1.dist-info/WHEEL +5 -0
- dory_processor_sdk-0.0.1.dist-info/entry_points.txt +2 -0
- dory_processor_sdk-0.0.1.dist-info/licenses/LICENSE +201 -0
- dory_processor_sdk-0.0.1.dist-info/top_level.txt +1 -0
dory/core/app.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DoryApp - Main entry point for processor applications.
|
|
3
|
+
|
|
4
|
+
Orchestrates the entire processor lifecycle including:
|
|
5
|
+
- Configuration loading
|
|
6
|
+
- Health server startup
|
|
7
|
+
- Signal handling
|
|
8
|
+
- State restoration
|
|
9
|
+
- Processor lifecycle management
|
|
10
|
+
- Graceful shutdown
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import sys
|
|
16
|
+
from typing import Type
|
|
17
|
+
|
|
18
|
+
from dory.core.processor import BaseProcessor
|
|
19
|
+
from dory.core.context import ExecutionContext
|
|
20
|
+
from dory.core.lifecycle import LifecycleManager
|
|
21
|
+
from dory.core.signals import SignalHandler
|
|
22
|
+
from dory.config.loader import ConfigLoader
|
|
23
|
+
from dory.config.schema import DoryConfig
|
|
24
|
+
from dory.health.server import HealthServer
|
|
25
|
+
from dory.migration.state_manager import StateManager
|
|
26
|
+
from dory.recovery.recovery_decision import RecoveryDecisionMaker
|
|
27
|
+
from dory.recovery.restart_detector import RestartDetector
|
|
28
|
+
from dory.logging.logger import setup_logging
|
|
29
|
+
from dory.metrics.collector import MetricsCollector
|
|
30
|
+
from dory.utils.errors import DoryStartupError, DoryStateError
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DoryApp:
|
|
36
|
+
"""
|
|
37
|
+
Main entry point for Dory processor applications.
|
|
38
|
+
|
|
39
|
+
Usage:
|
|
40
|
+
from dory import DoryApp, BaseProcessor
|
|
41
|
+
|
|
42
|
+
class MyProcessor(BaseProcessor):
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
if __name__ == '__main__':
|
|
46
|
+
DoryApp().run(MyProcessor)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
config_file: str | None = None,
|
|
52
|
+
log_level: str | None = None,
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Initialize DoryApp.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
config_file: Optional path to YAML config file
|
|
59
|
+
log_level: Optional log level override
|
|
60
|
+
"""
|
|
61
|
+
self._config_file = config_file
|
|
62
|
+
self._log_level_override = log_level
|
|
63
|
+
|
|
64
|
+
# Components (initialized in _initialize)
|
|
65
|
+
self._config: DoryConfig | None = None
|
|
66
|
+
self._context: ExecutionContext | None = None
|
|
67
|
+
self._processor: BaseProcessor | None = None
|
|
68
|
+
self._lifecycle: LifecycleManager | None = None
|
|
69
|
+
self._signals: SignalHandler | None = None
|
|
70
|
+
self._health_server: HealthServer | None = None
|
|
71
|
+
self._state_manager: StateManager | None = None
|
|
72
|
+
self._metrics: MetricsCollector | None = None
|
|
73
|
+
self._restart_detector: RestartDetector | None = None
|
|
74
|
+
self._recovery_decision: RecoveryDecisionMaker | None = None
|
|
75
|
+
|
|
76
|
+
def run(self, processor_class: Type[BaseProcessor]) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Run the processor application.
|
|
79
|
+
|
|
80
|
+
This is the main entry point that blocks until shutdown.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
processor_class: Class implementing BaseProcessor
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
asyncio.run(self._run_async(processor_class))
|
|
87
|
+
except KeyboardInterrupt:
|
|
88
|
+
logger.info("Keyboard interrupt received")
|
|
89
|
+
sys.exit(0)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Fatal error: {e}")
|
|
92
|
+
sys.exit(1)
|
|
93
|
+
|
|
94
|
+
async def _run_async(self, processor_class: Type[BaseProcessor]) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Async implementation of the run loop.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
processor_class: Class implementing BaseProcessor
|
|
100
|
+
"""
|
|
101
|
+
exit_code = 0
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Phase 1: Initialize SDK components
|
|
105
|
+
await self._initialize(processor_class)
|
|
106
|
+
|
|
107
|
+
# Phase 2: Start health server
|
|
108
|
+
await self._start_health_server()
|
|
109
|
+
|
|
110
|
+
# Phase 3: Run processor lifecycle
|
|
111
|
+
await self._run_processor_lifecycle()
|
|
112
|
+
|
|
113
|
+
except DoryStartupError as e:
|
|
114
|
+
logger.error(f"Startup failed: {e}")
|
|
115
|
+
exit_code = 1
|
|
116
|
+
raise
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"Unexpected error: {e}")
|
|
120
|
+
exit_code = 1
|
|
121
|
+
raise
|
|
122
|
+
|
|
123
|
+
finally:
|
|
124
|
+
# Phase 4: Cleanup
|
|
125
|
+
await self._cleanup()
|
|
126
|
+
logger.info(f"DoryApp exiting with code {exit_code}")
|
|
127
|
+
|
|
128
|
+
async def _initialize(self, processor_class: Type[BaseProcessor]) -> None:
|
|
129
|
+
"""Initialize all SDK components."""
|
|
130
|
+
logger.debug("Initializing DoryApp components")
|
|
131
|
+
|
|
132
|
+
# Load configuration
|
|
133
|
+
config_loader = ConfigLoader(config_file=self._config_file)
|
|
134
|
+
self._config = config_loader.load()
|
|
135
|
+
|
|
136
|
+
# Apply log level override if provided
|
|
137
|
+
if self._log_level_override:
|
|
138
|
+
self._config.log_level = self._log_level_override
|
|
139
|
+
|
|
140
|
+
# Setup logging
|
|
141
|
+
setup_logging(
|
|
142
|
+
level=self._config.log_level,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
logger.info("Dory SDK initializing", extra={
|
|
146
|
+
"version": "1.0.0",
|
|
147
|
+
"config": self._config.model_dump(),
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
# Create execution context from environment
|
|
151
|
+
self._context = ExecutionContext.from_environment()
|
|
152
|
+
|
|
153
|
+
# Initialize components
|
|
154
|
+
self._lifecycle = LifecycleManager()
|
|
155
|
+
self._signals = SignalHandler()
|
|
156
|
+
self._state_manager = StateManager(
|
|
157
|
+
backend=self._config.state_backend,
|
|
158
|
+
config=self._config,
|
|
159
|
+
)
|
|
160
|
+
self._metrics = MetricsCollector()
|
|
161
|
+
self._restart_detector = RestartDetector()
|
|
162
|
+
self._recovery_decision = RecoveryDecisionMaker()
|
|
163
|
+
|
|
164
|
+
# Detect restart count
|
|
165
|
+
restart_info = await self._restart_detector.detect()
|
|
166
|
+
self._context.set_attempt_number(restart_info.restart_count)
|
|
167
|
+
|
|
168
|
+
logger.info(
|
|
169
|
+
f"Execution context: pod={self._context.pod_name}, "
|
|
170
|
+
f"processor_id={self._context.processor_id}, "
|
|
171
|
+
f"attempt={self._context.attempt_number}, "
|
|
172
|
+
f"is_migrating={self._context.is_migrating}"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Create processor instance
|
|
176
|
+
self._processor = processor_class(self._context)
|
|
177
|
+
|
|
178
|
+
# Setup signal handlers
|
|
179
|
+
self._signals.setup(
|
|
180
|
+
shutdown_callback=self._trigger_shutdown,
|
|
181
|
+
snapshot_callback=self._trigger_snapshot,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Record startup metric
|
|
185
|
+
self._metrics.record_startup_started()
|
|
186
|
+
|
|
187
|
+
async def _start_health_server(self) -> None:
|
|
188
|
+
"""Start the health/metrics HTTP server."""
|
|
189
|
+
self._health_server = HealthServer(
|
|
190
|
+
port=self._config.health_port,
|
|
191
|
+
metrics_collector=self._metrics,
|
|
192
|
+
state_getter=self._get_processor_state,
|
|
193
|
+
state_restorer=self._restore_processor_state,
|
|
194
|
+
prestop_handler=self._handle_prestop,
|
|
195
|
+
)
|
|
196
|
+
await self._health_server.start()
|
|
197
|
+
logger.info(f"Health server started on port {self._config.health_port}")
|
|
198
|
+
|
|
199
|
+
def _get_processor_state(self) -> dict:
|
|
200
|
+
"""Get processor state for /state GET endpoint (state capture)."""
|
|
201
|
+
if self._processor is None:
|
|
202
|
+
logger.warning("Processor not initialized, returning empty state")
|
|
203
|
+
return {}
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
import os
|
|
207
|
+
import time
|
|
208
|
+
state = self._processor.get_state()
|
|
209
|
+
|
|
210
|
+
# Wrap state in ApplicationState format expected by Orchestrator
|
|
211
|
+
return {
|
|
212
|
+
"pod_name": self._context.pod_name if self._context else "unknown",
|
|
213
|
+
"app_name": os.environ.get("APP_NAME", "dory-processor"),
|
|
214
|
+
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
215
|
+
"state_version": "1.0",
|
|
216
|
+
"data": state,
|
|
217
|
+
"metrics": {},
|
|
218
|
+
"connections": [],
|
|
219
|
+
"active_sessions": 0,
|
|
220
|
+
"session_data": {},
|
|
221
|
+
"uptime_seconds": self._metrics.get_uptime_seconds() if self._metrics else 0.0,
|
|
222
|
+
"request_count": self._metrics.get_request_count() if self._metrics else 0,
|
|
223
|
+
"last_health_time": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
224
|
+
}
|
|
225
|
+
except Exception as e:
|
|
226
|
+
logger.error(f"Failed to get processor state: {e}")
|
|
227
|
+
return {"error": str(e)}
|
|
228
|
+
|
|
229
|
+
async def _restore_processor_state(self, state: dict) -> None:
|
|
230
|
+
"""Restore processor state from /state POST endpoint (state transfer)."""
|
|
231
|
+
if self._processor is None:
|
|
232
|
+
raise RuntimeError("Processor not initialized, cannot restore state")
|
|
233
|
+
|
|
234
|
+
# Extract processor data from ApplicationState format
|
|
235
|
+
processor_data = state.get("data", state)
|
|
236
|
+
|
|
237
|
+
logger.info(f"Restoring state from transfer", extra={
|
|
238
|
+
"pod_name": state.get("pod_name", "unknown"),
|
|
239
|
+
"state_version": state.get("state_version", "unknown"),
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
await self._processor.restore_state(processor_data)
|
|
243
|
+
|
|
244
|
+
async def _handle_prestop(self) -> None:
|
|
245
|
+
"""Handle PreStop hook - prepare for graceful shutdown."""
|
|
246
|
+
logger.info("PreStop hook: initiating graceful shutdown preparation")
|
|
247
|
+
|
|
248
|
+
# Signal context that shutdown is coming
|
|
249
|
+
if self._context:
|
|
250
|
+
self._context.request_shutdown()
|
|
251
|
+
|
|
252
|
+
# Mark health server as not ready to stop receiving traffic
|
|
253
|
+
if self._health_server:
|
|
254
|
+
self._health_server.mark_not_ready()
|
|
255
|
+
|
|
256
|
+
# Save state before pod terminates - this is critical because
|
|
257
|
+
# SIGTERM may arrive after the app has already started exiting
|
|
258
|
+
if self._processor and self._state_manager and self._context:
|
|
259
|
+
try:
|
|
260
|
+
state = self._processor.get_state()
|
|
261
|
+
await self._state_manager.save_state(
|
|
262
|
+
processor_id=self._context.processor_id,
|
|
263
|
+
state=state,
|
|
264
|
+
)
|
|
265
|
+
logger.info("State snapshot saved during PreStop")
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(f"Failed to save state during PreStop: {e}")
|
|
268
|
+
|
|
269
|
+
async def _run_processor_lifecycle(self) -> None:
|
|
270
|
+
"""Run the complete processor lifecycle."""
|
|
271
|
+
# Check if saved state exists before deciding recovery strategy
|
|
272
|
+
# This is important for detecting pod replacement (new pod, existing state)
|
|
273
|
+
state_exists = False
|
|
274
|
+
try:
|
|
275
|
+
existing_state = await self._state_manager.load_state(
|
|
276
|
+
processor_id=self._context.processor_id,
|
|
277
|
+
)
|
|
278
|
+
state_exists = existing_state is not None
|
|
279
|
+
if state_exists:
|
|
280
|
+
logger.info("Existing state found in checkpoint")
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.debug(f"No existing state found: {e}")
|
|
283
|
+
|
|
284
|
+
# Determine recovery strategy
|
|
285
|
+
strategy = self._recovery_decision.decide(
|
|
286
|
+
restart_count=self._context.attempt_number,
|
|
287
|
+
is_migrating=self._context.is_migrating,
|
|
288
|
+
state_exists=state_exists,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
logger.info(f"Recovery strategy: {strategy.name}")
|
|
292
|
+
|
|
293
|
+
# Load state if needed (may already have it from check above)
|
|
294
|
+
state = None
|
|
295
|
+
if strategy.should_restore_state:
|
|
296
|
+
try:
|
|
297
|
+
if state_exists and existing_state:
|
|
298
|
+
state = existing_state
|
|
299
|
+
else:
|
|
300
|
+
state = await self._state_manager.load_state(
|
|
301
|
+
processor_id=self._context.processor_id,
|
|
302
|
+
)
|
|
303
|
+
if state:
|
|
304
|
+
logger.info("State loaded from checkpoint")
|
|
305
|
+
except Exception as e:
|
|
306
|
+
logger.warning(f"Failed to load state: {e}")
|
|
307
|
+
state = None
|
|
308
|
+
|
|
309
|
+
# Run startup
|
|
310
|
+
await self._lifecycle.run_startup(
|
|
311
|
+
processor=self._processor,
|
|
312
|
+
timeout=self._config.startup_timeout_sec,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Connect publisher if available (after startup, before state restore)
|
|
316
|
+
if hasattr(self._processor, 'publisher') and self._processor.publisher is not None:
|
|
317
|
+
try:
|
|
318
|
+
await self._processor.publisher.connect()
|
|
319
|
+
logger.info("Publisher connected")
|
|
320
|
+
except Exception as e:
|
|
321
|
+
logger.warning(f"Publisher connection failed (non-fatal): {e}")
|
|
322
|
+
|
|
323
|
+
# Restore state if available
|
|
324
|
+
if state:
|
|
325
|
+
try:
|
|
326
|
+
await self._processor.restore_state(state)
|
|
327
|
+
logger.info("State restored successfully")
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.error(f"State restore failed: {e}")
|
|
330
|
+
should_continue = await self._processor.on_state_restore_failed(e)
|
|
331
|
+
if not should_continue:
|
|
332
|
+
raise DoryStateError("State restore failed and recovery declined", cause=e)
|
|
333
|
+
|
|
334
|
+
# Mark as ready
|
|
335
|
+
self._health_server.mark_ready()
|
|
336
|
+
self._metrics.record_startup_completed()
|
|
337
|
+
logger.info("Processor ready")
|
|
338
|
+
|
|
339
|
+
# Run main loop
|
|
340
|
+
try:
|
|
341
|
+
await self._lifecycle.run_main_loop(
|
|
342
|
+
processor=self._processor,
|
|
343
|
+
context=self._context,
|
|
344
|
+
)
|
|
345
|
+
except asyncio.CancelledError:
|
|
346
|
+
logger.info("Main loop cancelled")
|
|
347
|
+
|
|
348
|
+
async def _trigger_shutdown(self) -> None:
|
|
349
|
+
"""Trigger graceful shutdown sequence."""
|
|
350
|
+
logger.info("Shutdown triggered")
|
|
351
|
+
|
|
352
|
+
# Signal context
|
|
353
|
+
if self._context:
|
|
354
|
+
self._context.request_shutdown()
|
|
355
|
+
|
|
356
|
+
# Mark health server as not ready
|
|
357
|
+
if self._health_server:
|
|
358
|
+
self._health_server.mark_not_ready()
|
|
359
|
+
|
|
360
|
+
# Wait briefly for run() to exit
|
|
361
|
+
await asyncio.sleep(0.5)
|
|
362
|
+
|
|
363
|
+
# Run shutdown
|
|
364
|
+
if self._lifecycle and self._processor:
|
|
365
|
+
await self._lifecycle.run_shutdown(
|
|
366
|
+
processor=self._processor,
|
|
367
|
+
timeout=self._config.shutdown_timeout_sec if self._config else 30.0,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Close publisher (flush buffer) before state snapshot
|
|
371
|
+
if hasattr(self._processor, 'publisher') and self._processor.publisher is not None:
|
|
372
|
+
try:
|
|
373
|
+
await self._processor.publisher.close()
|
|
374
|
+
logger.info("Publisher closed")
|
|
375
|
+
except Exception as e:
|
|
376
|
+
logger.warning(f"Publisher close failed: {e}")
|
|
377
|
+
|
|
378
|
+
# Snapshot state
|
|
379
|
+
if self._processor and self._state_manager and self._context:
|
|
380
|
+
try:
|
|
381
|
+
state = self._processor.get_state()
|
|
382
|
+
await self._state_manager.save_state(
|
|
383
|
+
processor_id=self._context.processor_id,
|
|
384
|
+
state=state,
|
|
385
|
+
)
|
|
386
|
+
logger.info("State snapshot saved")
|
|
387
|
+
except Exception as e:
|
|
388
|
+
logger.error(f"Failed to save state: {e}")
|
|
389
|
+
|
|
390
|
+
if self._metrics:
|
|
391
|
+
self._metrics.record_shutdown_completed()
|
|
392
|
+
|
|
393
|
+
async def _trigger_snapshot(self) -> None:
|
|
394
|
+
"""Trigger state snapshot (SIGUSR1 handler)."""
|
|
395
|
+
logger.info("State snapshot triggered")
|
|
396
|
+
if not (self._processor and self._state_manager and self._context):
|
|
397
|
+
logger.warning("Cannot snapshot: components not initialized")
|
|
398
|
+
return
|
|
399
|
+
try:
|
|
400
|
+
state = self._processor.get_state()
|
|
401
|
+
await self._state_manager.save_state(
|
|
402
|
+
processor_id=self._context.processor_id,
|
|
403
|
+
state=state,
|
|
404
|
+
)
|
|
405
|
+
logger.info("State snapshot saved (debug)")
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.error(f"Failed to save state snapshot: {e}")
|
|
408
|
+
|
|
409
|
+
async def _cleanup(self) -> None:
|
|
410
|
+
"""Cleanup all components."""
|
|
411
|
+
logger.debug("Cleaning up DoryApp components")
|
|
412
|
+
|
|
413
|
+
# Remove signal handlers
|
|
414
|
+
if self._signals:
|
|
415
|
+
self._signals.remove_handlers()
|
|
416
|
+
|
|
417
|
+
# Stop health server
|
|
418
|
+
if self._health_server:
|
|
419
|
+
await self._health_server.stop()
|
|
420
|
+
|
|
421
|
+
# Ensure publisher is closed
|
|
422
|
+
if hasattr(self, '_processor') and self._processor is not None:
|
|
423
|
+
if hasattr(self._processor, 'publisher') and self._processor.publisher is not None:
|
|
424
|
+
try:
|
|
425
|
+
await self._processor.publisher.close()
|
|
426
|
+
except Exception:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
# Flush metrics
|
|
430
|
+
if self._metrics:
|
|
431
|
+
self._metrics.flush()
|
|
432
|
+
|
|
433
|
+
# Flush logs
|
|
434
|
+
logging.shutdown()
|
dory/core/context.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ExecutionContext - Runtime context passed to processors.
|
|
3
|
+
|
|
4
|
+
Contains pod metadata, events, and utility methods. The context is
|
|
5
|
+
created by DoryApp and passed to the processor constructor.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ExecutionContext:
|
|
17
|
+
"""
|
|
18
|
+
Execution context containing pod metadata and utilities.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
pod_name: Kubernetes pod name (e.g., "camera-processor-1")
|
|
22
|
+
pod_namespace: Kubernetes namespace (e.g., "default")
|
|
23
|
+
processor_id: Unique processor ID from Dory DB
|
|
24
|
+
attempt_number: Pod restart count (1, 2, 3...)
|
|
25
|
+
is_migrating: True if this is a restart due to migration
|
|
26
|
+
previous_pod_name: Name of pod we're migrating from (if applicable)
|
|
27
|
+
shutdown_requested: Event that fires when SIGTERM received
|
|
28
|
+
migration_imminent: Event that fires 10s before forced shutdown
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Pod metadata (read from K8s/env)
|
|
32
|
+
pod_name: str
|
|
33
|
+
pod_namespace: str
|
|
34
|
+
processor_id: str
|
|
35
|
+
attempt_number: int = 1
|
|
36
|
+
is_migrating: bool = False
|
|
37
|
+
previous_pod_name: str | None = None
|
|
38
|
+
|
|
39
|
+
# Async events for coordination
|
|
40
|
+
shutdown_requested: asyncio.Event = field(default_factory=asyncio.Event)
|
|
41
|
+
migration_imminent: asyncio.Event = field(default_factory=asyncio.Event)
|
|
42
|
+
|
|
43
|
+
# Internal config cache
|
|
44
|
+
_config: dict[str, Any] = field(default_factory=dict)
|
|
45
|
+
_logger: logging.Logger | None = field(default=None, repr=False)
|
|
46
|
+
|
|
47
|
+
def is_shutdown_requested(self) -> bool:
|
|
48
|
+
"""
|
|
49
|
+
Check if graceful shutdown is in progress.
|
|
50
|
+
|
|
51
|
+
Processors should poll this in their run() loop to exit gracefully.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
True if SIGTERM received and shutdown initiated
|
|
55
|
+
"""
|
|
56
|
+
return self.shutdown_requested.is_set()
|
|
57
|
+
|
|
58
|
+
def is_migration_imminent(self) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Check if migration is about to happen.
|
|
61
|
+
|
|
62
|
+
If True, processor should finish in-flight operations quickly.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if migration scheduled within next 10s
|
|
66
|
+
"""
|
|
67
|
+
return self.migration_imminent.is_set()
|
|
68
|
+
|
|
69
|
+
def config(self) -> dict[str, Any]:
|
|
70
|
+
"""
|
|
71
|
+
Get application configuration from environment/ConfigMap.
|
|
72
|
+
|
|
73
|
+
Only returns app-specific config (CAMERA_FEED_URL, MODEL_PATH, etc.),
|
|
74
|
+
not SDK internals (DORY_* vars are filtered out).
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dict with app configuration
|
|
78
|
+
"""
|
|
79
|
+
return self._config
|
|
80
|
+
|
|
81
|
+
def logger(self) -> logging.Logger:
|
|
82
|
+
"""
|
|
83
|
+
Get pre-configured logger with pod context.
|
|
84
|
+
|
|
85
|
+
Logger automatically includes pod_name, processor_id, namespace
|
|
86
|
+
in all log messages.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Logger configured with pod context
|
|
90
|
+
"""
|
|
91
|
+
if self._logger is None:
|
|
92
|
+
self._logger = logging.getLogger(f"dory.processor.{self.processor_id}")
|
|
93
|
+
return self._logger
|
|
94
|
+
|
|
95
|
+
def get_env(self, key: str, default: str | None = None) -> str | None:
|
|
96
|
+
"""
|
|
97
|
+
Get environment variable value.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
key: Environment variable name
|
|
101
|
+
default: Default value if not set
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Environment variable value or default
|
|
105
|
+
"""
|
|
106
|
+
return os.environ.get(key, default)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def from_environment(cls) -> "ExecutionContext":
|
|
110
|
+
"""
|
|
111
|
+
Create ExecutionContext from environment variables.
|
|
112
|
+
|
|
113
|
+
Reads DORY_* environment variables set by Dory orchestrator.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
ExecutionContext populated from environment
|
|
117
|
+
"""
|
|
118
|
+
# Read Dory system env vars
|
|
119
|
+
pod_name = os.environ.get("DORY_POD_NAME", os.environ.get("POD_NAME", "unknown"))
|
|
120
|
+
pod_namespace = os.environ.get(
|
|
121
|
+
"DORY_POD_NAMESPACE", os.environ.get("POD_NAMESPACE", "default")
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Get processor_id from env or derive from pod name
|
|
125
|
+
processor_id = os.environ.get("DORY_PROCESSOR_ID", os.environ.get("PROCESSOR_ID"))
|
|
126
|
+
if not processor_id:
|
|
127
|
+
# Derive from pod name (e.g., "myapp-7f8d9c6b-x4h2j" -> "myapp")
|
|
128
|
+
processor_id = cls._derive_processor_id_from_pod_name(pod_name)
|
|
129
|
+
|
|
130
|
+
is_migrating = os.environ.get("DORY_IS_MIGRATING", "false").lower() == "true"
|
|
131
|
+
previous_pod = os.environ.get("DORY_MIGRATED_FROM")
|
|
132
|
+
|
|
133
|
+
# Parse restart count (will be updated from K8s later)
|
|
134
|
+
attempt_number = 1
|
|
135
|
+
|
|
136
|
+
# Load app config (non-DORY_ env vars)
|
|
137
|
+
app_config = {}
|
|
138
|
+
for key, value in os.environ.items():
|
|
139
|
+
if not key.startswith("DORY_") and not key.startswith("KUBERNETES_"):
|
|
140
|
+
app_config[key] = value
|
|
141
|
+
|
|
142
|
+
return cls(
|
|
143
|
+
pod_name=pod_name,
|
|
144
|
+
pod_namespace=pod_namespace,
|
|
145
|
+
processor_id=processor_id,
|
|
146
|
+
attempt_number=attempt_number,
|
|
147
|
+
is_migrating=is_migrating,
|
|
148
|
+
previous_pod_name=previous_pod,
|
|
149
|
+
_config=app_config,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def _derive_processor_id_from_pod_name(pod_name: str) -> str:
|
|
154
|
+
"""
|
|
155
|
+
Derive processor ID from Kubernetes pod name.
|
|
156
|
+
|
|
157
|
+
Pod names typically follow the pattern:
|
|
158
|
+
- Deployment: <deployment>-<replicaset-hash>-<pod-hash> (e.g., "myapp-7f8d9c6b-x4h2j")
|
|
159
|
+
- StatefulSet: <statefulset>-<ordinal> (e.g., "myapp-0")
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
pod_name: Kubernetes pod name
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Derived processor ID or "unknown" if cannot be derived
|
|
166
|
+
"""
|
|
167
|
+
if not pod_name or pod_name == "unknown":
|
|
168
|
+
return "unknown"
|
|
169
|
+
|
|
170
|
+
parts = pod_name.split("-")
|
|
171
|
+
if len(parts) >= 3:
|
|
172
|
+
# Deployment format: name-replicaset-pod
|
|
173
|
+
# Try to find where the hash parts start (typically 8+ chars of alphanumeric)
|
|
174
|
+
for i in range(len(parts) - 1, 0, -1):
|
|
175
|
+
part = parts[i]
|
|
176
|
+
# If this looks like a hash (short alphanumeric), skip it
|
|
177
|
+
if len(part) <= 10 and part.isalnum():
|
|
178
|
+
continue
|
|
179
|
+
# Otherwise, include up to this point
|
|
180
|
+
return "-".join(parts[: i + 1])
|
|
181
|
+
# If all parts look like hashes, take the first part
|
|
182
|
+
return parts[0]
|
|
183
|
+
elif len(parts) == 2:
|
|
184
|
+
# StatefulSet format: name-ordinal or simple name-hash
|
|
185
|
+
if parts[1].isdigit():
|
|
186
|
+
return parts[0] # StatefulSet
|
|
187
|
+
return parts[0] # Simple deployment
|
|
188
|
+
else:
|
|
189
|
+
return pod_name
|
|
190
|
+
|
|
191
|
+
def request_shutdown(self) -> None:
|
|
192
|
+
"""Signal that shutdown has been requested."""
|
|
193
|
+
self.shutdown_requested.set()
|
|
194
|
+
|
|
195
|
+
def signal_migration(self) -> None:
|
|
196
|
+
"""Signal that migration will happen soon."""
|
|
197
|
+
self.migration_imminent.set()
|
|
198
|
+
|
|
199
|
+
def signal_migration_imminent(self) -> None:
|
|
200
|
+
"""Signal that migration will happen soon (alias for signal_migration)."""
|
|
201
|
+
self.migration_imminent.set()
|
|
202
|
+
|
|
203
|
+
def update_config(self, config: dict[str, Any]) -> None:
|
|
204
|
+
"""Update app configuration (internal use)."""
|
|
205
|
+
self._config.update(config)
|
|
206
|
+
|
|
207
|
+
def set_attempt_number(self, attempt: int) -> None:
|
|
208
|
+
"""Set restart attempt number (internal use)."""
|
|
209
|
+
self.attempt_number = attempt
|