dory-sdk 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +70 -0
- dory/auto_instrument.py +142 -0
- dory/cli/__init__.py +5 -0
- dory/cli/main.py +290 -0
- dory/cli/templates.py +333 -0
- dory/config/__init__.py +23 -0
- dory/config/defaults.py +50 -0
- dory/config/loader.py +361 -0
- dory/config/presets.py +325 -0
- dory/config/schema.py +152 -0
- dory/core/__init__.py +27 -0
- dory/core/app.py +404 -0
- dory/core/context.py +209 -0
- dory/core/lifecycle.py +214 -0
- dory/core/meta.py +121 -0
- dory/core/modes.py +479 -0
- dory/core/processor.py +654 -0
- dory/core/signals.py +122 -0
- dory/decorators.py +142 -0
- dory/errors/__init__.py +117 -0
- dory/errors/classification.py +362 -0
- dory/errors/codes.py +495 -0
- dory/health/__init__.py +10 -0
- dory/health/probes.py +210 -0
- dory/health/server.py +306 -0
- dory/k8s/__init__.py +11 -0
- dory/k8s/annotation_watcher.py +184 -0
- dory/k8s/client.py +251 -0
- dory/k8s/pod_metadata.py +182 -0
- dory/logging/__init__.py +9 -0
- dory/logging/logger.py +175 -0
- dory/metrics/__init__.py +7 -0
- dory/metrics/collector.py +301 -0
- dory/middleware/__init__.py +36 -0
- dory/middleware/connection_tracker.py +608 -0
- dory/middleware/request_id.py +321 -0
- dory/middleware/request_tracker.py +501 -0
- dory/migration/__init__.py +11 -0
- dory/migration/configmap.py +260 -0
- dory/migration/serialization.py +167 -0
- dory/migration/state_manager.py +301 -0
- dory/monitoring/__init__.py +23 -0
- dory/monitoring/opentelemetry.py +462 -0
- dory/py.typed +2 -0
- dory/recovery/__init__.py +60 -0
- dory/recovery/golden_image.py +480 -0
- dory/recovery/golden_snapshot.py +561 -0
- dory/recovery/golden_validator.py +518 -0
- dory/recovery/partial_recovery.py +479 -0
- dory/recovery/recovery_decision.py +242 -0
- dory/recovery/restart_detector.py +142 -0
- dory/recovery/state_validator.py +187 -0
- dory/resilience/__init__.py +45 -0
- dory/resilience/circuit_breaker.py +454 -0
- dory/resilience/retry.py +389 -0
- dory/sidecar/__init__.py +6 -0
- dory/sidecar/main.py +75 -0
- dory/sidecar/server.py +329 -0
- dory/simple.py +342 -0
- dory/types.py +75 -0
- dory/utils/__init__.py +25 -0
- dory/utils/errors.py +59 -0
- dory/utils/retry.py +115 -0
- dory/utils/timeout.py +80 -0
- dory_sdk-2.1.0.dist-info/METADATA +663 -0
- dory_sdk-2.1.0.dist-info/RECORD +69 -0
- dory_sdk-2.1.0.dist-info/WHEEL +5 -0
- dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
- dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
dory/config/schema.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration schema for Dory SDK.
|
|
3
|
+
|
|
4
|
+
Uses Pydantic for validation and type coercion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from pydantic import BaseModel, Field, field_validator
|
|
9
|
+
|
|
10
|
+
from dory.types import StateBackend, LogFormat
|
|
11
|
+
from dory.config.defaults import DEFAULT_CONFIG
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DoryConfig(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
Dory SDK configuration schema.
|
|
17
|
+
|
|
18
|
+
All configuration can be set via:
|
|
19
|
+
1. YAML config file
|
|
20
|
+
2. Environment variables (DORY_ prefix)
|
|
21
|
+
3. Constructor arguments
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Lifecycle timeouts
|
|
25
|
+
startup_timeout_sec: int = Field(
|
|
26
|
+
default=DEFAULT_CONFIG["startup_timeout_sec"],
|
|
27
|
+
ge=1,
|
|
28
|
+
le=300,
|
|
29
|
+
description="Maximum time for startup in seconds",
|
|
30
|
+
)
|
|
31
|
+
shutdown_timeout_sec: int = Field(
|
|
32
|
+
default=DEFAULT_CONFIG["shutdown_timeout_sec"],
|
|
33
|
+
ge=1,
|
|
34
|
+
le=300,
|
|
35
|
+
description="Maximum time for shutdown in seconds",
|
|
36
|
+
)
|
|
37
|
+
health_check_interval_sec: int = Field(
|
|
38
|
+
default=DEFAULT_CONFIG["health_check_interval_sec"],
|
|
39
|
+
ge=1,
|
|
40
|
+
le=60,
|
|
41
|
+
description="Interval between health checks",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Health server
|
|
45
|
+
health_port: int = Field(
|
|
46
|
+
default=DEFAULT_CONFIG["health_port"],
|
|
47
|
+
ge=1,
|
|
48
|
+
le=65535,
|
|
49
|
+
description="Port for health/metrics HTTP server",
|
|
50
|
+
)
|
|
51
|
+
health_path: str = Field(
|
|
52
|
+
default=DEFAULT_CONFIG["health_path"],
|
|
53
|
+
description="Path for liveness probe",
|
|
54
|
+
)
|
|
55
|
+
ready_path: str = Field(
|
|
56
|
+
default=DEFAULT_CONFIG["ready_path"],
|
|
57
|
+
description="Path for readiness probe",
|
|
58
|
+
)
|
|
59
|
+
metrics_path: str = Field(
|
|
60
|
+
default=DEFAULT_CONFIG["metrics_path"],
|
|
61
|
+
description="Path for Prometheus metrics",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# State management
|
|
65
|
+
state_backend: str = Field(
|
|
66
|
+
default=DEFAULT_CONFIG["state_backend"],
|
|
67
|
+
description="Backend for state persistence",
|
|
68
|
+
)
|
|
69
|
+
state_pvc_mount: str = Field(
|
|
70
|
+
default=DEFAULT_CONFIG["state_pvc_mount"],
|
|
71
|
+
description="Mount path for PVC state backend",
|
|
72
|
+
)
|
|
73
|
+
state_s3_bucket: Optional[str] = Field(
|
|
74
|
+
default=DEFAULT_CONFIG["state_s3_bucket"],
|
|
75
|
+
description="S3 bucket for state (if using S3 backend)",
|
|
76
|
+
)
|
|
77
|
+
state_s3_prefix: str = Field(
|
|
78
|
+
default=DEFAULT_CONFIG["state_s3_prefix"],
|
|
79
|
+
description="S3 key prefix for state objects",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Recovery
|
|
83
|
+
max_restart_attempts: int = Field(
|
|
84
|
+
default=DEFAULT_CONFIG["max_restart_attempts"],
|
|
85
|
+
ge=1,
|
|
86
|
+
le=10,
|
|
87
|
+
description="Max restarts before golden image reset",
|
|
88
|
+
)
|
|
89
|
+
restart_backoff_sec: int = Field(
|
|
90
|
+
default=DEFAULT_CONFIG["restart_backoff_sec"],
|
|
91
|
+
ge=0,
|
|
92
|
+
le=300,
|
|
93
|
+
description="Backoff delay between restarts",
|
|
94
|
+
)
|
|
95
|
+
golden_image_threshold: int = Field(
|
|
96
|
+
default=DEFAULT_CONFIG["golden_image_threshold"],
|
|
97
|
+
ge=1,
|
|
98
|
+
le=10,
|
|
99
|
+
description="Restart count triggering golden image reset",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Logging
|
|
103
|
+
log_level: str = Field(
|
|
104
|
+
default=DEFAULT_CONFIG["log_level"],
|
|
105
|
+
description="Logging level (DEBUG, INFO, WARNING, ERROR)",
|
|
106
|
+
)
|
|
107
|
+
log_format: str = Field(
|
|
108
|
+
default=DEFAULT_CONFIG["log_format"],
|
|
109
|
+
description="Log format (json or text)",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Metrics
|
|
113
|
+
metrics_enabled: bool = Field(
|
|
114
|
+
default=DEFAULT_CONFIG["metrics_enabled"],
|
|
115
|
+
description="Enable Prometheus metrics",
|
|
116
|
+
)
|
|
117
|
+
metrics_prefix: str = Field(
|
|
118
|
+
default=DEFAULT_CONFIG["metrics_prefix"],
|
|
119
|
+
description="Prefix for metric names",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
@field_validator("state_backend")
|
|
123
|
+
@classmethod
|
|
124
|
+
def validate_state_backend(cls, v: str) -> str:
|
|
125
|
+
"""Validate state backend value."""
|
|
126
|
+
valid_backends = [b.value for b in StateBackend]
|
|
127
|
+
if v not in valid_backends:
|
|
128
|
+
raise ValueError(f"state_backend must be one of {valid_backends}")
|
|
129
|
+
return v
|
|
130
|
+
|
|
131
|
+
@field_validator("log_level")
|
|
132
|
+
@classmethod
|
|
133
|
+
def validate_log_level(cls, v: str) -> str:
|
|
134
|
+
"""Validate log level."""
|
|
135
|
+
valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
|
136
|
+
v_upper = v.upper()
|
|
137
|
+
if v_upper not in valid_levels:
|
|
138
|
+
raise ValueError(f"log_level must be one of {valid_levels}")
|
|
139
|
+
return v_upper
|
|
140
|
+
|
|
141
|
+
@field_validator("log_format")
|
|
142
|
+
@classmethod
|
|
143
|
+
def validate_log_format(cls, v: str) -> str:
|
|
144
|
+
"""Validate log format."""
|
|
145
|
+
valid_formats = [f.value for f in LogFormat]
|
|
146
|
+
if v not in valid_formats:
|
|
147
|
+
raise ValueError(f"log_format must be one of {valid_formats}")
|
|
148
|
+
return v
|
|
149
|
+
|
|
150
|
+
model_config = {
|
|
151
|
+
"extra": "ignore", # Ignore unknown fields
|
|
152
|
+
}
|
dory/core/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Core modules for Dory SDK."""
|
|
2
|
+
|
|
3
|
+
from dory.core.processor import BaseProcessor
|
|
4
|
+
from dory.core.context import ExecutionContext
|
|
5
|
+
from dory.core.app import DoryApp
|
|
6
|
+
from dory.core.lifecycle import LifecycleManager
|
|
7
|
+
from dory.core.signals import SignalHandler
|
|
8
|
+
from dory.core.modes import (
|
|
9
|
+
ModeManager,
|
|
10
|
+
ProcessingMode,
|
|
11
|
+
ModeTransition,
|
|
12
|
+
ModeTransitionReason,
|
|
13
|
+
ModeConfig,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"BaseProcessor",
|
|
18
|
+
"ExecutionContext",
|
|
19
|
+
"DoryApp",
|
|
20
|
+
"LifecycleManager",
|
|
21
|
+
"SignalHandler",
|
|
22
|
+
"ModeManager",
|
|
23
|
+
"ProcessingMode",
|
|
24
|
+
"ModeTransition",
|
|
25
|
+
"ModeTransitionReason",
|
|
26
|
+
"ModeConfig",
|
|
27
|
+
]
|
dory/core/app.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DoryApp - Main entry point for processor applications.
|
|
3
|
+
|
|
4
|
+
Orchestrates the entire processor lifecycle including:
|
|
5
|
+
- Configuration loading
|
|
6
|
+
- Health server startup
|
|
7
|
+
- Signal handling
|
|
8
|
+
- State restoration
|
|
9
|
+
- Processor lifecycle management
|
|
10
|
+
- Graceful shutdown
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import sys
|
|
16
|
+
from typing import Type
|
|
17
|
+
|
|
18
|
+
from dory.core.processor import BaseProcessor
|
|
19
|
+
from dory.core.context import ExecutionContext
|
|
20
|
+
from dory.core.lifecycle import LifecycleManager
|
|
21
|
+
from dory.core.signals import SignalHandler
|
|
22
|
+
from dory.config.loader import ConfigLoader
|
|
23
|
+
from dory.config.schema import DoryConfig
|
|
24
|
+
from dory.health.server import HealthServer
|
|
25
|
+
from dory.migration.state_manager import StateManager
|
|
26
|
+
from dory.recovery.recovery_decision import RecoveryDecisionMaker
|
|
27
|
+
from dory.recovery.restart_detector import RestartDetector
|
|
28
|
+
from dory.logging.logger import setup_logging
|
|
29
|
+
from dory.metrics.collector import MetricsCollector
|
|
30
|
+
from dory.utils.errors import DoryStartupError, DoryStateError
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DoryApp:
|
|
36
|
+
"""
|
|
37
|
+
Main entry point for Dory processor applications.
|
|
38
|
+
|
|
39
|
+
Usage:
|
|
40
|
+
from dory import DoryApp, BaseProcessor
|
|
41
|
+
|
|
42
|
+
class MyProcessor(BaseProcessor):
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
if __name__ == '__main__':
|
|
46
|
+
DoryApp().run(MyProcessor)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
config_file: str | None = None,
|
|
52
|
+
log_level: str | None = None,
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Initialize DoryApp.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
config_file: Optional path to YAML config file
|
|
59
|
+
log_level: Optional log level override
|
|
60
|
+
"""
|
|
61
|
+
self._config_file = config_file
|
|
62
|
+
self._log_level_override = log_level
|
|
63
|
+
|
|
64
|
+
# Components (initialized in _initialize)
|
|
65
|
+
self._config: DoryConfig | None = None
|
|
66
|
+
self._context: ExecutionContext | None = None
|
|
67
|
+
self._processor: BaseProcessor | None = None
|
|
68
|
+
self._lifecycle: LifecycleManager | None = None
|
|
69
|
+
self._signals: SignalHandler | None = None
|
|
70
|
+
self._health_server: HealthServer | None = None
|
|
71
|
+
self._state_manager: StateManager | None = None
|
|
72
|
+
self._metrics: MetricsCollector | None = None
|
|
73
|
+
self._restart_detector: RestartDetector | None = None
|
|
74
|
+
self._recovery_decision: RecoveryDecisionMaker | None = None
|
|
75
|
+
|
|
76
|
+
def run(self, processor_class: Type[BaseProcessor]) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Run the processor application.
|
|
79
|
+
|
|
80
|
+
This is the main entry point that blocks until shutdown.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
processor_class: Class implementing BaseProcessor
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
asyncio.run(self._run_async(processor_class))
|
|
87
|
+
except KeyboardInterrupt:
|
|
88
|
+
logger.info("Keyboard interrupt received")
|
|
89
|
+
sys.exit(0)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Fatal error: {e}")
|
|
92
|
+
sys.exit(1)
|
|
93
|
+
|
|
94
|
+
async def _run_async(self, processor_class: Type[BaseProcessor]) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Async implementation of the run loop.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
processor_class: Class implementing BaseProcessor
|
|
100
|
+
"""
|
|
101
|
+
exit_code = 0
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Phase 1: Initialize SDK components
|
|
105
|
+
await self._initialize(processor_class)
|
|
106
|
+
|
|
107
|
+
# Phase 2: Start health server
|
|
108
|
+
await self._start_health_server()
|
|
109
|
+
|
|
110
|
+
# Phase 3: Run processor lifecycle
|
|
111
|
+
await self._run_processor_lifecycle()
|
|
112
|
+
|
|
113
|
+
except DoryStartupError as e:
|
|
114
|
+
logger.error(f"Startup failed: {e}")
|
|
115
|
+
exit_code = 1
|
|
116
|
+
raise
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"Unexpected error: {e}")
|
|
120
|
+
exit_code = 1
|
|
121
|
+
raise
|
|
122
|
+
|
|
123
|
+
finally:
|
|
124
|
+
# Phase 4: Cleanup
|
|
125
|
+
await self._cleanup()
|
|
126
|
+
logger.info(f"DoryApp exiting with code {exit_code}")
|
|
127
|
+
|
|
128
|
+
async def _initialize(self, processor_class: Type[BaseProcessor]) -> None:
|
|
129
|
+
"""Initialize all SDK components."""
|
|
130
|
+
logger.debug("Initializing DoryApp components")
|
|
131
|
+
|
|
132
|
+
# Load configuration
|
|
133
|
+
config_loader = ConfigLoader(config_file=self._config_file)
|
|
134
|
+
self._config = config_loader.load()
|
|
135
|
+
|
|
136
|
+
# Apply log level override if provided
|
|
137
|
+
if self._log_level_override:
|
|
138
|
+
self._config.log_level = self._log_level_override
|
|
139
|
+
|
|
140
|
+
# Setup logging
|
|
141
|
+
setup_logging(
|
|
142
|
+
level=self._config.log_level,
|
|
143
|
+
format=self._config.log_format,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
logger.info("Dory SDK initializing", extra={
|
|
147
|
+
"version": "1.0.0",
|
|
148
|
+
"config": self._config.model_dump(),
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# Create execution context from environment
|
|
152
|
+
self._context = ExecutionContext.from_environment()
|
|
153
|
+
|
|
154
|
+
# Initialize components
|
|
155
|
+
self._lifecycle = LifecycleManager()
|
|
156
|
+
self._signals = SignalHandler()
|
|
157
|
+
self._state_manager = StateManager(
|
|
158
|
+
backend=self._config.state_backend,
|
|
159
|
+
config=self._config,
|
|
160
|
+
)
|
|
161
|
+
self._metrics = MetricsCollector()
|
|
162
|
+
self._restart_detector = RestartDetector()
|
|
163
|
+
self._recovery_decision = RecoveryDecisionMaker()
|
|
164
|
+
|
|
165
|
+
# Detect restart count
|
|
166
|
+
restart_info = await self._restart_detector.detect()
|
|
167
|
+
self._context.set_attempt_number(restart_info.restart_count)
|
|
168
|
+
|
|
169
|
+
logger.info(
|
|
170
|
+
f"Execution context: pod={self._context.pod_name}, "
|
|
171
|
+
f"processor_id={self._context.processor_id}, "
|
|
172
|
+
f"attempt={self._context.attempt_number}, "
|
|
173
|
+
f"is_migrating={self._context.is_migrating}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Create processor instance
|
|
177
|
+
self._processor = processor_class(self._context)
|
|
178
|
+
|
|
179
|
+
# Setup signal handlers
|
|
180
|
+
self._signals.setup(
|
|
181
|
+
shutdown_callback=self._trigger_shutdown,
|
|
182
|
+
snapshot_callback=self._trigger_snapshot,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Record startup metric
|
|
186
|
+
self._metrics.record_startup_started()
|
|
187
|
+
|
|
188
|
+
async def _start_health_server(self) -> None:
|
|
189
|
+
"""Start the health/metrics HTTP server."""
|
|
190
|
+
self._health_server = HealthServer(
|
|
191
|
+
port=self._config.health_port,
|
|
192
|
+
metrics_collector=self._metrics,
|
|
193
|
+
state_getter=self._get_processor_state,
|
|
194
|
+
state_restorer=self._restore_processor_state,
|
|
195
|
+
prestop_handler=self._handle_prestop,
|
|
196
|
+
)
|
|
197
|
+
await self._health_server.start()
|
|
198
|
+
logger.info(f"Health server started on port {self._config.health_port}")
|
|
199
|
+
|
|
200
|
+
def _get_processor_state(self) -> dict:
|
|
201
|
+
"""Get processor state for /state GET endpoint (state capture)."""
|
|
202
|
+
if self._processor is None:
|
|
203
|
+
logger.warning("Processor not initialized, returning empty state")
|
|
204
|
+
return {}
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
import os
|
|
208
|
+
import time
|
|
209
|
+
state = self._processor.get_state()
|
|
210
|
+
|
|
211
|
+
# Wrap state in ApplicationState format expected by Orchestrator
|
|
212
|
+
return {
|
|
213
|
+
"pod_name": self._context.pod_name if self._context else "unknown",
|
|
214
|
+
"app_name": os.environ.get("APP_NAME", "dory-processor"),
|
|
215
|
+
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
216
|
+
"state_version": "1.0",
|
|
217
|
+
"data": state,
|
|
218
|
+
"metrics": {},
|
|
219
|
+
"connections": [],
|
|
220
|
+
"active_sessions": 0,
|
|
221
|
+
"session_data": {},
|
|
222
|
+
"uptime_seconds": self._metrics.get_uptime_seconds() if self._metrics else 0.0,
|
|
223
|
+
"request_count": self._metrics.get_request_count() if self._metrics else 0,
|
|
224
|
+
"last_health_time": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
225
|
+
}
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error(f"Failed to get processor state: {e}")
|
|
228
|
+
return {"error": str(e)}
|
|
229
|
+
|
|
230
|
+
async def _restore_processor_state(self, state: dict) -> None:
|
|
231
|
+
"""Restore processor state from /state POST endpoint (state transfer)."""
|
|
232
|
+
if self._processor is None:
|
|
233
|
+
raise RuntimeError("Processor not initialized, cannot restore state")
|
|
234
|
+
|
|
235
|
+
# Extract processor data from ApplicationState format
|
|
236
|
+
processor_data = state.get("data", state)
|
|
237
|
+
|
|
238
|
+
logger.info(f"Restoring state from transfer", extra={
|
|
239
|
+
"pod_name": state.get("pod_name", "unknown"),
|
|
240
|
+
"state_version": state.get("state_version", "unknown"),
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
await self._processor.restore_state(processor_data)
|
|
244
|
+
|
|
245
|
+
async def _handle_prestop(self) -> None:
|
|
246
|
+
"""Handle PreStop hook - prepare for graceful shutdown."""
|
|
247
|
+
logger.info("PreStop hook: initiating graceful shutdown preparation")
|
|
248
|
+
|
|
249
|
+
# Signal context that shutdown is coming
|
|
250
|
+
if self._context:
|
|
251
|
+
self._context.request_shutdown()
|
|
252
|
+
|
|
253
|
+
# Mark health server as not ready to stop receiving traffic
|
|
254
|
+
if self._health_server:
|
|
255
|
+
self._health_server.mark_not_ready()
|
|
256
|
+
|
|
257
|
+
# Save state before pod terminates - this is critical because
|
|
258
|
+
# SIGTERM may arrive after the app has already started exiting
|
|
259
|
+
if self._processor and self._state_manager and self._context:
|
|
260
|
+
try:
|
|
261
|
+
state = self._processor.get_state()
|
|
262
|
+
await self._state_manager.save_state(
|
|
263
|
+
processor_id=self._context.processor_id,
|
|
264
|
+
state=state,
|
|
265
|
+
)
|
|
266
|
+
logger.info("State snapshot saved during PreStop")
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.error(f"Failed to save state during PreStop: {e}")
|
|
269
|
+
|
|
270
|
+
async def _run_processor_lifecycle(self) -> None:
|
|
271
|
+
"""Run the complete processor lifecycle."""
|
|
272
|
+
# Check if saved state exists before deciding recovery strategy
|
|
273
|
+
# This is important for detecting pod replacement (new pod, existing state)
|
|
274
|
+
state_exists = False
|
|
275
|
+
try:
|
|
276
|
+
existing_state = await self._state_manager.load_state(
|
|
277
|
+
processor_id=self._context.processor_id,
|
|
278
|
+
)
|
|
279
|
+
state_exists = existing_state is not None
|
|
280
|
+
if state_exists:
|
|
281
|
+
logger.info("Existing state found in checkpoint")
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.debug(f"No existing state found: {e}")
|
|
284
|
+
|
|
285
|
+
# Determine recovery strategy
|
|
286
|
+
strategy = self._recovery_decision.decide(
|
|
287
|
+
restart_count=self._context.attempt_number,
|
|
288
|
+
is_migrating=self._context.is_migrating,
|
|
289
|
+
state_exists=state_exists,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
logger.info(f"Recovery strategy: {strategy.name}")
|
|
293
|
+
|
|
294
|
+
# Load state if needed (may already have it from check above)
|
|
295
|
+
state = None
|
|
296
|
+
if strategy.should_restore_state:
|
|
297
|
+
try:
|
|
298
|
+
if state_exists and existing_state:
|
|
299
|
+
state = existing_state
|
|
300
|
+
else:
|
|
301
|
+
state = await self._state_manager.load_state(
|
|
302
|
+
processor_id=self._context.processor_id,
|
|
303
|
+
)
|
|
304
|
+
if state:
|
|
305
|
+
logger.info("State loaded from checkpoint")
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logger.warning(f"Failed to load state: {e}")
|
|
308
|
+
state = None
|
|
309
|
+
|
|
310
|
+
# Run startup
|
|
311
|
+
await self._lifecycle.run_startup(
|
|
312
|
+
processor=self._processor,
|
|
313
|
+
timeout=self._config.startup_timeout_sec,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Restore state if available
|
|
317
|
+
if state:
|
|
318
|
+
try:
|
|
319
|
+
await self._processor.restore_state(state)
|
|
320
|
+
logger.info("State restored successfully")
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error(f"State restore failed: {e}")
|
|
323
|
+
should_continue = await self._processor.on_state_restore_failed(e)
|
|
324
|
+
if not should_continue:
|
|
325
|
+
raise DoryStateError("State restore failed and recovery declined", cause=e)
|
|
326
|
+
|
|
327
|
+
# Mark as ready
|
|
328
|
+
self._health_server.mark_ready()
|
|
329
|
+
self._metrics.record_startup_completed()
|
|
330
|
+
logger.info("Processor ready")
|
|
331
|
+
|
|
332
|
+
# Run main loop
|
|
333
|
+
try:
|
|
334
|
+
await self._lifecycle.run_main_loop(
|
|
335
|
+
processor=self._processor,
|
|
336
|
+
context=self._context,
|
|
337
|
+
)
|
|
338
|
+
except asyncio.CancelledError:
|
|
339
|
+
logger.info("Main loop cancelled")
|
|
340
|
+
|
|
341
|
+
async def _trigger_shutdown(self) -> None:
|
|
342
|
+
"""Trigger graceful shutdown sequence."""
|
|
343
|
+
logger.info("Shutdown triggered")
|
|
344
|
+
|
|
345
|
+
# Signal context
|
|
346
|
+
self._context.request_shutdown()
|
|
347
|
+
|
|
348
|
+
# Mark health server as not ready
|
|
349
|
+
if self._health_server:
|
|
350
|
+
self._health_server.mark_not_ready()
|
|
351
|
+
|
|
352
|
+
# Wait briefly for run() to exit
|
|
353
|
+
await asyncio.sleep(0.5)
|
|
354
|
+
|
|
355
|
+
# Run shutdown
|
|
356
|
+
await self._lifecycle.run_shutdown(
|
|
357
|
+
processor=self._processor,
|
|
358
|
+
timeout=self._config.shutdown_timeout_sec,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Snapshot state
|
|
362
|
+
try:
|
|
363
|
+
state = self._processor.get_state()
|
|
364
|
+
await self._state_manager.save_state(
|
|
365
|
+
processor_id=self._context.processor_id,
|
|
366
|
+
state=state,
|
|
367
|
+
)
|
|
368
|
+
logger.info("State snapshot saved")
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.error(f"Failed to save state: {e}")
|
|
371
|
+
|
|
372
|
+
self._metrics.record_shutdown_completed()
|
|
373
|
+
|
|
374
|
+
async def _trigger_snapshot(self) -> None:
|
|
375
|
+
"""Trigger state snapshot (SIGUSR1 handler)."""
|
|
376
|
+
logger.info("State snapshot triggered")
|
|
377
|
+
try:
|
|
378
|
+
state = self._processor.get_state()
|
|
379
|
+
await self._state_manager.save_state(
|
|
380
|
+
processor_id=self._context.processor_id,
|
|
381
|
+
state=state,
|
|
382
|
+
)
|
|
383
|
+
logger.info("State snapshot saved (debug)")
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.error(f"Failed to save state snapshot: {e}")
|
|
386
|
+
|
|
387
|
+
async def _cleanup(self) -> None:
|
|
388
|
+
"""Cleanup all components."""
|
|
389
|
+
logger.debug("Cleaning up DoryApp components")
|
|
390
|
+
|
|
391
|
+
# Remove signal handlers
|
|
392
|
+
if self._signals:
|
|
393
|
+
self._signals.remove_handlers()
|
|
394
|
+
|
|
395
|
+
# Stop health server
|
|
396
|
+
if self._health_server:
|
|
397
|
+
await self._health_server.stop()
|
|
398
|
+
|
|
399
|
+
# Flush metrics
|
|
400
|
+
if self._metrics:
|
|
401
|
+
self._metrics.flush()
|
|
402
|
+
|
|
403
|
+
# Flush logs
|
|
404
|
+
logging.shutdown()
|