dory-sdk 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +70 -0
- dory/auto_instrument.py +142 -0
- dory/cli/__init__.py +5 -0
- dory/cli/main.py +290 -0
- dory/cli/templates.py +333 -0
- dory/config/__init__.py +23 -0
- dory/config/defaults.py +50 -0
- dory/config/loader.py +361 -0
- dory/config/presets.py +325 -0
- dory/config/schema.py +152 -0
- dory/core/__init__.py +27 -0
- dory/core/app.py +404 -0
- dory/core/context.py +209 -0
- dory/core/lifecycle.py +214 -0
- dory/core/meta.py +121 -0
- dory/core/modes.py +479 -0
- dory/core/processor.py +654 -0
- dory/core/signals.py +122 -0
- dory/decorators.py +142 -0
- dory/errors/__init__.py +117 -0
- dory/errors/classification.py +362 -0
- dory/errors/codes.py +495 -0
- dory/health/__init__.py +10 -0
- dory/health/probes.py +210 -0
- dory/health/server.py +306 -0
- dory/k8s/__init__.py +11 -0
- dory/k8s/annotation_watcher.py +184 -0
- dory/k8s/client.py +251 -0
- dory/k8s/pod_metadata.py +182 -0
- dory/logging/__init__.py +9 -0
- dory/logging/logger.py +175 -0
- dory/metrics/__init__.py +7 -0
- dory/metrics/collector.py +301 -0
- dory/middleware/__init__.py +36 -0
- dory/middleware/connection_tracker.py +608 -0
- dory/middleware/request_id.py +321 -0
- dory/middleware/request_tracker.py +501 -0
- dory/migration/__init__.py +11 -0
- dory/migration/configmap.py +260 -0
- dory/migration/serialization.py +167 -0
- dory/migration/state_manager.py +301 -0
- dory/monitoring/__init__.py +23 -0
- dory/monitoring/opentelemetry.py +462 -0
- dory/py.typed +2 -0
- dory/recovery/__init__.py +60 -0
- dory/recovery/golden_image.py +480 -0
- dory/recovery/golden_snapshot.py +561 -0
- dory/recovery/golden_validator.py +518 -0
- dory/recovery/partial_recovery.py +479 -0
- dory/recovery/recovery_decision.py +242 -0
- dory/recovery/restart_detector.py +142 -0
- dory/recovery/state_validator.py +187 -0
- dory/resilience/__init__.py +45 -0
- dory/resilience/circuit_breaker.py +454 -0
- dory/resilience/retry.py +389 -0
- dory/sidecar/__init__.py +6 -0
- dory/sidecar/main.py +75 -0
- dory/sidecar/server.py +329 -0
- dory/simple.py +342 -0
- dory/types.py +75 -0
- dory/utils/__init__.py +25 -0
- dory/utils/errors.py +59 -0
- dory/utils/retry.py +115 -0
- dory/utils/timeout.py +80 -0
- dory_sdk-2.1.0.dist-info/METADATA +663 -0
- dory_sdk-2.1.0.dist-info/RECORD +69 -0
- dory_sdk-2.1.0.dist-info/WHEEL +5 -0
- dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
- dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Recovery decision maker.
|
|
3
|
+
|
|
4
|
+
Determines the appropriate recovery strategy based on
|
|
5
|
+
restart count, failure type, and migration status.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
from dory.types import RecoveryStrategy, FaultType
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DecisionReason(Enum):
|
|
18
|
+
"""Reasons for recovery decisions."""
|
|
19
|
+
FIRST_START = "first_start"
|
|
20
|
+
MIGRATION = "migration"
|
|
21
|
+
NORMAL_RESTART = "normal_restart"
|
|
22
|
+
RAPID_RESTART = "rapid_restart"
|
|
23
|
+
THRESHOLD_EXCEEDED = "threshold_exceeded"
|
|
24
|
+
STATE_CORRUPTION = "state_corruption"
|
|
25
|
+
CRASH_LOOP = "crash_loop"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RecoveryDecision:
|
|
30
|
+
"""Result of recovery decision making."""
|
|
31
|
+
strategy: RecoveryStrategy
|
|
32
|
+
reason: DecisionReason
|
|
33
|
+
should_restore_state: bool
|
|
34
|
+
should_clear_caches: bool
|
|
35
|
+
backoff_seconds: int = 0
|
|
36
|
+
message: str = ""
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def name(self) -> str:
|
|
40
|
+
"""Get strategy name for logging."""
|
|
41
|
+
return self.strategy.value
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class RecoveryDecisionMaker:
|
|
45
|
+
"""
|
|
46
|
+
Decides recovery strategy based on context.
|
|
47
|
+
|
|
48
|
+
Strategies:
|
|
49
|
+
1. RESTORE_STATE - Normal recovery, restore from checkpoint
|
|
50
|
+
2. GOLDEN_IMAGE - Full reset, start fresh
|
|
51
|
+
3. GOLDEN_WITH_BACKOFF - Reset with delay to prevent rapid cycling
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
golden_image_threshold: int = 3,
|
|
57
|
+
rapid_restart_window_sec: int = 60,
|
|
58
|
+
max_backoff_sec: int = 300,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize decision maker.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
golden_image_threshold: Restart count triggering golden image
|
|
65
|
+
rapid_restart_window_sec: Window for detecting rapid restarts
|
|
66
|
+
max_backoff_sec: Maximum backoff delay
|
|
67
|
+
"""
|
|
68
|
+
self._golden_threshold = golden_image_threshold
|
|
69
|
+
self._rapid_window = rapid_restart_window_sec
|
|
70
|
+
self._max_backoff = max_backoff_sec
|
|
71
|
+
|
|
72
|
+
def decide(
|
|
73
|
+
self,
|
|
74
|
+
restart_count: int,
|
|
75
|
+
is_migrating: bool = False,
|
|
76
|
+
fault_type: FaultType | None = None,
|
|
77
|
+
state_valid: bool = True,
|
|
78
|
+
state_exists: bool = False,
|
|
79
|
+
) -> RecoveryDecision:
|
|
80
|
+
"""
|
|
81
|
+
Decide recovery strategy.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
restart_count: Current restart count
|
|
85
|
+
is_migrating: Whether this is a migration restart
|
|
86
|
+
fault_type: Type of fault that caused restart
|
|
87
|
+
state_valid: Whether existing state is valid
|
|
88
|
+
state_exists: Whether saved state exists (e.g., in ConfigMap)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
RecoveryDecision with strategy and details
|
|
92
|
+
"""
|
|
93
|
+
# First start - but if state exists, it means this is a pod replacement
|
|
94
|
+
# (orchestrator created a new pod after deleting the old one)
|
|
95
|
+
if restart_count == 0:
|
|
96
|
+
if state_exists:
|
|
97
|
+
# State exists from previous pod - treat as migration/replacement
|
|
98
|
+
return RecoveryDecision(
|
|
99
|
+
strategy=RecoveryStrategy.RESTORE_STATE,
|
|
100
|
+
reason=DecisionReason.MIGRATION,
|
|
101
|
+
should_restore_state=True,
|
|
102
|
+
should_clear_caches=False,
|
|
103
|
+
message="Pod replacement detected (state exists), restoring state",
|
|
104
|
+
)
|
|
105
|
+
# Truly first start with no prior state
|
|
106
|
+
return RecoveryDecision(
|
|
107
|
+
strategy=RecoveryStrategy.RESTORE_STATE,
|
|
108
|
+
reason=DecisionReason.FIRST_START,
|
|
109
|
+
should_restore_state=False,
|
|
110
|
+
should_clear_caches=False,
|
|
111
|
+
message="First start, no state to restore",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Migration - always restore state
|
|
115
|
+
if is_migrating:
|
|
116
|
+
return RecoveryDecision(
|
|
117
|
+
strategy=RecoveryStrategy.RESTORE_STATE,
|
|
118
|
+
reason=DecisionReason.MIGRATION,
|
|
119
|
+
should_restore_state=True,
|
|
120
|
+
should_clear_caches=False,
|
|
121
|
+
message="Migration restart, restoring state",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# State corruption - golden image reset
|
|
125
|
+
if not state_valid or fault_type == FaultType.STATE_CORRUPTION:
|
|
126
|
+
return RecoveryDecision(
|
|
127
|
+
strategy=RecoveryStrategy.GOLDEN_IMAGE,
|
|
128
|
+
reason=DecisionReason.STATE_CORRUPTION,
|
|
129
|
+
should_restore_state=False,
|
|
130
|
+
should_clear_caches=True,
|
|
131
|
+
message="State corruption detected, performing golden image reset",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Threshold exceeded - golden image with backoff
|
|
135
|
+
if restart_count >= self._golden_threshold:
|
|
136
|
+
backoff = self._calculate_backoff(restart_count)
|
|
137
|
+
return RecoveryDecision(
|
|
138
|
+
strategy=RecoveryStrategy.GOLDEN_WITH_BACKOFF,
|
|
139
|
+
reason=DecisionReason.THRESHOLD_EXCEEDED,
|
|
140
|
+
should_restore_state=False,
|
|
141
|
+
should_clear_caches=True,
|
|
142
|
+
backoff_seconds=backoff,
|
|
143
|
+
message=f"Restart threshold exceeded ({restart_count} >= {self._golden_threshold}), "
|
|
144
|
+
f"golden image reset with {backoff}s backoff",
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Normal restart - try to restore state
|
|
148
|
+
return RecoveryDecision(
|
|
149
|
+
strategy=RecoveryStrategy.RESTORE_STATE,
|
|
150
|
+
reason=DecisionReason.NORMAL_RESTART,
|
|
151
|
+
should_restore_state=True,
|
|
152
|
+
should_clear_caches=True,
|
|
153
|
+
message=f"Normal restart (attempt {restart_count + 1}), restoring state",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def _calculate_backoff(self, restart_count: int) -> int:
|
|
157
|
+
"""
|
|
158
|
+
Calculate backoff delay based on restart count.
|
|
159
|
+
|
|
160
|
+
Uses exponential backoff with jitter.
|
|
161
|
+
"""
|
|
162
|
+
base_backoff = 10 # seconds
|
|
163
|
+
# Exponential: 10, 20, 40, 80, 160, ... capped at max
|
|
164
|
+
backoff = min(
|
|
165
|
+
base_backoff * (2 ** (restart_count - self._golden_threshold)),
|
|
166
|
+
self._max_backoff,
|
|
167
|
+
)
|
|
168
|
+
return int(backoff)
|
|
169
|
+
|
|
170
|
+
def should_trigger_alert(self, restart_count: int) -> bool:
|
|
171
|
+
"""
|
|
172
|
+
Check if restart count should trigger alerting.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
restart_count: Current restart count
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
True if alert should be triggered
|
|
179
|
+
"""
|
|
180
|
+
# Alert on first golden image reset and every N restarts after
|
|
181
|
+
if restart_count == self._golden_threshold:
|
|
182
|
+
return True
|
|
183
|
+
if restart_count > self._golden_threshold and restart_count % 3 == 0:
|
|
184
|
+
return True
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class RecoveryExecutor:
|
|
189
|
+
"""
|
|
190
|
+
Executes recovery decisions.
|
|
191
|
+
|
|
192
|
+
Coordinates the actual recovery steps based on decision.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def __init__(self, state_manager, golden_image_manager):
|
|
196
|
+
"""
|
|
197
|
+
Initialize recovery executor.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
state_manager: State manager for state operations
|
|
201
|
+
golden_image_manager: Golden image manager for resets
|
|
202
|
+
"""
|
|
203
|
+
self._state_manager = state_manager
|
|
204
|
+
self._golden_manager = golden_image_manager
|
|
205
|
+
|
|
206
|
+
async def execute(
|
|
207
|
+
self,
|
|
208
|
+
decision: RecoveryDecision,
|
|
209
|
+
processor_id: str,
|
|
210
|
+
) -> dict | None:
|
|
211
|
+
"""
|
|
212
|
+
Execute recovery decision.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
decision: Recovery decision to execute
|
|
216
|
+
processor_id: Processor ID
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Restored state dict, or None if golden image reset
|
|
220
|
+
"""
|
|
221
|
+
logger.info(f"Executing recovery: {decision.strategy.value} - {decision.message}")
|
|
222
|
+
|
|
223
|
+
# Apply backoff if needed
|
|
224
|
+
if decision.backoff_seconds > 0:
|
|
225
|
+
logger.info(f"Applying backoff: {decision.backoff_seconds}s")
|
|
226
|
+
import asyncio
|
|
227
|
+
await asyncio.sleep(decision.backoff_seconds)
|
|
228
|
+
|
|
229
|
+
# Golden image reset
|
|
230
|
+
if decision.strategy in (
|
|
231
|
+
RecoveryStrategy.GOLDEN_IMAGE,
|
|
232
|
+
RecoveryStrategy.GOLDEN_WITH_BACKOFF,
|
|
233
|
+
):
|
|
234
|
+
await self._golden_manager.reset(processor_id)
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
# Restore state
|
|
238
|
+
if decision.should_restore_state:
|
|
239
|
+
state = await self._state_manager.load_state(processor_id)
|
|
240
|
+
return state
|
|
241
|
+
|
|
242
|
+
return None
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Restart detection for pod lifecycle tracking.
|
|
3
|
+
|
|
4
|
+
Detects restarts by checking restart count from:
|
|
5
|
+
1. Kubernetes downward API (restart count annotation)
|
|
6
|
+
2. Local file marker
|
|
7
|
+
3. Environment variables
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class RestartInfo:
|
|
20
|
+
"""Information about restart status."""
|
|
21
|
+
restart_count: int
|
|
22
|
+
is_restart: bool
|
|
23
|
+
previous_exit_code: int | None = None
|
|
24
|
+
restart_reason: str | None = None
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def is_first_start(self) -> bool:
|
|
28
|
+
"""Check if this is the first start (not a restart)."""
|
|
29
|
+
return not self.is_restart
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RestartDetector:
|
|
33
|
+
"""
|
|
34
|
+
Detects pod restarts and tracks restart count.
|
|
35
|
+
|
|
36
|
+
Uses multiple methods to detect restarts:
|
|
37
|
+
1. RESTART_COUNT environment variable (set by init container)
|
|
38
|
+
2. Local marker file with count
|
|
39
|
+
3. Kubernetes pod annotation via downward API
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
MARKER_FILE_PATH = "/tmp/dory-restart-marker"
|
|
43
|
+
RESTART_COUNT_ENV = "RESTART_COUNT"
|
|
44
|
+
PREVIOUS_EXIT_CODE_ENV = "PREVIOUS_EXIT_CODE"
|
|
45
|
+
RESTART_REASON_ENV = "RESTART_REASON"
|
|
46
|
+
|
|
47
|
+
def __init__(self, marker_path: str | None = None):
|
|
48
|
+
"""
|
|
49
|
+
Initialize restart detector.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
marker_path: Optional custom path for marker file
|
|
53
|
+
"""
|
|
54
|
+
self._marker_path = Path(marker_path or self.MARKER_FILE_PATH)
|
|
55
|
+
|
|
56
|
+
async def detect(self) -> RestartInfo:
|
|
57
|
+
"""
|
|
58
|
+
Detect restart status.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
RestartInfo with restart count and status
|
|
62
|
+
"""
|
|
63
|
+
# Try environment variable first (most reliable in K8s)
|
|
64
|
+
env_count = self._detect_from_env()
|
|
65
|
+
if env_count is not None:
|
|
66
|
+
logger.debug(f"Restart count from env: {env_count}")
|
|
67
|
+
return RestartInfo(
|
|
68
|
+
restart_count=env_count,
|
|
69
|
+
is_restart=env_count > 0,
|
|
70
|
+
previous_exit_code=self._get_previous_exit_code(),
|
|
71
|
+
restart_reason=self._get_restart_reason(),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Fall back to marker file
|
|
75
|
+
marker_count = self._detect_from_marker()
|
|
76
|
+
logger.debug(f"Restart count from marker: {marker_count}")
|
|
77
|
+
|
|
78
|
+
# Increment and save marker for next restart
|
|
79
|
+
self._save_marker(marker_count + 1)
|
|
80
|
+
|
|
81
|
+
return RestartInfo(
|
|
82
|
+
restart_count=marker_count,
|
|
83
|
+
is_restart=marker_count > 0,
|
|
84
|
+
previous_exit_code=self._get_previous_exit_code(),
|
|
85
|
+
restart_reason=self._get_restart_reason(),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _detect_from_env(self) -> int | None:
|
|
89
|
+
"""Detect restart count from environment variable."""
|
|
90
|
+
count_str = os.environ.get(self.RESTART_COUNT_ENV)
|
|
91
|
+
if count_str is None:
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
return int(count_str)
|
|
96
|
+
except ValueError:
|
|
97
|
+
logger.warning(f"Invalid RESTART_COUNT value: {count_str}")
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
def _detect_from_marker(self) -> int:
|
|
101
|
+
"""Detect restart count from marker file."""
|
|
102
|
+
if not self._marker_path.exists():
|
|
103
|
+
return 0
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
content = self._marker_path.read_text().strip()
|
|
107
|
+
return int(content)
|
|
108
|
+
except (ValueError, IOError) as e:
|
|
109
|
+
logger.warning(f"Failed to read marker file: {e}")
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
def _save_marker(self, count: int) -> None:
|
|
113
|
+
"""Save restart count to marker file."""
|
|
114
|
+
try:
|
|
115
|
+
self._marker_path.parent.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
self._marker_path.write_text(str(count))
|
|
117
|
+
except IOError as e:
|
|
118
|
+
logger.warning(f"Failed to save marker file: {e}")
|
|
119
|
+
|
|
120
|
+
def _get_previous_exit_code(self) -> int | None:
|
|
121
|
+
"""Get previous exit code from environment."""
|
|
122
|
+
code_str = os.environ.get(self.PREVIOUS_EXIT_CODE_ENV)
|
|
123
|
+
if code_str is None:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
return int(code_str)
|
|
128
|
+
except ValueError:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
def _get_restart_reason(self) -> str | None:
|
|
132
|
+
"""Get restart reason from environment."""
|
|
133
|
+
return os.environ.get(self.RESTART_REASON_ENV)
|
|
134
|
+
|
|
135
|
+
def reset(self) -> None:
|
|
136
|
+
"""Reset restart counter (for testing or golden image reset)."""
|
|
137
|
+
if self._marker_path.exists():
|
|
138
|
+
try:
|
|
139
|
+
self._marker_path.unlink()
|
|
140
|
+
logger.info("Restart marker reset")
|
|
141
|
+
except IOError as e:
|
|
142
|
+
logger.warning(f"Failed to reset marker: {e}")
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
State validation for integrity checking.
|
|
3
|
+
|
|
4
|
+
Validates restored state against schema and checksums.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from dory.utils.errors import DoryValidationError
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StateValidator:
|
|
16
|
+
"""
|
|
17
|
+
Validates processor state for integrity and schema compliance.
|
|
18
|
+
|
|
19
|
+
Performs:
|
|
20
|
+
1. Schema validation (required fields, types)
|
|
21
|
+
2. Integrity checks (checksums)
|
|
22
|
+
3. Version compatibility checks
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, schema: dict[str, type] | None = None):
|
|
26
|
+
"""
|
|
27
|
+
Initialize validator.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
schema: Optional schema mapping field names to expected types
|
|
31
|
+
"""
|
|
32
|
+
self._schema = schema
|
|
33
|
+
|
|
34
|
+
def validate(self, state: dict[str, Any]) -> bool:
|
|
35
|
+
"""
|
|
36
|
+
Validate state dictionary.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
state: State dictionary to validate
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
True if valid
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
DoryValidationError: If validation fails
|
|
46
|
+
"""
|
|
47
|
+
if not isinstance(state, dict):
|
|
48
|
+
raise DoryValidationError(f"State must be a dict, got {type(state)}")
|
|
49
|
+
|
|
50
|
+
# Validate against schema if provided
|
|
51
|
+
if self._schema:
|
|
52
|
+
self._validate_schema(state)
|
|
53
|
+
|
|
54
|
+
# Run integrity checks
|
|
55
|
+
self._validate_integrity(state)
|
|
56
|
+
|
|
57
|
+
logger.debug("State validation passed")
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
def _validate_schema(self, state: dict[str, Any]) -> None:
|
|
61
|
+
"""Validate state against schema."""
|
|
62
|
+
for field_name, expected_type in self._schema.items():
|
|
63
|
+
if field_name not in state:
|
|
64
|
+
raise DoryValidationError(
|
|
65
|
+
f"Required field '{field_name}' missing from state"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
value = state[field_name]
|
|
69
|
+
|
|
70
|
+
# Allow None for any type
|
|
71
|
+
if value is None:
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
if not isinstance(value, expected_type):
|
|
75
|
+
raise DoryValidationError(
|
|
76
|
+
f"Field '{field_name}' has wrong type: "
|
|
77
|
+
f"expected {expected_type.__name__}, got {type(value).__name__}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def _validate_integrity(self, state: dict[str, Any]) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Run integrity checks on state.
|
|
83
|
+
|
|
84
|
+
Can be extended for custom integrity validation.
|
|
85
|
+
"""
|
|
86
|
+
# Check for common corruption indicators
|
|
87
|
+
if "__corrupted__" in state:
|
|
88
|
+
raise DoryValidationError("State marked as corrupted")
|
|
89
|
+
|
|
90
|
+
# Check metadata if present
|
|
91
|
+
if "_metadata" in state:
|
|
92
|
+
metadata = state["_metadata"]
|
|
93
|
+
if not isinstance(metadata, dict):
|
|
94
|
+
raise DoryValidationError("State metadata must be a dict")
|
|
95
|
+
|
|
96
|
+
def validate_partial(self, state: dict[str, Any], required_fields: list[str]) -> bool:
|
|
97
|
+
"""
|
|
98
|
+
Validate that specific fields exist and have correct types.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
state: State dictionary
|
|
102
|
+
required_fields: List of field names that must exist
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
True if valid
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
DoryValidationError: If validation fails
|
|
109
|
+
"""
|
|
110
|
+
for field_name in required_fields:
|
|
111
|
+
if field_name not in state:
|
|
112
|
+
raise DoryValidationError(
|
|
113
|
+
f"Required field '{field_name}' missing from state"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if self._schema and field_name in self._schema:
|
|
117
|
+
expected_type = self._schema[field_name]
|
|
118
|
+
value = state[field_name]
|
|
119
|
+
|
|
120
|
+
if value is not None and not isinstance(value, expected_type):
|
|
121
|
+
raise DoryValidationError(
|
|
122
|
+
f"Field '{field_name}' has wrong type"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class StateVersionChecker:
|
|
129
|
+
"""
|
|
130
|
+
Checks state version compatibility.
|
|
131
|
+
|
|
132
|
+
Ensures restored state is compatible with current processor version.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
VERSION_FIELD = "_version"
|
|
136
|
+
|
|
137
|
+
def __init__(self, current_version: str):
|
|
138
|
+
"""
|
|
139
|
+
Initialize version checker.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
current_version: Current processor state version
|
|
143
|
+
"""
|
|
144
|
+
self._current_version = current_version
|
|
145
|
+
|
|
146
|
+
def check_compatible(self, state: dict[str, Any]) -> bool:
|
|
147
|
+
"""
|
|
148
|
+
Check if state version is compatible.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
state: State dictionary
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
True if compatible
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
DoryValidationError: If incompatible
|
|
158
|
+
"""
|
|
159
|
+
state_version = state.get(self.VERSION_FIELD)
|
|
160
|
+
|
|
161
|
+
if state_version is None:
|
|
162
|
+
# No version = assume compatible (v0)
|
|
163
|
+
logger.warning("State has no version field, assuming compatible")
|
|
164
|
+
return True
|
|
165
|
+
|
|
166
|
+
if not self._is_compatible(state_version, self._current_version):
|
|
167
|
+
raise DoryValidationError(
|
|
168
|
+
f"State version {state_version} not compatible "
|
|
169
|
+
f"with processor version {self._current_version}"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return True
|
|
173
|
+
|
|
174
|
+
def _is_compatible(self, state_version: str, processor_version: str) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
Check version compatibility.
|
|
177
|
+
|
|
178
|
+
Default: major version must match.
|
|
179
|
+
Override for custom compatibility logic.
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
state_major = int(state_version.split(".")[0])
|
|
183
|
+
processor_major = int(processor_version.split(".")[0])
|
|
184
|
+
return state_major == processor_major
|
|
185
|
+
except (ValueError, IndexError):
|
|
186
|
+
# Invalid version format
|
|
187
|
+
return False
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resilience patterns for fault-tolerant processing.
|
|
3
|
+
|
|
4
|
+
This module provides production-ready resilience patterns:
|
|
5
|
+
- Retry with exponential backoff
|
|
6
|
+
- Circuit breaker pattern
|
|
7
|
+
- Rate limiting
|
|
8
|
+
- Bulkhead isolation
|
|
9
|
+
|
|
10
|
+
Example usage:
|
|
11
|
+
from dory.resilience import retry_with_backoff, CircuitBreaker
|
|
12
|
+
|
|
13
|
+
@retry_with_backoff(max_attempts=3)
|
|
14
|
+
async def call_api():
|
|
15
|
+
return await api.get()
|
|
16
|
+
|
|
17
|
+
breaker = CircuitBreaker(name="database")
|
|
18
|
+
result = await breaker.call(db.query)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .retry import (
|
|
22
|
+
retry_with_backoff,
|
|
23
|
+
RetryPolicy,
|
|
24
|
+
RetryBudget,
|
|
25
|
+
RetryExhaustedError,
|
|
26
|
+
)
|
|
27
|
+
from .circuit_breaker import (
|
|
28
|
+
CircuitBreaker,
|
|
29
|
+
CircuitState,
|
|
30
|
+
CircuitOpenError,
|
|
31
|
+
CircuitBreakerConfig,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# Retry
|
|
36
|
+
"retry_with_backoff",
|
|
37
|
+
"RetryPolicy",
|
|
38
|
+
"RetryBudget",
|
|
39
|
+
"RetryExhaustedError",
|
|
40
|
+
# Circuit Breaker
|
|
41
|
+
"CircuitBreaker",
|
|
42
|
+
"CircuitState",
|
|
43
|
+
"CircuitOpenError",
|
|
44
|
+
"CircuitBreakerConfig",
|
|
45
|
+
]
|