dory-sdk 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dory/__init__.py +70 -0
  2. dory/auto_instrument.py +142 -0
  3. dory/cli/__init__.py +5 -0
  4. dory/cli/main.py +290 -0
  5. dory/cli/templates.py +333 -0
  6. dory/config/__init__.py +23 -0
  7. dory/config/defaults.py +50 -0
  8. dory/config/loader.py +361 -0
  9. dory/config/presets.py +325 -0
  10. dory/config/schema.py +152 -0
  11. dory/core/__init__.py +27 -0
  12. dory/core/app.py +404 -0
  13. dory/core/context.py +209 -0
  14. dory/core/lifecycle.py +214 -0
  15. dory/core/meta.py +121 -0
  16. dory/core/modes.py +479 -0
  17. dory/core/processor.py +654 -0
  18. dory/core/signals.py +122 -0
  19. dory/decorators.py +142 -0
  20. dory/errors/__init__.py +117 -0
  21. dory/errors/classification.py +362 -0
  22. dory/errors/codes.py +495 -0
  23. dory/health/__init__.py +10 -0
  24. dory/health/probes.py +210 -0
  25. dory/health/server.py +306 -0
  26. dory/k8s/__init__.py +11 -0
  27. dory/k8s/annotation_watcher.py +184 -0
  28. dory/k8s/client.py +251 -0
  29. dory/k8s/pod_metadata.py +182 -0
  30. dory/logging/__init__.py +9 -0
  31. dory/logging/logger.py +175 -0
  32. dory/metrics/__init__.py +7 -0
  33. dory/metrics/collector.py +301 -0
  34. dory/middleware/__init__.py +36 -0
  35. dory/middleware/connection_tracker.py +608 -0
  36. dory/middleware/request_id.py +321 -0
  37. dory/middleware/request_tracker.py +501 -0
  38. dory/migration/__init__.py +11 -0
  39. dory/migration/configmap.py +260 -0
  40. dory/migration/serialization.py +167 -0
  41. dory/migration/state_manager.py +301 -0
  42. dory/monitoring/__init__.py +23 -0
  43. dory/monitoring/opentelemetry.py +462 -0
  44. dory/py.typed +2 -0
  45. dory/recovery/__init__.py +60 -0
  46. dory/recovery/golden_image.py +480 -0
  47. dory/recovery/golden_snapshot.py +561 -0
  48. dory/recovery/golden_validator.py +518 -0
  49. dory/recovery/partial_recovery.py +479 -0
  50. dory/recovery/recovery_decision.py +242 -0
  51. dory/recovery/restart_detector.py +142 -0
  52. dory/recovery/state_validator.py +187 -0
  53. dory/resilience/__init__.py +45 -0
  54. dory/resilience/circuit_breaker.py +454 -0
  55. dory/resilience/retry.py +389 -0
  56. dory/sidecar/__init__.py +6 -0
  57. dory/sidecar/main.py +75 -0
  58. dory/sidecar/server.py +329 -0
  59. dory/simple.py +342 -0
  60. dory/types.py +75 -0
  61. dory/utils/__init__.py +25 -0
  62. dory/utils/errors.py +59 -0
  63. dory/utils/retry.py +115 -0
  64. dory/utils/timeout.py +80 -0
  65. dory_sdk-2.1.0.dist-info/METADATA +663 -0
  66. dory_sdk-2.1.0.dist-info/RECORD +69 -0
  67. dory_sdk-2.1.0.dist-info/WHEEL +5 -0
  68. dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
  69. dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,242 @@
1
+ """
2
+ Recovery decision maker.
3
+
4
+ Determines the appropriate recovery strategy based on
5
+ restart count, failure type, and migration status.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from enum import Enum
11
+
12
+ from dory.types import RecoveryStrategy, FaultType
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DecisionReason(Enum):
18
+ """Reasons for recovery decisions."""
19
+ FIRST_START = "first_start"
20
+ MIGRATION = "migration"
21
+ NORMAL_RESTART = "normal_restart"
22
+ RAPID_RESTART = "rapid_restart"
23
+ THRESHOLD_EXCEEDED = "threshold_exceeded"
24
+ STATE_CORRUPTION = "state_corruption"
25
+ CRASH_LOOP = "crash_loop"
26
+
27
+
28
+ @dataclass
29
+ class RecoveryDecision:
30
+ """Result of recovery decision making."""
31
+ strategy: RecoveryStrategy
32
+ reason: DecisionReason
33
+ should_restore_state: bool
34
+ should_clear_caches: bool
35
+ backoff_seconds: int = 0
36
+ message: str = ""
37
+
38
+ @property
39
+ def name(self) -> str:
40
+ """Get strategy name for logging."""
41
+ return self.strategy.value
42
+
43
+
44
+ class RecoveryDecisionMaker:
45
+ """
46
+ Decides recovery strategy based on context.
47
+
48
+ Strategies:
49
+ 1. RESTORE_STATE - Normal recovery, restore from checkpoint
50
+ 2. GOLDEN_IMAGE - Full reset, start fresh
51
+ 3. GOLDEN_WITH_BACKOFF - Reset with delay to prevent rapid cycling
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ golden_image_threshold: int = 3,
57
+ rapid_restart_window_sec: int = 60,
58
+ max_backoff_sec: int = 300,
59
+ ):
60
+ """
61
+ Initialize decision maker.
62
+
63
+ Args:
64
+ golden_image_threshold: Restart count triggering golden image
65
+ rapid_restart_window_sec: Window for detecting rapid restarts
66
+ max_backoff_sec: Maximum backoff delay
67
+ """
68
+ self._golden_threshold = golden_image_threshold
69
+ self._rapid_window = rapid_restart_window_sec
70
+ self._max_backoff = max_backoff_sec
71
+
72
+ def decide(
73
+ self,
74
+ restart_count: int,
75
+ is_migrating: bool = False,
76
+ fault_type: FaultType | None = None,
77
+ state_valid: bool = True,
78
+ state_exists: bool = False,
79
+ ) -> RecoveryDecision:
80
+ """
81
+ Decide recovery strategy.
82
+
83
+ Args:
84
+ restart_count: Current restart count
85
+ is_migrating: Whether this is a migration restart
86
+ fault_type: Type of fault that caused restart
87
+ state_valid: Whether existing state is valid
88
+ state_exists: Whether saved state exists (e.g., in ConfigMap)
89
+
90
+ Returns:
91
+ RecoveryDecision with strategy and details
92
+ """
93
+ # First start - but if state exists, it means this is a pod replacement
94
+ # (orchestrator created a new pod after deleting the old one)
95
+ if restart_count == 0:
96
+ if state_exists:
97
+ # State exists from previous pod - treat as migration/replacement
98
+ return RecoveryDecision(
99
+ strategy=RecoveryStrategy.RESTORE_STATE,
100
+ reason=DecisionReason.MIGRATION,
101
+ should_restore_state=True,
102
+ should_clear_caches=False,
103
+ message="Pod replacement detected (state exists), restoring state",
104
+ )
105
+ # Truly first start with no prior state
106
+ return RecoveryDecision(
107
+ strategy=RecoveryStrategy.RESTORE_STATE,
108
+ reason=DecisionReason.FIRST_START,
109
+ should_restore_state=False,
110
+ should_clear_caches=False,
111
+ message="First start, no state to restore",
112
+ )
113
+
114
+ # Migration - always restore state
115
+ if is_migrating:
116
+ return RecoveryDecision(
117
+ strategy=RecoveryStrategy.RESTORE_STATE,
118
+ reason=DecisionReason.MIGRATION,
119
+ should_restore_state=True,
120
+ should_clear_caches=False,
121
+ message="Migration restart, restoring state",
122
+ )
123
+
124
+ # State corruption - golden image reset
125
+ if not state_valid or fault_type == FaultType.STATE_CORRUPTION:
126
+ return RecoveryDecision(
127
+ strategy=RecoveryStrategy.GOLDEN_IMAGE,
128
+ reason=DecisionReason.STATE_CORRUPTION,
129
+ should_restore_state=False,
130
+ should_clear_caches=True,
131
+ message="State corruption detected, performing golden image reset",
132
+ )
133
+
134
+ # Threshold exceeded - golden image with backoff
135
+ if restart_count >= self._golden_threshold:
136
+ backoff = self._calculate_backoff(restart_count)
137
+ return RecoveryDecision(
138
+ strategy=RecoveryStrategy.GOLDEN_WITH_BACKOFF,
139
+ reason=DecisionReason.THRESHOLD_EXCEEDED,
140
+ should_restore_state=False,
141
+ should_clear_caches=True,
142
+ backoff_seconds=backoff,
143
+ message=f"Restart threshold exceeded ({restart_count} >= {self._golden_threshold}), "
144
+ f"golden image reset with {backoff}s backoff",
145
+ )
146
+
147
+ # Normal restart - try to restore state
148
+ return RecoveryDecision(
149
+ strategy=RecoveryStrategy.RESTORE_STATE,
150
+ reason=DecisionReason.NORMAL_RESTART,
151
+ should_restore_state=True,
152
+ should_clear_caches=True,
153
+ message=f"Normal restart (attempt {restart_count + 1}), restoring state",
154
+ )
155
+
156
+ def _calculate_backoff(self, restart_count: int) -> int:
157
+ """
158
+ Calculate backoff delay based on restart count.
159
+
160
+ Uses exponential backoff with jitter.
161
+ """
162
+ base_backoff = 10 # seconds
163
+ # Exponential: 10, 20, 40, 80, 160, ... capped at max
164
+ backoff = min(
165
+ base_backoff * (2 ** (restart_count - self._golden_threshold)),
166
+ self._max_backoff,
167
+ )
168
+ return int(backoff)
169
+
170
+ def should_trigger_alert(self, restart_count: int) -> bool:
171
+ """
172
+ Check if restart count should trigger alerting.
173
+
174
+ Args:
175
+ restart_count: Current restart count
176
+
177
+ Returns:
178
+ True if alert should be triggered
179
+ """
180
+ # Alert on first golden image reset and every N restarts after
181
+ if restart_count == self._golden_threshold:
182
+ return True
183
+ if restart_count > self._golden_threshold and restart_count % 3 == 0:
184
+ return True
185
+ return False
186
+
187
+
188
+ class RecoveryExecutor:
189
+ """
190
+ Executes recovery decisions.
191
+
192
+ Coordinates the actual recovery steps based on decision.
193
+ """
194
+
195
+ def __init__(self, state_manager, golden_image_manager):
196
+ """
197
+ Initialize recovery executor.
198
+
199
+ Args:
200
+ state_manager: State manager for state operations
201
+ golden_image_manager: Golden image manager for resets
202
+ """
203
+ self._state_manager = state_manager
204
+ self._golden_manager = golden_image_manager
205
+
206
+ async def execute(
207
+ self,
208
+ decision: RecoveryDecision,
209
+ processor_id: str,
210
+ ) -> dict | None:
211
+ """
212
+ Execute recovery decision.
213
+
214
+ Args:
215
+ decision: Recovery decision to execute
216
+ processor_id: Processor ID
217
+
218
+ Returns:
219
+ Restored state dict, or None if golden image reset
220
+ """
221
+ logger.info(f"Executing recovery: {decision.strategy.value} - {decision.message}")
222
+
223
+ # Apply backoff if needed
224
+ if decision.backoff_seconds > 0:
225
+ logger.info(f"Applying backoff: {decision.backoff_seconds}s")
226
+ import asyncio
227
+ await asyncio.sleep(decision.backoff_seconds)
228
+
229
+ # Golden image reset
230
+ if decision.strategy in (
231
+ RecoveryStrategy.GOLDEN_IMAGE,
232
+ RecoveryStrategy.GOLDEN_WITH_BACKOFF,
233
+ ):
234
+ await self._golden_manager.reset(processor_id)
235
+ return None
236
+
237
+ # Restore state
238
+ if decision.should_restore_state:
239
+ state = await self._state_manager.load_state(processor_id)
240
+ return state
241
+
242
+ return None
@@ -0,0 +1,142 @@
1
+ """
2
+ Restart detection for pod lifecycle tracking.
3
+
4
+ Detects restarts by checking restart count from:
5
+ 1. Kubernetes downward API (restart count annotation)
6
+ 2. Local file marker
7
+ 3. Environment variables
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class RestartInfo:
20
+ """Information about restart status."""
21
+ restart_count: int
22
+ is_restart: bool
23
+ previous_exit_code: int | None = None
24
+ restart_reason: str | None = None
25
+
26
+ @property
27
+ def is_first_start(self) -> bool:
28
+ """Check if this is the first start (not a restart)."""
29
+ return not self.is_restart
30
+
31
+
32
+ class RestartDetector:
33
+ """
34
+ Detects pod restarts and tracks restart count.
35
+
36
+ Uses multiple methods to detect restarts:
37
+ 1. RESTART_COUNT environment variable (set by init container)
38
+ 2. Local marker file with count
39
+ 3. Kubernetes pod annotation via downward API
40
+ """
41
+
42
+ MARKER_FILE_PATH = "/tmp/dory-restart-marker"
43
+ RESTART_COUNT_ENV = "RESTART_COUNT"
44
+ PREVIOUS_EXIT_CODE_ENV = "PREVIOUS_EXIT_CODE"
45
+ RESTART_REASON_ENV = "RESTART_REASON"
46
+
47
+ def __init__(self, marker_path: str | None = None):
48
+ """
49
+ Initialize restart detector.
50
+
51
+ Args:
52
+ marker_path: Optional custom path for marker file
53
+ """
54
+ self._marker_path = Path(marker_path or self.MARKER_FILE_PATH)
55
+
56
+ async def detect(self) -> RestartInfo:
57
+ """
58
+ Detect restart status.
59
+
60
+ Returns:
61
+ RestartInfo with restart count and status
62
+ """
63
+ # Try environment variable first (most reliable in K8s)
64
+ env_count = self._detect_from_env()
65
+ if env_count is not None:
66
+ logger.debug(f"Restart count from env: {env_count}")
67
+ return RestartInfo(
68
+ restart_count=env_count,
69
+ is_restart=env_count > 0,
70
+ previous_exit_code=self._get_previous_exit_code(),
71
+ restart_reason=self._get_restart_reason(),
72
+ )
73
+
74
+ # Fall back to marker file
75
+ marker_count = self._detect_from_marker()
76
+ logger.debug(f"Restart count from marker: {marker_count}")
77
+
78
+ # Increment and save marker for next restart
79
+ self._save_marker(marker_count + 1)
80
+
81
+ return RestartInfo(
82
+ restart_count=marker_count,
83
+ is_restart=marker_count > 0,
84
+ previous_exit_code=self._get_previous_exit_code(),
85
+ restart_reason=self._get_restart_reason(),
86
+ )
87
+
88
+ def _detect_from_env(self) -> int | None:
89
+ """Detect restart count from environment variable."""
90
+ count_str = os.environ.get(self.RESTART_COUNT_ENV)
91
+ if count_str is None:
92
+ return None
93
+
94
+ try:
95
+ return int(count_str)
96
+ except ValueError:
97
+ logger.warning(f"Invalid RESTART_COUNT value: {count_str}")
98
+ return None
99
+
100
+ def _detect_from_marker(self) -> int:
101
+ """Detect restart count from marker file."""
102
+ if not self._marker_path.exists():
103
+ return 0
104
+
105
+ try:
106
+ content = self._marker_path.read_text().strip()
107
+ return int(content)
108
+ except (ValueError, IOError) as e:
109
+ logger.warning(f"Failed to read marker file: {e}")
110
+ return 0
111
+
112
+ def _save_marker(self, count: int) -> None:
113
+ """Save restart count to marker file."""
114
+ try:
115
+ self._marker_path.parent.mkdir(parents=True, exist_ok=True)
116
+ self._marker_path.write_text(str(count))
117
+ except IOError as e:
118
+ logger.warning(f"Failed to save marker file: {e}")
119
+
120
+ def _get_previous_exit_code(self) -> int | None:
121
+ """Get previous exit code from environment."""
122
+ code_str = os.environ.get(self.PREVIOUS_EXIT_CODE_ENV)
123
+ if code_str is None:
124
+ return None
125
+
126
+ try:
127
+ return int(code_str)
128
+ except ValueError:
129
+ return None
130
+
131
+ def _get_restart_reason(self) -> str | None:
132
+ """Get restart reason from environment."""
133
+ return os.environ.get(self.RESTART_REASON_ENV)
134
+
135
+ def reset(self) -> None:
136
+ """Reset restart counter (for testing or golden image reset)."""
137
+ if self._marker_path.exists():
138
+ try:
139
+ self._marker_path.unlink()
140
+ logger.info("Restart marker reset")
141
+ except IOError as e:
142
+ logger.warning(f"Failed to reset marker: {e}")
@@ -0,0 +1,187 @@
1
+ """
2
+ State validation for integrity checking.
3
+
4
+ Validates restored state against schema and checksums.
5
+ """
6
+
7
+ import logging
8
+ from typing import Any
9
+
10
+ from dory.utils.errors import DoryValidationError
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class StateValidator:
16
+ """
17
+ Validates processor state for integrity and schema compliance.
18
+
19
+ Performs:
20
+ 1. Schema validation (required fields, types)
21
+ 2. Integrity checks (checksums)
22
+ 3. Version compatibility checks
23
+ """
24
+
25
+ def __init__(self, schema: dict[str, type] | None = None):
26
+ """
27
+ Initialize validator.
28
+
29
+ Args:
30
+ schema: Optional schema mapping field names to expected types
31
+ """
32
+ self._schema = schema
33
+
34
+ def validate(self, state: dict[str, Any]) -> bool:
35
+ """
36
+ Validate state dictionary.
37
+
38
+ Args:
39
+ state: State dictionary to validate
40
+
41
+ Returns:
42
+ True if valid
43
+
44
+ Raises:
45
+ DoryValidationError: If validation fails
46
+ """
47
+ if not isinstance(state, dict):
48
+ raise DoryValidationError(f"State must be a dict, got {type(state)}")
49
+
50
+ # Validate against schema if provided
51
+ if self._schema:
52
+ self._validate_schema(state)
53
+
54
+ # Run integrity checks
55
+ self._validate_integrity(state)
56
+
57
+ logger.debug("State validation passed")
58
+ return True
59
+
60
+ def _validate_schema(self, state: dict[str, Any]) -> None:
61
+ """Validate state against schema."""
62
+ for field_name, expected_type in self._schema.items():
63
+ if field_name not in state:
64
+ raise DoryValidationError(
65
+ f"Required field '{field_name}' missing from state"
66
+ )
67
+
68
+ value = state[field_name]
69
+
70
+ # Allow None for any type
71
+ if value is None:
72
+ continue
73
+
74
+ if not isinstance(value, expected_type):
75
+ raise DoryValidationError(
76
+ f"Field '{field_name}' has wrong type: "
77
+ f"expected {expected_type.__name__}, got {type(value).__name__}"
78
+ )
79
+
80
+ def _validate_integrity(self, state: dict[str, Any]) -> None:
81
+ """
82
+ Run integrity checks on state.
83
+
84
+ Can be extended for custom integrity validation.
85
+ """
86
+ # Check for common corruption indicators
87
+ if "__corrupted__" in state:
88
+ raise DoryValidationError("State marked as corrupted")
89
+
90
+ # Check metadata if present
91
+ if "_metadata" in state:
92
+ metadata = state["_metadata"]
93
+ if not isinstance(metadata, dict):
94
+ raise DoryValidationError("State metadata must be a dict")
95
+
96
+ def validate_partial(self, state: dict[str, Any], required_fields: list[str]) -> bool:
97
+ """
98
+ Validate that specific fields exist and have correct types.
99
+
100
+ Args:
101
+ state: State dictionary
102
+ required_fields: List of field names that must exist
103
+
104
+ Returns:
105
+ True if valid
106
+
107
+ Raises:
108
+ DoryValidationError: If validation fails
109
+ """
110
+ for field_name in required_fields:
111
+ if field_name not in state:
112
+ raise DoryValidationError(
113
+ f"Required field '{field_name}' missing from state"
114
+ )
115
+
116
+ if self._schema and field_name in self._schema:
117
+ expected_type = self._schema[field_name]
118
+ value = state[field_name]
119
+
120
+ if value is not None and not isinstance(value, expected_type):
121
+ raise DoryValidationError(
122
+ f"Field '{field_name}' has wrong type"
123
+ )
124
+
125
+ return True
126
+
127
+
128
+ class StateVersionChecker:
129
+ """
130
+ Checks state version compatibility.
131
+
132
+ Ensures restored state is compatible with current processor version.
133
+ """
134
+
135
+ VERSION_FIELD = "_version"
136
+
137
+ def __init__(self, current_version: str):
138
+ """
139
+ Initialize version checker.
140
+
141
+ Args:
142
+ current_version: Current processor state version
143
+ """
144
+ self._current_version = current_version
145
+
146
+ def check_compatible(self, state: dict[str, Any]) -> bool:
147
+ """
148
+ Check if state version is compatible.
149
+
150
+ Args:
151
+ state: State dictionary
152
+
153
+ Returns:
154
+ True if compatible
155
+
156
+ Raises:
157
+ DoryValidationError: If incompatible
158
+ """
159
+ state_version = state.get(self.VERSION_FIELD)
160
+
161
+ if state_version is None:
162
+ # No version = assume compatible (v0)
163
+ logger.warning("State has no version field, assuming compatible")
164
+ return True
165
+
166
+ if not self._is_compatible(state_version, self._current_version):
167
+ raise DoryValidationError(
168
+ f"State version {state_version} not compatible "
169
+ f"with processor version {self._current_version}"
170
+ )
171
+
172
+ return True
173
+
174
+ def _is_compatible(self, state_version: str, processor_version: str) -> bool:
175
+ """
176
+ Check version compatibility.
177
+
178
+ Default: major version must match.
179
+ Override for custom compatibility logic.
180
+ """
181
+ try:
182
+ state_major = int(state_version.split(".")[0])
183
+ processor_major = int(processor_version.split(".")[0])
184
+ return state_major == processor_major
185
+ except (ValueError, IndexError):
186
+ # Invalid version format
187
+ return False
@@ -0,0 +1,45 @@
1
+ """
2
+ Resilience patterns for fault-tolerant processing.
3
+
4
+ This module provides production-ready resilience patterns:
5
+ - Retry with exponential backoff
6
+ - Circuit breaker pattern
7
+ - Rate limiting
8
+ - Bulkhead isolation
9
+
10
+ Example usage:
11
+ from dory.resilience import retry_with_backoff, CircuitBreaker
12
+
13
+ @retry_with_backoff(max_attempts=3)
14
+ async def call_api():
15
+ return await api.get()
16
+
17
+ breaker = CircuitBreaker(name="database")
18
+ result = await breaker.call(db.query)
19
+ """
20
+
21
+ from .retry import (
22
+ retry_with_backoff,
23
+ RetryPolicy,
24
+ RetryBudget,
25
+ RetryExhaustedError,
26
+ )
27
+ from .circuit_breaker import (
28
+ CircuitBreaker,
29
+ CircuitState,
30
+ CircuitOpenError,
31
+ CircuitBreakerConfig,
32
+ )
33
+
34
+ __all__ = [
35
+ # Retry
36
+ "retry_with_backoff",
37
+ "RetryPolicy",
38
+ "RetryBudget",
39
+ "RetryExhaustedError",
40
+ # Circuit Breaker
41
+ "CircuitBreaker",
42
+ "CircuitState",
43
+ "CircuitOpenError",
44
+ "CircuitBreakerConfig",
45
+ ]