dory-sdk 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +70 -0
- dory/auto_instrument.py +142 -0
- dory/cli/__init__.py +5 -0
- dory/cli/main.py +290 -0
- dory/cli/templates.py +333 -0
- dory/config/__init__.py +23 -0
- dory/config/defaults.py +50 -0
- dory/config/loader.py +361 -0
- dory/config/presets.py +325 -0
- dory/config/schema.py +152 -0
- dory/core/__init__.py +27 -0
- dory/core/app.py +404 -0
- dory/core/context.py +209 -0
- dory/core/lifecycle.py +214 -0
- dory/core/meta.py +121 -0
- dory/core/modes.py +479 -0
- dory/core/processor.py +654 -0
- dory/core/signals.py +122 -0
- dory/decorators.py +142 -0
- dory/errors/__init__.py +117 -0
- dory/errors/classification.py +362 -0
- dory/errors/codes.py +495 -0
- dory/health/__init__.py +10 -0
- dory/health/probes.py +210 -0
- dory/health/server.py +306 -0
- dory/k8s/__init__.py +11 -0
- dory/k8s/annotation_watcher.py +184 -0
- dory/k8s/client.py +251 -0
- dory/k8s/pod_metadata.py +182 -0
- dory/logging/__init__.py +9 -0
- dory/logging/logger.py +175 -0
- dory/metrics/__init__.py +7 -0
- dory/metrics/collector.py +301 -0
- dory/middleware/__init__.py +36 -0
- dory/middleware/connection_tracker.py +608 -0
- dory/middleware/request_id.py +321 -0
- dory/middleware/request_tracker.py +501 -0
- dory/migration/__init__.py +11 -0
- dory/migration/configmap.py +260 -0
- dory/migration/serialization.py +167 -0
- dory/migration/state_manager.py +301 -0
- dory/monitoring/__init__.py +23 -0
- dory/monitoring/opentelemetry.py +462 -0
- dory/py.typed +2 -0
- dory/recovery/__init__.py +60 -0
- dory/recovery/golden_image.py +480 -0
- dory/recovery/golden_snapshot.py +561 -0
- dory/recovery/golden_validator.py +518 -0
- dory/recovery/partial_recovery.py +479 -0
- dory/recovery/recovery_decision.py +242 -0
- dory/recovery/restart_detector.py +142 -0
- dory/recovery/state_validator.py +187 -0
- dory/resilience/__init__.py +45 -0
- dory/resilience/circuit_breaker.py +454 -0
- dory/resilience/retry.py +389 -0
- dory/sidecar/__init__.py +6 -0
- dory/sidecar/main.py +75 -0
- dory/sidecar/server.py +329 -0
- dory/simple.py +342 -0
- dory/types.py +75 -0
- dory/utils/__init__.py +25 -0
- dory/utils/errors.py +59 -0
- dory/utils/retry.py +115 -0
- dory/utils/timeout.py +80 -0
- dory_sdk-2.1.0.dist-info/METADATA +663 -0
- dory_sdk-2.1.0.dist-info/RECORD +69 -0
- dory_sdk-2.1.0.dist-info/WHEEL +5 -0
- dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
- dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
dory/errors/codes.py
ADDED
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structured Error Code System for Dory SDK
|
|
3
|
+
|
|
4
|
+
This module defines a comprehensive error code system for the Dory SDK,
|
|
5
|
+
providing structured, searchable error codes for debugging and monitoring.
|
|
6
|
+
|
|
7
|
+
Error Code Format: E-<DOMAIN>-<NUMBER>
|
|
8
|
+
- DOMAIN: 3-letter code identifying the module/domain
|
|
9
|
+
- NUMBER: 3-digit unique identifier
|
|
10
|
+
|
|
11
|
+
Example: E-RET-001 (Retry domain, error #1)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from typing import Dict, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ErrorDomain(str, Enum):
|
|
20
|
+
"""Error domains for categorizing errors."""
|
|
21
|
+
|
|
22
|
+
# Core SDK
|
|
23
|
+
CORE = "COR" # Core SDK functionality
|
|
24
|
+
STATE = "STA" # State management
|
|
25
|
+
MIGRATION = "MIG" # Migration operations
|
|
26
|
+
|
|
27
|
+
# Resilience
|
|
28
|
+
RETRY = "RET" # Retry logic
|
|
29
|
+
CIRCUIT_BREAKER = "CBR" # Circuit breaker
|
|
30
|
+
ERROR_CLASS = "ECL" # Error classification
|
|
31
|
+
|
|
32
|
+
# Recovery
|
|
33
|
+
GOLDEN_IMAGE = "GLD" # Golden image/snapshots
|
|
34
|
+
RECOVERY = "REC" # Recovery operations
|
|
35
|
+
VALIDATION = "VAL" # Validation
|
|
36
|
+
|
|
37
|
+
# Processing
|
|
38
|
+
PROCESSOR = "PRC" # Processor operations
|
|
39
|
+
MODE = "MOD" # Processing modes
|
|
40
|
+
QUEUE = "QUE" # Queue operations
|
|
41
|
+
|
|
42
|
+
# Monitoring
|
|
43
|
+
METRICS = "MET" # Metrics collection
|
|
44
|
+
HEALTH = "HLT" # Health checks
|
|
45
|
+
TELEMETRY = "TEL" # OpenTelemetry
|
|
46
|
+
|
|
47
|
+
# Infrastructure
|
|
48
|
+
KUBERNETES = "K8S" # Kubernetes operations
|
|
49
|
+
STORAGE = "STO" # Storage operations
|
|
50
|
+
NETWORK = "NET" # Network operations
|
|
51
|
+
|
|
52
|
+
# Middleware
|
|
53
|
+
REQUEST = "REQ" # Request tracking
|
|
54
|
+
CONNECTION = "CON" # Connection management
|
|
55
|
+
SESSION = "SES" # Session management
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(frozen=True)
|
|
59
|
+
class ErrorCode:
|
|
60
|
+
"""Represents a structured error code."""
|
|
61
|
+
|
|
62
|
+
domain: ErrorDomain
|
|
63
|
+
number: int
|
|
64
|
+
message: str
|
|
65
|
+
description: str
|
|
66
|
+
remediation: str
|
|
67
|
+
severity: str = "ERROR"
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def code(self) -> str:
|
|
71
|
+
"""Get formatted error code (e.g., E-RET-001)."""
|
|
72
|
+
return f"E-{self.domain.value}-{self.number:03d}"
|
|
73
|
+
|
|
74
|
+
def __str__(self) -> str:
|
|
75
|
+
"""String representation of error code."""
|
|
76
|
+
return f"[{self.code}] {self.message}"
|
|
77
|
+
|
|
78
|
+
def format_full(self) -> str:
|
|
79
|
+
"""Get full formatted error message."""
|
|
80
|
+
return f"""
|
|
81
|
+
Error Code: {self.code}
|
|
82
|
+
Severity: {self.severity}
|
|
83
|
+
Message: {self.message}
|
|
84
|
+
Description: {self.description}
|
|
85
|
+
Remediation: {self.remediation}
|
|
86
|
+
""".strip()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ============================================================================
|
|
90
|
+
# RETRY ERRORS (E-RET-xxx)
|
|
91
|
+
# ============================================================================
|
|
92
|
+
|
|
93
|
+
E_RET_001 = ErrorCode(
|
|
94
|
+
domain=ErrorDomain.RETRY,
|
|
95
|
+
number=1,
|
|
96
|
+
message="Retry budget exhausted",
|
|
97
|
+
description="The retry budget has been depleted. No more retries are allowed.",
|
|
98
|
+
remediation="Wait for budget to replenish or increase max_retry_budget.",
|
|
99
|
+
severity="ERROR",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
E_RET_002 = ErrorCode(
|
|
103
|
+
domain=ErrorDomain.RETRY,
|
|
104
|
+
number=2,
|
|
105
|
+
message="Max retry attempts exceeded",
|
|
106
|
+
description="Operation failed after maximum retry attempts.",
|
|
107
|
+
remediation="Check operation logic and increase max_attempts if appropriate.",
|
|
108
|
+
severity="ERROR",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
E_RET_003 = ErrorCode(
|
|
112
|
+
domain=ErrorDomain.RETRY,
|
|
113
|
+
number=3,
|
|
114
|
+
message="Backoff timeout exceeded",
|
|
115
|
+
description="Total backoff time exceeded max_backoff_time.",
|
|
116
|
+
remediation="Increase max_backoff_time or reduce initial_delay/max_delay.",
|
|
117
|
+
severity="ERROR",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# ============================================================================
|
|
121
|
+
# CIRCUIT BREAKER ERRORS (E-CBR-xxx)
|
|
122
|
+
# ============================================================================
|
|
123
|
+
|
|
124
|
+
E_CBR_001 = ErrorCode(
|
|
125
|
+
domain=ErrorDomain.CIRCUIT_BREAKER,
|
|
126
|
+
number=1,
|
|
127
|
+
message="Circuit breaker is OPEN",
|
|
128
|
+
description="Circuit breaker is open due to high failure rate. Requests are being rejected.",
|
|
129
|
+
remediation="Wait for circuit breaker to enter HALF_OPEN state or manually reset.",
|
|
130
|
+
severity="WARNING",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
E_CBR_002 = ErrorCode(
|
|
134
|
+
domain=ErrorDomain.CIRCUIT_BREAKER,
|
|
135
|
+
number=2,
|
|
136
|
+
message="Circuit breaker transition failed",
|
|
137
|
+
description="Failed to transition circuit breaker state.",
|
|
138
|
+
remediation="Check circuit breaker configuration and state consistency.",
|
|
139
|
+
severity="ERROR",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
E_CBR_003 = ErrorCode(
|
|
143
|
+
domain=ErrorDomain.CIRCUIT_BREAKER,
|
|
144
|
+
number=3,
|
|
145
|
+
message="Failure threshold exceeded",
|
|
146
|
+
description="Operation failures exceeded the circuit breaker threshold.",
|
|
147
|
+
remediation="Investigate underlying failures and adjust failure_threshold if needed.",
|
|
148
|
+
severity="WARNING",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# ============================================================================
|
|
152
|
+
# ERROR CLASSIFICATION ERRORS (E-ECL-xxx)
|
|
153
|
+
# ============================================================================
|
|
154
|
+
|
|
155
|
+
E_ECL_001 = ErrorCode(
|
|
156
|
+
domain=ErrorDomain.ERROR_CLASS,
|
|
157
|
+
number=1,
|
|
158
|
+
message="Unable to classify error",
|
|
159
|
+
description="Error classification failed - error type could not be determined.",
|
|
160
|
+
remediation="Add error pattern to classifier or handle as UNKNOWN type.",
|
|
161
|
+
severity="WARNING",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
E_ECL_002 = ErrorCode(
|
|
165
|
+
domain=ErrorDomain.ERROR_CLASS,
|
|
166
|
+
number=2,
|
|
167
|
+
message="Error classification confidence low",
|
|
168
|
+
description="Error was classified but with low confidence score.",
|
|
169
|
+
remediation="Review error patterns and improve classification rules.",
|
|
170
|
+
severity="INFO",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# ============================================================================
|
|
174
|
+
# GOLDEN IMAGE ERRORS (E-GLD-xxx)
|
|
175
|
+
# ============================================================================
|
|
176
|
+
|
|
177
|
+
E_GLD_001 = ErrorCode(
|
|
178
|
+
domain=ErrorDomain.GOLDEN_IMAGE,
|
|
179
|
+
number=1,
|
|
180
|
+
message="Golden snapshot capture failed",
|
|
181
|
+
description="Failed to capture golden state snapshot.",
|
|
182
|
+
remediation="Check storage permissions and available space.",
|
|
183
|
+
severity="ERROR",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
E_GLD_002 = ErrorCode(
|
|
187
|
+
domain=ErrorDomain.GOLDEN_IMAGE,
|
|
188
|
+
number=2,
|
|
189
|
+
message="Golden snapshot restore failed",
|
|
190
|
+
description="Failed to restore state from golden snapshot.",
|
|
191
|
+
remediation="Verify snapshot integrity and compatibility with current version.",
|
|
192
|
+
severity="ERROR",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
E_GLD_003 = ErrorCode(
|
|
196
|
+
domain=ErrorDomain.GOLDEN_IMAGE,
|
|
197
|
+
number=3,
|
|
198
|
+
message="Snapshot checksum mismatch",
|
|
199
|
+
description="Snapshot checksum verification failed - data may be corrupted.",
|
|
200
|
+
remediation="Recapture snapshot or restore from backup.",
|
|
201
|
+
severity="CRITICAL",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
E_GLD_004 = ErrorCode(
|
|
205
|
+
domain=ErrorDomain.GOLDEN_IMAGE,
|
|
206
|
+
number=4,
|
|
207
|
+
message="Snapshot compression failed",
|
|
208
|
+
description="Failed to compress snapshot data.",
|
|
209
|
+
remediation="Check available memory and disk space.",
|
|
210
|
+
severity="ERROR",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
E_GLD_005 = ErrorCode(
|
|
214
|
+
domain=ErrorDomain.GOLDEN_IMAGE,
|
|
215
|
+
number=5,
|
|
216
|
+
message="Graduated reset failed",
|
|
217
|
+
description="All graduated reset levels failed to restore state.",
|
|
218
|
+
remediation="Manual intervention required - check logs for specific failures.",
|
|
219
|
+
severity="CRITICAL",
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# ============================================================================
|
|
223
|
+
# VALIDATION ERRORS (E-VAL-xxx)
|
|
224
|
+
# ============================================================================
|
|
225
|
+
|
|
226
|
+
E_VAL_001 = ErrorCode(
|
|
227
|
+
domain=ErrorDomain.VALIDATION,
|
|
228
|
+
number=1,
|
|
229
|
+
message="State validation failed",
|
|
230
|
+
description="State validation found critical issues.",
|
|
231
|
+
remediation="Review validation errors and fix state data.",
|
|
232
|
+
severity="ERROR",
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
E_VAL_002 = ErrorCode(
|
|
236
|
+
domain=ErrorDomain.VALIDATION,
|
|
237
|
+
number=2,
|
|
238
|
+
message="Schema validation failed",
|
|
239
|
+
description="State does not match expected schema.",
|
|
240
|
+
remediation="Update state to match schema or update schema definition.",
|
|
241
|
+
severity="ERROR",
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
E_VAL_003 = ErrorCode(
|
|
245
|
+
domain=ErrorDomain.VALIDATION,
|
|
246
|
+
number=3,
|
|
247
|
+
message="Dependency validation failed",
|
|
248
|
+
description="Required dependencies are missing or invalid.",
|
|
249
|
+
remediation="Ensure all required dependencies are present and valid.",
|
|
250
|
+
severity="ERROR",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# ============================================================================
|
|
254
|
+
# PROCESSING MODE ERRORS (E-MOD-xxx)
|
|
255
|
+
# ============================================================================
|
|
256
|
+
|
|
257
|
+
E_MOD_001 = ErrorCode(
|
|
258
|
+
domain=ErrorDomain.MODE,
|
|
259
|
+
number=1,
|
|
260
|
+
message="Mode transition failed",
|
|
261
|
+
description="Failed to transition to target processing mode.",
|
|
262
|
+
remediation="Check mode transition preconditions and system state.",
|
|
263
|
+
severity="ERROR",
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
E_MOD_002 = ErrorCode(
|
|
267
|
+
domain=ErrorDomain.MODE,
|
|
268
|
+
number=2,
|
|
269
|
+
message="Invalid mode for operation",
|
|
270
|
+
description="Operation not available in current processing mode.",
|
|
271
|
+
remediation="Wait for mode transition or use degraded operation variant.",
|
|
272
|
+
severity="WARNING",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
E_MOD_003 = ErrorCode(
|
|
276
|
+
domain=ErrorDomain.MODE,
|
|
277
|
+
number=3,
|
|
278
|
+
message="Mode auto-recovery failed",
|
|
279
|
+
description="Automatic mode recovery did not succeed.",
|
|
280
|
+
remediation="Manual intervention required to restore normal mode.",
|
|
281
|
+
severity="ERROR",
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# ============================================================================
|
|
285
|
+
# REQUEST TRACKING ERRORS (E-REQ-xxx)
|
|
286
|
+
# ============================================================================
|
|
287
|
+
|
|
288
|
+
E_REQ_001 = ErrorCode(
|
|
289
|
+
domain=ErrorDomain.REQUEST,
|
|
290
|
+
number=1,
|
|
291
|
+
message="Request tracking initialization failed",
|
|
292
|
+
description="Failed to initialize request tracking.",
|
|
293
|
+
remediation="Check RequestTracker configuration and retry.",
|
|
294
|
+
severity="WARNING",
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
E_REQ_002 = ErrorCode(
|
|
298
|
+
domain=ErrorDomain.REQUEST,
|
|
299
|
+
number=2,
|
|
300
|
+
message="Request timeout exceeded",
|
|
301
|
+
description="Request exceeded configured timeout duration.",
|
|
302
|
+
remediation="Increase timeout or optimize request processing.",
|
|
303
|
+
severity="WARNING",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# ============================================================================
|
|
307
|
+
# CONNECTION ERRORS (E-CON-xxx)
|
|
308
|
+
# ============================================================================
|
|
309
|
+
|
|
310
|
+
E_CON_001 = ErrorCode(
|
|
311
|
+
domain=ErrorDomain.CONNECTION,
|
|
312
|
+
number=1,
|
|
313
|
+
message="Connection health check failed",
|
|
314
|
+
description="Connection failed health check.",
|
|
315
|
+
remediation="Verify connection is alive and responsive.",
|
|
316
|
+
severity="WARNING",
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
E_CON_002 = ErrorCode(
|
|
320
|
+
domain=ErrorDomain.CONNECTION,
|
|
321
|
+
number=2,
|
|
322
|
+
message="Connection idle timeout",
|
|
323
|
+
description="Connection closed due to idle timeout.",
|
|
324
|
+
remediation="Increase idle_timeout or ensure connection is actively used.",
|
|
325
|
+
severity="INFO",
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
E_CON_003 = ErrorCode(
|
|
329
|
+
domain=ErrorDomain.CONNECTION,
|
|
330
|
+
number=3,
|
|
331
|
+
message="Connection registration failed",
|
|
332
|
+
description="Failed to register connection with tracker.",
|
|
333
|
+
remediation="Check connection is valid and tracker is initialized.",
|
|
334
|
+
severity="ERROR",
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# ============================================================================
|
|
338
|
+
# STATE MANAGEMENT ERRORS (E-STA-xxx)
|
|
339
|
+
# ============================================================================
|
|
340
|
+
|
|
341
|
+
E_STA_001 = ErrorCode(
|
|
342
|
+
domain=ErrorDomain.STATE,
|
|
343
|
+
number=1,
|
|
344
|
+
message="State serialization failed",
|
|
345
|
+
description="Failed to serialize state data.",
|
|
346
|
+
remediation="Ensure state contains only serializable types.",
|
|
347
|
+
severity="ERROR",
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
E_STA_002 = ErrorCode(
|
|
351
|
+
domain=ErrorDomain.STATE,
|
|
352
|
+
number=2,
|
|
353
|
+
message="State deserialization failed",
|
|
354
|
+
description="Failed to deserialize state data.",
|
|
355
|
+
remediation="Verify state format and version compatibility.",
|
|
356
|
+
severity="ERROR",
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
E_STA_003 = ErrorCode(
|
|
360
|
+
domain=ErrorDomain.STATE,
|
|
361
|
+
number=3,
|
|
362
|
+
message="State corruption detected",
|
|
363
|
+
description="State data appears to be corrupted.",
|
|
364
|
+
remediation="Restore from golden snapshot or recapture state.",
|
|
365
|
+
severity="CRITICAL",
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# ============================================================================
|
|
369
|
+
# ERROR CODE REGISTRY
|
|
370
|
+
# ============================================================================
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
class ErrorCodeRegistry:
|
|
374
|
+
"""Registry for all error codes."""
|
|
375
|
+
|
|
376
|
+
_codes: Dict[str, ErrorCode] = {}
|
|
377
|
+
|
|
378
|
+
@classmethod
|
|
379
|
+
def register(cls, error_code: ErrorCode) -> None:
|
|
380
|
+
"""Register an error code."""
|
|
381
|
+
cls._codes[error_code.code] = error_code
|
|
382
|
+
|
|
383
|
+
@classmethod
|
|
384
|
+
def get(cls, code: str) -> Optional[ErrorCode]:
|
|
385
|
+
"""Get error code by code string."""
|
|
386
|
+
return cls._codes.get(code)
|
|
387
|
+
|
|
388
|
+
@classmethod
|
|
389
|
+
def search(cls, query: str) -> list[ErrorCode]:
|
|
390
|
+
"""Search error codes by message or description."""
|
|
391
|
+
query_lower = query.lower()
|
|
392
|
+
return [
|
|
393
|
+
code
|
|
394
|
+
for code in cls._codes.values()
|
|
395
|
+
if query_lower in code.message.lower()
|
|
396
|
+
or query_lower in code.description.lower()
|
|
397
|
+
]
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def list_by_domain(cls, domain: ErrorDomain) -> list[ErrorCode]:
|
|
401
|
+
"""List all error codes for a domain."""
|
|
402
|
+
return [code for code in cls._codes.values() if code.domain == domain]
|
|
403
|
+
|
|
404
|
+
@classmethod
|
|
405
|
+
def all(cls) -> list[ErrorCode]:
|
|
406
|
+
"""Get all registered error codes."""
|
|
407
|
+
return sorted(cls._codes.values(), key=lambda c: c.code)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
# Auto-register all error codes defined in this module
|
|
411
|
+
_error_codes = [
|
|
412
|
+
# Retry
|
|
413
|
+
E_RET_001,
|
|
414
|
+
E_RET_002,
|
|
415
|
+
E_RET_003,
|
|
416
|
+
# Circuit Breaker
|
|
417
|
+
E_CBR_001,
|
|
418
|
+
E_CBR_002,
|
|
419
|
+
E_CBR_003,
|
|
420
|
+
# Error Classification
|
|
421
|
+
E_ECL_001,
|
|
422
|
+
E_ECL_002,
|
|
423
|
+
# Golden Image
|
|
424
|
+
E_GLD_001,
|
|
425
|
+
E_GLD_002,
|
|
426
|
+
E_GLD_003,
|
|
427
|
+
E_GLD_004,
|
|
428
|
+
E_GLD_005,
|
|
429
|
+
# Validation
|
|
430
|
+
E_VAL_001,
|
|
431
|
+
E_VAL_002,
|
|
432
|
+
E_VAL_003,
|
|
433
|
+
# Processing Mode
|
|
434
|
+
E_MOD_001,
|
|
435
|
+
E_MOD_002,
|
|
436
|
+
E_MOD_003,
|
|
437
|
+
# Request Tracking
|
|
438
|
+
E_REQ_001,
|
|
439
|
+
E_REQ_002,
|
|
440
|
+
# Connection
|
|
441
|
+
E_CON_001,
|
|
442
|
+
E_CON_002,
|
|
443
|
+
E_CON_003,
|
|
444
|
+
# State Management
|
|
445
|
+
E_STA_001,
|
|
446
|
+
E_STA_002,
|
|
447
|
+
E_STA_003,
|
|
448
|
+
]
|
|
449
|
+
|
|
450
|
+
for _code in _error_codes:
|
|
451
|
+
ErrorCodeRegistry.register(_code)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
# ============================================================================
|
|
455
|
+
# ERROR CODE EXCEPTIONS
|
|
456
|
+
# ============================================================================
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
class DoryError(Exception):
|
|
460
|
+
"""Base exception with error code support."""
|
|
461
|
+
|
|
462
|
+
def __init__(
|
|
463
|
+
self,
|
|
464
|
+
error_code: ErrorCode,
|
|
465
|
+
details: Optional[str] = None,
|
|
466
|
+
cause: Optional[Exception] = None,
|
|
467
|
+
):
|
|
468
|
+
"""
|
|
469
|
+
Initialize error with code.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
error_code: The error code
|
|
473
|
+
details: Additional context-specific details
|
|
474
|
+
cause: Original exception that caused this error
|
|
475
|
+
"""
|
|
476
|
+
self.error_code = error_code
|
|
477
|
+
self.details = details
|
|
478
|
+
self.cause = cause
|
|
479
|
+
|
|
480
|
+
message = str(error_code)
|
|
481
|
+
if details:
|
|
482
|
+
message += f"\nDetails: {details}"
|
|
483
|
+
if cause:
|
|
484
|
+
message += f"\nCause: {cause}"
|
|
485
|
+
|
|
486
|
+
super().__init__(message)
|
|
487
|
+
|
|
488
|
+
def format_full(self) -> str:
|
|
489
|
+
"""Get full formatted error message."""
|
|
490
|
+
msg = self.error_code.format_full()
|
|
491
|
+
if self.details:
|
|
492
|
+
msg += f"\n\nAdditional Details:\n{self.details}"
|
|
493
|
+
if self.cause:
|
|
494
|
+
msg += f"\n\nCaused by:\n{self.cause}"
|
|
495
|
+
return msg
|
dory/health/__init__.py
ADDED
dory/health/probes.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health probe implementations.
|
|
3
|
+
|
|
4
|
+
Provides liveness and readiness probes for Kubernetes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import inspect
|
|
9
|
+
import logging
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Callable, Awaitable, Union
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ProbeResult:
|
|
19
|
+
"""Result of a health probe check."""
|
|
20
|
+
healthy: bool
|
|
21
|
+
message: str = ""
|
|
22
|
+
details: dict = field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
def to_dict(self) -> dict:
|
|
25
|
+
"""Convert to dictionary for JSON response."""
|
|
26
|
+
return {
|
|
27
|
+
"healthy": self.healthy,
|
|
28
|
+
"message": self.message,
|
|
29
|
+
"details": self.details,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class HealthProbe(ABC):
|
|
34
|
+
"""Abstract base class for health probes."""
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
async def check(self) -> ProbeResult:
|
|
38
|
+
"""
|
|
39
|
+
Perform health check.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
ProbeResult indicating health status
|
|
43
|
+
"""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class LivenessProbe(HealthProbe):
|
|
48
|
+
"""
|
|
49
|
+
Liveness probe for Kubernetes.
|
|
50
|
+
|
|
51
|
+
Indicates whether the process is alive and should not be killed.
|
|
52
|
+
Failed liveness = Kubernetes restarts the pod.
|
|
53
|
+
|
|
54
|
+
Should be lightweight and always pass unless process is deadlocked.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self):
|
|
58
|
+
"""Initialize liveness probe."""
|
|
59
|
+
self._custom_checks: list[Callable[[], Union[bool, Awaitable[bool]]]] = []
|
|
60
|
+
|
|
61
|
+
def add_check(self, check: Callable[[], Union[bool, Awaitable[bool]]]) -> None:
|
|
62
|
+
"""
|
|
63
|
+
Add custom liveness check.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
check: Sync or async function returning True if healthy
|
|
67
|
+
"""
|
|
68
|
+
self._custom_checks.append(check)
|
|
69
|
+
|
|
70
|
+
async def check(self) -> ProbeResult:
|
|
71
|
+
"""
|
|
72
|
+
Perform liveness check.
|
|
73
|
+
|
|
74
|
+
Default implementation always returns healthy.
|
|
75
|
+
Override or add custom checks for specific requirements.
|
|
76
|
+
"""
|
|
77
|
+
# Run custom checks
|
|
78
|
+
for i, custom_check in enumerate(self._custom_checks):
|
|
79
|
+
try:
|
|
80
|
+
# Handle both sync and async functions
|
|
81
|
+
if asyncio.iscoroutinefunction(custom_check):
|
|
82
|
+
result = await custom_check()
|
|
83
|
+
else:
|
|
84
|
+
result = custom_check()
|
|
85
|
+
|
|
86
|
+
if not result:
|
|
87
|
+
return ProbeResult(
|
|
88
|
+
healthy=False,
|
|
89
|
+
message=f"Custom liveness check {i} failed",
|
|
90
|
+
)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"Liveness check {i} error: {e}")
|
|
93
|
+
return ProbeResult(
|
|
94
|
+
healthy=False,
|
|
95
|
+
message=f"Custom liveness check {i} error: {e}",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return ProbeResult(healthy=True, message="Process is alive")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class ReadinessProbe(HealthProbe):
|
|
102
|
+
"""
|
|
103
|
+
Readiness probe for Kubernetes.
|
|
104
|
+
|
|
105
|
+
Indicates whether the process is ready to receive traffic.
|
|
106
|
+
Failed readiness = Kubernetes removes pod from service endpoints.
|
|
107
|
+
|
|
108
|
+
Should check that all dependencies are available.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(self):
|
|
112
|
+
"""Initialize readiness probe."""
|
|
113
|
+
self._ready = False
|
|
114
|
+
self._custom_checks: list[Callable[[], Union[bool, Awaitable[bool]]]] = []
|
|
115
|
+
|
|
116
|
+
def mark_ready(self) -> None:
|
|
117
|
+
"""Mark the processor as ready to receive traffic."""
|
|
118
|
+
self._ready = True
|
|
119
|
+
logger.info("Processor marked as ready")
|
|
120
|
+
|
|
121
|
+
def mark_not_ready(self) -> None:
|
|
122
|
+
"""Mark the processor as not ready."""
|
|
123
|
+
self._ready = False
|
|
124
|
+
logger.info("Processor marked as not ready")
|
|
125
|
+
|
|
126
|
+
def is_ready(self) -> bool:
|
|
127
|
+
"""Check if currently marked as ready."""
|
|
128
|
+
return self._ready
|
|
129
|
+
|
|
130
|
+
def add_check(self, check: Callable[[], Union[bool, Awaitable[bool]]]) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Add custom readiness check.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
check: Sync or async function returning True if ready
|
|
136
|
+
"""
|
|
137
|
+
self._custom_checks.append(check)
|
|
138
|
+
|
|
139
|
+
async def check(self) -> ProbeResult:
|
|
140
|
+
"""
|
|
141
|
+
Perform readiness check.
|
|
142
|
+
|
|
143
|
+
Returns not ready until explicitly marked ready.
|
|
144
|
+
Also runs any custom checks.
|
|
145
|
+
"""
|
|
146
|
+
if not self._ready:
|
|
147
|
+
return ProbeResult(
|
|
148
|
+
healthy=False,
|
|
149
|
+
message="Processor not yet ready",
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Run custom checks
|
|
153
|
+
for i, custom_check in enumerate(self._custom_checks):
|
|
154
|
+
try:
|
|
155
|
+
# Handle both sync and async functions
|
|
156
|
+
if asyncio.iscoroutinefunction(custom_check):
|
|
157
|
+
result = await custom_check()
|
|
158
|
+
else:
|
|
159
|
+
result = custom_check()
|
|
160
|
+
|
|
161
|
+
if not result:
|
|
162
|
+
return ProbeResult(
|
|
163
|
+
healthy=False,
|
|
164
|
+
message=f"Custom readiness check {i} failed",
|
|
165
|
+
)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"Readiness check {i} error: {e}")
|
|
168
|
+
return ProbeResult(
|
|
169
|
+
healthy=False,
|
|
170
|
+
message=f"Custom readiness check {i} error: {e}",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return ProbeResult(healthy=True, message="Processor is ready")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class StartupProbe(HealthProbe):
|
|
177
|
+
"""
|
|
178
|
+
Startup probe for Kubernetes.
|
|
179
|
+
|
|
180
|
+
Indicates whether the application has finished starting up.
|
|
181
|
+
Failed startup = Kubernetes keeps waiting (up to failureThreshold).
|
|
182
|
+
|
|
183
|
+
Useful for slow-starting applications.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def __init__(self, startup_complete_check: Callable[[], bool] | None = None):
|
|
187
|
+
"""
|
|
188
|
+
Initialize startup probe.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
startup_complete_check: Function returning True when startup is complete
|
|
192
|
+
"""
|
|
193
|
+
self._startup_complete = False
|
|
194
|
+
self._startup_check = startup_complete_check
|
|
195
|
+
|
|
196
|
+
def mark_startup_complete(self) -> None:
|
|
197
|
+
"""Mark startup as complete."""
|
|
198
|
+
self._startup_complete = True
|
|
199
|
+
logger.info("Startup marked as complete")
|
|
200
|
+
|
|
201
|
+
async def check(self) -> ProbeResult:
|
|
202
|
+
"""Perform startup check."""
|
|
203
|
+
if self._startup_complete:
|
|
204
|
+
return ProbeResult(healthy=True, message="Startup complete")
|
|
205
|
+
|
|
206
|
+
if self._startup_check and self._startup_check():
|
|
207
|
+
self._startup_complete = True
|
|
208
|
+
return ProbeResult(healthy=True, message="Startup complete")
|
|
209
|
+
|
|
210
|
+
return ProbeResult(healthy=False, message="Still starting up")
|