dory-sdk 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dory/__init__.py +70 -0
  2. dory/auto_instrument.py +142 -0
  3. dory/cli/__init__.py +5 -0
  4. dory/cli/main.py +290 -0
  5. dory/cli/templates.py +333 -0
  6. dory/config/__init__.py +23 -0
  7. dory/config/defaults.py +50 -0
  8. dory/config/loader.py +361 -0
  9. dory/config/presets.py +325 -0
  10. dory/config/schema.py +152 -0
  11. dory/core/__init__.py +27 -0
  12. dory/core/app.py +404 -0
  13. dory/core/context.py +209 -0
  14. dory/core/lifecycle.py +214 -0
  15. dory/core/meta.py +121 -0
  16. dory/core/modes.py +479 -0
  17. dory/core/processor.py +654 -0
  18. dory/core/signals.py +122 -0
  19. dory/decorators.py +142 -0
  20. dory/errors/__init__.py +117 -0
  21. dory/errors/classification.py +362 -0
  22. dory/errors/codes.py +495 -0
  23. dory/health/__init__.py +10 -0
  24. dory/health/probes.py +210 -0
  25. dory/health/server.py +306 -0
  26. dory/k8s/__init__.py +11 -0
  27. dory/k8s/annotation_watcher.py +184 -0
  28. dory/k8s/client.py +251 -0
  29. dory/k8s/pod_metadata.py +182 -0
  30. dory/logging/__init__.py +9 -0
  31. dory/logging/logger.py +175 -0
  32. dory/metrics/__init__.py +7 -0
  33. dory/metrics/collector.py +301 -0
  34. dory/middleware/__init__.py +36 -0
  35. dory/middleware/connection_tracker.py +608 -0
  36. dory/middleware/request_id.py +321 -0
  37. dory/middleware/request_tracker.py +501 -0
  38. dory/migration/__init__.py +11 -0
  39. dory/migration/configmap.py +260 -0
  40. dory/migration/serialization.py +167 -0
  41. dory/migration/state_manager.py +301 -0
  42. dory/monitoring/__init__.py +23 -0
  43. dory/monitoring/opentelemetry.py +462 -0
  44. dory/py.typed +2 -0
  45. dory/recovery/__init__.py +60 -0
  46. dory/recovery/golden_image.py +480 -0
  47. dory/recovery/golden_snapshot.py +561 -0
  48. dory/recovery/golden_validator.py +518 -0
  49. dory/recovery/partial_recovery.py +479 -0
  50. dory/recovery/recovery_decision.py +242 -0
  51. dory/recovery/restart_detector.py +142 -0
  52. dory/recovery/state_validator.py +187 -0
  53. dory/resilience/__init__.py +45 -0
  54. dory/resilience/circuit_breaker.py +454 -0
  55. dory/resilience/retry.py +389 -0
  56. dory/sidecar/__init__.py +6 -0
  57. dory/sidecar/main.py +75 -0
  58. dory/sidecar/server.py +329 -0
  59. dory/simple.py +342 -0
  60. dory/types.py +75 -0
  61. dory/utils/__init__.py +25 -0
  62. dory/utils/errors.py +59 -0
  63. dory/utils/retry.py +115 -0
  64. dory/utils/timeout.py +80 -0
  65. dory_sdk-2.1.0.dist-info/METADATA +663 -0
  66. dory_sdk-2.1.0.dist-info/RECORD +69 -0
  67. dory_sdk-2.1.0.dist-info/WHEEL +5 -0
  68. dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
  69. dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
dory/errors/codes.py ADDED
@@ -0,0 +1,495 @@
1
+ """
2
+ Structured Error Code System for Dory SDK
3
+
4
+ This module defines a comprehensive error code system for the Dory SDK,
5
+ providing structured, searchable error codes for debugging and monitoring.
6
+
7
+ Error Code Format: E-<DOMAIN>-<NUMBER>
8
+ - DOMAIN: 3-letter code identifying the module/domain
9
+ - NUMBER: 3-digit unique identifier
10
+
11
+ Example: E-RET-001 (Retry domain, error #1)
12
+ """
13
+
14
+ from dataclasses import dataclass
15
+ from enum import Enum
16
+ from typing import Dict, Optional
17
+
18
+
19
+ class ErrorDomain(str, Enum):
20
+ """Error domains for categorizing errors."""
21
+
22
+ # Core SDK
23
+ CORE = "COR" # Core SDK functionality
24
+ STATE = "STA" # State management
25
+ MIGRATION = "MIG" # Migration operations
26
+
27
+ # Resilience
28
+ RETRY = "RET" # Retry logic
29
+ CIRCUIT_BREAKER = "CBR" # Circuit breaker
30
+ ERROR_CLASS = "ECL" # Error classification
31
+
32
+ # Recovery
33
+ GOLDEN_IMAGE = "GLD" # Golden image/snapshots
34
+ RECOVERY = "REC" # Recovery operations
35
+ VALIDATION = "VAL" # Validation
36
+
37
+ # Processing
38
+ PROCESSOR = "PRC" # Processor operations
39
+ MODE = "MOD" # Processing modes
40
+ QUEUE = "QUE" # Queue operations
41
+
42
+ # Monitoring
43
+ METRICS = "MET" # Metrics collection
44
+ HEALTH = "HLT" # Health checks
45
+ TELEMETRY = "TEL" # OpenTelemetry
46
+
47
+ # Infrastructure
48
+ KUBERNETES = "K8S" # Kubernetes operations
49
+ STORAGE = "STO" # Storage operations
50
+ NETWORK = "NET" # Network operations
51
+
52
+ # Middleware
53
+ REQUEST = "REQ" # Request tracking
54
+ CONNECTION = "CON" # Connection management
55
+ SESSION = "SES" # Session management
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class ErrorCode:
60
+ """Represents a structured error code."""
61
+
62
+ domain: ErrorDomain
63
+ number: int
64
+ message: str
65
+ description: str
66
+ remediation: str
67
+ severity: str = "ERROR"
68
+
69
+ @property
70
+ def code(self) -> str:
71
+ """Get formatted error code (e.g., E-RET-001)."""
72
+ return f"E-{self.domain.value}-{self.number:03d}"
73
+
74
+ def __str__(self) -> str:
75
+ """String representation of error code."""
76
+ return f"[{self.code}] {self.message}"
77
+
78
+ def format_full(self) -> str:
79
+ """Get full formatted error message."""
80
+ return f"""
81
+ Error Code: {self.code}
82
+ Severity: {self.severity}
83
+ Message: {self.message}
84
+ Description: {self.description}
85
+ Remediation: {self.remediation}
86
+ """.strip()
87
+
88
+
89
+ # ============================================================================
90
+ # RETRY ERRORS (E-RET-xxx)
91
+ # ============================================================================
92
+
93
+ E_RET_001 = ErrorCode(
94
+ domain=ErrorDomain.RETRY,
95
+ number=1,
96
+ message="Retry budget exhausted",
97
+ description="The retry budget has been depleted. No more retries are allowed.",
98
+ remediation="Wait for budget to replenish or increase max_retry_budget.",
99
+ severity="ERROR",
100
+ )
101
+
102
+ E_RET_002 = ErrorCode(
103
+ domain=ErrorDomain.RETRY,
104
+ number=2,
105
+ message="Max retry attempts exceeded",
106
+ description="Operation failed after maximum retry attempts.",
107
+ remediation="Check operation logic and increase max_attempts if appropriate.",
108
+ severity="ERROR",
109
+ )
110
+
111
+ E_RET_003 = ErrorCode(
112
+ domain=ErrorDomain.RETRY,
113
+ number=3,
114
+ message="Backoff timeout exceeded",
115
+ description="Total backoff time exceeded max_backoff_time.",
116
+ remediation="Increase max_backoff_time or reduce initial_delay/max_delay.",
117
+ severity="ERROR",
118
+ )
119
+
120
+ # ============================================================================
121
+ # CIRCUIT BREAKER ERRORS (E-CBR-xxx)
122
+ # ============================================================================
123
+
124
+ E_CBR_001 = ErrorCode(
125
+ domain=ErrorDomain.CIRCUIT_BREAKER,
126
+ number=1,
127
+ message="Circuit breaker is OPEN",
128
+ description="Circuit breaker is open due to high failure rate. Requests are being rejected.",
129
+ remediation="Wait for circuit breaker to enter HALF_OPEN state or manually reset.",
130
+ severity="WARNING",
131
+ )
132
+
133
+ E_CBR_002 = ErrorCode(
134
+ domain=ErrorDomain.CIRCUIT_BREAKER,
135
+ number=2,
136
+ message="Circuit breaker transition failed",
137
+ description="Failed to transition circuit breaker state.",
138
+ remediation="Check circuit breaker configuration and state consistency.",
139
+ severity="ERROR",
140
+ )
141
+
142
+ E_CBR_003 = ErrorCode(
143
+ domain=ErrorDomain.CIRCUIT_BREAKER,
144
+ number=3,
145
+ message="Failure threshold exceeded",
146
+ description="Operation failures exceeded the circuit breaker threshold.",
147
+ remediation="Investigate underlying failures and adjust failure_threshold if needed.",
148
+ severity="WARNING",
149
+ )
150
+
151
+ # ============================================================================
152
+ # ERROR CLASSIFICATION ERRORS (E-ECL-xxx)
153
+ # ============================================================================
154
+
155
+ E_ECL_001 = ErrorCode(
156
+ domain=ErrorDomain.ERROR_CLASS,
157
+ number=1,
158
+ message="Unable to classify error",
159
+ description="Error classification failed - error type could not be determined.",
160
+ remediation="Add error pattern to classifier or handle as UNKNOWN type.",
161
+ severity="WARNING",
162
+ )
163
+
164
+ E_ECL_002 = ErrorCode(
165
+ domain=ErrorDomain.ERROR_CLASS,
166
+ number=2,
167
+ message="Error classification confidence low",
168
+ description="Error was classified but with low confidence score.",
169
+ remediation="Review error patterns and improve classification rules.",
170
+ severity="INFO",
171
+ )
172
+
173
+ # ============================================================================
174
+ # GOLDEN IMAGE ERRORS (E-GLD-xxx)
175
+ # ============================================================================
176
+
177
+ E_GLD_001 = ErrorCode(
178
+ domain=ErrorDomain.GOLDEN_IMAGE,
179
+ number=1,
180
+ message="Golden snapshot capture failed",
181
+ description="Failed to capture golden state snapshot.",
182
+ remediation="Check storage permissions and available space.",
183
+ severity="ERROR",
184
+ )
185
+
186
+ E_GLD_002 = ErrorCode(
187
+ domain=ErrorDomain.GOLDEN_IMAGE,
188
+ number=2,
189
+ message="Golden snapshot restore failed",
190
+ description="Failed to restore state from golden snapshot.",
191
+ remediation="Verify snapshot integrity and compatibility with current version.",
192
+ severity="ERROR",
193
+ )
194
+
195
+ E_GLD_003 = ErrorCode(
196
+ domain=ErrorDomain.GOLDEN_IMAGE,
197
+ number=3,
198
+ message="Snapshot checksum mismatch",
199
+ description="Snapshot checksum verification failed - data may be corrupted.",
200
+ remediation="Recapture snapshot or restore from backup.",
201
+ severity="CRITICAL",
202
+ )
203
+
204
+ E_GLD_004 = ErrorCode(
205
+ domain=ErrorDomain.GOLDEN_IMAGE,
206
+ number=4,
207
+ message="Snapshot compression failed",
208
+ description="Failed to compress snapshot data.",
209
+ remediation="Check available memory and disk space.",
210
+ severity="ERROR",
211
+ )
212
+
213
+ E_GLD_005 = ErrorCode(
214
+ domain=ErrorDomain.GOLDEN_IMAGE,
215
+ number=5,
216
+ message="Graduated reset failed",
217
+ description="All graduated reset levels failed to restore state.",
218
+ remediation="Manual intervention required - check logs for specific failures.",
219
+ severity="CRITICAL",
220
+ )
221
+
222
+ # ============================================================================
223
+ # VALIDATION ERRORS (E-VAL-xxx)
224
+ # ============================================================================
225
+
226
+ E_VAL_001 = ErrorCode(
227
+ domain=ErrorDomain.VALIDATION,
228
+ number=1,
229
+ message="State validation failed",
230
+ description="State validation found critical issues.",
231
+ remediation="Review validation errors and fix state data.",
232
+ severity="ERROR",
233
+ )
234
+
235
+ E_VAL_002 = ErrorCode(
236
+ domain=ErrorDomain.VALIDATION,
237
+ number=2,
238
+ message="Schema validation failed",
239
+ description="State does not match expected schema.",
240
+ remediation="Update state to match schema or update schema definition.",
241
+ severity="ERROR",
242
+ )
243
+
244
+ E_VAL_003 = ErrorCode(
245
+ domain=ErrorDomain.VALIDATION,
246
+ number=3,
247
+ message="Dependency validation failed",
248
+ description="Required dependencies are missing or invalid.",
249
+ remediation="Ensure all required dependencies are present and valid.",
250
+ severity="ERROR",
251
+ )
252
+
253
+ # ============================================================================
254
+ # PROCESSING MODE ERRORS (E-MOD-xxx)
255
+ # ============================================================================
256
+
257
+ E_MOD_001 = ErrorCode(
258
+ domain=ErrorDomain.MODE,
259
+ number=1,
260
+ message="Mode transition failed",
261
+ description="Failed to transition to target processing mode.",
262
+ remediation="Check mode transition preconditions and system state.",
263
+ severity="ERROR",
264
+ )
265
+
266
+ E_MOD_002 = ErrorCode(
267
+ domain=ErrorDomain.MODE,
268
+ number=2,
269
+ message="Invalid mode for operation",
270
+ description="Operation not available in current processing mode.",
271
+ remediation="Wait for mode transition or use degraded operation variant.",
272
+ severity="WARNING",
273
+ )
274
+
275
+ E_MOD_003 = ErrorCode(
276
+ domain=ErrorDomain.MODE,
277
+ number=3,
278
+ message="Mode auto-recovery failed",
279
+ description="Automatic mode recovery did not succeed.",
280
+ remediation="Manual intervention required to restore normal mode.",
281
+ severity="ERROR",
282
+ )
283
+
284
+ # ============================================================================
285
+ # REQUEST TRACKING ERRORS (E-REQ-xxx)
286
+ # ============================================================================
287
+
288
+ E_REQ_001 = ErrorCode(
289
+ domain=ErrorDomain.REQUEST,
290
+ number=1,
291
+ message="Request tracking initialization failed",
292
+ description="Failed to initialize request tracking.",
293
+ remediation="Check RequestTracker configuration and retry.",
294
+ severity="WARNING",
295
+ )
296
+
297
+ E_REQ_002 = ErrorCode(
298
+ domain=ErrorDomain.REQUEST,
299
+ number=2,
300
+ message="Request timeout exceeded",
301
+ description="Request exceeded configured timeout duration.",
302
+ remediation="Increase timeout or optimize request processing.",
303
+ severity="WARNING",
304
+ )
305
+
306
+ # ============================================================================
307
+ # CONNECTION ERRORS (E-CON-xxx)
308
+ # ============================================================================
309
+
310
+ E_CON_001 = ErrorCode(
311
+ domain=ErrorDomain.CONNECTION,
312
+ number=1,
313
+ message="Connection health check failed",
314
+ description="Connection failed health check.",
315
+ remediation="Verify connection is alive and responsive.",
316
+ severity="WARNING",
317
+ )
318
+
319
+ E_CON_002 = ErrorCode(
320
+ domain=ErrorDomain.CONNECTION,
321
+ number=2,
322
+ message="Connection idle timeout",
323
+ description="Connection closed due to idle timeout.",
324
+ remediation="Increase idle_timeout or ensure connection is actively used.",
325
+ severity="INFO",
326
+ )
327
+
328
+ E_CON_003 = ErrorCode(
329
+ domain=ErrorDomain.CONNECTION,
330
+ number=3,
331
+ message="Connection registration failed",
332
+ description="Failed to register connection with tracker.",
333
+ remediation="Check connection is valid and tracker is initialized.",
334
+ severity="ERROR",
335
+ )
336
+
337
+ # ============================================================================
338
+ # STATE MANAGEMENT ERRORS (E-STA-xxx)
339
+ # ============================================================================
340
+
341
+ E_STA_001 = ErrorCode(
342
+ domain=ErrorDomain.STATE,
343
+ number=1,
344
+ message="State serialization failed",
345
+ description="Failed to serialize state data.",
346
+ remediation="Ensure state contains only serializable types.",
347
+ severity="ERROR",
348
+ )
349
+
350
+ E_STA_002 = ErrorCode(
351
+ domain=ErrorDomain.STATE,
352
+ number=2,
353
+ message="State deserialization failed",
354
+ description="Failed to deserialize state data.",
355
+ remediation="Verify state format and version compatibility.",
356
+ severity="ERROR",
357
+ )
358
+
359
+ E_STA_003 = ErrorCode(
360
+ domain=ErrorDomain.STATE,
361
+ number=3,
362
+ message="State corruption detected",
363
+ description="State data appears to be corrupted.",
364
+ remediation="Restore from golden snapshot or recapture state.",
365
+ severity="CRITICAL",
366
+ )
367
+
368
+ # ============================================================================
369
+ # ERROR CODE REGISTRY
370
+ # ============================================================================
371
+
372
+
373
+ class ErrorCodeRegistry:
374
+ """Registry for all error codes."""
375
+
376
+ _codes: Dict[str, ErrorCode] = {}
377
+
378
+ @classmethod
379
+ def register(cls, error_code: ErrorCode) -> None:
380
+ """Register an error code."""
381
+ cls._codes[error_code.code] = error_code
382
+
383
+ @classmethod
384
+ def get(cls, code: str) -> Optional[ErrorCode]:
385
+ """Get error code by code string."""
386
+ return cls._codes.get(code)
387
+
388
+ @classmethod
389
+ def search(cls, query: str) -> list[ErrorCode]:
390
+ """Search error codes by message or description."""
391
+ query_lower = query.lower()
392
+ return [
393
+ code
394
+ for code in cls._codes.values()
395
+ if query_lower in code.message.lower()
396
+ or query_lower in code.description.lower()
397
+ ]
398
+
399
+ @classmethod
400
+ def list_by_domain(cls, domain: ErrorDomain) -> list[ErrorCode]:
401
+ """List all error codes for a domain."""
402
+ return [code for code in cls._codes.values() if code.domain == domain]
403
+
404
+ @classmethod
405
+ def all(cls) -> list[ErrorCode]:
406
+ """Get all registered error codes."""
407
+ return sorted(cls._codes.values(), key=lambda c: c.code)
408
+
409
+
410
+ # Auto-register all error codes defined in this module
411
+ _error_codes = [
412
+ # Retry
413
+ E_RET_001,
414
+ E_RET_002,
415
+ E_RET_003,
416
+ # Circuit Breaker
417
+ E_CBR_001,
418
+ E_CBR_002,
419
+ E_CBR_003,
420
+ # Error Classification
421
+ E_ECL_001,
422
+ E_ECL_002,
423
+ # Golden Image
424
+ E_GLD_001,
425
+ E_GLD_002,
426
+ E_GLD_003,
427
+ E_GLD_004,
428
+ E_GLD_005,
429
+ # Validation
430
+ E_VAL_001,
431
+ E_VAL_002,
432
+ E_VAL_003,
433
+ # Processing Mode
434
+ E_MOD_001,
435
+ E_MOD_002,
436
+ E_MOD_003,
437
+ # Request Tracking
438
+ E_REQ_001,
439
+ E_REQ_002,
440
+ # Connection
441
+ E_CON_001,
442
+ E_CON_002,
443
+ E_CON_003,
444
+ # State Management
445
+ E_STA_001,
446
+ E_STA_002,
447
+ E_STA_003,
448
+ ]
449
+
450
+ for _code in _error_codes:
451
+ ErrorCodeRegistry.register(_code)
452
+
453
+
454
+ # ============================================================================
455
+ # ERROR CODE EXCEPTIONS
456
+ # ============================================================================
457
+
458
+
459
+ class DoryError(Exception):
460
+ """Base exception with error code support."""
461
+
462
+ def __init__(
463
+ self,
464
+ error_code: ErrorCode,
465
+ details: Optional[str] = None,
466
+ cause: Optional[Exception] = None,
467
+ ):
468
+ """
469
+ Initialize error with code.
470
+
471
+ Args:
472
+ error_code: The error code
473
+ details: Additional context-specific details
474
+ cause: Original exception that caused this error
475
+ """
476
+ self.error_code = error_code
477
+ self.details = details
478
+ self.cause = cause
479
+
480
+ message = str(error_code)
481
+ if details:
482
+ message += f"\nDetails: {details}"
483
+ if cause:
484
+ message += f"\nCause: {cause}"
485
+
486
+ super().__init__(message)
487
+
488
+ def format_full(self) -> str:
489
+ """Get full formatted error message."""
490
+ msg = self.error_code.format_full()
491
+ if self.details:
492
+ msg += f"\n\nAdditional Details:\n{self.details}"
493
+ if self.cause:
494
+ msg += f"\n\nCaused by:\n{self.cause}"
495
+ return msg
@@ -0,0 +1,10 @@
1
+ """Health check and metrics HTTP server."""
2
+
3
+ from dory.health.server import HealthServer
4
+ from dory.health.probes import LivenessProbe, ReadinessProbe
5
+
6
+ __all__ = [
7
+ "HealthServer",
8
+ "LivenessProbe",
9
+ "ReadinessProbe",
10
+ ]
dory/health/probes.py ADDED
@@ -0,0 +1,210 @@
1
+ """
2
+ Health probe implementations.
3
+
4
+ Provides liveness and readiness probes for Kubernetes.
5
+ """
6
+
7
+ import asyncio
8
+ import inspect
9
+ import logging
10
+ from abc import ABC, abstractmethod
11
+ from dataclasses import dataclass, field
12
+ from typing import Callable, Awaitable, Union
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class ProbeResult:
19
+ """Result of a health probe check."""
20
+ healthy: bool
21
+ message: str = ""
22
+ details: dict = field(default_factory=dict)
23
+
24
+ def to_dict(self) -> dict:
25
+ """Convert to dictionary for JSON response."""
26
+ return {
27
+ "healthy": self.healthy,
28
+ "message": self.message,
29
+ "details": self.details,
30
+ }
31
+
32
+
33
+ class HealthProbe(ABC):
34
+ """Abstract base class for health probes."""
35
+
36
+ @abstractmethod
37
+ async def check(self) -> ProbeResult:
38
+ """
39
+ Perform health check.
40
+
41
+ Returns:
42
+ ProbeResult indicating health status
43
+ """
44
+ pass
45
+
46
+
47
+ class LivenessProbe(HealthProbe):
48
+ """
49
+ Liveness probe for Kubernetes.
50
+
51
+ Indicates whether the process is alive and should not be killed.
52
+ Failed liveness = Kubernetes restarts the pod.
53
+
54
+ Should be lightweight and always pass unless process is deadlocked.
55
+ """
56
+
57
+ def __init__(self):
58
+ """Initialize liveness probe."""
59
+ self._custom_checks: list[Callable[[], Union[bool, Awaitable[bool]]]] = []
60
+
61
+ def add_check(self, check: Callable[[], Union[bool, Awaitable[bool]]]) -> None:
62
+ """
63
+ Add custom liveness check.
64
+
65
+ Args:
66
+ check: Sync or async function returning True if healthy
67
+ """
68
+ self._custom_checks.append(check)
69
+
70
+ async def check(self) -> ProbeResult:
71
+ """
72
+ Perform liveness check.
73
+
74
+ Default implementation always returns healthy.
75
+ Override or add custom checks for specific requirements.
76
+ """
77
+ # Run custom checks
78
+ for i, custom_check in enumerate(self._custom_checks):
79
+ try:
80
+ # Handle both sync and async functions
81
+ if asyncio.iscoroutinefunction(custom_check):
82
+ result = await custom_check()
83
+ else:
84
+ result = custom_check()
85
+
86
+ if not result:
87
+ return ProbeResult(
88
+ healthy=False,
89
+ message=f"Custom liveness check {i} failed",
90
+ )
91
+ except Exception as e:
92
+ logger.error(f"Liveness check {i} error: {e}")
93
+ return ProbeResult(
94
+ healthy=False,
95
+ message=f"Custom liveness check {i} error: {e}",
96
+ )
97
+
98
+ return ProbeResult(healthy=True, message="Process is alive")
99
+
100
+
101
+ class ReadinessProbe(HealthProbe):
102
+ """
103
+ Readiness probe for Kubernetes.
104
+
105
+ Indicates whether the process is ready to receive traffic.
106
+ Failed readiness = Kubernetes removes pod from service endpoints.
107
+
108
+ Should check that all dependencies are available.
109
+ """
110
+
111
+ def __init__(self):
112
+ """Initialize readiness probe."""
113
+ self._ready = False
114
+ self._custom_checks: list[Callable[[], Union[bool, Awaitable[bool]]]] = []
115
+
116
+ def mark_ready(self) -> None:
117
+ """Mark the processor as ready to receive traffic."""
118
+ self._ready = True
119
+ logger.info("Processor marked as ready")
120
+
121
+ def mark_not_ready(self) -> None:
122
+ """Mark the processor as not ready."""
123
+ self._ready = False
124
+ logger.info("Processor marked as not ready")
125
+
126
+ def is_ready(self) -> bool:
127
+ """Check if currently marked as ready."""
128
+ return self._ready
129
+
130
+ def add_check(self, check: Callable[[], Union[bool, Awaitable[bool]]]) -> None:
131
+ """
132
+ Add custom readiness check.
133
+
134
+ Args:
135
+ check: Sync or async function returning True if ready
136
+ """
137
+ self._custom_checks.append(check)
138
+
139
+ async def check(self) -> ProbeResult:
140
+ """
141
+ Perform readiness check.
142
+
143
+ Returns not ready until explicitly marked ready.
144
+ Also runs any custom checks.
145
+ """
146
+ if not self._ready:
147
+ return ProbeResult(
148
+ healthy=False,
149
+ message="Processor not yet ready",
150
+ )
151
+
152
+ # Run custom checks
153
+ for i, custom_check in enumerate(self._custom_checks):
154
+ try:
155
+ # Handle both sync and async functions
156
+ if asyncio.iscoroutinefunction(custom_check):
157
+ result = await custom_check()
158
+ else:
159
+ result = custom_check()
160
+
161
+ if not result:
162
+ return ProbeResult(
163
+ healthy=False,
164
+ message=f"Custom readiness check {i} failed",
165
+ )
166
+ except Exception as e:
167
+ logger.error(f"Readiness check {i} error: {e}")
168
+ return ProbeResult(
169
+ healthy=False,
170
+ message=f"Custom readiness check {i} error: {e}",
171
+ )
172
+
173
+ return ProbeResult(healthy=True, message="Processor is ready")
174
+
175
+
176
+ class StartupProbe(HealthProbe):
177
+ """
178
+ Startup probe for Kubernetes.
179
+
180
+ Indicates whether the application has finished starting up.
181
+ Failed startup = Kubernetes keeps waiting (up to failureThreshold).
182
+
183
+ Useful for slow-starting applications.
184
+ """
185
+
186
+ def __init__(self, startup_complete_check: Callable[[], bool] | None = None):
187
+ """
188
+ Initialize startup probe.
189
+
190
+ Args:
191
+ startup_complete_check: Function returning True when startup is complete
192
+ """
193
+ self._startup_complete = False
194
+ self._startup_check = startup_complete_check
195
+
196
+ def mark_startup_complete(self) -> None:
197
+ """Mark startup as complete."""
198
+ self._startup_complete = True
199
+ logger.info("Startup marked as complete")
200
+
201
+ async def check(self) -> ProbeResult:
202
+ """Perform startup check."""
203
+ if self._startup_complete:
204
+ return ProbeResult(healthy=True, message="Startup complete")
205
+
206
+ if self._startup_check and self._startup_check():
207
+ self._startup_complete = True
208
+ return ProbeResult(healthy=True, message="Startup complete")
209
+
210
+ return ProbeResult(healthy=False, message="Still starting up")