dory-sdk 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dory/__init__.py +70 -0
  2. dory/auto_instrument.py +142 -0
  3. dory/cli/__init__.py +5 -0
  4. dory/cli/main.py +290 -0
  5. dory/cli/templates.py +333 -0
  6. dory/config/__init__.py +23 -0
  7. dory/config/defaults.py +50 -0
  8. dory/config/loader.py +361 -0
  9. dory/config/presets.py +325 -0
  10. dory/config/schema.py +152 -0
  11. dory/core/__init__.py +27 -0
  12. dory/core/app.py +404 -0
  13. dory/core/context.py +209 -0
  14. dory/core/lifecycle.py +214 -0
  15. dory/core/meta.py +121 -0
  16. dory/core/modes.py +479 -0
  17. dory/core/processor.py +654 -0
  18. dory/core/signals.py +122 -0
  19. dory/decorators.py +142 -0
  20. dory/errors/__init__.py +117 -0
  21. dory/errors/classification.py +362 -0
  22. dory/errors/codes.py +495 -0
  23. dory/health/__init__.py +10 -0
  24. dory/health/probes.py +210 -0
  25. dory/health/server.py +306 -0
  26. dory/k8s/__init__.py +11 -0
  27. dory/k8s/annotation_watcher.py +184 -0
  28. dory/k8s/client.py +251 -0
  29. dory/k8s/pod_metadata.py +182 -0
  30. dory/logging/__init__.py +9 -0
  31. dory/logging/logger.py +175 -0
  32. dory/metrics/__init__.py +7 -0
  33. dory/metrics/collector.py +301 -0
  34. dory/middleware/__init__.py +36 -0
  35. dory/middleware/connection_tracker.py +608 -0
  36. dory/middleware/request_id.py +321 -0
  37. dory/middleware/request_tracker.py +501 -0
  38. dory/migration/__init__.py +11 -0
  39. dory/migration/configmap.py +260 -0
  40. dory/migration/serialization.py +167 -0
  41. dory/migration/state_manager.py +301 -0
  42. dory/monitoring/__init__.py +23 -0
  43. dory/monitoring/opentelemetry.py +462 -0
  44. dory/py.typed +2 -0
  45. dory/recovery/__init__.py +60 -0
  46. dory/recovery/golden_image.py +480 -0
  47. dory/recovery/golden_snapshot.py +561 -0
  48. dory/recovery/golden_validator.py +518 -0
  49. dory/recovery/partial_recovery.py +479 -0
  50. dory/recovery/recovery_decision.py +242 -0
  51. dory/recovery/restart_detector.py +142 -0
  52. dory/recovery/state_validator.py +187 -0
  53. dory/resilience/__init__.py +45 -0
  54. dory/resilience/circuit_breaker.py +454 -0
  55. dory/resilience/retry.py +389 -0
  56. dory/sidecar/__init__.py +6 -0
  57. dory/sidecar/main.py +75 -0
  58. dory/sidecar/server.py +329 -0
  59. dory/simple.py +342 -0
  60. dory/types.py +75 -0
  61. dory/utils/__init__.py +25 -0
  62. dory/utils/errors.py +59 -0
  63. dory/utils/retry.py +115 -0
  64. dory/utils/timeout.py +80 -0
  65. dory_sdk-2.1.0.dist-info/METADATA +663 -0
  66. dory_sdk-2.1.0.dist-info/RECORD +69 -0
  67. dory_sdk-2.1.0.dist-info/WHEEL +5 -0
  68. dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
  69. dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,480 @@
1
+ """
2
+ Golden image reset manager.
3
+
4
+ Handles state cleanup for fresh-start recovery after
5
+ repeated failures.
6
+
7
+ Implements graduated reset levels:
8
+ - SOFT: Clear caches only
9
+ - MODERATE: Clear session state, keep persistent data
10
+ - FULL: Delete all state
11
+ - FACTORY: Full reset + clear all metadata
12
+ """
13
+
14
+ import logging
15
+ from enum import Enum
16
+ from typing import TYPE_CHECKING, Optional, Dict, Any, List, Callable
17
+ from dataclasses import dataclass
18
+
19
+ from dory.utils.errors import DoryStateError
20
+
21
+ if TYPE_CHECKING:
22
+ from dory.migration.state_manager import StateManager
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class ResetLevel(Enum):
28
+ """
29
+ Graduated reset levels from least to most destructive.
30
+ """
31
+ SOFT = "soft" # Clear caches only, preserve all state
32
+ MODERATE = "moderate" # Clear session state, keep persistent data
33
+ FULL = "full" # Delete all persisted state
34
+ FACTORY = "factory" # Full reset + clear metadata, restart counts
35
+
36
+
37
+ @dataclass
38
+ class ResetResult:
39
+ """
40
+ Result of a reset operation.
41
+ """
42
+ success: bool
43
+ level: ResetLevel
44
+ processor_id: str
45
+ items_cleared: int = 0
46
+ errors: List[str] = None
47
+
48
+ def __post_init__(self):
49
+ if self.errors is None:
50
+ self.errors = []
51
+
52
+
53
+ class GoldenImageManager:
54
+ """
55
+ Manages golden image reset operations with graduated reset levels.
56
+
57
+ Reset Levels (in order of severity):
58
+ 1. SOFT: Clear caches only, preserve all state
59
+ 2. MODERATE: Clear session state, keep persistent data
60
+ 3. FULL: Delete all persisted state
61
+ 4. FACTORY: Full reset + clear metadata
62
+
63
+ Golden image reset = delete all persisted state and restart fresh.
64
+ Used when:
65
+ 1. State corruption is detected
66
+ 2. Restart count exceeds threshold
67
+ 3. Manual reset requested
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ state_manager: "StateManager",
73
+ reset_threshold: int = 3,
74
+ soft_threshold: int = 1,
75
+ moderate_threshold: int = 2,
76
+ cache_manager: Optional["CacheResetManager"] = None,
77
+ on_reset: Optional[Callable] = None,
78
+ ):
79
+ """
80
+ Initialize golden image manager.
81
+
82
+ Args:
83
+ state_manager: State manager for state deletion
84
+ reset_threshold: Restart count that triggers FULL reset
85
+ soft_threshold: Restart count that triggers SOFT reset
86
+ moderate_threshold: Restart count that triggers MODERATE reset
87
+ cache_manager: Optional cache manager for SOFT resets
88
+ on_reset: Optional callback when reset occurs
89
+ """
90
+ self._state_manager = state_manager
91
+ self._reset_threshold = reset_threshold
92
+ self._soft_threshold = soft_threshold
93
+ self._moderate_threshold = moderate_threshold
94
+ self._cache_manager = cache_manager or CacheResetManager()
95
+ self._on_reset = on_reset
96
+
97
+ # Metrics
98
+ self._reset_counts = {
99
+ ResetLevel.SOFT: 0,
100
+ ResetLevel.MODERATE: 0,
101
+ ResetLevel.FULL: 0,
102
+ ResetLevel.FACTORY: 0,
103
+ }
104
+
105
+ def should_reset(self, restart_count: int) -> bool:
106
+ """
107
+ Check if golden image reset should be triggered.
108
+
109
+ Args:
110
+ restart_count: Current restart count
111
+
112
+ Returns:
113
+ True if reset should be triggered
114
+ """
115
+ if restart_count >= self._soft_threshold:
116
+ logger.warning(
117
+ f"Restart count {restart_count} >= threshold {self._soft_threshold}, "
118
+ "recommending reset"
119
+ )
120
+ return True
121
+ return False
122
+
123
+ def determine_reset_level(
124
+ self,
125
+ restart_count: int,
126
+ state_corrupted: bool = False,
127
+ manual_factory: bool = False,
128
+ ) -> ResetLevel:
129
+ """
130
+ Determine appropriate reset level based on conditions.
131
+
132
+ Args:
133
+ restart_count: Current restart count
134
+ state_corrupted: Whether state corruption is detected
135
+ manual_factory: Whether factory reset is manually requested
136
+
137
+ Returns:
138
+ Recommended reset level
139
+
140
+ Logic:
141
+ - Factory reset if manually requested
142
+ - Full reset if state corrupted
143
+ - Graduated by restart count: SOFT -> MODERATE -> FULL
144
+ """
145
+ # Manual factory reset
146
+ if manual_factory:
147
+ logger.info("Factory reset manually requested")
148
+ return ResetLevel.FACTORY
149
+
150
+ # State corruption detected -> FULL reset
151
+ if state_corrupted:
152
+ logger.warning("State corruption detected, recommending FULL reset")
153
+ return ResetLevel.FULL
154
+
155
+ # Graduated by restart count
156
+ if restart_count >= self._reset_threshold:
157
+ return ResetLevel.FULL
158
+ elif restart_count >= self._moderate_threshold:
159
+ return ResetLevel.MODERATE
160
+ elif restart_count >= self._soft_threshold:
161
+ return ResetLevel.SOFT
162
+ else:
163
+ # No reset needed
164
+ return ResetLevel.SOFT # Default to softest
165
+
166
+ async def reset(
167
+ self,
168
+ processor_id: str,
169
+ level: Optional[ResetLevel] = None,
170
+ restart_count: int = 0,
171
+ ) -> ResetResult:
172
+ """
173
+ Perform golden image reset with specified or auto-determined level.
174
+
175
+ Args:
176
+ processor_id: Processor ID to reset
177
+ level: Reset level (auto-determined if None)
178
+ restart_count: Current restart count (for auto-determination)
179
+
180
+ Returns:
181
+ ResetResult with success status and details
182
+ """
183
+ # Determine level if not specified
184
+ if level is None:
185
+ level = self.determine_reset_level(restart_count)
186
+
187
+ logger.warning(
188
+ f"Performing {level.value.upper()} reset for processor {processor_id}"
189
+ )
190
+
191
+ # Perform reset based on level
192
+ if level == ResetLevel.SOFT:
193
+ result = await self._soft_reset(processor_id)
194
+ elif level == ResetLevel.MODERATE:
195
+ result = await self._moderate_reset(processor_id)
196
+ elif level == ResetLevel.FULL:
197
+ result = await self._full_reset(processor_id)
198
+ elif level == ResetLevel.FACTORY:
199
+ result = await self._factory_reset(processor_id)
200
+ else:
201
+ logger.error(f"Unknown reset level: {level}")
202
+ return ResetResult(
203
+ success=False,
204
+ level=level,
205
+ processor_id=processor_id,
206
+ errors=[f"Unknown reset level: {level}"],
207
+ )
208
+
209
+ # Update metrics
210
+ if result.success:
211
+ self._reset_counts[level] += 1
212
+
213
+ # Call reset callback
214
+ if self._on_reset and result.success:
215
+ try:
216
+ if asyncio.iscoroutinefunction(self._on_reset):
217
+ await self._on_reset(result)
218
+ else:
219
+ self._on_reset(result)
220
+ except Exception as e:
221
+ logger.warning(f"Reset callback failed: {e}")
222
+
223
+ return result
224
+
225
+ async def _soft_reset(self, processor_id: str) -> ResetResult:
226
+ """
227
+ SOFT reset: Clear caches only, preserve all state.
228
+
229
+ Args:
230
+ processor_id: Processor ID
231
+
232
+ Returns:
233
+ ResetResult
234
+ """
235
+ logger.info(f"Performing SOFT reset for {processor_id} (cache clear only)")
236
+
237
+ try:
238
+ cleared_count = await self._cache_manager.clear_all_caches()
239
+
240
+ return ResetResult(
241
+ success=True,
242
+ level=ResetLevel.SOFT,
243
+ processor_id=processor_id,
244
+ items_cleared=cleared_count,
245
+ )
246
+
247
+ except Exception as e:
248
+ logger.error(f"SOFT reset failed: {e}")
249
+ return ResetResult(
250
+ success=False,
251
+ level=ResetLevel.SOFT,
252
+ processor_id=processor_id,
253
+ errors=[str(e)],
254
+ )
255
+
256
+ async def _moderate_reset(self, processor_id: str) -> ResetResult:
257
+ """
258
+ MODERATE reset: Clear session state, keep persistent data.
259
+
260
+ Args:
261
+ processor_id: Processor ID
262
+
263
+ Returns:
264
+ ResetResult
265
+ """
266
+ logger.info(f"Performing MODERATE reset for {processor_id}")
267
+
268
+ errors = []
269
+ items_cleared = 0
270
+
271
+ try:
272
+ # Clear caches
273
+ cleared_count = await self._cache_manager.clear_all_caches()
274
+ items_cleared += cleared_count
275
+
276
+ # Clear session-level state (if state manager supports it)
277
+ # For now, this is a placeholder - implement based on state_manager API
278
+ # TODO: Add session-level state clearing when available
279
+
280
+ return ResetResult(
281
+ success=True,
282
+ level=ResetLevel.MODERATE,
283
+ processor_id=processor_id,
284
+ items_cleared=items_cleared,
285
+ )
286
+
287
+ except Exception as e:
288
+ logger.error(f"MODERATE reset failed: {e}")
289
+ errors.append(str(e))
290
+ return ResetResult(
291
+ success=False,
292
+ level=ResetLevel.MODERATE,
293
+ processor_id=processor_id,
294
+ items_cleared=items_cleared,
295
+ errors=errors,
296
+ )
297
+
298
+ async def _full_reset(self, processor_id: str) -> ResetResult:
299
+ """
300
+ FULL reset: Delete all persisted state.
301
+
302
+ Args:
303
+ processor_id: Processor ID
304
+
305
+ Returns:
306
+ ResetResult
307
+ """
308
+ logger.info(f"Performing FULL reset for {processor_id}")
309
+
310
+ try:
311
+ # Clear caches first
312
+ await self._cache_manager.clear_all_caches()
313
+
314
+ # Delete all state
315
+ deleted = await self._state_manager.delete_state(processor_id)
316
+
317
+ if deleted:
318
+ logger.info(f"FULL reset complete for {processor_id}")
319
+ else:
320
+ logger.info(f"No state to delete for {processor_id}")
321
+
322
+ return ResetResult(
323
+ success=True,
324
+ level=ResetLevel.FULL,
325
+ processor_id=processor_id,
326
+ items_cleared=1 if deleted else 0,
327
+ )
328
+
329
+ except DoryStateError as e:
330
+ logger.error(f"FULL reset failed: {e}")
331
+ return ResetResult(
332
+ success=False,
333
+ level=ResetLevel.FULL,
334
+ processor_id=processor_id,
335
+ errors=[str(e)],
336
+ )
337
+
338
+ async def _factory_reset(self, processor_id: str) -> ResetResult:
339
+ """
340
+ FACTORY reset: Full reset + clear all metadata and counters.
341
+
342
+ Args:
343
+ processor_id: Processor ID
344
+
345
+ Returns:
346
+ ResetResult
347
+ """
348
+ logger.warning(f"Performing FACTORY reset for {processor_id}")
349
+
350
+ errors = []
351
+ items_cleared = 0
352
+
353
+ try:
354
+ # Clear caches
355
+ cache_cleared = await self._cache_manager.clear_all_caches()
356
+ items_cleared += cache_cleared
357
+
358
+ # Delete all state
359
+ deleted = await self._state_manager.delete_state(processor_id)
360
+ if deleted:
361
+ items_cleared += 1
362
+
363
+ # TODO: Clear restart counts, metrics, snapshots
364
+ # This would require additional manager references
365
+
366
+ logger.info(f"FACTORY reset complete for {processor_id}")
367
+
368
+ return ResetResult(
369
+ success=True,
370
+ level=ResetLevel.FACTORY,
371
+ processor_id=processor_id,
372
+ items_cleared=items_cleared,
373
+ )
374
+
375
+ except Exception as e:
376
+ logger.error(f"FACTORY reset failed: {e}")
377
+ errors.append(str(e))
378
+ return ResetResult(
379
+ success=False,
380
+ level=ResetLevel.FACTORY,
381
+ processor_id=processor_id,
382
+ items_cleared=items_cleared,
383
+ errors=errors,
384
+ )
385
+
386
+ def get_reset_stats(self) -> Dict[str, int]:
387
+ """
388
+ Get reset statistics.
389
+
390
+ Returns:
391
+ Dictionary of reset counts by level
392
+ """
393
+ return {
394
+ level.value: count
395
+ for level, count in self._reset_counts.items()
396
+ }
397
+
398
+ async def reset_with_callback(
399
+ self,
400
+ processor_id: str,
401
+ pre_reset_callback=None,
402
+ post_reset_callback=None,
403
+ ) -> bool:
404
+ """
405
+ Perform golden image reset with callbacks.
406
+
407
+ Args:
408
+ processor_id: Processor ID to reset
409
+ pre_reset_callback: Async callback before reset
410
+ post_reset_callback: Async callback after reset
411
+
412
+ Returns:
413
+ True if reset was successful
414
+ """
415
+ # Pre-reset callback
416
+ if pre_reset_callback:
417
+ try:
418
+ await pre_reset_callback()
419
+ except Exception as e:
420
+ logger.error(f"Pre-reset callback failed: {e}")
421
+
422
+ # Perform reset
423
+ success = await self.reset(processor_id)
424
+
425
+ # Post-reset callback
426
+ if post_reset_callback and success:
427
+ try:
428
+ await post_reset_callback()
429
+ except Exception as e:
430
+ logger.error(f"Post-reset callback failed: {e}")
431
+
432
+ return success
433
+
434
+
435
+ class CacheResetManager:
436
+ """
437
+ Manages cache clearing during recovery.
438
+
439
+ Clears in-memory caches while preserving persisted state.
440
+ Used for lighter recovery than full golden image reset.
441
+ """
442
+
443
+ def __init__(self):
444
+ """Initialize cache reset manager."""
445
+ self._cache_clear_callbacks: list = []
446
+
447
+ def register_cache(self, clear_callback) -> None:
448
+ """
449
+ Register a cache clear callback.
450
+
451
+ Args:
452
+ clear_callback: Function to call to clear cache
453
+ """
454
+ self._cache_clear_callbacks.append(clear_callback)
455
+
456
+ async def clear_all_caches(self) -> int:
457
+ """
458
+ Clear all registered caches.
459
+
460
+ Returns:
461
+ Number of caches cleared
462
+ """
463
+ cleared = 0
464
+
465
+ for callback in self._cache_clear_callbacks:
466
+ try:
467
+ if asyncio.iscoroutinefunction(callback):
468
+ await callback()
469
+ else:
470
+ callback()
471
+ cleared += 1
472
+ except Exception as e:
473
+ logger.error(f"Cache clear failed: {e}")
474
+
475
+ logger.info(f"Cleared {cleared} caches")
476
+ return cleared
477
+
478
+
479
+ # Import for type checking
480
+ import asyncio