dory-sdk 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dory/__init__.py +70 -0
  2. dory/auto_instrument.py +142 -0
  3. dory/cli/__init__.py +5 -0
  4. dory/cli/main.py +290 -0
  5. dory/cli/templates.py +333 -0
  6. dory/config/__init__.py +23 -0
  7. dory/config/defaults.py +50 -0
  8. dory/config/loader.py +361 -0
  9. dory/config/presets.py +325 -0
  10. dory/config/schema.py +152 -0
  11. dory/core/__init__.py +27 -0
  12. dory/core/app.py +404 -0
  13. dory/core/context.py +209 -0
  14. dory/core/lifecycle.py +214 -0
  15. dory/core/meta.py +121 -0
  16. dory/core/modes.py +479 -0
  17. dory/core/processor.py +654 -0
  18. dory/core/signals.py +122 -0
  19. dory/decorators.py +142 -0
  20. dory/errors/__init__.py +117 -0
  21. dory/errors/classification.py +362 -0
  22. dory/errors/codes.py +495 -0
  23. dory/health/__init__.py +10 -0
  24. dory/health/probes.py +210 -0
  25. dory/health/server.py +306 -0
  26. dory/k8s/__init__.py +11 -0
  27. dory/k8s/annotation_watcher.py +184 -0
  28. dory/k8s/client.py +251 -0
  29. dory/k8s/pod_metadata.py +182 -0
  30. dory/logging/__init__.py +9 -0
  31. dory/logging/logger.py +175 -0
  32. dory/metrics/__init__.py +7 -0
  33. dory/metrics/collector.py +301 -0
  34. dory/middleware/__init__.py +36 -0
  35. dory/middleware/connection_tracker.py +608 -0
  36. dory/middleware/request_id.py +321 -0
  37. dory/middleware/request_tracker.py +501 -0
  38. dory/migration/__init__.py +11 -0
  39. dory/migration/configmap.py +260 -0
  40. dory/migration/serialization.py +167 -0
  41. dory/migration/state_manager.py +301 -0
  42. dory/monitoring/__init__.py +23 -0
  43. dory/monitoring/opentelemetry.py +462 -0
  44. dory/py.typed +2 -0
  45. dory/recovery/__init__.py +60 -0
  46. dory/recovery/golden_image.py +480 -0
  47. dory/recovery/golden_snapshot.py +561 -0
  48. dory/recovery/golden_validator.py +518 -0
  49. dory/recovery/partial_recovery.py +479 -0
  50. dory/recovery/recovery_decision.py +242 -0
  51. dory/recovery/restart_detector.py +142 -0
  52. dory/recovery/state_validator.py +187 -0
  53. dory/resilience/__init__.py +45 -0
  54. dory/resilience/circuit_breaker.py +454 -0
  55. dory/resilience/retry.py +389 -0
  56. dory/sidecar/__init__.py +6 -0
  57. dory/sidecar/main.py +75 -0
  58. dory/sidecar/server.py +329 -0
  59. dory/simple.py +342 -0
  60. dory/types.py +75 -0
  61. dory/utils/__init__.py +25 -0
  62. dory/utils/errors.py +59 -0
  63. dory/utils/retry.py +115 -0
  64. dory/utils/timeout.py +80 -0
  65. dory_sdk-2.1.0.dist-info/METADATA +663 -0
  66. dory_sdk-2.1.0.dist-info/RECORD +69 -0
  67. dory_sdk-2.1.0.dist-info/WHEEL +5 -0
  68. dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
  69. dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,561 @@
1
+ """
2
+ Golden Snapshot Manager
3
+
4
+ Captures and manages golden snapshots of processor state to prevent
5
+ 100% data loss during resets. Implements:
6
+ - Snapshot capture with checksums
7
+ - Versioned snapshot storage
8
+ - Snapshot validation
9
+ - Restoration from snapshots
10
+ """
11
+
12
+ import asyncio
13
+ import hashlib
14
+ import json
15
+ import logging
16
+ import time
17
+ from dataclasses import dataclass, field, asdict
18
+ from datetime import datetime
19
+ from enum import Enum
20
+ from pathlib import Path
21
+ from typing import Any, Dict, Optional, List, Callable
22
+ import gzip
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class SnapshotStorageError(Exception):
28
+ """Raised when snapshot storage operations fail."""
29
+ pass
30
+
31
+
32
+ class SnapshotValidationError(Exception):
33
+ """Raised when snapshot validation fails."""
34
+ pass
35
+
36
+
37
+ class SnapshotFormat(Enum):
38
+ """Snapshot storage format."""
39
+ JSON = "json"
40
+ JSON_GZ = "json.gz" # Compressed JSON
41
+ BINARY = "binary"
42
+
43
+
44
+ @dataclass
45
+ class SnapshotMetadata:
46
+ """
47
+ Metadata about a golden snapshot.
48
+
49
+ Includes version, timestamps, checksums, and size information.
50
+ """
51
+ snapshot_id: str
52
+ processor_id: str
53
+ created_at: float
54
+ state_version: str
55
+ checksum: str
56
+ size_bytes: int
57
+ compressed: bool = False
58
+ format: str = "json"
59
+ validation_passed: bool = True
60
+ restore_count: int = 0
61
+ last_restored_at: Optional[float] = None
62
+ tags: Dict[str, str] = field(default_factory=dict)
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ """Convert to dictionary."""
66
+ return asdict(self)
67
+
68
+ @classmethod
69
+ def from_dict(cls, data: Dict[str, Any]) -> "SnapshotMetadata":
70
+ """Create from dictionary."""
71
+ return cls(**data)
72
+
73
+ def age_seconds(self) -> float:
74
+ """Get age of snapshot in seconds."""
75
+ return time.time() - self.created_at
76
+
77
+
78
+ @dataclass
79
+ class Snapshot:
80
+ """
81
+ Complete snapshot including metadata and state data.
82
+ """
83
+ metadata: SnapshotMetadata
84
+ state_data: Dict[str, Any]
85
+
86
+ def to_dict(self) -> Dict[str, Any]:
87
+ """Convert to dictionary."""
88
+ return {
89
+ "metadata": self.metadata.to_dict(),
90
+ "state_data": self.state_data
91
+ }
92
+
93
+ @classmethod
94
+ def from_dict(cls, data: Dict[str, Any]) -> "Snapshot":
95
+ """Create from dictionary."""
96
+ return cls(
97
+ metadata=SnapshotMetadata.from_dict(data["metadata"]),
98
+ state_data=data["state_data"]
99
+ )
100
+
101
+
102
+ class GoldenSnapshotManager:
103
+ """
104
+ Manages golden snapshots of processor state.
105
+
106
+ Features:
107
+ - Automatic snapshot capture at key points
108
+ - Checksum validation
109
+ - Multiple snapshot versions
110
+ - Compression support
111
+ - Restore with validation
112
+ - Snapshot lifecycle management
113
+
114
+ Usage:
115
+ manager = GoldenSnapshotManager(storage_path="./snapshots")
116
+
117
+ # Capture snapshot
118
+ snapshot = await manager.capture_snapshot(
119
+ processor_id="my-processor",
120
+ state_data={"key": "value"},
121
+ tags={"version": "1.0"}
122
+ )
123
+
124
+ # List snapshots
125
+ snapshots = await manager.list_snapshots(processor_id="my-processor")
126
+
127
+ # Restore from snapshot
128
+ state = await manager.restore_snapshot(snapshot.metadata.snapshot_id)
129
+ """
130
+
131
+ def __init__(
132
+ self,
133
+ storage_path: str = "./golden_snapshots",
134
+ max_snapshots_per_processor: int = 5,
135
+ compression_enabled: bool = True,
136
+ checksum_algorithm: str = "sha256",
137
+ auto_cleanup: bool = True,
138
+ on_capture: Optional[Callable] = None,
139
+ on_restore: Optional[Callable] = None,
140
+ ):
141
+ """
142
+ Initialize golden snapshot manager.
143
+
144
+ Args:
145
+ storage_path: Directory to store snapshots
146
+ max_snapshots_per_processor: Maximum snapshots per processor
147
+ compression_enabled: Whether to compress snapshots
148
+ checksum_algorithm: Algorithm for checksums (sha256, md5)
149
+ auto_cleanup: Automatically cleanup old snapshots
150
+ on_capture: Callback when snapshot is captured
151
+ on_restore: Callback when snapshot is restored
152
+ """
153
+ self.storage_path = Path(storage_path)
154
+ self.max_snapshots_per_processor = max_snapshots_per_processor
155
+ self.compression_enabled = compression_enabled
156
+ self.checksum_algorithm = checksum_algorithm
157
+ self.auto_cleanup = auto_cleanup
158
+ self.on_capture = on_capture
159
+ self.on_restore = on_restore
160
+
161
+ # Create storage directory
162
+ self.storage_path.mkdir(parents=True, exist_ok=True)
163
+
164
+ # Metrics
165
+ self._capture_count = 0
166
+ self._restore_count = 0
167
+ self._validation_failures = 0
168
+
169
+ logger.info(
170
+ f"GoldenSnapshotManager initialized: storage={storage_path}, "
171
+ f"compression={compression_enabled}, max_per_processor={max_snapshots_per_processor}"
172
+ )
173
+
174
+ async def capture_snapshot(
175
+ self,
176
+ processor_id: str,
177
+ state_data: Dict[str, Any],
178
+ state_version: str = "1.0",
179
+ tags: Optional[Dict[str, str]] = None,
180
+ validate_before_save: bool = True,
181
+ ) -> Snapshot:
182
+ """
183
+ Capture a golden snapshot of processor state.
184
+
185
+ Args:
186
+ processor_id: ID of processor
187
+ state_data: State data to snapshot
188
+ state_version: Version of state schema
189
+ tags: Optional tags for the snapshot
190
+ validate_before_save: Validate state before saving
191
+
192
+ Returns:
193
+ Captured snapshot with metadata
194
+
195
+ Raises:
196
+ SnapshotValidationError: If validation fails
197
+ SnapshotStorageError: If storage fails
198
+ """
199
+ logger.info(f"Capturing snapshot for processor {processor_id}")
200
+
201
+ # Validate state data
202
+ if validate_before_save:
203
+ if not self._validate_state_data(state_data):
204
+ raise SnapshotValidationError("State data validation failed")
205
+
206
+ # Generate snapshot ID
207
+ snapshot_id = self._generate_snapshot_id(processor_id)
208
+
209
+ # Serialize state data
210
+ state_json = json.dumps(state_data, sort_keys=True)
211
+
212
+ # Compress if enabled
213
+ if self.compression_enabled:
214
+ state_bytes = gzip.compress(state_json.encode())
215
+ compressed = True
216
+ format_type = SnapshotFormat.JSON_GZ.value
217
+ else:
218
+ state_bytes = state_json.encode()
219
+ compressed = False
220
+ format_type = SnapshotFormat.JSON.value
221
+
222
+ # Calculate checksum
223
+ checksum = self._calculate_checksum(state_bytes)
224
+
225
+ # Create metadata
226
+ metadata = SnapshotMetadata(
227
+ snapshot_id=snapshot_id,
228
+ processor_id=processor_id,
229
+ created_at=time.time(),
230
+ state_version=state_version,
231
+ checksum=checksum,
232
+ size_bytes=len(state_bytes),
233
+ compressed=compressed,
234
+ format=format_type,
235
+ tags=tags or {},
236
+ )
237
+
238
+ # Create snapshot
239
+ snapshot = Snapshot(metadata=metadata, state_data=state_data)
240
+
241
+ # Save to storage
242
+ try:
243
+ await self._save_snapshot(snapshot, state_bytes)
244
+ except Exception as e:
245
+ logger.error(f"Failed to save snapshot: {e}")
246
+ raise SnapshotStorageError(f"Failed to save snapshot: {e}")
247
+
248
+ # Update metrics
249
+ self._capture_count += 1
250
+
251
+ # Cleanup old snapshots if auto_cleanup enabled
252
+ if self.auto_cleanup:
253
+ await self._cleanup_old_snapshots(processor_id)
254
+
255
+ # Call capture callback
256
+ if self.on_capture:
257
+ try:
258
+ if asyncio.iscoroutinefunction(self.on_capture):
259
+ await self.on_capture(snapshot)
260
+ else:
261
+ self.on_capture(snapshot)
262
+ except Exception as e:
263
+ logger.warning(f"Capture callback failed: {e}")
264
+
265
+ logger.info(
266
+ f"Snapshot captured: id={snapshot_id}, size={len(state_bytes)} bytes, "
267
+ f"compressed={compressed}, checksum={checksum[:8]}..."
268
+ )
269
+
270
+ return snapshot
271
+
272
+ async def restore_snapshot(
273
+ self,
274
+ snapshot_id: str,
275
+ validate_checksum: bool = True,
276
+ update_metadata: bool = True,
277
+ ) -> Dict[str, Any]:
278
+ """
279
+ Restore state from a snapshot.
280
+
281
+ Args:
282
+ snapshot_id: ID of snapshot to restore
283
+ validate_checksum: Validate checksum before restoring
284
+ update_metadata: Update metadata (restore count, timestamp)
285
+
286
+ Returns:
287
+ Restored state data
288
+
289
+ Raises:
290
+ SnapshotValidationError: If validation fails
291
+ SnapshotStorageError: If snapshot not found or load fails
292
+ """
293
+ logger.info(f"Restoring snapshot {snapshot_id}")
294
+
295
+ # Load snapshot
296
+ try:
297
+ snapshot, state_bytes = await self._load_snapshot(snapshot_id)
298
+ except Exception as e:
299
+ logger.error(f"Failed to load snapshot: {e}")
300
+ raise SnapshotStorageError(f"Failed to load snapshot: {e}")
301
+
302
+ # Validate checksum
303
+ if validate_checksum:
304
+ calculated_checksum = self._calculate_checksum(state_bytes)
305
+ if calculated_checksum != snapshot.metadata.checksum:
306
+ self._validation_failures += 1
307
+ raise SnapshotValidationError(
308
+ f"Checksum mismatch: expected {snapshot.metadata.checksum}, "
309
+ f"got {calculated_checksum}"
310
+ )
311
+
312
+ # Update metadata
313
+ if update_metadata:
314
+ snapshot.metadata.restore_count += 1
315
+ snapshot.metadata.last_restored_at = time.time()
316
+ await self._update_metadata(snapshot.metadata)
317
+
318
+ # Update metrics
319
+ self._restore_count += 1
320
+
321
+ # Call restore callback
322
+ if self.on_restore:
323
+ try:
324
+ if asyncio.iscoroutinefunction(self.on_restore):
325
+ await self.on_restore(snapshot)
326
+ else:
327
+ self.on_restore(snapshot)
328
+ except Exception as e:
329
+ logger.warning(f"Restore callback failed: {e}")
330
+
331
+ logger.info(
332
+ f"Snapshot restored: id={snapshot_id}, "
333
+ f"restore_count={snapshot.metadata.restore_count}"
334
+ )
335
+
336
+ return snapshot.state_data
337
+
338
+ async def list_snapshots(
339
+ self,
340
+ processor_id: Optional[str] = None,
341
+ limit: Optional[int] = None,
342
+ ) -> List[SnapshotMetadata]:
343
+ """
344
+ List available snapshots.
345
+
346
+ Args:
347
+ processor_id: Filter by processor ID (None for all)
348
+ limit: Maximum number of snapshots to return
349
+
350
+ Returns:
351
+ List of snapshot metadata, sorted by created_at (newest first)
352
+ """
353
+ snapshots = []
354
+
355
+ # Iterate through storage directory
356
+ for meta_file in self.storage_path.glob("*.meta.json"):
357
+ try:
358
+ with open(meta_file, "r") as f:
359
+ meta_data = json.load(f)
360
+ metadata = SnapshotMetadata.from_dict(meta_data)
361
+
362
+ # Filter by processor_id if specified
363
+ if processor_id is None or metadata.processor_id == processor_id:
364
+ snapshots.append(metadata)
365
+ except Exception as e:
366
+ logger.warning(f"Failed to load metadata from {meta_file}: {e}")
367
+
368
+ # Sort by created_at (newest first)
369
+ snapshots.sort(key=lambda x: x.created_at, reverse=True)
370
+
371
+ # Apply limit
372
+ if limit is not None:
373
+ snapshots = snapshots[:limit]
374
+
375
+ return snapshots
376
+
377
+ async def delete_snapshot(self, snapshot_id: str) -> bool:
378
+ """
379
+ Delete a snapshot.
380
+
381
+ Args:
382
+ snapshot_id: ID of snapshot to delete
383
+
384
+ Returns:
385
+ True if deleted successfully
386
+ """
387
+ logger.info(f"Deleting snapshot {snapshot_id}")
388
+
389
+ try:
390
+ # Delete data file
391
+ data_file = self.storage_path / f"{snapshot_id}.data"
392
+ if data_file.exists():
393
+ data_file.unlink()
394
+
395
+ # Delete metadata file
396
+ meta_file = self.storage_path / f"{snapshot_id}.meta.json"
397
+ if meta_file.exists():
398
+ meta_file.unlink()
399
+
400
+ logger.info(f"Snapshot deleted: {snapshot_id}")
401
+ return True
402
+
403
+ except Exception as e:
404
+ logger.error(f"Failed to delete snapshot {snapshot_id}: {e}")
405
+ return False
406
+
407
+ async def get_latest_snapshot(
408
+ self,
409
+ processor_id: str
410
+ ) -> Optional[SnapshotMetadata]:
411
+ """
412
+ Get the latest snapshot for a processor.
413
+
414
+ Args:
415
+ processor_id: Processor ID
416
+
417
+ Returns:
418
+ Latest snapshot metadata, or None if no snapshots exist
419
+ """
420
+ snapshots = await self.list_snapshots(processor_id=processor_id, limit=1)
421
+ return snapshots[0] if snapshots else None
422
+
423
+ def get_stats(self) -> Dict[str, Any]:
424
+ """
425
+ Get snapshot manager statistics.
426
+
427
+ Returns:
428
+ Dictionary of statistics
429
+ """
430
+ return {
431
+ "storage_path": str(self.storage_path),
432
+ "capture_count": self._capture_count,
433
+ "restore_count": self._restore_count,
434
+ "validation_failures": self._validation_failures,
435
+ "compression_enabled": self.compression_enabled,
436
+ "max_snapshots_per_processor": self.max_snapshots_per_processor,
437
+ }
438
+
439
+ # Private methods
440
+
441
+ def _generate_snapshot_id(self, processor_id: str) -> str:
442
+ """Generate unique snapshot ID."""
443
+ timestamp = int(time.time() * 1000)
444
+ return f"{processor_id}_{timestamp}"
445
+
446
+ def _calculate_checksum(self, data: bytes) -> str:
447
+ """Calculate checksum for data."""
448
+ if self.checksum_algorithm == "sha256":
449
+ return hashlib.sha256(data).hexdigest()
450
+ elif self.checksum_algorithm == "md5":
451
+ return hashlib.md5(data).hexdigest()
452
+ else:
453
+ raise ValueError(f"Unsupported checksum algorithm: {self.checksum_algorithm}")
454
+
455
+ def _validate_state_data(self, state_data: Dict[str, Any]) -> bool:
456
+ """
457
+ Validate state data before capture.
458
+
459
+ Args:
460
+ state_data: State data to validate
461
+
462
+ Returns:
463
+ True if valid
464
+ """
465
+ # Basic validation
466
+ if not isinstance(state_data, dict):
467
+ logger.error("State data must be a dictionary")
468
+ return False
469
+
470
+ # Check if serializable
471
+ try:
472
+ json.dumps(state_data)
473
+ except (TypeError, ValueError) as e:
474
+ logger.error(f"State data is not JSON serializable: {e}")
475
+ return False
476
+
477
+ return True
478
+
479
+ async def _save_snapshot(self, snapshot: Snapshot, state_bytes: bytes) -> None:
480
+ """Save snapshot to storage."""
481
+ snapshot_id = snapshot.metadata.snapshot_id
482
+
483
+ # Save data file
484
+ data_file = self.storage_path / f"{snapshot_id}.data"
485
+ with open(data_file, "wb") as f:
486
+ f.write(state_bytes)
487
+
488
+ # Save metadata file
489
+ meta_file = self.storage_path / f"{snapshot_id}.meta.json"
490
+ with open(meta_file, "w") as f:
491
+ json.dump(snapshot.metadata.to_dict(), f, indent=2)
492
+
493
+ async def _load_snapshot(self, snapshot_id: str) -> tuple[Snapshot, bytes]:
494
+ """Load snapshot from storage."""
495
+ # Load metadata
496
+ meta_file = self.storage_path / f"{snapshot_id}.meta.json"
497
+ if not meta_file.exists():
498
+ raise SnapshotStorageError(f"Snapshot metadata not found: {snapshot_id}")
499
+
500
+ with open(meta_file, "r") as f:
501
+ meta_data = json.load(f)
502
+ metadata = SnapshotMetadata.from_dict(meta_data)
503
+
504
+ # Load data
505
+ data_file = self.storage_path / f"{snapshot_id}.data"
506
+ if not data_file.exists():
507
+ raise SnapshotStorageError(f"Snapshot data not found: {snapshot_id}")
508
+
509
+ with open(data_file, "rb") as f:
510
+ state_bytes = f.read()
511
+
512
+ # Decompress if needed
513
+ if metadata.compressed:
514
+ state_json = gzip.decompress(state_bytes).decode()
515
+ else:
516
+ state_json = state_bytes.decode()
517
+
518
+ # Parse JSON
519
+ state_data = json.loads(state_json)
520
+
521
+ # Create snapshot object
522
+ snapshot = Snapshot(metadata=metadata, state_data=state_data)
523
+
524
+ return snapshot, state_bytes
525
+
526
+ async def _update_metadata(self, metadata: SnapshotMetadata) -> None:
527
+ """Update snapshot metadata."""
528
+ meta_file = self.storage_path / f"{metadata.snapshot_id}.meta.json"
529
+ with open(meta_file, "w") as f:
530
+ json.dump(metadata.to_dict(), f, indent=2)
531
+
532
+ async def _cleanup_old_snapshots(self, processor_id: str) -> int:
533
+ """
534
+ Cleanup old snapshots exceeding max limit.
535
+
536
+ Args:
537
+ processor_id: Processor ID
538
+
539
+ Returns:
540
+ Number of snapshots deleted
541
+ """
542
+ snapshots = await self.list_snapshots(processor_id=processor_id)
543
+
544
+ # Keep only max_snapshots_per_processor newest snapshots
545
+ if len(snapshots) <= self.max_snapshots_per_processor:
546
+ return 0
547
+
548
+ # Delete excess snapshots
549
+ to_delete = snapshots[self.max_snapshots_per_processor:]
550
+ deleted_count = 0
551
+
552
+ for metadata in to_delete:
553
+ if await self.delete_snapshot(metadata.snapshot_id):
554
+ deleted_count += 1
555
+
556
+ if deleted_count > 0:
557
+ logger.info(
558
+ f"Cleaned up {deleted_count} old snapshots for processor {processor_id}"
559
+ )
560
+
561
+ return deleted_count