dory-processor-sdk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +101 -0
- dory/auth/__init__.py +10 -0
- dory/auth/oauth2.py +153 -0
- dory/auto_instrument.py +142 -0
- dory/cli/__init__.py +5 -0
- dory/cli/main.py +137 -0
- dory/cli/templates.py +123 -0
- dory/config/__init__.py +23 -0
- dory/config/defaults.py +24 -0
- dory/config/loader.py +430 -0
- dory/config/presets.py +73 -0
- dory/config/schema.py +84 -0
- dory/core/__init__.py +27 -0
- dory/core/app.py +434 -0
- dory/core/context.py +209 -0
- dory/core/lifecycle.py +214 -0
- dory/core/meta.py +121 -0
- dory/core/modes.py +479 -0
- dory/core/processor.py +564 -0
- dory/core/signals.py +122 -0
- dory/decorators.py +142 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +644 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +419 -0
- dory/errors/__init__.py +139 -0
- dory/errors/classification.py +362 -0
- dory/errors/codes.py +498 -0
- dory/geo/__init__.py +40 -0
- dory/geo/geolocalizer.py +1034 -0
- dory/health/__init__.py +12 -0
- dory/health/probes.py +210 -0
- dory/health/server.py +635 -0
- dory/k8s/__init__.py +80 -0
- dory/k8s/annotation_watcher.py +184 -0
- dory/k8s/client.py +251 -0
- dory/k8s/labels.py +505 -0
- dory/k8s/pod_metadata.py +182 -0
- dory/logging/__init__.py +9 -0
- dory/logging/logger.py +148 -0
- dory/metrics/__init__.py +7 -0
- dory/metrics/collector.py +301 -0
- dory/middleware/__init__.py +46 -0
- dory/middleware/connection_tracker.py +608 -0
- dory/middleware/request_id.py +325 -0
- dory/middleware/request_tracker.py +511 -0
- dory/migration/__init__.py +33 -0
- dory/migration/configmap.py +232 -0
- dory/migration/s3_store.py +594 -0
- dory/migration/serialization.py +135 -0
- dory/migration/state_manager.py +286 -0
- dory/migration/transfer.py +382 -0
- dory/monitoring/__init__.py +29 -0
- dory/monitoring/opentelemetry.py +489 -0
- dory/output/__init__.py +31 -0
- dory/output/envelope.py +137 -0
- dory/output/formatter.py +113 -0
- dory/output/rabbitmq.py +632 -0
- dory/output/routing.py +318 -0
- dory/output/validator.py +199 -0
- dory/py.typed +2 -0
- dory/recovery/__init__.py +60 -0
- dory/recovery/golden_image.py +487 -0
- dory/recovery/golden_snapshot.py +713 -0
- dory/recovery/golden_validator.py +518 -0
- dory/recovery/partial_recovery.py +482 -0
- dory/recovery/recovery_decision.py +242 -0
- dory/recovery/restart_detector.py +142 -0
- dory/recovery/state_validator.py +183 -0
- dory/resilience/__init__.py +45 -0
- dory/resilience/circuit_breaker.py +457 -0
- dory/resilience/retry.py +389 -0
- dory/simple.py +342 -0
- dory/types.py +68 -0
- dory/utils/__init__.py +31 -0
- dory/utils/errors.py +59 -0
- dory/utils/retry.py +115 -0
- dory/utils/timeout.py +80 -0
- dory_processor_sdk-0.0.1.dist-info/METADATA +424 -0
- dory_processor_sdk-0.0.1.dist-info/RECORD +86 -0
- dory_processor_sdk-0.0.1.dist-info/WHEEL +5 -0
- dory_processor_sdk-0.0.1.dist-info/entry_points.txt +2 -0
- dory_processor_sdk-0.0.1.dist-info/licenses/LICENSE +201 -0
- dory_processor_sdk-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,713 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Golden Snapshot Manager
|
|
3
|
+
|
|
4
|
+
Captures and manages golden snapshots of processor state to prevent
|
|
5
|
+
100% data loss during resets. Implements:
|
|
6
|
+
- Snapshot capture with checksums
|
|
7
|
+
- Versioned snapshot storage using StateManager
|
|
8
|
+
- Snapshot validation
|
|
9
|
+
- Restoration from snapshots
|
|
10
|
+
|
|
11
|
+
Storage: Uses StateManager to persist snapshots to the same backend
|
|
12
|
+
as processor state (ConfigMap in production, local file in development).
|
|
13
|
+
This ensures snapshots survive pod restarts and node failures.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
import time
|
|
21
|
+
from dataclasses import dataclass, field, asdict
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from typing import Any, Dict, Optional, List, Callable, TYPE_CHECKING
|
|
24
|
+
import gzip
|
|
25
|
+
import base64
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from dory.migration.state_manager import StateManager
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SnapshotStorageError(Exception):
|
|
34
|
+
"""Raised when snapshot storage operations fail."""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SnapshotValidationError(Exception):
|
|
39
|
+
"""Raised when snapshot validation fails."""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SnapshotFormat(Enum):
|
|
44
|
+
"""Snapshot storage format."""
|
|
45
|
+
JSON = "json"
|
|
46
|
+
JSON_GZ = "json.gz" # Compressed JSON
|
|
47
|
+
BINARY = "binary"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class SnapshotMetadata:
|
|
52
|
+
"""
|
|
53
|
+
Metadata about a golden snapshot.
|
|
54
|
+
|
|
55
|
+
Includes version, timestamps, checksums, and size information.
|
|
56
|
+
"""
|
|
57
|
+
snapshot_id: str
|
|
58
|
+
processor_id: str
|
|
59
|
+
created_at: float
|
|
60
|
+
state_version: str
|
|
61
|
+
checksum: str
|
|
62
|
+
size_bytes: int
|
|
63
|
+
compressed: bool = False
|
|
64
|
+
format: str = "json"
|
|
65
|
+
validation_passed: bool = True
|
|
66
|
+
restore_count: int = 0
|
|
67
|
+
last_restored_at: Optional[float] = None
|
|
68
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
|
69
|
+
|
|
70
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
71
|
+
"""Convert to dictionary."""
|
|
72
|
+
return asdict(self)
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_dict(cls, data: Dict[str, Any]) -> "SnapshotMetadata":
|
|
76
|
+
"""Create from dictionary."""
|
|
77
|
+
return cls(**data)
|
|
78
|
+
|
|
79
|
+
def age_seconds(self) -> float:
|
|
80
|
+
"""Get age of snapshot in seconds."""
|
|
81
|
+
return time.time() - self.created_at
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class Snapshot:
|
|
86
|
+
"""
|
|
87
|
+
Complete snapshot including metadata and state data.
|
|
88
|
+
"""
|
|
89
|
+
metadata: SnapshotMetadata
|
|
90
|
+
state_data: Dict[str, Any]
|
|
91
|
+
|
|
92
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
93
|
+
"""Convert to dictionary."""
|
|
94
|
+
return {
|
|
95
|
+
"metadata": self.metadata.to_dict(),
|
|
96
|
+
"state_data": self.state_data
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def from_dict(cls, data: Dict[str, Any]) -> "Snapshot":
|
|
101
|
+
"""Create from dictionary."""
|
|
102
|
+
return cls(
|
|
103
|
+
metadata=SnapshotMetadata.from_dict(data["metadata"]),
|
|
104
|
+
state_data=data["state_data"]
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class GoldenSnapshotManager:
|
|
109
|
+
"""
|
|
110
|
+
Manages golden snapshots of processor state.
|
|
111
|
+
|
|
112
|
+
Features:
|
|
113
|
+
- Automatic snapshot capture at key points
|
|
114
|
+
- Checksum validation
|
|
115
|
+
- Multiple snapshot versions
|
|
116
|
+
- Compression support
|
|
117
|
+
- Restore with validation
|
|
118
|
+
- Snapshot lifecycle management
|
|
119
|
+
- Persistent storage via StateManager (ConfigMap/S3/PVC)
|
|
120
|
+
|
|
121
|
+
Usage:
|
|
122
|
+
from dory.migration.state_manager import StateManager
|
|
123
|
+
|
|
124
|
+
state_manager = StateManager(backend="configmap")
|
|
125
|
+
manager = GoldenSnapshotManager(state_manager=state_manager)
|
|
126
|
+
|
|
127
|
+
# Capture snapshot
|
|
128
|
+
snapshot = await manager.capture_snapshot(
|
|
129
|
+
processor_id="my-processor",
|
|
130
|
+
state_data={"key": "value"},
|
|
131
|
+
tags={"version": "1.0"}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# List snapshots
|
|
135
|
+
snapshots = await manager.list_snapshots(processor_id="my-processor")
|
|
136
|
+
|
|
137
|
+
# Restore from snapshot
|
|
138
|
+
state = await manager.restore_snapshot(snapshot.metadata.snapshot_id)
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
# Key prefix for snapshot storage
|
|
142
|
+
SNAPSHOT_KEY_PREFIX = "snapshot"
|
|
143
|
+
|
|
144
|
+
def __init__(
|
|
145
|
+
self,
|
|
146
|
+
state_manager: "StateManager",
|
|
147
|
+
max_snapshots_per_processor: int = 5,
|
|
148
|
+
compression_enabled: bool = True,
|
|
149
|
+
checksum_algorithm: str = "sha256",
|
|
150
|
+
auto_cleanup: bool = True,
|
|
151
|
+
on_capture: Optional[Callable] = None,
|
|
152
|
+
on_restore: Optional[Callable] = None,
|
|
153
|
+
):
|
|
154
|
+
"""
|
|
155
|
+
Initialize golden snapshot manager.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
state_manager: StateManager instance for persistent storage
|
|
159
|
+
max_snapshots_per_processor: Maximum snapshots per processor
|
|
160
|
+
compression_enabled: Whether to compress snapshots
|
|
161
|
+
checksum_algorithm: Algorithm for checksums (sha256, md5)
|
|
162
|
+
auto_cleanup: Automatically cleanup old snapshots
|
|
163
|
+
on_capture: Callback when snapshot is captured
|
|
164
|
+
on_restore: Callback when snapshot is restored
|
|
165
|
+
"""
|
|
166
|
+
self._state_manager = state_manager
|
|
167
|
+
self.max_snapshots_per_processor = max_snapshots_per_processor
|
|
168
|
+
self.compression_enabled = compression_enabled
|
|
169
|
+
self.checksum_algorithm = checksum_algorithm
|
|
170
|
+
self.auto_cleanup = auto_cleanup
|
|
171
|
+
self.on_capture = on_capture
|
|
172
|
+
self.on_restore = on_restore
|
|
173
|
+
|
|
174
|
+
# In-memory cache of snapshot metadata (loaded on first access)
|
|
175
|
+
self._snapshot_cache: Dict[str, List[Dict[str, Any]]] = {}
|
|
176
|
+
self._cache_lock = asyncio.Lock()
|
|
177
|
+
|
|
178
|
+
# Metrics
|
|
179
|
+
self._capture_count = 0
|
|
180
|
+
self._restore_count = 0
|
|
181
|
+
self._validation_failures = 0
|
|
182
|
+
|
|
183
|
+
logger.info(
|
|
184
|
+
f"GoldenSnapshotManager initialized: "
|
|
185
|
+
f"compression={compression_enabled}, max_per_processor={max_snapshots_per_processor}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
async def capture_snapshot(
|
|
189
|
+
self,
|
|
190
|
+
processor_id: str,
|
|
191
|
+
state_data: Dict[str, Any],
|
|
192
|
+
state_version: str = "1.0",
|
|
193
|
+
tags: Optional[Dict[str, str]] = None,
|
|
194
|
+
validate_before_save: bool = True,
|
|
195
|
+
) -> Snapshot:
|
|
196
|
+
"""
|
|
197
|
+
Capture a golden snapshot of processor state.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
processor_id: ID of processor
|
|
201
|
+
state_data: State data to snapshot
|
|
202
|
+
state_version: Version of state schema
|
|
203
|
+
tags: Optional tags for the snapshot
|
|
204
|
+
validate_before_save: Validate state before saving
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Captured snapshot with metadata
|
|
208
|
+
|
|
209
|
+
Raises:
|
|
210
|
+
SnapshotValidationError: If validation fails
|
|
211
|
+
SnapshotStorageError: If storage fails
|
|
212
|
+
"""
|
|
213
|
+
logger.info(f"Capturing snapshot for processor {processor_id}")
|
|
214
|
+
|
|
215
|
+
# Validate state data
|
|
216
|
+
if validate_before_save:
|
|
217
|
+
if not self._validate_state_data(state_data):
|
|
218
|
+
raise SnapshotValidationError("State data validation failed")
|
|
219
|
+
|
|
220
|
+
# Generate snapshot ID
|
|
221
|
+
snapshot_id = self._generate_snapshot_id(processor_id)
|
|
222
|
+
|
|
223
|
+
# Serialize state data
|
|
224
|
+
state_json = json.dumps(state_data, sort_keys=True)
|
|
225
|
+
|
|
226
|
+
# Compress if enabled
|
|
227
|
+
if self.compression_enabled:
|
|
228
|
+
state_bytes = gzip.compress(state_json.encode())
|
|
229
|
+
compressed = True
|
|
230
|
+
format_type = SnapshotFormat.JSON_GZ.value
|
|
231
|
+
else:
|
|
232
|
+
state_bytes = state_json.encode()
|
|
233
|
+
compressed = False
|
|
234
|
+
format_type = SnapshotFormat.JSON.value
|
|
235
|
+
|
|
236
|
+
# Calculate checksum
|
|
237
|
+
checksum = self._calculate_checksum(state_bytes)
|
|
238
|
+
|
|
239
|
+
# Create metadata
|
|
240
|
+
metadata = SnapshotMetadata(
|
|
241
|
+
snapshot_id=snapshot_id,
|
|
242
|
+
processor_id=processor_id,
|
|
243
|
+
created_at=time.time(),
|
|
244
|
+
state_version=state_version,
|
|
245
|
+
checksum=checksum,
|
|
246
|
+
size_bytes=len(state_bytes),
|
|
247
|
+
compressed=compressed,
|
|
248
|
+
format=format_type,
|
|
249
|
+
tags=tags or {},
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Create snapshot
|
|
253
|
+
snapshot = Snapshot(metadata=metadata, state_data=state_data)
|
|
254
|
+
|
|
255
|
+
# Save to storage
|
|
256
|
+
try:
|
|
257
|
+
await self._save_snapshot(snapshot, state_bytes)
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error(f"Failed to save snapshot: {e}")
|
|
260
|
+
raise SnapshotStorageError(f"Failed to save snapshot: {e}")
|
|
261
|
+
|
|
262
|
+
# Update metrics
|
|
263
|
+
self._capture_count += 1
|
|
264
|
+
|
|
265
|
+
# Cleanup old snapshots if auto_cleanup enabled
|
|
266
|
+
if self.auto_cleanup:
|
|
267
|
+
await self._cleanup_old_snapshots(processor_id)
|
|
268
|
+
|
|
269
|
+
# Call capture callback
|
|
270
|
+
if self.on_capture:
|
|
271
|
+
try:
|
|
272
|
+
if asyncio.iscoroutinefunction(self.on_capture):
|
|
273
|
+
await self.on_capture(snapshot)
|
|
274
|
+
else:
|
|
275
|
+
self.on_capture(snapshot)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.warning(f"Capture callback failed: {e}")
|
|
278
|
+
|
|
279
|
+
logger.info(
|
|
280
|
+
f"Snapshot captured: id={snapshot_id}, size={len(state_bytes)} bytes, "
|
|
281
|
+
f"compressed={compressed}, checksum={checksum[:8]}..."
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return snapshot
|
|
285
|
+
|
|
286
|
+
async def restore_snapshot(
|
|
287
|
+
self,
|
|
288
|
+
snapshot_id: str,
|
|
289
|
+
validate_checksum: bool = True,
|
|
290
|
+
update_metadata: bool = True,
|
|
291
|
+
) -> Dict[str, Any]:
|
|
292
|
+
"""
|
|
293
|
+
Restore state from a snapshot.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
snapshot_id: ID of snapshot to restore
|
|
297
|
+
validate_checksum: Validate checksum before restoring
|
|
298
|
+
update_metadata: Update metadata (restore count, timestamp)
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Restored state data
|
|
302
|
+
|
|
303
|
+
Raises:
|
|
304
|
+
SnapshotValidationError: If validation fails
|
|
305
|
+
SnapshotStorageError: If snapshot not found or load fails
|
|
306
|
+
"""
|
|
307
|
+
logger.info(f"Restoring snapshot {snapshot_id}")
|
|
308
|
+
|
|
309
|
+
# Load snapshot
|
|
310
|
+
try:
|
|
311
|
+
snapshot, state_bytes = await self._load_snapshot(snapshot_id)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logger.error(f"Failed to load snapshot: {e}")
|
|
314
|
+
raise SnapshotStorageError(f"Failed to load snapshot: {e}")
|
|
315
|
+
|
|
316
|
+
# Validate checksum
|
|
317
|
+
if validate_checksum:
|
|
318
|
+
calculated_checksum = self._calculate_checksum(state_bytes)
|
|
319
|
+
if calculated_checksum != snapshot.metadata.checksum:
|
|
320
|
+
self._validation_failures += 1
|
|
321
|
+
raise SnapshotValidationError(
|
|
322
|
+
f"Checksum mismatch: expected {snapshot.metadata.checksum}, "
|
|
323
|
+
f"got {calculated_checksum}"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Update metadata
|
|
327
|
+
if update_metadata:
|
|
328
|
+
snapshot.metadata.restore_count += 1
|
|
329
|
+
snapshot.metadata.last_restored_at = time.time()
|
|
330
|
+
await self._update_metadata(snapshot.metadata)
|
|
331
|
+
|
|
332
|
+
# Update metrics
|
|
333
|
+
self._restore_count += 1
|
|
334
|
+
|
|
335
|
+
# Call restore callback
|
|
336
|
+
if self.on_restore:
|
|
337
|
+
try:
|
|
338
|
+
if asyncio.iscoroutinefunction(self.on_restore):
|
|
339
|
+
await self.on_restore(snapshot)
|
|
340
|
+
else:
|
|
341
|
+
self.on_restore(snapshot)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.warning(f"Restore callback failed: {e}")
|
|
344
|
+
|
|
345
|
+
logger.info(
|
|
346
|
+
f"Snapshot restored: id={snapshot_id}, "
|
|
347
|
+
f"restore_count={snapshot.metadata.restore_count}"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
return snapshot.state_data
|
|
351
|
+
|
|
352
|
+
async def list_snapshots(
|
|
353
|
+
self,
|
|
354
|
+
processor_id: Optional[str] = None,
|
|
355
|
+
limit: Optional[int] = None,
|
|
356
|
+
) -> List[SnapshotMetadata]:
|
|
357
|
+
"""
|
|
358
|
+
List available snapshots.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
processor_id: Filter by processor ID (required - StateManager requires processor ID)
|
|
362
|
+
limit: Maximum number of snapshots to return
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
List of snapshot metadata, sorted by created_at (newest first)
|
|
366
|
+
|
|
367
|
+
Raises:
|
|
368
|
+
ValueError: If processor_id is not provided
|
|
369
|
+
"""
|
|
370
|
+
if processor_id is None:
|
|
371
|
+
raise ValueError("processor_id is required for listing snapshots")
|
|
372
|
+
|
|
373
|
+
# Load snapshots from StateManager
|
|
374
|
+
snapshot_entries = await self._load_all_snapshots(processor_id)
|
|
375
|
+
|
|
376
|
+
# Extract metadata
|
|
377
|
+
snapshots = []
|
|
378
|
+
for entry in snapshot_entries:
|
|
379
|
+
try:
|
|
380
|
+
metadata = SnapshotMetadata.from_dict(entry["metadata"])
|
|
381
|
+
snapshots.append(metadata)
|
|
382
|
+
except Exception as e:
|
|
383
|
+
logger.warning(f"Failed to parse snapshot metadata: {e}")
|
|
384
|
+
|
|
385
|
+
# Already sorted by created_at (newest first) in _load_all_snapshots
|
|
386
|
+
# Apply limit
|
|
387
|
+
if limit is not None:
|
|
388
|
+
snapshots = snapshots[:limit]
|
|
389
|
+
|
|
390
|
+
return snapshots
|
|
391
|
+
|
|
392
|
+
async def delete_snapshot(self, snapshot_id: str) -> bool:
|
|
393
|
+
"""
|
|
394
|
+
Delete a snapshot.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
snapshot_id: ID of snapshot to delete
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
True if deleted successfully
|
|
401
|
+
"""
|
|
402
|
+
logger.info(f"Deleting snapshot {snapshot_id}")
|
|
403
|
+
|
|
404
|
+
try:
|
|
405
|
+
# Extract processor_id from snapshot_id (format: {processor_id}_{timestamp})
|
|
406
|
+
parts = snapshot_id.rsplit("_", 1)
|
|
407
|
+
if len(parts) != 2:
|
|
408
|
+
logger.error(f"Invalid snapshot ID format: {snapshot_id}")
|
|
409
|
+
return False
|
|
410
|
+
|
|
411
|
+
processor_id = parts[0]
|
|
412
|
+
|
|
413
|
+
# Load all snapshots
|
|
414
|
+
snapshots = await self._load_all_snapshots(processor_id)
|
|
415
|
+
|
|
416
|
+
# Find and remove the snapshot
|
|
417
|
+
original_count = len(snapshots)
|
|
418
|
+
snapshots = [
|
|
419
|
+
s for s in snapshots
|
|
420
|
+
if s["metadata"]["snapshot_id"] != snapshot_id
|
|
421
|
+
]
|
|
422
|
+
|
|
423
|
+
if len(snapshots) == original_count:
|
|
424
|
+
logger.warning(f"Snapshot not found: {snapshot_id}")
|
|
425
|
+
return False
|
|
426
|
+
|
|
427
|
+
# Save back (or delete if no snapshots left)
|
|
428
|
+
if snapshots:
|
|
429
|
+
await self._save_all_snapshots(processor_id, snapshots)
|
|
430
|
+
else:
|
|
431
|
+
# No snapshots left, delete the entire state entry
|
|
432
|
+
storage_key = self._get_storage_key(processor_id)
|
|
433
|
+
await self._state_manager.delete_state(storage_key)
|
|
434
|
+
# Clear cache
|
|
435
|
+
self._snapshot_cache.pop(processor_id, None)
|
|
436
|
+
|
|
437
|
+
logger.info(f"Snapshot deleted: {snapshot_id}")
|
|
438
|
+
return True
|
|
439
|
+
|
|
440
|
+
except Exception as e:
|
|
441
|
+
logger.error(f"Failed to delete snapshot {snapshot_id}: {e}")
|
|
442
|
+
return False
|
|
443
|
+
|
|
444
|
+
async def get_latest_snapshot(
|
|
445
|
+
self,
|
|
446
|
+
processor_id: str
|
|
447
|
+
) -> Optional[SnapshotMetadata]:
|
|
448
|
+
"""
|
|
449
|
+
Get the latest snapshot for a processor.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
processor_id: Processor ID
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
Latest snapshot metadata, or None if no snapshots exist
|
|
456
|
+
"""
|
|
457
|
+
snapshots = await self.list_snapshots(processor_id=processor_id, limit=1)
|
|
458
|
+
return snapshots[0] if snapshots else None
|
|
459
|
+
|
|
460
|
+
async def delete_all_snapshots(self, processor_id: str) -> int:
|
|
461
|
+
"""
|
|
462
|
+
Delete all snapshots for a processor.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
processor_id: Processor ID
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Number of snapshots deleted
|
|
469
|
+
"""
|
|
470
|
+
logger.info(f"Deleting all snapshots for processor {processor_id}")
|
|
471
|
+
|
|
472
|
+
# Get count before deletion
|
|
473
|
+
snapshots = await self._load_all_snapshots(processor_id)
|
|
474
|
+
count = len(snapshots)
|
|
475
|
+
|
|
476
|
+
if count == 0:
|
|
477
|
+
return 0
|
|
478
|
+
|
|
479
|
+
# Delete the entire state entry
|
|
480
|
+
storage_key = self._get_storage_key(processor_id)
|
|
481
|
+
await self._state_manager.delete_state(storage_key)
|
|
482
|
+
|
|
483
|
+
# Clear cache
|
|
484
|
+
self._snapshot_cache.pop(processor_id, None)
|
|
485
|
+
|
|
486
|
+
logger.info(f"Deleted {count} snapshots for processor {processor_id}")
|
|
487
|
+
return count
|
|
488
|
+
|
|
489
|
+
def clear_cache(self, processor_id: Optional[str] = None) -> None:
|
|
490
|
+
"""
|
|
491
|
+
Clear the in-memory snapshot cache.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
processor_id: Clear cache for specific processor, or all if None
|
|
495
|
+
"""
|
|
496
|
+
if processor_id:
|
|
497
|
+
self._snapshot_cache.pop(processor_id, None)
|
|
498
|
+
else:
|
|
499
|
+
self._snapshot_cache.clear()
|
|
500
|
+
|
|
501
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
502
|
+
"""
|
|
503
|
+
Get snapshot manager statistics.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
Dictionary of statistics
|
|
507
|
+
"""
|
|
508
|
+
return {
|
|
509
|
+
"storage_backend": str(self._state_manager._backend),
|
|
510
|
+
"capture_count": self._capture_count,
|
|
511
|
+
"restore_count": self._restore_count,
|
|
512
|
+
"validation_failures": self._validation_failures,
|
|
513
|
+
"compression_enabled": self.compression_enabled,
|
|
514
|
+
"max_snapshots_per_processor": self.max_snapshots_per_processor,
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
# Private methods
|
|
518
|
+
|
|
519
|
+
def _get_storage_key(self, processor_id: str) -> str:
|
|
520
|
+
"""Get the StateManager key for storing snapshots."""
|
|
521
|
+
return f"{processor_id}-{self.SNAPSHOT_KEY_PREFIX}"
|
|
522
|
+
|
|
523
|
+
def _generate_snapshot_id(self, processor_id: str) -> str:
|
|
524
|
+
"""Generate unique snapshot ID."""
|
|
525
|
+
timestamp = int(time.time() * 1000)
|
|
526
|
+
return f"{processor_id}_{timestamp}"
|
|
527
|
+
|
|
528
|
+
def _calculate_checksum(self, data: bytes) -> str:
|
|
529
|
+
"""Calculate checksum for data."""
|
|
530
|
+
if self.checksum_algorithm == "sha256":
|
|
531
|
+
return hashlib.sha256(data).hexdigest()
|
|
532
|
+
elif self.checksum_algorithm == "md5":
|
|
533
|
+
return hashlib.md5(data).hexdigest()
|
|
534
|
+
else:
|
|
535
|
+
raise ValueError(f"Unsupported checksum algorithm: {self.checksum_algorithm}")
|
|
536
|
+
|
|
537
|
+
def _validate_state_data(self, state_data: Dict[str, Any]) -> bool:
|
|
538
|
+
"""
|
|
539
|
+
Validate state data before capture.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
state_data: State data to validate
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
True if valid
|
|
546
|
+
"""
|
|
547
|
+
# Basic validation
|
|
548
|
+
if not isinstance(state_data, dict):
|
|
549
|
+
logger.error("State data must be a dictionary")
|
|
550
|
+
return False
|
|
551
|
+
|
|
552
|
+
# Check if serializable
|
|
553
|
+
try:
|
|
554
|
+
json.dumps(state_data)
|
|
555
|
+
except (TypeError, ValueError) as e:
|
|
556
|
+
logger.error(f"State data is not JSON serializable: {e}")
|
|
557
|
+
return False
|
|
558
|
+
|
|
559
|
+
return True
|
|
560
|
+
|
|
561
|
+
async def _load_all_snapshots(self, processor_id: str) -> List[Dict[str, Any]]:
|
|
562
|
+
"""
|
|
563
|
+
Load all snapshots for a processor from StateManager.
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
List of snapshot entries (each with 'metadata' and 'data_b64' keys)
|
|
567
|
+
"""
|
|
568
|
+
async with self._cache_lock:
|
|
569
|
+
# Check cache first
|
|
570
|
+
if processor_id in self._snapshot_cache:
|
|
571
|
+
return self._snapshot_cache[processor_id]
|
|
572
|
+
|
|
573
|
+
storage_key = self._get_storage_key(processor_id)
|
|
574
|
+
state = await self._state_manager.load_state(storage_key)
|
|
575
|
+
|
|
576
|
+
if state is None:
|
|
577
|
+
self._snapshot_cache[processor_id] = []
|
|
578
|
+
return []
|
|
579
|
+
|
|
580
|
+
snapshots = state.get("snapshots", [])
|
|
581
|
+
self._snapshot_cache[processor_id] = snapshots
|
|
582
|
+
return snapshots
|
|
583
|
+
|
|
584
|
+
async def _save_all_snapshots(
|
|
585
|
+
self, processor_id: str, snapshots: List[Dict[str, Any]]
|
|
586
|
+
) -> None:
|
|
587
|
+
"""
|
|
588
|
+
Save all snapshots for a processor to StateManager.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
processor_id: Processor ID
|
|
592
|
+
snapshots: List of snapshot entries
|
|
593
|
+
"""
|
|
594
|
+
storage_key = self._get_storage_key(processor_id)
|
|
595
|
+
|
|
596
|
+
state = {
|
|
597
|
+
"processor_id": processor_id,
|
|
598
|
+
"snapshot_count": len(snapshots),
|
|
599
|
+
"snapshots": snapshots,
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
await self._state_manager.save_state(storage_key, state)
|
|
603
|
+
|
|
604
|
+
# Update cache
|
|
605
|
+
async with self._cache_lock:
|
|
606
|
+
self._snapshot_cache[processor_id] = snapshots
|
|
607
|
+
|
|
608
|
+
async def _save_snapshot(self, snapshot: Snapshot, state_bytes: bytes) -> None:
|
|
609
|
+
"""Save snapshot to StateManager."""
|
|
610
|
+
processor_id = snapshot.metadata.processor_id
|
|
611
|
+
|
|
612
|
+
# Load existing snapshots
|
|
613
|
+
snapshots = await self._load_all_snapshots(processor_id)
|
|
614
|
+
|
|
615
|
+
# Create snapshot entry with base64-encoded data
|
|
616
|
+
snapshot_entry = {
|
|
617
|
+
"metadata": snapshot.metadata.to_dict(),
|
|
618
|
+
"data_b64": base64.b64encode(state_bytes).decode("ascii"),
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
# Add new snapshot at the beginning (newest first)
|
|
622
|
+
snapshots.insert(0, snapshot_entry)
|
|
623
|
+
|
|
624
|
+
# Save back to StateManager
|
|
625
|
+
await self._save_all_snapshots(processor_id, snapshots)
|
|
626
|
+
|
|
627
|
+
async def _load_snapshot(self, snapshot_id: str) -> tuple[Snapshot, bytes]:
|
|
628
|
+
"""Load snapshot from StateManager."""
|
|
629
|
+
# Extract processor_id from snapshot_id (format: {processor_id}_{timestamp})
|
|
630
|
+
parts = snapshot_id.rsplit("_", 1)
|
|
631
|
+
if len(parts) != 2:
|
|
632
|
+
raise SnapshotStorageError(f"Invalid snapshot ID format: {snapshot_id}")
|
|
633
|
+
|
|
634
|
+
processor_id = parts[0]
|
|
635
|
+
|
|
636
|
+
# Load all snapshots for this processor
|
|
637
|
+
snapshots = await self._load_all_snapshots(processor_id)
|
|
638
|
+
|
|
639
|
+
# Find the specific snapshot
|
|
640
|
+
snapshot_entry = None
|
|
641
|
+
for entry in snapshots:
|
|
642
|
+
if entry["metadata"]["snapshot_id"] == snapshot_id:
|
|
643
|
+
snapshot_entry = entry
|
|
644
|
+
break
|
|
645
|
+
|
|
646
|
+
if snapshot_entry is None:
|
|
647
|
+
raise SnapshotStorageError(f"Snapshot not found: {snapshot_id}")
|
|
648
|
+
|
|
649
|
+
# Decode data
|
|
650
|
+
state_bytes = base64.b64decode(snapshot_entry["data_b64"])
|
|
651
|
+
metadata = SnapshotMetadata.from_dict(snapshot_entry["metadata"])
|
|
652
|
+
|
|
653
|
+
# Decompress if needed
|
|
654
|
+
if metadata.compressed:
|
|
655
|
+
state_json = gzip.decompress(state_bytes).decode()
|
|
656
|
+
else:
|
|
657
|
+
state_json = state_bytes.decode()
|
|
658
|
+
|
|
659
|
+
# Parse JSON
|
|
660
|
+
state_data = json.loads(state_json)
|
|
661
|
+
|
|
662
|
+
# Create snapshot object
|
|
663
|
+
snapshot = Snapshot(metadata=metadata, state_data=state_data)
|
|
664
|
+
|
|
665
|
+
return snapshot, state_bytes
|
|
666
|
+
|
|
667
|
+
async def _update_metadata(self, metadata: SnapshotMetadata) -> None:
|
|
668
|
+
"""Update snapshot metadata in StateManager."""
|
|
669
|
+
processor_id = metadata.processor_id
|
|
670
|
+
|
|
671
|
+
# Load all snapshots
|
|
672
|
+
snapshots = await self._load_all_snapshots(processor_id)
|
|
673
|
+
|
|
674
|
+
# Find and update the specific snapshot
|
|
675
|
+
for entry in snapshots:
|
|
676
|
+
if entry["metadata"]["snapshot_id"] == metadata.snapshot_id:
|
|
677
|
+
entry["metadata"] = metadata.to_dict()
|
|
678
|
+
break
|
|
679
|
+
|
|
680
|
+
# Save back
|
|
681
|
+
await self._save_all_snapshots(processor_id, snapshots)
|
|
682
|
+
|
|
683
|
+
async def _cleanup_old_snapshots(self, processor_id: str) -> int:
|
|
684
|
+
"""
|
|
685
|
+
Cleanup old snapshots exceeding max limit.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
processor_id: Processor ID
|
|
689
|
+
|
|
690
|
+
Returns:
|
|
691
|
+
Number of snapshots deleted
|
|
692
|
+
"""
|
|
693
|
+
snapshots = await self._load_all_snapshots(processor_id)
|
|
694
|
+
|
|
695
|
+
# Keep only max_snapshots_per_processor newest snapshots
|
|
696
|
+
if len(snapshots) <= self.max_snapshots_per_processor:
|
|
697
|
+
return 0
|
|
698
|
+
|
|
699
|
+
# Count snapshots to delete
|
|
700
|
+
deleted_count = len(snapshots) - self.max_snapshots_per_processor
|
|
701
|
+
|
|
702
|
+
# Keep only the newest snapshots
|
|
703
|
+
snapshots = snapshots[:self.max_snapshots_per_processor]
|
|
704
|
+
|
|
705
|
+
# Save back
|
|
706
|
+
await self._save_all_snapshots(processor_id, snapshots)
|
|
707
|
+
|
|
708
|
+
if deleted_count > 0:
|
|
709
|
+
logger.info(
|
|
710
|
+
f"Cleaned up {deleted_count} old snapshots for processor {processor_id}"
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
return deleted_count
|