dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dory/edge/fencing.py ADDED
@@ -0,0 +1,488 @@
1
+ """Fencing mechanism for split-brain prevention.
2
+
3
+ Provides distributed locking and fencing tokens to prevent dual-write
4
+ scenarios during edge-to-cloud failover.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import os
10
+ import secrets
11
+ import time
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from typing import Any, Protocol
15
+
16
+ from dory.utils.errors import DoryError
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class FencingError(DoryError):
22
+ """Base error for fencing operations."""
23
+ pass
24
+
25
+
26
+ class FenceViolation(FencingError):
27
+ """Raised when an operation violates the fencing protocol."""
28
+ pass
29
+
30
+
31
+ class StaleEpochError(FencingError):
32
+ """Raised when attempting to use a stale (outdated) epoch."""
33
+ pass
34
+
35
+
36
+ class FencingBackend(Protocol):
37
+ """Protocol for fencing backend implementations."""
38
+
39
+ async def acquire_epoch(self, processor_id: str, node_id: str) -> int:
40
+ """Acquire and increment epoch for processor."""
41
+ ...
42
+
43
+ async def get_current_epoch(self, processor_id: str) -> int:
44
+ """Get current epoch for processor."""
45
+ ...
46
+
47
+ async def validate_epoch(self, processor_id: str, epoch: int) -> bool:
48
+ """Check if epoch is current (not stale)."""
49
+ ...
50
+
51
+ async def release(self, processor_id: str, node_id: str) -> None:
52
+ """Release fencing lock."""
53
+ ...
54
+
55
+
56
+ @dataclass
57
+ class FencingToken:
58
+ """Fencing token for split-brain prevention.
59
+
60
+ Contains a monotonically increasing epoch number that must be
61
+ validated before any state-modifying operation.
62
+ """
63
+
64
+ processor_id: str
65
+ node_id: str
66
+ epoch: int
67
+ acquired_at: float = field(default_factory=time.time)
68
+ token_id: str = field(default_factory=lambda: secrets.token_hex(8))
69
+
70
+ def is_valid(self, current_epoch: int) -> bool:
71
+ """Check if this token's epoch is still current."""
72
+ return self.epoch >= current_epoch
73
+
74
+ def to_dict(self) -> dict[str, Any]:
75
+ """Serialize token to dictionary."""
76
+ return {
77
+ "processor_id": self.processor_id,
78
+ "node_id": self.node_id,
79
+ "epoch": self.epoch,
80
+ "acquired_at": self.acquired_at,
81
+ "token_id": self.token_id,
82
+ }
83
+
84
+ @classmethod
85
+ def from_dict(cls, data: dict[str, Any]) -> "FencingToken":
86
+ """Deserialize token from dictionary."""
87
+ return cls(
88
+ processor_id=data["processor_id"],
89
+ node_id=data["node_id"],
90
+ epoch=data["epoch"],
91
+ acquired_at=data.get("acquired_at", time.time()),
92
+ token_id=data.get("token_id", secrets.token_hex(8)),
93
+ )
94
+
95
+
96
+ @dataclass
97
+ class FencingConfig:
98
+ """Configuration for fencing behavior."""
99
+
100
+ # Lock acquisition timeout
101
+ acquire_timeout_sec: float = 10.0
102
+
103
+ # Lock TTL (auto-release after this time)
104
+ lock_ttl_sec: float = 60.0
105
+
106
+ # How often to refresh the lock
107
+ refresh_interval_sec: float = 15.0
108
+
109
+ # Backend type: "redis" or "memory" (for testing)
110
+ backend: str = "redis"
111
+
112
+ # Redis connection URL (if using redis backend)
113
+ redis_url: str | None = None
114
+
115
+ # Key prefix for Redis keys
116
+ key_prefix: str = "dory:fencing"
117
+
118
+ def __post_init__(self):
119
+ """Validate configuration."""
120
+ if self.refresh_interval_sec >= self.lock_ttl_sec:
121
+ raise ValueError(
122
+ f"refresh_interval_sec ({self.refresh_interval_sec}) must be "
123
+ f"less than lock_ttl_sec ({self.lock_ttl_sec})"
124
+ )
125
+
126
+ # Try to get Redis URL from environment if not provided
127
+ if self.backend == "redis" and not self.redis_url:
128
+ self.redis_url = os.environ.get("DORY_REDIS_URL", "redis://localhost:6379")
129
+
130
+
131
+ class InMemoryFencingBackend:
132
+ """In-memory fencing backend for testing."""
133
+
134
+ def __init__(self):
135
+ self._epochs: dict[str, int] = {}
136
+ self._locks: dict[str, tuple[str, float]] = {} # processor_id -> (node_id, expires_at)
137
+ self._lock = asyncio.Lock()
138
+
139
+ async def acquire_epoch(self, processor_id: str, node_id: str) -> int:
140
+ """Acquire and increment epoch."""
141
+ async with self._lock:
142
+ current = self._epochs.get(processor_id, 0)
143
+ new_epoch = current + 1
144
+ self._epochs[processor_id] = new_epoch
145
+ self._locks[processor_id] = (node_id, time.time() + 60.0)
146
+ return new_epoch
147
+
148
+ async def get_current_epoch(self, processor_id: str) -> int:
149
+ """Get current epoch."""
150
+ return self._epochs.get(processor_id, 0)
151
+
152
+ async def validate_epoch(self, processor_id: str, epoch: int) -> bool:
153
+ """Check if epoch is current."""
154
+ current = self._epochs.get(processor_id, 0)
155
+ return epoch >= current
156
+
157
+ async def release(self, processor_id: str, node_id: str) -> None:
158
+ """Release fencing lock."""
159
+ async with self._lock:
160
+ if processor_id in self._locks:
161
+ locked_node, _ = self._locks[processor_id]
162
+ if locked_node == node_id:
163
+ del self._locks[processor_id]
164
+
165
+
166
+ class RedisFencingBackend:
167
+ """Redis-based fencing backend for distributed deployments."""
168
+
169
+ def __init__(self, config: FencingConfig):
170
+ self._config = config
171
+ self._redis: Any = None
172
+ self._initialized = False
173
+
174
+ async def _ensure_initialized(self) -> None:
175
+ """Lazily initialize Redis connection."""
176
+ if self._initialized:
177
+ return
178
+
179
+ try:
180
+ import redis.asyncio as redis
181
+ except ImportError:
182
+ raise ImportError(
183
+ "redis package required for Redis fencing backend. "
184
+ "Install with: pip install redis"
185
+ )
186
+
187
+ self._redis = redis.from_url(
188
+ self._config.redis_url,
189
+ decode_responses=True,
190
+ )
191
+ self._initialized = True
192
+
193
+ def _epoch_key(self, processor_id: str) -> str:
194
+ """Get Redis key for epoch counter."""
195
+ return f"{self._config.key_prefix}:epoch:{processor_id}"
196
+
197
+ def _lock_key(self, processor_id: str) -> str:
198
+ """Get Redis key for lock."""
199
+ return f"{self._config.key_prefix}:lock:{processor_id}"
200
+
201
+ async def acquire_epoch(self, processor_id: str, node_id: str) -> int:
202
+ """Acquire and increment epoch atomically using Lua script."""
203
+ await self._ensure_initialized()
204
+
205
+ # Lua script for atomic epoch increment + lock acquisition
206
+ lua_script = """
207
+ local epoch_key = KEYS[1]
208
+ local lock_key = KEYS[2]
209
+ local node_id = ARGV[1]
210
+ local ttl = tonumber(ARGV[2])
211
+
212
+ -- Increment epoch
213
+ local new_epoch = redis.call('INCR', epoch_key)
214
+
215
+ -- Set lock with TTL
216
+ redis.call('SET', lock_key, node_id, 'EX', ttl)
217
+
218
+ return new_epoch
219
+ """
220
+
221
+ epoch_key = self._epoch_key(processor_id)
222
+ lock_key = self._lock_key(processor_id)
223
+
224
+ result = await self._redis.eval(
225
+ lua_script,
226
+ 2, # number of keys
227
+ epoch_key,
228
+ lock_key,
229
+ node_id,
230
+ int(self._config.lock_ttl_sec),
231
+ )
232
+
233
+ return int(result)
234
+
235
+ async def get_current_epoch(self, processor_id: str) -> int:
236
+ """Get current epoch."""
237
+ await self._ensure_initialized()
238
+
239
+ epoch_key = self._epoch_key(processor_id)
240
+ result = await self._redis.get(epoch_key)
241
+ return int(result) if result else 0
242
+
243
+ async def validate_epoch(self, processor_id: str, epoch: int) -> bool:
244
+ """Check if epoch is current."""
245
+ current = await self.get_current_epoch(processor_id)
246
+ return epoch >= current
247
+
248
+ async def release(self, processor_id: str, node_id: str) -> None:
249
+ """Release fencing lock if owned by this node."""
250
+ await self._ensure_initialized()
251
+
252
+ # Lua script for conditional delete
253
+ lua_script = """
254
+ local lock_key = KEYS[1]
255
+ local node_id = ARGV[1]
256
+
257
+ if redis.call('GET', lock_key) == node_id then
258
+ return redis.call('DEL', lock_key)
259
+ end
260
+ return 0
261
+ """
262
+
263
+ lock_key = self._lock_key(processor_id)
264
+ await self._redis.eval(lua_script, 1, lock_key, node_id)
265
+
266
+ async def close(self) -> None:
267
+ """Close Redis connection."""
268
+ if self._redis:
269
+ await self._redis.close()
270
+ self._initialized = False
271
+
272
+
273
+ class FencingManager:
274
+ """Manager for acquiring and validating fencing tokens.
275
+
276
+ Prevents split-brain scenarios by ensuring only one processor
277
+ instance can perform state-modifying operations at a time.
278
+
279
+ Usage:
280
+ config = FencingConfig(backend="redis")
281
+ manager = FencingManager(config)
282
+
283
+ # Acquire fencing token before processing
284
+ token = await manager.acquire("my-processor", "edge-node-1")
285
+
286
+ # Validate token before state operations
287
+ if await manager.validate(token):
288
+ # Safe to modify state
289
+ await save_state(...)
290
+ else:
291
+ # Token is stale, another instance has taken over
292
+ raise FenceViolation("Stale epoch")
293
+
294
+ # Release on shutdown
295
+ await manager.release(token)
296
+ """
297
+
298
+ def __init__(self, config: FencingConfig | None = None):
299
+ """Initialize fencing manager.
300
+
301
+ Args:
302
+ config: Fencing configuration
303
+ """
304
+ self._config = config or FencingConfig()
305
+ self._backend = self._create_backend()
306
+ self._active_tokens: dict[str, FencingToken] = {}
307
+ self._refresh_tasks: dict[str, asyncio.Task] = {}
308
+
309
+ def _create_backend(self) -> FencingBackend:
310
+ """Create fencing backend based on configuration."""
311
+ if self._config.backend == "memory":
312
+ return InMemoryFencingBackend()
313
+ elif self._config.backend == "redis":
314
+ return RedisFencingBackend(self._config)
315
+ else:
316
+ raise ValueError(f"Unknown fencing backend: {self._config.backend}")
317
+
318
+ async def acquire(
319
+ self,
320
+ processor_id: str,
321
+ node_id: str,
322
+ timeout_sec: float | None = None,
323
+ ) -> FencingToken:
324
+ """Acquire fencing token for processor.
325
+
326
+ Args:
327
+ processor_id: Unique processor identifier
328
+ node_id: Current node identifier
329
+ timeout_sec: Acquisition timeout (default from config)
330
+
331
+ Returns:
332
+ FencingToken with new epoch
333
+
334
+ Raises:
335
+ FencingError: If acquisition fails
336
+ """
337
+ timeout = timeout_sec or self._config.acquire_timeout_sec
338
+
339
+ try:
340
+ # Acquire epoch with timeout
341
+ epoch = await asyncio.wait_for(
342
+ self._backend.acquire_epoch(processor_id, node_id),
343
+ timeout=timeout,
344
+ )
345
+
346
+ token = FencingToken(
347
+ processor_id=processor_id,
348
+ node_id=node_id,
349
+ epoch=epoch,
350
+ )
351
+
352
+ self._active_tokens[processor_id] = token
353
+
354
+ # Start background refresh task
355
+ self._start_refresh_task(processor_id, node_id)
356
+
357
+ logger.info(
358
+ f"Acquired fencing token for {processor_id}: "
359
+ f"epoch={epoch}, node={node_id}"
360
+ )
361
+
362
+ return token
363
+
364
+ except asyncio.TimeoutError:
365
+ raise FencingError(
366
+ f"Failed to acquire fencing token for {processor_id} "
367
+ f"within {timeout}s"
368
+ )
369
+
370
+ async def validate(self, token: FencingToken) -> bool:
371
+ """Validate that fencing token is still current.
372
+
373
+ Args:
374
+ token: Fencing token to validate
375
+
376
+ Returns:
377
+ True if token is valid (not stale)
378
+ """
379
+ return await self._backend.validate_epoch(
380
+ token.processor_id,
381
+ token.epoch,
382
+ )
383
+
384
+ async def validate_or_raise(self, token: FencingToken) -> None:
385
+ """Validate token and raise if stale.
386
+
387
+ Args:
388
+ token: Fencing token to validate
389
+
390
+ Raises:
391
+ StaleEpochError: If token is stale
392
+ """
393
+ if not await self.validate(token):
394
+ current_epoch = await self._backend.get_current_epoch(token.processor_id)
395
+ raise StaleEpochError(
396
+ f"Fencing token for {token.processor_id} is stale: "
397
+ f"token_epoch={token.epoch}, current_epoch={current_epoch}"
398
+ )
399
+
400
+ async def release(self, token: FencingToken) -> None:
401
+ """Release fencing token.
402
+
403
+ Args:
404
+ token: Token to release
405
+ """
406
+ processor_id = token.processor_id
407
+
408
+ # Stop refresh task
409
+ if processor_id in self._refresh_tasks:
410
+ self._refresh_tasks[processor_id].cancel()
411
+ try:
412
+ await self._refresh_tasks[processor_id]
413
+ except asyncio.CancelledError:
414
+ pass
415
+ del self._refresh_tasks[processor_id]
416
+
417
+ # Release backend lock
418
+ await self._backend.release(processor_id, token.node_id)
419
+
420
+ # Remove from active tokens
421
+ if processor_id in self._active_tokens:
422
+ del self._active_tokens[processor_id]
423
+
424
+ logger.info(f"Released fencing token for {processor_id}")
425
+
426
+ async def get_current_epoch(self, processor_id: str) -> int:
427
+ """Get current epoch for processor.
428
+
429
+ Args:
430
+ processor_id: Processor identifier
431
+
432
+ Returns:
433
+ Current epoch number
434
+ """
435
+ return await self._backend.get_current_epoch(processor_id)
436
+
437
+ def _start_refresh_task(self, processor_id: str, node_id: str) -> None:
438
+ """Start background task to refresh fencing lock."""
439
+ if processor_id in self._refresh_tasks:
440
+ self._refresh_tasks[processor_id].cancel()
441
+
442
+ async def refresh_loop():
443
+ while True:
444
+ try:
445
+ await asyncio.sleep(self._config.refresh_interval_sec)
446
+
447
+ # Re-acquire to refresh TTL
448
+ # This doesn't increment epoch, just refreshes the lock
449
+ if isinstance(self._backend, RedisFencingBackend):
450
+ lock_key = self._backend._lock_key(processor_id)
451
+ await self._backend._redis.expire(
452
+ lock_key,
453
+ int(self._config.lock_ttl_sec),
454
+ )
455
+ logger.debug(f"Refreshed fencing lock for {processor_id}")
456
+
457
+ except asyncio.CancelledError:
458
+ break
459
+ except Exception as e:
460
+ logger.warning(f"Failed to refresh fencing lock: {e}")
461
+
462
+ self._refresh_tasks[processor_id] = asyncio.create_task(refresh_loop())
463
+
464
+ async def close(self) -> None:
465
+ """Close manager and release all tokens."""
466
+ # Cancel all refresh tasks
467
+ for task in self._refresh_tasks.values():
468
+ task.cancel()
469
+
470
+ # Wait for tasks to complete
471
+ if self._refresh_tasks:
472
+ await asyncio.gather(
473
+ *self._refresh_tasks.values(),
474
+ return_exceptions=True,
475
+ )
476
+ self._refresh_tasks.clear()
477
+
478
+ # Release all active tokens
479
+ for token in list(self._active_tokens.values()):
480
+ await self.release(token)
481
+
482
+ # Close backend if needed
483
+ if isinstance(self._backend, RedisFencingBackend):
484
+ await self._backend.close()
485
+
486
+ def get_active_token(self, processor_id: str) -> FencingToken | None:
487
+ """Get active token for processor if exists."""
488
+ return self._active_tokens.get(processor_id)