PyPI - dory-sdk - Versions diffs - 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl - Mend

dory-sdk 2.1.0py3-none-any.whl → 2.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

dory/__init__.py +32 -1
dory/config/defaults.py +6 -0
dory/config/schema.py +26 -0
dory/edge/__init__.py +88 -0
dory/edge/adaptive.py +648 -0
dory/edge/detector.py +546 -0
dory/edge/fencing.py +488 -0
dory/edge/heartbeat.py +598 -0
dory/edge/role.py +416 -0
dory/health/server.py +283 -9
dory/k8s/__init__.py +69 -0
dory/k8s/labels.py +505 -0
dory/migration/__init__.py +49 -0
dory/migration/s3_store.py +656 -0
dory/migration/state_manager.py +64 -6
dory/migration/transfer.py +382 -0
dory/migration/versioning.py +749 -0
{dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/METADATA +37 -32
{dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/RECORD +22 -15
dory_sdk-2.1.4.dist-info/entry_points.txt +2 -0
dory/sidecar/__init__.py +0 -6
dory/sidecar/main.py +0 -75
dory/sidecar/server.py +0 -329
dory_sdk-2.1.0.dist-info/entry_points.txt +0 -3
{dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/WHEEL +0 -0
{dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/top_level.txt +0 -0

dory/edge/fencing.py ADDED Viewed

@@ -0,0 +1,488 @@
+"""Fencing mechanism for split-brain prevention.
+Provides distributed locking and fencing tokens to prevent dual-write
+scenarios during edge-to-cloud failover.
+"""
+import asyncio
+import logging
+import os
+import secrets
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Protocol
+from dory.utils.errors import DoryError
+logger = logging.getLogger(__name__)
+class FencingError(DoryError):
+    """Base error for fencing operations."""
+    pass
+class FenceViolation(FencingError):
+    """Raised when an operation violates the fencing protocol."""
+    pass
+class StaleEpochError(FencingError):
+    """Raised when attempting to use a stale (outdated) epoch."""
+    pass
+class FencingBackend(Protocol):
+    """Protocol for fencing backend implementations."""
+    async def acquire_epoch(self, processor_id: str, node_id: str) -> int:
+        """Acquire and increment epoch for processor."""
+        ...
+    async def get_current_epoch(self, processor_id: str) -> int:
+        """Get current epoch for processor."""
+        ...
+    async def validate_epoch(self, processor_id: str, epoch: int) -> bool:
+        """Check if epoch is current (not stale)."""
+        ...
+    async def release(self, processor_id: str, node_id: str) -> None:
+        """Release fencing lock."""
+        ...
+@dataclass
+class FencingToken:
+    """Fencing token for split-brain prevention.
+    Contains a monotonically increasing epoch number that must be
+    validated before any state-modifying operation.
+    """
+    processor_id: str
+    node_id: str
+    epoch: int
+    acquired_at: float = field(default_factory=time.time)
+    token_id: str = field(default_factory=lambda: secrets.token_hex(8))
+    def is_valid(self, current_epoch: int) -> bool:
+        """Check if this token's epoch is still current."""
+        return self.epoch >= current_epoch
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize token to dictionary."""
+        return {
+            "processor_id": self.processor_id,
+            "node_id": self.node_id,
+            "epoch": self.epoch,
+            "acquired_at": self.acquired_at,
+            "token_id": self.token_id,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "FencingToken":
+        """Deserialize token from dictionary."""
+        return cls(
+            processor_id=data["processor_id"],
+            node_id=data["node_id"],
+            epoch=data["epoch"],
+            acquired_at=data.get("acquired_at", time.time()),
+            token_id=data.get("token_id", secrets.token_hex(8)),
+        )
+@dataclass
+class FencingConfig:
+    """Configuration for fencing behavior."""
+    # Lock acquisition timeout
+    acquire_timeout_sec: float = 10.0
+    # Lock TTL (auto-release after this time)
+    lock_ttl_sec: float = 60.0
+    # How often to refresh the lock
+    refresh_interval_sec: float = 15.0
+    # Backend type: "redis" or "memory" (for testing)
+    backend: str = "redis"
+    # Redis connection URL (if using redis backend)
+    redis_url: str | None = None
+    # Key prefix for Redis keys
+    key_prefix: str = "dory:fencing"
+    def __post_init__(self):
+        """Validate configuration."""
+        if self.refresh_interval_sec >= self.lock_ttl_sec:
+            raise ValueError(
+                f"refresh_interval_sec ({self.refresh_interval_sec}) must be "
+                f"less than lock_ttl_sec ({self.lock_ttl_sec})"
+            )
+        # Try to get Redis URL from environment if not provided
+        if self.backend == "redis" and not self.redis_url:
+            self.redis_url = os.environ.get("DORY_REDIS_URL", "redis://localhost:6379")
+class InMemoryFencingBackend:
+    """In-memory fencing backend for testing."""
+    def __init__(self):
+        self._epochs: dict[str, int] = {}
+        self._locks: dict[str, tuple[str, float]] = {}  # processor_id -> (node_id, expires_at)
+        self._lock = asyncio.Lock()
+    async def acquire_epoch(self, processor_id: str, node_id: str) -> int:
+        """Acquire and increment epoch."""
+        async with self._lock:
+            current = self._epochs.get(processor_id, 0)
+            new_epoch = current + 1
+            self._epochs[processor_id] = new_epoch
+            self._locks[processor_id] = (node_id, time.time() + 60.0)
+            return new_epoch
+    async def get_current_epoch(self, processor_id: str) -> int:
+        """Get current epoch."""
+        return self._epochs.get(processor_id, 0)
+    async def validate_epoch(self, processor_id: str, epoch: int) -> bool:
+        """Check if epoch is current."""
+        current = self._epochs.get(processor_id, 0)
+        return epoch >= current
+    async def release(self, processor_id: str, node_id: str) -> None:
+        """Release fencing lock."""
+        async with self._lock:
+            if processor_id in self._locks:
+                locked_node, _ = self._locks[processor_id]
+                if locked_node == node_id:
+                    del self._locks[processor_id]
+class RedisFencingBackend:
+    """Redis-based fencing backend for distributed deployments."""
+    def __init__(self, config: FencingConfig):
+        self._config = config
+        self._redis: Any = None
+        self._initialized = False
+    async def _ensure_initialized(self) -> None:
+        """Lazily initialize Redis connection."""
+        if self._initialized:
+            return
+        try:
+            import redis.asyncio as redis
+        except ImportError:
+            raise ImportError(
+                "redis package required for Redis fencing backend. "
+                "Install with: pip install redis"
+            )
+        self._redis = redis.from_url(
+            self._config.redis_url,
+            decode_responses=True,
+        )
+        self._initialized = True
+    def _epoch_key(self, processor_id: str) -> str:
+        """Get Redis key for epoch counter."""
+        return f"{self._config.key_prefix}:epoch:{processor_id}"
+    def _lock_key(self, processor_id: str) -> str:
+        """Get Redis key for lock."""
+        return f"{self._config.key_prefix}:lock:{processor_id}"
+    async def acquire_epoch(self, processor_id: str, node_id: str) -> int:
+        """Acquire and increment epoch atomically using Lua script."""
+        await self._ensure_initialized()
+        # Lua script for atomic epoch increment + lock acquisition
+        lua_script = """
+        local epoch_key = KEYS[1]
+        local lock_key = KEYS[2]
+        local node_id = ARGV[1]
+        local ttl = tonumber(ARGV[2])
+        -- Increment epoch
+        local new_epoch = redis.call('INCR', epoch_key)
+        -- Set lock with TTL
+        redis.call('SET', lock_key, node_id, 'EX', ttl)
+        return new_epoch
+        """
+        epoch_key = self._epoch_key(processor_id)
+        lock_key = self._lock_key(processor_id)
+        result = await self._redis.eval(
+            lua_script,
+            2,  # number of keys
+            epoch_key,
+            lock_key,
+            node_id,
+            int(self._config.lock_ttl_sec),
+        )
+        return int(result)
+    async def get_current_epoch(self, processor_id: str) -> int:
+        """Get current epoch."""
+        await self._ensure_initialized()
+        epoch_key = self._epoch_key(processor_id)
+        result = await self._redis.get(epoch_key)
+        return int(result) if result else 0
+    async def validate_epoch(self, processor_id: str, epoch: int) -> bool:
+        """Check if epoch is current."""
+        current = await self.get_current_epoch(processor_id)
+        return epoch >= current
+    async def release(self, processor_id: str, node_id: str) -> None:
+        """Release fencing lock if owned by this node."""
+        await self._ensure_initialized()
+        # Lua script for conditional delete
+        lua_script = """
+        local lock_key = KEYS[1]
+        local node_id = ARGV[1]
+        if redis.call('GET', lock_key) == node_id then
+            return redis.call('DEL', lock_key)
+        end
+        return 0
+        """
+        lock_key = self._lock_key(processor_id)
+        await self._redis.eval(lua_script, 1, lock_key, node_id)
+    async def close(self) -> None:
+        """Close Redis connection."""
+        if self._redis:
+            await self._redis.close()
+            self._initialized = False
+class FencingManager:
+    """Manager for acquiring and validating fencing tokens.
+    Prevents split-brain scenarios by ensuring only one processor
+    instance can perform state-modifying operations at a time.
+    Usage:
+        config = FencingConfig(backend="redis")
+        manager = FencingManager(config)
+        # Acquire fencing token before processing
+        token = await manager.acquire("my-processor", "edge-node-1")
+        # Validate token before state operations
+        if await manager.validate(token):
+            # Safe to modify state
+            await save_state(...)
+        else:
+            # Token is stale, another instance has taken over
+            raise FenceViolation("Stale epoch")
+        # Release on shutdown
+        await manager.release(token)
+    """
+    def __init__(self, config: FencingConfig | None = None):
+        """Initialize fencing manager.
+        Args:
+            config: Fencing configuration
+        """
+        self._config = config or FencingConfig()
+        self._backend = self._create_backend()
+        self._active_tokens: dict[str, FencingToken] = {}
+        self._refresh_tasks: dict[str, asyncio.Task] = {}
+    def _create_backend(self) -> FencingBackend:
+        """Create fencing backend based on configuration."""
+        if self._config.backend == "memory":
+            return InMemoryFencingBackend()
+        elif self._config.backend == "redis":
+            return RedisFencingBackend(self._config)
+        else:
+            raise ValueError(f"Unknown fencing backend: {self._config.backend}")
+    async def acquire(
+        self,
+        processor_id: str,
+        node_id: str,
+        timeout_sec: float | None = None,
+    ) -> FencingToken:
+        """Acquire fencing token for processor.
+        Args:
+            processor_id: Unique processor identifier
+            node_id: Current node identifier
+            timeout_sec: Acquisition timeout (default from config)
+        Returns:
+            FencingToken with new epoch
+        Raises:
+            FencingError: If acquisition fails
+        """
+        timeout = timeout_sec or self._config.acquire_timeout_sec
+        try:
+            # Acquire epoch with timeout
+            epoch = await asyncio.wait_for(
+                self._backend.acquire_epoch(processor_id, node_id),
+                timeout=timeout,
+            )
+            token = FencingToken(
+                processor_id=processor_id,
+                node_id=node_id,
+                epoch=epoch,
+            )
+            self._active_tokens[processor_id] = token
+            # Start background refresh task
+            self._start_refresh_task(processor_id, node_id)
+            logger.info(
+                f"Acquired fencing token for {processor_id}: "
+                f"epoch={epoch}, node={node_id}"
+            )
+            return token
+        except asyncio.TimeoutError:
+            raise FencingError(
+                f"Failed to acquire fencing token for {processor_id} "
+                f"within {timeout}s"
+            )
+    async def validate(self, token: FencingToken) -> bool:
+        """Validate that fencing token is still current.
+        Args:
+            token: Fencing token to validate
+        Returns:
+            True if token is valid (not stale)
+        """
+        return await self._backend.validate_epoch(
+            token.processor_id,
+            token.epoch,
+        )
+    async def validate_or_raise(self, token: FencingToken) -> None:
+        """Validate token and raise if stale.
+        Args:
+            token: Fencing token to validate
+        Raises:
+            StaleEpochError: If token is stale
+        """
+        if not await self.validate(token):
+            current_epoch = await self._backend.get_current_epoch(token.processor_id)
+            raise StaleEpochError(
+                f"Fencing token for {token.processor_id} is stale: "
+                f"token_epoch={token.epoch}, current_epoch={current_epoch}"
+            )
+    async def release(self, token: FencingToken) -> None:
+        """Release fencing token.
+        Args:
+            token: Token to release
+        """
+        processor_id = token.processor_id
+        # Stop refresh task
+        if processor_id in self._refresh_tasks:
+            self._refresh_tasks[processor_id].cancel()
+            try:
+                await self._refresh_tasks[processor_id]
+            except asyncio.CancelledError:
+                pass
+            del self._refresh_tasks[processor_id]
+        # Release backend lock
+        await self._backend.release(processor_id, token.node_id)
+        # Remove from active tokens
+        if processor_id in self._active_tokens:
+            del self._active_tokens[processor_id]
+        logger.info(f"Released fencing token for {processor_id}")
+    async def get_current_epoch(self, processor_id: str) -> int:
+        """Get current epoch for processor.
+        Args:
+            processor_id: Processor identifier
+        Returns:
+            Current epoch number
+        """
+        return await self._backend.get_current_epoch(processor_id)
+    def _start_refresh_task(self, processor_id: str, node_id: str) -> None:
+        """Start background task to refresh fencing lock."""
+        if processor_id in self._refresh_tasks:
+            self._refresh_tasks[processor_id].cancel()
+        async def refresh_loop():
+            while True:
+                try:
+                    await asyncio.sleep(self._config.refresh_interval_sec)
+                    # Re-acquire to refresh TTL
+                    # This doesn't increment epoch, just refreshes the lock
+                    if isinstance(self._backend, RedisFencingBackend):
+                        lock_key = self._backend._lock_key(processor_id)
+                        await self._backend._redis.expire(
+                            lock_key,
+                            int(self._config.lock_ttl_sec),
+                        )
+                        logger.debug(f"Refreshed fencing lock for {processor_id}")
+                except asyncio.CancelledError:
+                    break
+                except Exception as e:
+                    logger.warning(f"Failed to refresh fencing lock: {e}")
+        self._refresh_tasks[processor_id] = asyncio.create_task(refresh_loop())
+    async def close(self) -> None:
+        """Close manager and release all tokens."""
+        # Cancel all refresh tasks
+        for task in self._refresh_tasks.values():
+            task.cancel()
+        # Wait for tasks to complete
+        if self._refresh_tasks:
+            await asyncio.gather(
+                *self._refresh_tasks.values(),
+                return_exceptions=True,
+            )
+            self._refresh_tasks.clear()
+        # Release all active tokens
+        for token in list(self._active_tokens.values()):
+            await self.release(token)
+        # Close backend if needed
+        if isinstance(self._backend, RedisFencingBackend):
+            await self._backend.close()
+    def get_active_token(self, processor_id: str) -> FencingToken | None:
+        """Get active token for processor if exists."""
+        return self._active_tokens.get(processor_id)

dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

dory-sdk 2.1.0py3-none-any.whl → 2.1.4py3-none-any.whl