dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dory/edge/role.py ADDED
@@ -0,0 +1,416 @@
1
+ """Role management for edge processor failover.
2
+
3
+ Provides role-based coordination between edge and cloud instances
4
+ during failover scenarios.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import time
10
+ from dataclasses import dataclass, field
11
+ from enum import Enum
12
+ from typing import Any, Callable, Awaitable
13
+
14
+ from dory.edge.fencing import (
15
+ FencingManager,
16
+ FencingToken,
17
+ FencingConfig,
18
+ FenceViolation,
19
+ StaleEpochError,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ProcessorRole(Enum):
26
+ """Role states for a processor instance.
27
+
28
+ Transitions:
29
+ INITIALIZING -> PRIMARY (normal startup)
30
+ INITIALIZING -> STANDBY (cloud replica waiting for failover)
31
+ PRIMARY -> DRAINING (graceful shutdown or failover)
32
+ PRIMARY -> FENCED (lost fencing token)
33
+ DRAINING -> STOPPED
34
+ STANDBY -> PRIMARY (failover promoted)
35
+ FENCED -> STOPPED
36
+ """
37
+
38
+ INITIALIZING = "initializing"
39
+ PRIMARY = "primary"
40
+ STANDBY = "standby"
41
+ DRAINING = "draining"
42
+ FENCED = "fenced"
43
+ STOPPED = "stopped"
44
+
45
+ def can_process(self) -> bool:
46
+ """Check if this role allows message processing."""
47
+ return self == ProcessorRole.PRIMARY
48
+
49
+ def can_write_state(self) -> bool:
50
+ """Check if this role allows state modifications."""
51
+ return self in (ProcessorRole.PRIMARY, ProcessorRole.DRAINING)
52
+
53
+ def is_terminal(self) -> bool:
54
+ """Check if this is a terminal (final) state."""
55
+ return self in (ProcessorRole.STOPPED, ProcessorRole.FENCED)
56
+
57
+
58
+ @dataclass
59
+ class RoleTransition:
60
+ """Record of a role transition event."""
61
+
62
+ from_role: ProcessorRole
63
+ to_role: ProcessorRole
64
+ timestamp: float = field(default_factory=time.time)
65
+ reason: str = ""
66
+ epoch: int | None = None
67
+ node_id: str | None = None
68
+
69
+ def to_dict(self) -> dict[str, Any]:
70
+ """Serialize to dictionary."""
71
+ return {
72
+ "from_role": self.from_role.value,
73
+ "to_role": self.to_role.value,
74
+ "timestamp": self.timestamp,
75
+ "reason": self.reason,
76
+ "epoch": self.epoch,
77
+ "node_id": self.node_id,
78
+ }
79
+
80
+
81
+ # Type alias for role change callbacks
82
+ RoleChangeCallback = Callable[[RoleTransition], Awaitable[None]]
83
+
84
+
85
+ class RoleManager:
86
+ """Manager for processor role transitions and fencing coordination.
87
+
88
+ Coordinates role-based behavior for edge-to-cloud failover:
89
+ - Edge processor starts as PRIMARY with fencing token
90
+ - Cloud replica starts as STANDBY
91
+ - On edge failure, cloud acquires fencing token and becomes PRIMARY
92
+ - Edge instance becomes FENCED when it loses the token
93
+
94
+ Usage:
95
+ fencing_config = FencingConfig(backend="redis")
96
+ manager = RoleManager(
97
+ processor_id="my-processor",
98
+ node_id="edge-node-1",
99
+ fencing_config=fencing_config,
100
+ )
101
+
102
+ # Start and acquire PRIMARY role
103
+ await manager.start()
104
+ assert manager.role == ProcessorRole.PRIMARY
105
+
106
+ # Check role before processing
107
+ if manager.can_process():
108
+ process_message(msg)
109
+
110
+ # Handle failover (e.g., when losing connectivity)
111
+ manager.add_role_change_callback(handle_role_change)
112
+
113
+ # Graceful shutdown
114
+ await manager.stop()
115
+ """
116
+
117
+ def __init__(
118
+ self,
119
+ processor_id: str,
120
+ node_id: str,
121
+ fencing_config: FencingConfig | None = None,
122
+ start_as_standby: bool = False,
123
+ ):
124
+ """Initialize role manager.
125
+
126
+ Args:
127
+ processor_id: Unique processor identifier
128
+ node_id: Current node identifier (e.g., pod name)
129
+ fencing_config: Fencing configuration
130
+ start_as_standby: If True, start in STANDBY role
131
+ """
132
+ self._processor_id = processor_id
133
+ self._node_id = node_id
134
+ self._start_as_standby = start_as_standby
135
+
136
+ self._fencing_manager = FencingManager(fencing_config)
137
+ self._fencing_token: FencingToken | None = None
138
+
139
+ self._role = ProcessorRole.INITIALIZING
140
+ self._role_history: list[RoleTransition] = []
141
+ self._callbacks: list[RoleChangeCallback] = []
142
+
143
+ self._monitor_task: asyncio.Task | None = None
144
+ self._running = False
145
+
146
+ @property
147
+ def role(self) -> ProcessorRole:
148
+ """Current processor role."""
149
+ return self._role
150
+
151
+ @property
152
+ def processor_id(self) -> str:
153
+ """Processor identifier."""
154
+ return self._processor_id
155
+
156
+ @property
157
+ def node_id(self) -> str:
158
+ """Node identifier."""
159
+ return self._node_id
160
+
161
+ @property
162
+ def fencing_token(self) -> FencingToken | None:
163
+ """Current fencing token if PRIMARY."""
164
+ return self._fencing_token
165
+
166
+ def can_process(self) -> bool:
167
+ """Check if processor can handle messages."""
168
+ return self._role.can_process()
169
+
170
+ def can_write_state(self) -> bool:
171
+ """Check if processor can modify state."""
172
+ return self._role.can_write_state()
173
+
174
+ def add_role_change_callback(self, callback: RoleChangeCallback) -> None:
175
+ """Register callback for role changes.
176
+
177
+ Args:
178
+ callback: Async function called on role transitions
179
+ """
180
+ self._callbacks.append(callback)
181
+
182
+ def remove_role_change_callback(self, callback: RoleChangeCallback) -> None:
183
+ """Remove role change callback."""
184
+ if callback in self._callbacks:
185
+ self._callbacks.remove(callback)
186
+
187
+ async def start(self) -> ProcessorRole:
188
+ """Start role management and acquire initial role.
189
+
190
+ Returns:
191
+ Initial role (PRIMARY or STANDBY)
192
+ """
193
+ self._running = True
194
+
195
+ if self._start_as_standby:
196
+ await self._transition_to(
197
+ ProcessorRole.STANDBY,
198
+ reason="Started as standby replica",
199
+ )
200
+ else:
201
+ # Try to acquire PRIMARY role with fencing token
202
+ try:
203
+ self._fencing_token = await self._fencing_manager.acquire(
204
+ self._processor_id,
205
+ self._node_id,
206
+ )
207
+ await self._transition_to(
208
+ ProcessorRole.PRIMARY,
209
+ reason="Acquired fencing token",
210
+ epoch=self._fencing_token.epoch,
211
+ )
212
+ except Exception as e:
213
+ logger.error(f"Failed to acquire fencing token: {e}")
214
+ await self._transition_to(
215
+ ProcessorRole.STANDBY,
216
+ reason=f"Failed to acquire fencing token: {e}",
217
+ )
218
+
219
+ # Start background monitoring
220
+ self._monitor_task = asyncio.create_task(self._monitor_fencing())
221
+
222
+ return self._role
223
+
224
+ async def stop(self, reason: str = "Shutdown requested") -> None:
225
+ """Stop role management and release resources.
226
+
227
+ Args:
228
+ reason: Reason for stopping
229
+ """
230
+ self._running = False
231
+
232
+ # Stop monitoring
233
+ if self._monitor_task:
234
+ self._monitor_task.cancel()
235
+ try:
236
+ await self._monitor_task
237
+ except asyncio.CancelledError:
238
+ pass
239
+ self._monitor_task = None
240
+
241
+ # Drain if PRIMARY
242
+ if self._role == ProcessorRole.PRIMARY:
243
+ await self._transition_to(
244
+ ProcessorRole.DRAINING,
245
+ reason=reason,
246
+ )
247
+
248
+ # Release fencing token
249
+ if self._fencing_token:
250
+ await self._fencing_manager.release(self._fencing_token)
251
+ self._fencing_token = None
252
+
253
+ await self._transition_to(
254
+ ProcessorRole.STOPPED,
255
+ reason=reason,
256
+ )
257
+
258
+ # Close fencing manager
259
+ await self._fencing_manager.close()
260
+
261
+ async def promote_to_primary(self) -> bool:
262
+ """Attempt to promote from STANDBY to PRIMARY.
263
+
264
+ Returns:
265
+ True if promotion succeeded
266
+ """
267
+ if self._role != ProcessorRole.STANDBY:
268
+ logger.warning(
269
+ f"Cannot promote from {self._role.value}, "
270
+ "must be STANDBY"
271
+ )
272
+ return False
273
+
274
+ try:
275
+ # Acquire fencing token
276
+ self._fencing_token = await self._fencing_manager.acquire(
277
+ self._processor_id,
278
+ self._node_id,
279
+ )
280
+
281
+ await self._transition_to(
282
+ ProcessorRole.PRIMARY,
283
+ reason="Promoted from standby",
284
+ epoch=self._fencing_token.epoch,
285
+ )
286
+
287
+ return True
288
+
289
+ except Exception as e:
290
+ logger.error(f"Failed to promote to PRIMARY: {e}")
291
+ return False
292
+
293
+ async def demote_to_standby(self, reason: str = "Demoted") -> None:
294
+ """Demote from PRIMARY to STANDBY, releasing fencing token.
295
+
296
+ Args:
297
+ reason: Reason for demotion
298
+ """
299
+ if self._role not in (ProcessorRole.PRIMARY, ProcessorRole.DRAINING):
300
+ logger.warning(f"Cannot demote from {self._role.value}")
301
+ return
302
+
303
+ # Release fencing token
304
+ if self._fencing_token:
305
+ await self._fencing_manager.release(self._fencing_token)
306
+ self._fencing_token = None
307
+
308
+ await self._transition_to(
309
+ ProcessorRole.STANDBY,
310
+ reason=reason,
311
+ )
312
+
313
+ async def validate_fencing(self) -> bool:
314
+ """Validate current fencing token is still valid.
315
+
316
+ Returns:
317
+ True if fencing token is valid (or not PRIMARY)
318
+ """
319
+ if self._role != ProcessorRole.PRIMARY:
320
+ return True
321
+
322
+ if not self._fencing_token:
323
+ return False
324
+
325
+ return await self._fencing_manager.validate(self._fencing_token)
326
+
327
+ async def validate_fencing_or_fence(self) -> bool:
328
+ """Validate fencing and transition to FENCED if stale.
329
+
330
+ Returns:
331
+ True if still PRIMARY, False if fenced
332
+ """
333
+ if not await self.validate_fencing():
334
+ await self._transition_to(
335
+ ProcessorRole.FENCED,
336
+ reason="Fencing token became stale",
337
+ )
338
+ return False
339
+ return True
340
+
341
+ async def _transition_to(
342
+ self,
343
+ new_role: ProcessorRole,
344
+ reason: str = "",
345
+ epoch: int | None = None,
346
+ ) -> None:
347
+ """Transition to a new role.
348
+
349
+ Args:
350
+ new_role: Target role
351
+ reason: Reason for transition
352
+ epoch: Fencing epoch if applicable
353
+ """
354
+ if new_role == self._role:
355
+ return
356
+
357
+ old_role = self._role
358
+ self._role = new_role
359
+
360
+ transition = RoleTransition(
361
+ from_role=old_role,
362
+ to_role=new_role,
363
+ reason=reason,
364
+ epoch=epoch or (self._fencing_token.epoch if self._fencing_token else None),
365
+ node_id=self._node_id,
366
+ )
367
+ self._role_history.append(transition)
368
+
369
+ logger.info(
370
+ f"Role transition: {old_role.value} -> {new_role.value} "
371
+ f"(reason={reason}, epoch={transition.epoch})"
372
+ )
373
+
374
+ # Notify callbacks
375
+ for callback in self._callbacks:
376
+ try:
377
+ await callback(transition)
378
+ except Exception as e:
379
+ logger.error(f"Role change callback failed: {e}")
380
+
381
+ async def _monitor_fencing(self) -> None:
382
+ """Background task to monitor fencing token validity."""
383
+ while self._running:
384
+ try:
385
+ await asyncio.sleep(5.0) # Check every 5 seconds
386
+
387
+ if self._role == ProcessorRole.PRIMARY:
388
+ is_valid = await self.validate_fencing()
389
+ if not is_valid:
390
+ logger.warning("Fencing token validation failed")
391
+ await self._transition_to(
392
+ ProcessorRole.FENCED,
393
+ reason="Fencing token invalidated by another instance",
394
+ )
395
+
396
+ except asyncio.CancelledError:
397
+ break
398
+ except Exception as e:
399
+ logger.error(f"Fencing monitor error: {e}")
400
+
401
+ def get_role_history(self) -> list[RoleTransition]:
402
+ """Get list of role transitions."""
403
+ return self._role_history.copy()
404
+
405
+ def get_status(self) -> dict[str, Any]:
406
+ """Get current role status as dictionary."""
407
+ return {
408
+ "processor_id": self._processor_id,
409
+ "node_id": self._node_id,
410
+ "role": self._role.value,
411
+ "can_process": self.can_process(),
412
+ "can_write_state": self.can_write_state(),
413
+ "fencing_epoch": self._fencing_token.epoch if self._fencing_token else None,
414
+ "is_running": self._running,
415
+ "transition_count": len(self._role_history),
416
+ }