dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +32 -1
- dory/config/defaults.py +6 -0
- dory/config/schema.py +26 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +648 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +416 -0
- dory/health/server.py +283 -9
- dory/k8s/__init__.py +69 -0
- dory/k8s/labels.py +505 -0
- dory/migration/__init__.py +49 -0
- dory/migration/s3_store.py +656 -0
- dory/migration/state_manager.py +64 -6
- dory/migration/transfer.py +382 -0
- dory/migration/versioning.py +749 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/METADATA +37 -32
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/RECORD +22 -15
- dory_sdk-2.1.4.dist-info/entry_points.txt +2 -0
- dory/sidecar/__init__.py +0 -6
- dory/sidecar/main.py +0 -75
- dory/sidecar/server.py +0 -329
- dory_sdk-2.1.0.dist-info/entry_points.txt +0 -3
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/WHEEL +0 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/top_level.txt +0 -0
dory/edge/role.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""Role management for edge processor failover.
|
|
2
|
+
|
|
3
|
+
Provides role-based coordination between edge and cloud instances
|
|
4
|
+
during failover scenarios.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
import time
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any, Callable, Awaitable
|
|
13
|
+
|
|
14
|
+
from dory.edge.fencing import (
|
|
15
|
+
FencingManager,
|
|
16
|
+
FencingToken,
|
|
17
|
+
FencingConfig,
|
|
18
|
+
FenceViolation,
|
|
19
|
+
StaleEpochError,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ProcessorRole(Enum):
|
|
26
|
+
"""Role states for a processor instance.
|
|
27
|
+
|
|
28
|
+
Transitions:
|
|
29
|
+
INITIALIZING -> PRIMARY (normal startup)
|
|
30
|
+
INITIALIZING -> STANDBY (cloud replica waiting for failover)
|
|
31
|
+
PRIMARY -> DRAINING (graceful shutdown or failover)
|
|
32
|
+
PRIMARY -> FENCED (lost fencing token)
|
|
33
|
+
DRAINING -> STOPPED
|
|
34
|
+
STANDBY -> PRIMARY (failover promoted)
|
|
35
|
+
FENCED -> STOPPED
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
INITIALIZING = "initializing"
|
|
39
|
+
PRIMARY = "primary"
|
|
40
|
+
STANDBY = "standby"
|
|
41
|
+
DRAINING = "draining"
|
|
42
|
+
FENCED = "fenced"
|
|
43
|
+
STOPPED = "stopped"
|
|
44
|
+
|
|
45
|
+
def can_process(self) -> bool:
|
|
46
|
+
"""Check if this role allows message processing."""
|
|
47
|
+
return self == ProcessorRole.PRIMARY
|
|
48
|
+
|
|
49
|
+
def can_write_state(self) -> bool:
|
|
50
|
+
"""Check if this role allows state modifications."""
|
|
51
|
+
return self in (ProcessorRole.PRIMARY, ProcessorRole.DRAINING)
|
|
52
|
+
|
|
53
|
+
def is_terminal(self) -> bool:
|
|
54
|
+
"""Check if this is a terminal (final) state."""
|
|
55
|
+
return self in (ProcessorRole.STOPPED, ProcessorRole.FENCED)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class RoleTransition:
|
|
60
|
+
"""Record of a role transition event."""
|
|
61
|
+
|
|
62
|
+
from_role: ProcessorRole
|
|
63
|
+
to_role: ProcessorRole
|
|
64
|
+
timestamp: float = field(default_factory=time.time)
|
|
65
|
+
reason: str = ""
|
|
66
|
+
epoch: int | None = None
|
|
67
|
+
node_id: str | None = None
|
|
68
|
+
|
|
69
|
+
def to_dict(self) -> dict[str, Any]:
|
|
70
|
+
"""Serialize to dictionary."""
|
|
71
|
+
return {
|
|
72
|
+
"from_role": self.from_role.value,
|
|
73
|
+
"to_role": self.to_role.value,
|
|
74
|
+
"timestamp": self.timestamp,
|
|
75
|
+
"reason": self.reason,
|
|
76
|
+
"epoch": self.epoch,
|
|
77
|
+
"node_id": self.node_id,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Type alias for role change callbacks
|
|
82
|
+
RoleChangeCallback = Callable[[RoleTransition], Awaitable[None]]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class RoleManager:
|
|
86
|
+
"""Manager for processor role transitions and fencing coordination.
|
|
87
|
+
|
|
88
|
+
Coordinates role-based behavior for edge-to-cloud failover:
|
|
89
|
+
- Edge processor starts as PRIMARY with fencing token
|
|
90
|
+
- Cloud replica starts as STANDBY
|
|
91
|
+
- On edge failure, cloud acquires fencing token and becomes PRIMARY
|
|
92
|
+
- Edge instance becomes FENCED when it loses the token
|
|
93
|
+
|
|
94
|
+
Usage:
|
|
95
|
+
fencing_config = FencingConfig(backend="redis")
|
|
96
|
+
manager = RoleManager(
|
|
97
|
+
processor_id="my-processor",
|
|
98
|
+
node_id="edge-node-1",
|
|
99
|
+
fencing_config=fencing_config,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Start and acquire PRIMARY role
|
|
103
|
+
await manager.start()
|
|
104
|
+
assert manager.role == ProcessorRole.PRIMARY
|
|
105
|
+
|
|
106
|
+
# Check role before processing
|
|
107
|
+
if manager.can_process():
|
|
108
|
+
process_message(msg)
|
|
109
|
+
|
|
110
|
+
# Handle failover (e.g., when losing connectivity)
|
|
111
|
+
manager.add_role_change_callback(handle_role_change)
|
|
112
|
+
|
|
113
|
+
# Graceful shutdown
|
|
114
|
+
await manager.stop()
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
processor_id: str,
|
|
120
|
+
node_id: str,
|
|
121
|
+
fencing_config: FencingConfig | None = None,
|
|
122
|
+
start_as_standby: bool = False,
|
|
123
|
+
):
|
|
124
|
+
"""Initialize role manager.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
processor_id: Unique processor identifier
|
|
128
|
+
node_id: Current node identifier (e.g., pod name)
|
|
129
|
+
fencing_config: Fencing configuration
|
|
130
|
+
start_as_standby: If True, start in STANDBY role
|
|
131
|
+
"""
|
|
132
|
+
self._processor_id = processor_id
|
|
133
|
+
self._node_id = node_id
|
|
134
|
+
self._start_as_standby = start_as_standby
|
|
135
|
+
|
|
136
|
+
self._fencing_manager = FencingManager(fencing_config)
|
|
137
|
+
self._fencing_token: FencingToken | None = None
|
|
138
|
+
|
|
139
|
+
self._role = ProcessorRole.INITIALIZING
|
|
140
|
+
self._role_history: list[RoleTransition] = []
|
|
141
|
+
self._callbacks: list[RoleChangeCallback] = []
|
|
142
|
+
|
|
143
|
+
self._monitor_task: asyncio.Task | None = None
|
|
144
|
+
self._running = False
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def role(self) -> ProcessorRole:
|
|
148
|
+
"""Current processor role."""
|
|
149
|
+
return self._role
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def processor_id(self) -> str:
|
|
153
|
+
"""Processor identifier."""
|
|
154
|
+
return self._processor_id
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def node_id(self) -> str:
|
|
158
|
+
"""Node identifier."""
|
|
159
|
+
return self._node_id
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def fencing_token(self) -> FencingToken | None:
|
|
163
|
+
"""Current fencing token if PRIMARY."""
|
|
164
|
+
return self._fencing_token
|
|
165
|
+
|
|
166
|
+
def can_process(self) -> bool:
|
|
167
|
+
"""Check if processor can handle messages."""
|
|
168
|
+
return self._role.can_process()
|
|
169
|
+
|
|
170
|
+
def can_write_state(self) -> bool:
|
|
171
|
+
"""Check if processor can modify state."""
|
|
172
|
+
return self._role.can_write_state()
|
|
173
|
+
|
|
174
|
+
def add_role_change_callback(self, callback: RoleChangeCallback) -> None:
|
|
175
|
+
"""Register callback for role changes.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
callback: Async function called on role transitions
|
|
179
|
+
"""
|
|
180
|
+
self._callbacks.append(callback)
|
|
181
|
+
|
|
182
|
+
def remove_role_change_callback(self, callback: RoleChangeCallback) -> None:
|
|
183
|
+
"""Remove role change callback."""
|
|
184
|
+
if callback in self._callbacks:
|
|
185
|
+
self._callbacks.remove(callback)
|
|
186
|
+
|
|
187
|
+
async def start(self) -> ProcessorRole:
|
|
188
|
+
"""Start role management and acquire initial role.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Initial role (PRIMARY or STANDBY)
|
|
192
|
+
"""
|
|
193
|
+
self._running = True
|
|
194
|
+
|
|
195
|
+
if self._start_as_standby:
|
|
196
|
+
await self._transition_to(
|
|
197
|
+
ProcessorRole.STANDBY,
|
|
198
|
+
reason="Started as standby replica",
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
# Try to acquire PRIMARY role with fencing token
|
|
202
|
+
try:
|
|
203
|
+
self._fencing_token = await self._fencing_manager.acquire(
|
|
204
|
+
self._processor_id,
|
|
205
|
+
self._node_id,
|
|
206
|
+
)
|
|
207
|
+
await self._transition_to(
|
|
208
|
+
ProcessorRole.PRIMARY,
|
|
209
|
+
reason="Acquired fencing token",
|
|
210
|
+
epoch=self._fencing_token.epoch,
|
|
211
|
+
)
|
|
212
|
+
except Exception as e:
|
|
213
|
+
logger.error(f"Failed to acquire fencing token: {e}")
|
|
214
|
+
await self._transition_to(
|
|
215
|
+
ProcessorRole.STANDBY,
|
|
216
|
+
reason=f"Failed to acquire fencing token: {e}",
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Start background monitoring
|
|
220
|
+
self._monitor_task = asyncio.create_task(self._monitor_fencing())
|
|
221
|
+
|
|
222
|
+
return self._role
|
|
223
|
+
|
|
224
|
+
async def stop(self, reason: str = "Shutdown requested") -> None:
|
|
225
|
+
"""Stop role management and release resources.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
reason: Reason for stopping
|
|
229
|
+
"""
|
|
230
|
+
self._running = False
|
|
231
|
+
|
|
232
|
+
# Stop monitoring
|
|
233
|
+
if self._monitor_task:
|
|
234
|
+
self._monitor_task.cancel()
|
|
235
|
+
try:
|
|
236
|
+
await self._monitor_task
|
|
237
|
+
except asyncio.CancelledError:
|
|
238
|
+
pass
|
|
239
|
+
self._monitor_task = None
|
|
240
|
+
|
|
241
|
+
# Drain if PRIMARY
|
|
242
|
+
if self._role == ProcessorRole.PRIMARY:
|
|
243
|
+
await self._transition_to(
|
|
244
|
+
ProcessorRole.DRAINING,
|
|
245
|
+
reason=reason,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Release fencing token
|
|
249
|
+
if self._fencing_token:
|
|
250
|
+
await self._fencing_manager.release(self._fencing_token)
|
|
251
|
+
self._fencing_token = None
|
|
252
|
+
|
|
253
|
+
await self._transition_to(
|
|
254
|
+
ProcessorRole.STOPPED,
|
|
255
|
+
reason=reason,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Close fencing manager
|
|
259
|
+
await self._fencing_manager.close()
|
|
260
|
+
|
|
261
|
+
async def promote_to_primary(self) -> bool:
|
|
262
|
+
"""Attempt to promote from STANDBY to PRIMARY.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
True if promotion succeeded
|
|
266
|
+
"""
|
|
267
|
+
if self._role != ProcessorRole.STANDBY:
|
|
268
|
+
logger.warning(
|
|
269
|
+
f"Cannot promote from {self._role.value}, "
|
|
270
|
+
"must be STANDBY"
|
|
271
|
+
)
|
|
272
|
+
return False
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
# Acquire fencing token
|
|
276
|
+
self._fencing_token = await self._fencing_manager.acquire(
|
|
277
|
+
self._processor_id,
|
|
278
|
+
self._node_id,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
await self._transition_to(
|
|
282
|
+
ProcessorRole.PRIMARY,
|
|
283
|
+
reason="Promoted from standby",
|
|
284
|
+
epoch=self._fencing_token.epoch,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
return True
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"Failed to promote to PRIMARY: {e}")
|
|
291
|
+
return False
|
|
292
|
+
|
|
293
|
+
async def demote_to_standby(self, reason: str = "Demoted") -> None:
|
|
294
|
+
"""Demote from PRIMARY to STANDBY, releasing fencing token.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
reason: Reason for demotion
|
|
298
|
+
"""
|
|
299
|
+
if self._role not in (ProcessorRole.PRIMARY, ProcessorRole.DRAINING):
|
|
300
|
+
logger.warning(f"Cannot demote from {self._role.value}")
|
|
301
|
+
return
|
|
302
|
+
|
|
303
|
+
# Release fencing token
|
|
304
|
+
if self._fencing_token:
|
|
305
|
+
await self._fencing_manager.release(self._fencing_token)
|
|
306
|
+
self._fencing_token = None
|
|
307
|
+
|
|
308
|
+
await self._transition_to(
|
|
309
|
+
ProcessorRole.STANDBY,
|
|
310
|
+
reason=reason,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
async def validate_fencing(self) -> bool:
|
|
314
|
+
"""Validate current fencing token is still valid.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
True if fencing token is valid (or not PRIMARY)
|
|
318
|
+
"""
|
|
319
|
+
if self._role != ProcessorRole.PRIMARY:
|
|
320
|
+
return True
|
|
321
|
+
|
|
322
|
+
if not self._fencing_token:
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
return await self._fencing_manager.validate(self._fencing_token)
|
|
326
|
+
|
|
327
|
+
async def validate_fencing_or_fence(self) -> bool:
|
|
328
|
+
"""Validate fencing and transition to FENCED if stale.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
True if still PRIMARY, False if fenced
|
|
332
|
+
"""
|
|
333
|
+
if not await self.validate_fencing():
|
|
334
|
+
await self._transition_to(
|
|
335
|
+
ProcessorRole.FENCED,
|
|
336
|
+
reason="Fencing token became stale",
|
|
337
|
+
)
|
|
338
|
+
return False
|
|
339
|
+
return True
|
|
340
|
+
|
|
341
|
+
async def _transition_to(
|
|
342
|
+
self,
|
|
343
|
+
new_role: ProcessorRole,
|
|
344
|
+
reason: str = "",
|
|
345
|
+
epoch: int | None = None,
|
|
346
|
+
) -> None:
|
|
347
|
+
"""Transition to a new role.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
new_role: Target role
|
|
351
|
+
reason: Reason for transition
|
|
352
|
+
epoch: Fencing epoch if applicable
|
|
353
|
+
"""
|
|
354
|
+
if new_role == self._role:
|
|
355
|
+
return
|
|
356
|
+
|
|
357
|
+
old_role = self._role
|
|
358
|
+
self._role = new_role
|
|
359
|
+
|
|
360
|
+
transition = RoleTransition(
|
|
361
|
+
from_role=old_role,
|
|
362
|
+
to_role=new_role,
|
|
363
|
+
reason=reason,
|
|
364
|
+
epoch=epoch or (self._fencing_token.epoch if self._fencing_token else None),
|
|
365
|
+
node_id=self._node_id,
|
|
366
|
+
)
|
|
367
|
+
self._role_history.append(transition)
|
|
368
|
+
|
|
369
|
+
logger.info(
|
|
370
|
+
f"Role transition: {old_role.value} -> {new_role.value} "
|
|
371
|
+
f"(reason={reason}, epoch={transition.epoch})"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Notify callbacks
|
|
375
|
+
for callback in self._callbacks:
|
|
376
|
+
try:
|
|
377
|
+
await callback(transition)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
logger.error(f"Role change callback failed: {e}")
|
|
380
|
+
|
|
381
|
+
async def _monitor_fencing(self) -> None:
|
|
382
|
+
"""Background task to monitor fencing token validity."""
|
|
383
|
+
while self._running:
|
|
384
|
+
try:
|
|
385
|
+
await asyncio.sleep(5.0) # Check every 5 seconds
|
|
386
|
+
|
|
387
|
+
if self._role == ProcessorRole.PRIMARY:
|
|
388
|
+
is_valid = await self.validate_fencing()
|
|
389
|
+
if not is_valid:
|
|
390
|
+
logger.warning("Fencing token validation failed")
|
|
391
|
+
await self._transition_to(
|
|
392
|
+
ProcessorRole.FENCED,
|
|
393
|
+
reason="Fencing token invalidated by another instance",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
except asyncio.CancelledError:
|
|
397
|
+
break
|
|
398
|
+
except Exception as e:
|
|
399
|
+
logger.error(f"Fencing monitor error: {e}")
|
|
400
|
+
|
|
401
|
+
def get_role_history(self) -> list[RoleTransition]:
|
|
402
|
+
"""Get list of role transitions."""
|
|
403
|
+
return self._role_history.copy()
|
|
404
|
+
|
|
405
|
+
def get_status(self) -> dict[str, Any]:
|
|
406
|
+
"""Get current role status as dictionary."""
|
|
407
|
+
return {
|
|
408
|
+
"processor_id": self._processor_id,
|
|
409
|
+
"node_id": self._node_id,
|
|
410
|
+
"role": self._role.value,
|
|
411
|
+
"can_process": self.can_process(),
|
|
412
|
+
"can_write_state": self.can_write_state(),
|
|
413
|
+
"fencing_epoch": self._fencing_token.epoch if self._fencing_token else None,
|
|
414
|
+
"is_running": self._running,
|
|
415
|
+
"transition_count": len(self._role_history),
|
|
416
|
+
}
|