dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +32 -1
- dory/config/defaults.py +6 -0
- dory/config/schema.py +26 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +648 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +416 -0
- dory/health/server.py +283 -9
- dory/k8s/__init__.py +69 -0
- dory/k8s/labels.py +505 -0
- dory/migration/__init__.py +49 -0
- dory/migration/s3_store.py +656 -0
- dory/migration/state_manager.py +64 -6
- dory/migration/transfer.py +382 -0
- dory/migration/versioning.py +749 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/METADATA +37 -32
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/RECORD +22 -15
- dory_sdk-2.1.4.dist-info/entry_points.txt +2 -0
- dory/sidecar/__init__.py +0 -6
- dory/sidecar/main.py +0 -75
- dory/sidecar/server.py +0 -329
- dory_sdk-2.1.0.dist-info/entry_points.txt +0 -3
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/WHEEL +0 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/top_level.txt +0 -0
dory/edge/heartbeat.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
"""Edge heartbeat service for Orchestrator integration.
|
|
2
|
+
|
|
3
|
+
Provides connectivity monitoring and health reporting for edge nodes
|
|
4
|
+
with intermittent connectivity to the cloud/Orchestrator.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any, Callable, Awaitable
|
|
14
|
+
|
|
15
|
+
import aiohttp
|
|
16
|
+
|
|
17
|
+
from dory.edge.fencing import FencingToken
|
|
18
|
+
from dory.edge.role import ProcessorRole
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ConnectivityStatus(Enum):
|
|
24
|
+
"""Connectivity status to Orchestrator/cloud."""
|
|
25
|
+
|
|
26
|
+
CONNECTED = "connected"
|
|
27
|
+
DEGRADED = "degraded" # High latency or packet loss
|
|
28
|
+
DISCONNECTED = "disconnected"
|
|
29
|
+
UNKNOWN = "unknown"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class HeartbeatConfig:
|
|
34
|
+
"""Configuration for edge heartbeat service."""
|
|
35
|
+
|
|
36
|
+
# Orchestrator endpoint for heartbeat reporting
|
|
37
|
+
orchestrator_url: str | None = None
|
|
38
|
+
|
|
39
|
+
# Heartbeat interval in seconds
|
|
40
|
+
interval_sec: float = 10.0
|
|
41
|
+
|
|
42
|
+
# Timeout for heartbeat requests
|
|
43
|
+
timeout_sec: float = 5.0
|
|
44
|
+
|
|
45
|
+
# Number of missed heartbeats before considered disconnected
|
|
46
|
+
missed_threshold: int = 3
|
|
47
|
+
|
|
48
|
+
# Number of consecutive successes to transition from degraded to connected
|
|
49
|
+
recovery_threshold: int = 2
|
|
50
|
+
|
|
51
|
+
# Latency threshold for degraded status (ms)
|
|
52
|
+
latency_threshold_ms: float = 500.0
|
|
53
|
+
|
|
54
|
+
# Enable automatic role demotion on disconnect
|
|
55
|
+
auto_demote_on_disconnect: bool = True
|
|
56
|
+
|
|
57
|
+
# Grace period before demoting (seconds)
|
|
58
|
+
demote_grace_period_sec: float = 30.0
|
|
59
|
+
|
|
60
|
+
def __post_init__(self):
|
|
61
|
+
"""Load from environment if not provided."""
|
|
62
|
+
if not self.orchestrator_url:
|
|
63
|
+
self.orchestrator_url = os.environ.get(
|
|
64
|
+
"DORY_ORCHESTRATOR_URL",
|
|
65
|
+
"http://dory-orchestrator:8080",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class HeartbeatPayload:
|
|
71
|
+
"""Payload sent to Orchestrator in heartbeat."""
|
|
72
|
+
|
|
73
|
+
processor_id: str
|
|
74
|
+
node_id: str
|
|
75
|
+
role: str
|
|
76
|
+
epoch: int | None
|
|
77
|
+
timestamp: float = field(default_factory=time.time)
|
|
78
|
+
status: str = "healthy"
|
|
79
|
+
last_state_sync: float | None = None
|
|
80
|
+
state_size_bytes: int | None = None
|
|
81
|
+
uptime_sec: float = 0.0
|
|
82
|
+
|
|
83
|
+
def to_dict(self) -> dict[str, Any]:
|
|
84
|
+
"""Convert to dictionary for JSON serialization."""
|
|
85
|
+
return {
|
|
86
|
+
"processor_id": self.processor_id,
|
|
87
|
+
"node_id": self.node_id,
|
|
88
|
+
"role": self.role,
|
|
89
|
+
"epoch": self.epoch,
|
|
90
|
+
"timestamp": self.timestamp,
|
|
91
|
+
"status": self.status,
|
|
92
|
+
"last_state_sync": self.last_state_sync,
|
|
93
|
+
"state_size_bytes": self.state_size_bytes,
|
|
94
|
+
"uptime_sec": self.uptime_sec,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class HeartbeatResponse:
|
|
100
|
+
"""Response from Orchestrator heartbeat endpoint."""
|
|
101
|
+
|
|
102
|
+
acknowledged: bool
|
|
103
|
+
orchestrator_time: float
|
|
104
|
+
directive: str | None = None # "continue", "demote", "promote", "shutdown"
|
|
105
|
+
message: str | None = None
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def from_dict(cls, data: dict[str, Any]) -> "HeartbeatResponse":
|
|
109
|
+
"""Create from dictionary."""
|
|
110
|
+
return cls(
|
|
111
|
+
acknowledged=data.get("acknowledged", False),
|
|
112
|
+
orchestrator_time=data.get("orchestrator_time", time.time()),
|
|
113
|
+
directive=data.get("directive"),
|
|
114
|
+
message=data.get("message"),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class ConnectivityMetrics:
|
|
120
|
+
"""Metrics tracking connectivity health."""
|
|
121
|
+
|
|
122
|
+
total_heartbeats: int = 0
|
|
123
|
+
successful_heartbeats: int = 0
|
|
124
|
+
failed_heartbeats: int = 0
|
|
125
|
+
consecutive_failures: int = 0
|
|
126
|
+
consecutive_successes: int = 0
|
|
127
|
+
last_success_time: float | None = None
|
|
128
|
+
last_failure_time: float | None = None
|
|
129
|
+
last_latency_ms: float | None = None
|
|
130
|
+
avg_latency_ms: float = 0.0
|
|
131
|
+
max_latency_ms: float = 0.0
|
|
132
|
+
|
|
133
|
+
def record_success(self, latency_ms: float) -> None:
|
|
134
|
+
"""Record a successful heartbeat."""
|
|
135
|
+
self.total_heartbeats += 1
|
|
136
|
+
self.successful_heartbeats += 1
|
|
137
|
+
self.consecutive_successes += 1
|
|
138
|
+
self.consecutive_failures = 0
|
|
139
|
+
self.last_success_time = time.time()
|
|
140
|
+
self.last_latency_ms = latency_ms
|
|
141
|
+
|
|
142
|
+
# Update latency stats
|
|
143
|
+
if self.successful_heartbeats == 1:
|
|
144
|
+
self.avg_latency_ms = latency_ms
|
|
145
|
+
else:
|
|
146
|
+
# Exponential moving average
|
|
147
|
+
self.avg_latency_ms = 0.9 * self.avg_latency_ms + 0.1 * latency_ms
|
|
148
|
+
|
|
149
|
+
self.max_latency_ms = max(self.max_latency_ms, latency_ms)
|
|
150
|
+
|
|
151
|
+
def record_failure(self) -> None:
|
|
152
|
+
"""Record a failed heartbeat."""
|
|
153
|
+
self.total_heartbeats += 1
|
|
154
|
+
self.failed_heartbeats += 1
|
|
155
|
+
self.consecutive_failures += 1
|
|
156
|
+
self.consecutive_successes = 0
|
|
157
|
+
self.last_failure_time = time.time()
|
|
158
|
+
|
|
159
|
+
def get_success_rate(self) -> float:
|
|
160
|
+
"""Get heartbeat success rate."""
|
|
161
|
+
if self.total_heartbeats == 0:
|
|
162
|
+
return 0.0
|
|
163
|
+
return self.successful_heartbeats / self.total_heartbeats
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# Type alias for connectivity change callback
|
|
167
|
+
ConnectivityChangeCallback = Callable[[ConnectivityStatus, ConnectivityStatus], Awaitable[None]]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class HeartbeatManager:
|
|
171
|
+
"""Manager for edge heartbeat reporting and connectivity monitoring.
|
|
172
|
+
|
|
173
|
+
Periodically sends heartbeats to the Orchestrator and tracks
|
|
174
|
+
connectivity status. Can automatically trigger role transitions
|
|
175
|
+
when connectivity is lost.
|
|
176
|
+
|
|
177
|
+
Usage:
|
|
178
|
+
config = HeartbeatConfig(
|
|
179
|
+
orchestrator_url="http://orchestrator:8080",
|
|
180
|
+
interval_sec=10.0,
|
|
181
|
+
)
|
|
182
|
+
manager = HeartbeatManager(config)
|
|
183
|
+
|
|
184
|
+
# Set processor info
|
|
185
|
+
manager.set_processor_info(
|
|
186
|
+
processor_id="my-processor",
|
|
187
|
+
node_id="edge-node-1",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Start heartbeat loop
|
|
191
|
+
await manager.start()
|
|
192
|
+
|
|
193
|
+
# Check connectivity
|
|
194
|
+
if manager.is_connected():
|
|
195
|
+
# Safe to sync state
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
# Stop on shutdown
|
|
199
|
+
await manager.stop()
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
def __init__(self, config: HeartbeatConfig | None = None):
|
|
203
|
+
"""Initialize heartbeat manager.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
config: Heartbeat configuration
|
|
207
|
+
"""
|
|
208
|
+
self._config = config or HeartbeatConfig()
|
|
209
|
+
self._processor_id: str | None = None
|
|
210
|
+
self._node_id: str | None = None
|
|
211
|
+
self._role: ProcessorRole = ProcessorRole.INITIALIZING
|
|
212
|
+
self._fencing_token: FencingToken | None = None
|
|
213
|
+
|
|
214
|
+
self._status = ConnectivityStatus.UNKNOWN
|
|
215
|
+
self._metrics = ConnectivityMetrics()
|
|
216
|
+
self._callbacks: list[ConnectivityChangeCallback] = []
|
|
217
|
+
|
|
218
|
+
self._session: aiohttp.ClientSession | None = None
|
|
219
|
+
self._heartbeat_task: asyncio.Task | None = None
|
|
220
|
+
self._running = False
|
|
221
|
+
self._start_time: float = 0
|
|
222
|
+
self._last_state_sync: float | None = None
|
|
223
|
+
self._state_size_bytes: int | None = None
|
|
224
|
+
|
|
225
|
+
# Track disconnect time for grace period
|
|
226
|
+
self._disconnect_time: float | None = None
|
|
227
|
+
|
|
228
|
+
@property
|
|
229
|
+
def status(self) -> ConnectivityStatus:
|
|
230
|
+
"""Current connectivity status."""
|
|
231
|
+
return self._status
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def metrics(self) -> ConnectivityMetrics:
|
|
235
|
+
"""Connectivity metrics."""
|
|
236
|
+
return self._metrics
|
|
237
|
+
|
|
238
|
+
def is_connected(self) -> bool:
|
|
239
|
+
"""Check if connected to Orchestrator."""
|
|
240
|
+
return self._status == ConnectivityStatus.CONNECTED
|
|
241
|
+
|
|
242
|
+
def is_disconnected(self) -> bool:
|
|
243
|
+
"""Check if disconnected from Orchestrator."""
|
|
244
|
+
return self._status == ConnectivityStatus.DISCONNECTED
|
|
245
|
+
|
|
246
|
+
def set_processor_info(
|
|
247
|
+
self,
|
|
248
|
+
processor_id: str,
|
|
249
|
+
node_id: str,
|
|
250
|
+
role: ProcessorRole | None = None,
|
|
251
|
+
fencing_token: FencingToken | None = None,
|
|
252
|
+
) -> None:
|
|
253
|
+
"""Set processor information for heartbeat payload.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
processor_id: Processor identifier
|
|
257
|
+
node_id: Edge node identifier
|
|
258
|
+
role: Current processor role
|
|
259
|
+
fencing_token: Current fencing token
|
|
260
|
+
"""
|
|
261
|
+
self._processor_id = processor_id
|
|
262
|
+
self._node_id = node_id
|
|
263
|
+
if role:
|
|
264
|
+
self._role = role
|
|
265
|
+
if fencing_token:
|
|
266
|
+
self._fencing_token = fencing_token
|
|
267
|
+
|
|
268
|
+
def update_role(self, role: ProcessorRole) -> None:
|
|
269
|
+
"""Update current processor role."""
|
|
270
|
+
self._role = role
|
|
271
|
+
|
|
272
|
+
def update_fencing_token(self, token: FencingToken | None) -> None:
|
|
273
|
+
"""Update current fencing token."""
|
|
274
|
+
self._fencing_token = token
|
|
275
|
+
|
|
276
|
+
def record_state_sync(self, size_bytes: int | None = None) -> None:
|
|
277
|
+
"""Record a successful state sync."""
|
|
278
|
+
self._last_state_sync = time.time()
|
|
279
|
+
if size_bytes is not None:
|
|
280
|
+
self._state_size_bytes = size_bytes
|
|
281
|
+
|
|
282
|
+
def add_connectivity_callback(self, callback: ConnectivityChangeCallback) -> None:
|
|
283
|
+
"""Register callback for connectivity changes.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
callback: Async function called on status changes
|
|
287
|
+
"""
|
|
288
|
+
self._callbacks.append(callback)
|
|
289
|
+
|
|
290
|
+
def remove_connectivity_callback(self, callback: ConnectivityChangeCallback) -> None:
|
|
291
|
+
"""Remove connectivity callback."""
|
|
292
|
+
if callback in self._callbacks:
|
|
293
|
+
self._callbacks.remove(callback)
|
|
294
|
+
|
|
295
|
+
async def start(self) -> None:
|
|
296
|
+
"""Start heartbeat service."""
|
|
297
|
+
if self._running:
|
|
298
|
+
return
|
|
299
|
+
|
|
300
|
+
self._running = True
|
|
301
|
+
self._start_time = time.time()
|
|
302
|
+
|
|
303
|
+
# Create HTTP session
|
|
304
|
+
timeout = aiohttp.ClientTimeout(total=self._config.timeout_sec)
|
|
305
|
+
self._session = aiohttp.ClientSession(timeout=timeout)
|
|
306
|
+
|
|
307
|
+
# Start heartbeat loop
|
|
308
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
309
|
+
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Started heartbeat service: interval={self._config.interval_sec}s, "
|
|
312
|
+
f"orchestrator={self._config.orchestrator_url}"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
async def stop(self) -> None:
|
|
316
|
+
"""Stop heartbeat service."""
|
|
317
|
+
self._running = False
|
|
318
|
+
|
|
319
|
+
if self._heartbeat_task:
|
|
320
|
+
self._heartbeat_task.cancel()
|
|
321
|
+
try:
|
|
322
|
+
await self._heartbeat_task
|
|
323
|
+
except asyncio.CancelledError:
|
|
324
|
+
pass
|
|
325
|
+
self._heartbeat_task = None
|
|
326
|
+
|
|
327
|
+
if self._session:
|
|
328
|
+
await self._session.close()
|
|
329
|
+
self._session = None
|
|
330
|
+
|
|
331
|
+
logger.info("Stopped heartbeat service")
|
|
332
|
+
|
|
333
|
+
async def send_heartbeat(self) -> HeartbeatResponse | None:
|
|
334
|
+
"""Send a single heartbeat to the Orchestrator.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
HeartbeatResponse if successful, None if failed
|
|
338
|
+
"""
|
|
339
|
+
if not self._session or not self._processor_id or not self._node_id:
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
payload = HeartbeatPayload(
|
|
343
|
+
processor_id=self._processor_id,
|
|
344
|
+
node_id=self._node_id,
|
|
345
|
+
role=self._role.value,
|
|
346
|
+
epoch=self._fencing_token.epoch if self._fencing_token else None,
|
|
347
|
+
last_state_sync=self._last_state_sync,
|
|
348
|
+
state_size_bytes=self._state_size_bytes,
|
|
349
|
+
uptime_sec=time.time() - self._start_time,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
url = f"{self._config.orchestrator_url}/api/v1/edge/heartbeat"
|
|
353
|
+
|
|
354
|
+
start_time = time.monotonic()
|
|
355
|
+
try:
|
|
356
|
+
async with self._session.post(
|
|
357
|
+
url,
|
|
358
|
+
json=payload.to_dict(),
|
|
359
|
+
headers={"Content-Type": "application/json"},
|
|
360
|
+
) as resp:
|
|
361
|
+
latency_ms = (time.monotonic() - start_time) * 1000
|
|
362
|
+
|
|
363
|
+
if resp.status == 200:
|
|
364
|
+
data = await resp.json()
|
|
365
|
+
response = HeartbeatResponse.from_dict(data)
|
|
366
|
+
|
|
367
|
+
self._metrics.record_success(latency_ms)
|
|
368
|
+
await self._update_status(latency_ms)
|
|
369
|
+
|
|
370
|
+
logger.debug(
|
|
371
|
+
f"Heartbeat succeeded: latency={latency_ms:.1f}ms, "
|
|
372
|
+
f"directive={response.directive}"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
return response
|
|
376
|
+
else:
|
|
377
|
+
logger.warning(
|
|
378
|
+
f"Heartbeat returned status {resp.status}"
|
|
379
|
+
)
|
|
380
|
+
self._metrics.record_failure()
|
|
381
|
+
await self._update_status(None)
|
|
382
|
+
return None
|
|
383
|
+
|
|
384
|
+
except asyncio.TimeoutError:
|
|
385
|
+
logger.warning(
|
|
386
|
+
f"Heartbeat timeout after {self._config.timeout_sec}s"
|
|
387
|
+
)
|
|
388
|
+
self._metrics.record_failure()
|
|
389
|
+
await self._update_status(None)
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
except aiohttp.ClientError as e:
|
|
393
|
+
logger.warning(f"Heartbeat failed: {e}")
|
|
394
|
+
self._metrics.record_failure()
|
|
395
|
+
await self._update_status(None)
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
async def _heartbeat_loop(self) -> None:
|
|
399
|
+
"""Background heartbeat loop."""
|
|
400
|
+
while self._running:
|
|
401
|
+
try:
|
|
402
|
+
response = await self.send_heartbeat()
|
|
403
|
+
|
|
404
|
+
# Handle directives from Orchestrator
|
|
405
|
+
if response and response.directive:
|
|
406
|
+
await self._handle_directive(response.directive)
|
|
407
|
+
|
|
408
|
+
await asyncio.sleep(self._config.interval_sec)
|
|
409
|
+
|
|
410
|
+
except asyncio.CancelledError:
|
|
411
|
+
break
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.error(f"Heartbeat loop error: {e}")
|
|
414
|
+
await asyncio.sleep(self._config.interval_sec)
|
|
415
|
+
|
|
416
|
+
async def _update_status(self, latency_ms: float | None) -> None:
|
|
417
|
+
"""Update connectivity status based on metrics."""
|
|
418
|
+
old_status = self._status
|
|
419
|
+
|
|
420
|
+
if latency_ms is not None:
|
|
421
|
+
# Successful heartbeat
|
|
422
|
+
self._disconnect_time = None
|
|
423
|
+
|
|
424
|
+
if latency_ms > self._config.latency_threshold_ms:
|
|
425
|
+
# High latency - degraded
|
|
426
|
+
new_status = ConnectivityStatus.DEGRADED
|
|
427
|
+
elif (
|
|
428
|
+
self._status == ConnectivityStatus.DEGRADED
|
|
429
|
+
and self._metrics.consecutive_successes < self._config.recovery_threshold
|
|
430
|
+
):
|
|
431
|
+
# Still recovering from degraded
|
|
432
|
+
new_status = ConnectivityStatus.DEGRADED
|
|
433
|
+
else:
|
|
434
|
+
new_status = ConnectivityStatus.CONNECTED
|
|
435
|
+
else:
|
|
436
|
+
# Failed heartbeat
|
|
437
|
+
if self._metrics.consecutive_failures >= self._config.missed_threshold:
|
|
438
|
+
new_status = ConnectivityStatus.DISCONNECTED
|
|
439
|
+
|
|
440
|
+
# Track disconnect time for grace period
|
|
441
|
+
if self._disconnect_time is None:
|
|
442
|
+
self._disconnect_time = time.time()
|
|
443
|
+
elif self._metrics.consecutive_failures > 0:
|
|
444
|
+
new_status = ConnectivityStatus.DEGRADED
|
|
445
|
+
else:
|
|
446
|
+
new_status = self._status
|
|
447
|
+
|
|
448
|
+
if new_status != old_status:
|
|
449
|
+
self._status = new_status
|
|
450
|
+
logger.info(f"Connectivity status changed: {old_status.value} -> {new_status.value}")
|
|
451
|
+
|
|
452
|
+
# Notify callbacks
|
|
453
|
+
for callback in self._callbacks:
|
|
454
|
+
try:
|
|
455
|
+
await callback(old_status, new_status)
|
|
456
|
+
except Exception as e:
|
|
457
|
+
logger.error(f"Connectivity callback failed: {e}")
|
|
458
|
+
|
|
459
|
+
async def _handle_directive(self, directive: str) -> None:
|
|
460
|
+
"""Handle directive from Orchestrator.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
directive: Directive string ("continue", "demote", "promote", "shutdown")
|
|
464
|
+
"""
|
|
465
|
+
logger.info(f"Received directive from Orchestrator: {directive}")
|
|
466
|
+
|
|
467
|
+
if directive == "demote":
|
|
468
|
+
# Orchestrator is requesting we demote to STANDBY
|
|
469
|
+
# This could happen if another instance is being promoted
|
|
470
|
+
logger.warning("Orchestrator requested demotion")
|
|
471
|
+
# Note: actual demotion should be handled by caller via callback
|
|
472
|
+
|
|
473
|
+
elif directive == "promote":
|
|
474
|
+
# Orchestrator is requesting we promote to PRIMARY
|
|
475
|
+
logger.info("Orchestrator requested promotion")
|
|
476
|
+
|
|
477
|
+
elif directive == "shutdown":
|
|
478
|
+
# Orchestrator is requesting graceful shutdown
|
|
479
|
+
logger.warning("Orchestrator requested shutdown")
|
|
480
|
+
|
|
481
|
+
# "continue" means keep doing what you're doing
|
|
482
|
+
|
|
483
|
+
def get_status_dict(self) -> dict[str, Any]:
|
|
484
|
+
"""Get current status as dictionary."""
|
|
485
|
+
return {
|
|
486
|
+
"processor_id": self._processor_id,
|
|
487
|
+
"node_id": self._node_id,
|
|
488
|
+
"role": self._role.value,
|
|
489
|
+
"connectivity": self._status.value,
|
|
490
|
+
"is_connected": self.is_connected(),
|
|
491
|
+
"fencing_epoch": self._fencing_token.epoch if self._fencing_token else None,
|
|
492
|
+
"metrics": {
|
|
493
|
+
"total_heartbeats": self._metrics.total_heartbeats,
|
|
494
|
+
"success_rate": self._metrics.get_success_rate(),
|
|
495
|
+
"consecutive_failures": self._metrics.consecutive_failures,
|
|
496
|
+
"avg_latency_ms": self._metrics.avg_latency_ms,
|
|
497
|
+
"last_latency_ms": self._metrics.last_latency_ms,
|
|
498
|
+
},
|
|
499
|
+
"last_state_sync": self._last_state_sync,
|
|
500
|
+
"uptime_sec": time.time() - self._start_time if self._start_time else 0,
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
def should_demote(self) -> bool:
|
|
504
|
+
"""Check if should demote due to prolonged disconnect.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
True if disconnect grace period exceeded
|
|
508
|
+
"""
|
|
509
|
+
if not self._config.auto_demote_on_disconnect:
|
|
510
|
+
return False
|
|
511
|
+
|
|
512
|
+
if self._status != ConnectivityStatus.DISCONNECTED:
|
|
513
|
+
return False
|
|
514
|
+
|
|
515
|
+
if self._disconnect_time is None:
|
|
516
|
+
return False
|
|
517
|
+
|
|
518
|
+
elapsed = time.time() - self._disconnect_time
|
|
519
|
+
return elapsed >= self._config.demote_grace_period_sec
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
class EdgeHealthReporter:
|
|
523
|
+
"""Reports edge-specific health information to the HealthServer.
|
|
524
|
+
|
|
525
|
+
Integrates with the existing HealthServer to provide edge-specific
|
|
526
|
+
status information in health check responses.
|
|
527
|
+
|
|
528
|
+
Usage:
|
|
529
|
+
reporter = EdgeHealthReporter(
|
|
530
|
+
heartbeat_manager=heartbeat_mgr,
|
|
531
|
+
role_manager=role_mgr,
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# Add to health server
|
|
535
|
+
health_server.add_health_component("edge", reporter.get_health_status)
|
|
536
|
+
"""
|
|
537
|
+
|
|
538
|
+
def __init__(
|
|
539
|
+
self,
|
|
540
|
+
heartbeat_manager: HeartbeatManager | None = None,
|
|
541
|
+
role_manager: Any = None, # RoleManager, avoiding circular import
|
|
542
|
+
):
|
|
543
|
+
"""Initialize health reporter.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
heartbeat_manager: Heartbeat manager instance
|
|
547
|
+
role_manager: Role manager instance
|
|
548
|
+
"""
|
|
549
|
+
self._heartbeat = heartbeat_manager
|
|
550
|
+
self._role_manager = role_manager
|
|
551
|
+
|
|
552
|
+
def get_health_status(self) -> dict[str, Any]:
|
|
553
|
+
"""Get edge health status for health endpoint.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Dictionary with edge health information
|
|
557
|
+
"""
|
|
558
|
+
status: dict[str, Any] = {
|
|
559
|
+
"is_edge": True,
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
if self._heartbeat:
|
|
563
|
+
status["connectivity"] = self._heartbeat.status.value
|
|
564
|
+
status["is_connected"] = self._heartbeat.is_connected()
|
|
565
|
+
metrics = self._heartbeat.metrics
|
|
566
|
+
status["heartbeat"] = {
|
|
567
|
+
"success_rate": metrics.get_success_rate(),
|
|
568
|
+
"avg_latency_ms": metrics.avg_latency_ms,
|
|
569
|
+
"consecutive_failures": metrics.consecutive_failures,
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
if self._role_manager:
|
|
573
|
+
status["role"] = self._role_manager.role.value
|
|
574
|
+
status["can_process"] = self._role_manager.can_process()
|
|
575
|
+
if self._role_manager.fencing_token:
|
|
576
|
+
status["fencing_epoch"] = self._role_manager.fencing_token.epoch
|
|
577
|
+
|
|
578
|
+
return status
|
|
579
|
+
|
|
580
|
+
async def check_health(self) -> tuple[str, str]:
|
|
581
|
+
"""Async health check for HealthServer integration.
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
Tuple of (status, message) where status is "healthy", "degraded", or "unhealthy"
|
|
585
|
+
"""
|
|
586
|
+
if not self._heartbeat:
|
|
587
|
+
return "healthy", "Edge health reporter not configured"
|
|
588
|
+
|
|
589
|
+
connectivity = self._heartbeat.status
|
|
590
|
+
|
|
591
|
+
if connectivity == ConnectivityStatus.CONNECTED:
|
|
592
|
+
return "healthy", "Connected to Orchestrator"
|
|
593
|
+
elif connectivity == ConnectivityStatus.DEGRADED:
|
|
594
|
+
return "degraded", "Degraded connectivity to Orchestrator"
|
|
595
|
+
elif connectivity == ConnectivityStatus.DISCONNECTED:
|
|
596
|
+
return "unhealthy", "Disconnected from Orchestrator"
|
|
597
|
+
else:
|
|
598
|
+
return "degraded", "Connectivity status unknown"
|