dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dory/edge/heartbeat.py ADDED
@@ -0,0 +1,598 @@
1
+ """Edge heartbeat service for Orchestrator integration.
2
+
3
+ Provides connectivity monitoring and health reporting for edge nodes
4
+ with intermittent connectivity to the cloud/Orchestrator.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import os
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from typing import Any, Callable, Awaitable
14
+
15
+ import aiohttp
16
+
17
+ from dory.edge.fencing import FencingToken
18
+ from dory.edge.role import ProcessorRole
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class ConnectivityStatus(Enum):
24
+ """Connectivity status to Orchestrator/cloud."""
25
+
26
+ CONNECTED = "connected"
27
+ DEGRADED = "degraded" # High latency or packet loss
28
+ DISCONNECTED = "disconnected"
29
+ UNKNOWN = "unknown"
30
+
31
+
32
+ @dataclass
33
+ class HeartbeatConfig:
34
+ """Configuration for edge heartbeat service."""
35
+
36
+ # Orchestrator endpoint for heartbeat reporting
37
+ orchestrator_url: str | None = None
38
+
39
+ # Heartbeat interval in seconds
40
+ interval_sec: float = 10.0
41
+
42
+ # Timeout for heartbeat requests
43
+ timeout_sec: float = 5.0
44
+
45
+ # Number of missed heartbeats before considered disconnected
46
+ missed_threshold: int = 3
47
+
48
+ # Number of consecutive successes to transition from degraded to connected
49
+ recovery_threshold: int = 2
50
+
51
+ # Latency threshold for degraded status (ms)
52
+ latency_threshold_ms: float = 500.0
53
+
54
+ # Enable automatic role demotion on disconnect
55
+ auto_demote_on_disconnect: bool = True
56
+
57
+ # Grace period before demoting (seconds)
58
+ demote_grace_period_sec: float = 30.0
59
+
60
+ def __post_init__(self):
61
+ """Load from environment if not provided."""
62
+ if not self.orchestrator_url:
63
+ self.orchestrator_url = os.environ.get(
64
+ "DORY_ORCHESTRATOR_URL",
65
+ "http://dory-orchestrator:8080",
66
+ )
67
+
68
+
69
+ @dataclass
70
+ class HeartbeatPayload:
71
+ """Payload sent to Orchestrator in heartbeat."""
72
+
73
+ processor_id: str
74
+ node_id: str
75
+ role: str
76
+ epoch: int | None
77
+ timestamp: float = field(default_factory=time.time)
78
+ status: str = "healthy"
79
+ last_state_sync: float | None = None
80
+ state_size_bytes: int | None = None
81
+ uptime_sec: float = 0.0
82
+
83
+ def to_dict(self) -> dict[str, Any]:
84
+ """Convert to dictionary for JSON serialization."""
85
+ return {
86
+ "processor_id": self.processor_id,
87
+ "node_id": self.node_id,
88
+ "role": self.role,
89
+ "epoch": self.epoch,
90
+ "timestamp": self.timestamp,
91
+ "status": self.status,
92
+ "last_state_sync": self.last_state_sync,
93
+ "state_size_bytes": self.state_size_bytes,
94
+ "uptime_sec": self.uptime_sec,
95
+ }
96
+
97
+
98
+ @dataclass
99
+ class HeartbeatResponse:
100
+ """Response from Orchestrator heartbeat endpoint."""
101
+
102
+ acknowledged: bool
103
+ orchestrator_time: float
104
+ directive: str | None = None # "continue", "demote", "promote", "shutdown"
105
+ message: str | None = None
106
+
107
+ @classmethod
108
+ def from_dict(cls, data: dict[str, Any]) -> "HeartbeatResponse":
109
+ """Create from dictionary."""
110
+ return cls(
111
+ acknowledged=data.get("acknowledged", False),
112
+ orchestrator_time=data.get("orchestrator_time", time.time()),
113
+ directive=data.get("directive"),
114
+ message=data.get("message"),
115
+ )
116
+
117
+
118
+ @dataclass
119
+ class ConnectivityMetrics:
120
+ """Metrics tracking connectivity health."""
121
+
122
+ total_heartbeats: int = 0
123
+ successful_heartbeats: int = 0
124
+ failed_heartbeats: int = 0
125
+ consecutive_failures: int = 0
126
+ consecutive_successes: int = 0
127
+ last_success_time: float | None = None
128
+ last_failure_time: float | None = None
129
+ last_latency_ms: float | None = None
130
+ avg_latency_ms: float = 0.0
131
+ max_latency_ms: float = 0.0
132
+
133
+ def record_success(self, latency_ms: float) -> None:
134
+ """Record a successful heartbeat."""
135
+ self.total_heartbeats += 1
136
+ self.successful_heartbeats += 1
137
+ self.consecutive_successes += 1
138
+ self.consecutive_failures = 0
139
+ self.last_success_time = time.time()
140
+ self.last_latency_ms = latency_ms
141
+
142
+ # Update latency stats
143
+ if self.successful_heartbeats == 1:
144
+ self.avg_latency_ms = latency_ms
145
+ else:
146
+ # Exponential moving average
147
+ self.avg_latency_ms = 0.9 * self.avg_latency_ms + 0.1 * latency_ms
148
+
149
+ self.max_latency_ms = max(self.max_latency_ms, latency_ms)
150
+
151
+ def record_failure(self) -> None:
152
+ """Record a failed heartbeat."""
153
+ self.total_heartbeats += 1
154
+ self.failed_heartbeats += 1
155
+ self.consecutive_failures += 1
156
+ self.consecutive_successes = 0
157
+ self.last_failure_time = time.time()
158
+
159
+ def get_success_rate(self) -> float:
160
+ """Get heartbeat success rate."""
161
+ if self.total_heartbeats == 0:
162
+ return 0.0
163
+ return self.successful_heartbeats / self.total_heartbeats
164
+
165
+
166
+ # Type alias for connectivity change callback
167
+ ConnectivityChangeCallback = Callable[[ConnectivityStatus, ConnectivityStatus], Awaitable[None]]
168
+
169
+
170
+ class HeartbeatManager:
171
+ """Manager for edge heartbeat reporting and connectivity monitoring.
172
+
173
+ Periodically sends heartbeats to the Orchestrator and tracks
174
+ connectivity status. Can automatically trigger role transitions
175
+ when connectivity is lost.
176
+
177
+ Usage:
178
+ config = HeartbeatConfig(
179
+ orchestrator_url="http://orchestrator:8080",
180
+ interval_sec=10.0,
181
+ )
182
+ manager = HeartbeatManager(config)
183
+
184
+ # Set processor info
185
+ manager.set_processor_info(
186
+ processor_id="my-processor",
187
+ node_id="edge-node-1",
188
+ )
189
+
190
+ # Start heartbeat loop
191
+ await manager.start()
192
+
193
+ # Check connectivity
194
+ if manager.is_connected():
195
+ # Safe to sync state
196
+ pass
197
+
198
+ # Stop on shutdown
199
+ await manager.stop()
200
+ """
201
+
202
+ def __init__(self, config: HeartbeatConfig | None = None):
203
+ """Initialize heartbeat manager.
204
+
205
+ Args:
206
+ config: Heartbeat configuration
207
+ """
208
+ self._config = config or HeartbeatConfig()
209
+ self._processor_id: str | None = None
210
+ self._node_id: str | None = None
211
+ self._role: ProcessorRole = ProcessorRole.INITIALIZING
212
+ self._fencing_token: FencingToken | None = None
213
+
214
+ self._status = ConnectivityStatus.UNKNOWN
215
+ self._metrics = ConnectivityMetrics()
216
+ self._callbacks: list[ConnectivityChangeCallback] = []
217
+
218
+ self._session: aiohttp.ClientSession | None = None
219
+ self._heartbeat_task: asyncio.Task | None = None
220
+ self._running = False
221
+ self._start_time: float = 0
222
+ self._last_state_sync: float | None = None
223
+ self._state_size_bytes: int | None = None
224
+
225
+ # Track disconnect time for grace period
226
+ self._disconnect_time: float | None = None
227
+
228
+ @property
229
+ def status(self) -> ConnectivityStatus:
230
+ """Current connectivity status."""
231
+ return self._status
232
+
233
+ @property
234
+ def metrics(self) -> ConnectivityMetrics:
235
+ """Connectivity metrics."""
236
+ return self._metrics
237
+
238
+ def is_connected(self) -> bool:
239
+ """Check if connected to Orchestrator."""
240
+ return self._status == ConnectivityStatus.CONNECTED
241
+
242
+ def is_disconnected(self) -> bool:
243
+ """Check if disconnected from Orchestrator."""
244
+ return self._status == ConnectivityStatus.DISCONNECTED
245
+
246
+ def set_processor_info(
247
+ self,
248
+ processor_id: str,
249
+ node_id: str,
250
+ role: ProcessorRole | None = None,
251
+ fencing_token: FencingToken | None = None,
252
+ ) -> None:
253
+ """Set processor information for heartbeat payload.
254
+
255
+ Args:
256
+ processor_id: Processor identifier
257
+ node_id: Edge node identifier
258
+ role: Current processor role
259
+ fencing_token: Current fencing token
260
+ """
261
+ self._processor_id = processor_id
262
+ self._node_id = node_id
263
+ if role:
264
+ self._role = role
265
+ if fencing_token:
266
+ self._fencing_token = fencing_token
267
+
268
+ def update_role(self, role: ProcessorRole) -> None:
269
+ """Update current processor role."""
270
+ self._role = role
271
+
272
+ def update_fencing_token(self, token: FencingToken | None) -> None:
273
+ """Update current fencing token."""
274
+ self._fencing_token = token
275
+
276
+ def record_state_sync(self, size_bytes: int | None = None) -> None:
277
+ """Record a successful state sync."""
278
+ self._last_state_sync = time.time()
279
+ if size_bytes is not None:
280
+ self._state_size_bytes = size_bytes
281
+
282
+ def add_connectivity_callback(self, callback: ConnectivityChangeCallback) -> None:
283
+ """Register callback for connectivity changes.
284
+
285
+ Args:
286
+ callback: Async function called on status changes
287
+ """
288
+ self._callbacks.append(callback)
289
+
290
+ def remove_connectivity_callback(self, callback: ConnectivityChangeCallback) -> None:
291
+ """Remove connectivity callback."""
292
+ if callback in self._callbacks:
293
+ self._callbacks.remove(callback)
294
+
295
+ async def start(self) -> None:
296
+ """Start heartbeat service."""
297
+ if self._running:
298
+ return
299
+
300
+ self._running = True
301
+ self._start_time = time.time()
302
+
303
+ # Create HTTP session
304
+ timeout = aiohttp.ClientTimeout(total=self._config.timeout_sec)
305
+ self._session = aiohttp.ClientSession(timeout=timeout)
306
+
307
+ # Start heartbeat loop
308
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
309
+
310
+ logger.info(
311
+ f"Started heartbeat service: interval={self._config.interval_sec}s, "
312
+ f"orchestrator={self._config.orchestrator_url}"
313
+ )
314
+
315
+ async def stop(self) -> None:
316
+ """Stop heartbeat service."""
317
+ self._running = False
318
+
319
+ if self._heartbeat_task:
320
+ self._heartbeat_task.cancel()
321
+ try:
322
+ await self._heartbeat_task
323
+ except asyncio.CancelledError:
324
+ pass
325
+ self._heartbeat_task = None
326
+
327
+ if self._session:
328
+ await self._session.close()
329
+ self._session = None
330
+
331
+ logger.info("Stopped heartbeat service")
332
+
333
+ async def send_heartbeat(self) -> HeartbeatResponse | None:
334
+ """Send a single heartbeat to the Orchestrator.
335
+
336
+ Returns:
337
+ HeartbeatResponse if successful, None if failed
338
+ """
339
+ if not self._session or not self._processor_id or not self._node_id:
340
+ return None
341
+
342
+ payload = HeartbeatPayload(
343
+ processor_id=self._processor_id,
344
+ node_id=self._node_id,
345
+ role=self._role.value,
346
+ epoch=self._fencing_token.epoch if self._fencing_token else None,
347
+ last_state_sync=self._last_state_sync,
348
+ state_size_bytes=self._state_size_bytes,
349
+ uptime_sec=time.time() - self._start_time,
350
+ )
351
+
352
+ url = f"{self._config.orchestrator_url}/api/v1/edge/heartbeat"
353
+
354
+ start_time = time.monotonic()
355
+ try:
356
+ async with self._session.post(
357
+ url,
358
+ json=payload.to_dict(),
359
+ headers={"Content-Type": "application/json"},
360
+ ) as resp:
361
+ latency_ms = (time.monotonic() - start_time) * 1000
362
+
363
+ if resp.status == 200:
364
+ data = await resp.json()
365
+ response = HeartbeatResponse.from_dict(data)
366
+
367
+ self._metrics.record_success(latency_ms)
368
+ await self._update_status(latency_ms)
369
+
370
+ logger.debug(
371
+ f"Heartbeat succeeded: latency={latency_ms:.1f}ms, "
372
+ f"directive={response.directive}"
373
+ )
374
+
375
+ return response
376
+ else:
377
+ logger.warning(
378
+ f"Heartbeat returned status {resp.status}"
379
+ )
380
+ self._metrics.record_failure()
381
+ await self._update_status(None)
382
+ return None
383
+
384
+ except asyncio.TimeoutError:
385
+ logger.warning(
386
+ f"Heartbeat timeout after {self._config.timeout_sec}s"
387
+ )
388
+ self._metrics.record_failure()
389
+ await self._update_status(None)
390
+ return None
391
+
392
+ except aiohttp.ClientError as e:
393
+ logger.warning(f"Heartbeat failed: {e}")
394
+ self._metrics.record_failure()
395
+ await self._update_status(None)
396
+ return None
397
+
398
+ async def _heartbeat_loop(self) -> None:
399
+ """Background heartbeat loop."""
400
+ while self._running:
401
+ try:
402
+ response = await self.send_heartbeat()
403
+
404
+ # Handle directives from Orchestrator
405
+ if response and response.directive:
406
+ await self._handle_directive(response.directive)
407
+
408
+ await asyncio.sleep(self._config.interval_sec)
409
+
410
+ except asyncio.CancelledError:
411
+ break
412
+ except Exception as e:
413
+ logger.error(f"Heartbeat loop error: {e}")
414
+ await asyncio.sleep(self._config.interval_sec)
415
+
416
+ async def _update_status(self, latency_ms: float | None) -> None:
417
+ """Update connectivity status based on metrics."""
418
+ old_status = self._status
419
+
420
+ if latency_ms is not None:
421
+ # Successful heartbeat
422
+ self._disconnect_time = None
423
+
424
+ if latency_ms > self._config.latency_threshold_ms:
425
+ # High latency - degraded
426
+ new_status = ConnectivityStatus.DEGRADED
427
+ elif (
428
+ self._status == ConnectivityStatus.DEGRADED
429
+ and self._metrics.consecutive_successes < self._config.recovery_threshold
430
+ ):
431
+ # Still recovering from degraded
432
+ new_status = ConnectivityStatus.DEGRADED
433
+ else:
434
+ new_status = ConnectivityStatus.CONNECTED
435
+ else:
436
+ # Failed heartbeat
437
+ if self._metrics.consecutive_failures >= self._config.missed_threshold:
438
+ new_status = ConnectivityStatus.DISCONNECTED
439
+
440
+ # Track disconnect time for grace period
441
+ if self._disconnect_time is None:
442
+ self._disconnect_time = time.time()
443
+ elif self._metrics.consecutive_failures > 0:
444
+ new_status = ConnectivityStatus.DEGRADED
445
+ else:
446
+ new_status = self._status
447
+
448
+ if new_status != old_status:
449
+ self._status = new_status
450
+ logger.info(f"Connectivity status changed: {old_status.value} -> {new_status.value}")
451
+
452
+ # Notify callbacks
453
+ for callback in self._callbacks:
454
+ try:
455
+ await callback(old_status, new_status)
456
+ except Exception as e:
457
+ logger.error(f"Connectivity callback failed: {e}")
458
+
459
+ async def _handle_directive(self, directive: str) -> None:
460
+ """Handle directive from Orchestrator.
461
+
462
+ Args:
463
+ directive: Directive string ("continue", "demote", "promote", "shutdown")
464
+ """
465
+ logger.info(f"Received directive from Orchestrator: {directive}")
466
+
467
+ if directive == "demote":
468
+ # Orchestrator is requesting we demote to STANDBY
469
+ # This could happen if another instance is being promoted
470
+ logger.warning("Orchestrator requested demotion")
471
+ # Note: actual demotion should be handled by caller via callback
472
+
473
+ elif directive == "promote":
474
+ # Orchestrator is requesting we promote to PRIMARY
475
+ logger.info("Orchestrator requested promotion")
476
+
477
+ elif directive == "shutdown":
478
+ # Orchestrator is requesting graceful shutdown
479
+ logger.warning("Orchestrator requested shutdown")
480
+
481
+ # "continue" means keep doing what you're doing
482
+
483
+ def get_status_dict(self) -> dict[str, Any]:
484
+ """Get current status as dictionary."""
485
+ return {
486
+ "processor_id": self._processor_id,
487
+ "node_id": self._node_id,
488
+ "role": self._role.value,
489
+ "connectivity": self._status.value,
490
+ "is_connected": self.is_connected(),
491
+ "fencing_epoch": self._fencing_token.epoch if self._fencing_token else None,
492
+ "metrics": {
493
+ "total_heartbeats": self._metrics.total_heartbeats,
494
+ "success_rate": self._metrics.get_success_rate(),
495
+ "consecutive_failures": self._metrics.consecutive_failures,
496
+ "avg_latency_ms": self._metrics.avg_latency_ms,
497
+ "last_latency_ms": self._metrics.last_latency_ms,
498
+ },
499
+ "last_state_sync": self._last_state_sync,
500
+ "uptime_sec": time.time() - self._start_time if self._start_time else 0,
501
+ }
502
+
503
+ def should_demote(self) -> bool:
504
+ """Check if should demote due to prolonged disconnect.
505
+
506
+ Returns:
507
+ True if disconnect grace period exceeded
508
+ """
509
+ if not self._config.auto_demote_on_disconnect:
510
+ return False
511
+
512
+ if self._status != ConnectivityStatus.DISCONNECTED:
513
+ return False
514
+
515
+ if self._disconnect_time is None:
516
+ return False
517
+
518
+ elapsed = time.time() - self._disconnect_time
519
+ return elapsed >= self._config.demote_grace_period_sec
520
+
521
+
522
+ class EdgeHealthReporter:
523
+ """Reports edge-specific health information to the HealthServer.
524
+
525
+ Integrates with the existing HealthServer to provide edge-specific
526
+ status information in health check responses.
527
+
528
+ Usage:
529
+ reporter = EdgeHealthReporter(
530
+ heartbeat_manager=heartbeat_mgr,
531
+ role_manager=role_mgr,
532
+ )
533
+
534
+ # Add to health server
535
+ health_server.add_health_component("edge", reporter.get_health_status)
536
+ """
537
+
538
+ def __init__(
539
+ self,
540
+ heartbeat_manager: HeartbeatManager | None = None,
541
+ role_manager: Any = None, # RoleManager, avoiding circular import
542
+ ):
543
+ """Initialize health reporter.
544
+
545
+ Args:
546
+ heartbeat_manager: Heartbeat manager instance
547
+ role_manager: Role manager instance
548
+ """
549
+ self._heartbeat = heartbeat_manager
550
+ self._role_manager = role_manager
551
+
552
+ def get_health_status(self) -> dict[str, Any]:
553
+ """Get edge health status for health endpoint.
554
+
555
+ Returns:
556
+ Dictionary with edge health information
557
+ """
558
+ status: dict[str, Any] = {
559
+ "is_edge": True,
560
+ }
561
+
562
+ if self._heartbeat:
563
+ status["connectivity"] = self._heartbeat.status.value
564
+ status["is_connected"] = self._heartbeat.is_connected()
565
+ metrics = self._heartbeat.metrics
566
+ status["heartbeat"] = {
567
+ "success_rate": metrics.get_success_rate(),
568
+ "avg_latency_ms": metrics.avg_latency_ms,
569
+ "consecutive_failures": metrics.consecutive_failures,
570
+ }
571
+
572
+ if self._role_manager:
573
+ status["role"] = self._role_manager.role.value
574
+ status["can_process"] = self._role_manager.can_process()
575
+ if self._role_manager.fencing_token:
576
+ status["fencing_epoch"] = self._role_manager.fencing_token.epoch
577
+
578
+ return status
579
+
580
+ async def check_health(self) -> tuple[str, str]:
581
+ """Async health check for HealthServer integration.
582
+
583
+ Returns:
584
+ Tuple of (status, message) where status is "healthy", "degraded", or "unhealthy"
585
+ """
586
+ if not self._heartbeat:
587
+ return "healthy", "Edge health reporter not configured"
588
+
589
+ connectivity = self._heartbeat.status
590
+
591
+ if connectivity == ConnectivityStatus.CONNECTED:
592
+ return "healthy", "Connected to Orchestrator"
593
+ elif connectivity == ConnectivityStatus.DEGRADED:
594
+ return "degraded", "Degraded connectivity to Orchestrator"
595
+ elif connectivity == ConnectivityStatus.DISCONNECTED:
596
+ return "unhealthy", "Disconnected from Orchestrator"
597
+ else:
598
+ return "degraded", "Connectivity status unknown"