kailash 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -7
- kailash/cli/__init__.py +11 -1
- kailash/cli/validation_audit.py +570 -0
- kailash/core/actors/supervisor.py +1 -1
- kailash/core/resilience/circuit_breaker.py +71 -1
- kailash/core/resilience/health_monitor.py +172 -0
- kailash/edge/compliance.py +33 -0
- kailash/edge/consistency.py +609 -0
- kailash/edge/coordination/__init__.py +30 -0
- kailash/edge/coordination/global_ordering.py +355 -0
- kailash/edge/coordination/leader_election.py +217 -0
- kailash/edge/coordination/partition_detector.py +296 -0
- kailash/edge/coordination/raft.py +485 -0
- kailash/edge/discovery.py +63 -1
- kailash/edge/migration/__init__.py +19 -0
- kailash/edge/migration/edge_migrator.py +832 -0
- kailash/edge/monitoring/__init__.py +21 -0
- kailash/edge/monitoring/edge_monitor.py +736 -0
- kailash/edge/prediction/__init__.py +10 -0
- kailash/edge/prediction/predictive_warmer.py +591 -0
- kailash/edge/resource/__init__.py +102 -0
- kailash/edge/resource/cloud_integration.py +796 -0
- kailash/edge/resource/cost_optimizer.py +949 -0
- kailash/edge/resource/docker_integration.py +919 -0
- kailash/edge/resource/kubernetes_integration.py +893 -0
- kailash/edge/resource/platform_integration.py +913 -0
- kailash/edge/resource/predictive_scaler.py +959 -0
- kailash/edge/resource/resource_analyzer.py +824 -0
- kailash/edge/resource/resource_pools.py +610 -0
- kailash/integrations/dataflow_edge.py +261 -0
- kailash/mcp_server/registry_integration.py +1 -1
- kailash/monitoring/__init__.py +18 -0
- kailash/monitoring/alerts.py +646 -0
- kailash/monitoring/metrics.py +677 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/__init__.py +17 -0
- kailash/nodes/ai/a2a.py +1914 -43
- kailash/nodes/ai/a2a_backup.py +1807 -0
- kailash/nodes/ai/hybrid_search.py +972 -0
- kailash/nodes/ai/semantic_memory.py +558 -0
- kailash/nodes/ai/streaming_analytics.py +947 -0
- kailash/nodes/base.py +545 -0
- kailash/nodes/edge/__init__.py +36 -0
- kailash/nodes/edge/base.py +240 -0
- kailash/nodes/edge/cloud_node.py +710 -0
- kailash/nodes/edge/coordination.py +239 -0
- kailash/nodes/edge/docker_node.py +825 -0
- kailash/nodes/edge/edge_data.py +582 -0
- kailash/nodes/edge/edge_migration_node.py +392 -0
- kailash/nodes/edge/edge_monitoring_node.py +421 -0
- kailash/nodes/edge/edge_state.py +673 -0
- kailash/nodes/edge/edge_warming_node.py +393 -0
- kailash/nodes/edge/kubernetes_node.py +652 -0
- kailash/nodes/edge/platform_node.py +766 -0
- kailash/nodes/edge/resource_analyzer_node.py +378 -0
- kailash/nodes/edge/resource_optimizer_node.py +501 -0
- kailash/nodes/edge/resource_scaler_node.py +397 -0
- kailash/nodes/ports.py +676 -0
- kailash/runtime/local.py +344 -1
- kailash/runtime/validation/__init__.py +20 -0
- kailash/runtime/validation/connection_context.py +119 -0
- kailash/runtime/validation/enhanced_error_formatter.py +202 -0
- kailash/runtime/validation/error_categorizer.py +164 -0
- kailash/runtime/validation/metrics.py +380 -0
- kailash/runtime/validation/performance.py +615 -0
- kailash/runtime/validation/suggestion_engine.py +212 -0
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +234 -8
- kailash/workflow/contracts.py +418 -0
- kailash/workflow/edge_infrastructure.py +369 -0
- kailash/workflow/migration.py +3 -3
- kailash/workflow/type_inference.py +669 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/METADATA +44 -27
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/RECORD +78 -28
- kailash/nexus/__init__.py +0 -21
- kailash/nexus/cli/__init__.py +0 -5
- kailash/nexus/cli/__main__.py +0 -6
- kailash/nexus/cli/main.py +0 -176
- kailash/nexus/factory.py +0 -413
- kailash/nexus/gateway.py +0 -545
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
|
|
1
|
+
"""Global ordering service for distributed events using hybrid logical clocks."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import hashlib
|
5
|
+
import json
|
6
|
+
from collections import defaultdict
|
7
|
+
from datetime import datetime
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
9
|
+
|
10
|
+
|
11
|
+
class HybridLogicalClock:
|
12
|
+
"""Hybrid Logical Clock (HLC) implementation for global ordering.
|
13
|
+
|
14
|
+
Combines physical time with logical counters to provide:
|
15
|
+
- Causally consistent timestamps
|
16
|
+
- Tolerance for clock skew
|
17
|
+
- Total ordering of events
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, node_id: str):
|
21
|
+
"""Initialize HLC.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
node_id: Unique identifier for this node
|
25
|
+
"""
|
26
|
+
self.node_id = node_id
|
27
|
+
self.logical_time = 0
|
28
|
+
self.logical_counter = 0
|
29
|
+
self._lock = asyncio.Lock()
|
30
|
+
|
31
|
+
async def now(self) -> Tuple[int, int, str]:
|
32
|
+
"""Get current HLC timestamp.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Tuple of (logical_time, logical_counter, node_id)
|
36
|
+
"""
|
37
|
+
async with self._lock:
|
38
|
+
physical_time = int(datetime.now().timestamp() * 1000) # Milliseconds
|
39
|
+
|
40
|
+
if physical_time > self.logical_time:
|
41
|
+
self.logical_time = physical_time
|
42
|
+
self.logical_counter = 0
|
43
|
+
else:
|
44
|
+
self.logical_counter += 1
|
45
|
+
|
46
|
+
return (self.logical_time, self.logical_counter, self.node_id)
|
47
|
+
|
48
|
+
async def update(self, remote_time: int, remote_counter: int):
|
49
|
+
"""Update clock with remote timestamp.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
remote_time: Remote logical time
|
53
|
+
remote_counter: Remote logical counter
|
54
|
+
"""
|
55
|
+
async with self._lock:
|
56
|
+
physical_time = int(datetime.now().timestamp() * 1000)
|
57
|
+
|
58
|
+
if physical_time > max(self.logical_time, remote_time):
|
59
|
+
self.logical_time = physical_time
|
60
|
+
self.logical_counter = 0
|
61
|
+
elif self.logical_time == remote_time:
|
62
|
+
self.logical_counter = max(self.logical_counter, remote_counter) + 1
|
63
|
+
elif self.logical_time < remote_time:
|
64
|
+
self.logical_time = remote_time
|
65
|
+
self.logical_counter = remote_counter + 1
|
66
|
+
else:
|
67
|
+
self.logical_counter += 1
|
68
|
+
|
69
|
+
def compare(self, ts1: Tuple[int, int, str], ts2: Tuple[int, int, str]) -> int:
|
70
|
+
"""Compare two HLC timestamps.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
ts1: First timestamp
|
74
|
+
ts2: Second timestamp
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
-1 if ts1 < ts2, 0 if equal, 1 if ts1 > ts2
|
78
|
+
"""
|
79
|
+
if ts1[0] != ts2[0]:
|
80
|
+
return -1 if ts1[0] < ts2[0] else 1
|
81
|
+
if ts1[1] != ts2[1]:
|
82
|
+
return -1 if ts1[1] < ts2[1] else 1
|
83
|
+
if ts1[2] != ts2[2]:
|
84
|
+
return -1 if ts1[2] < ts2[2] else 1
|
85
|
+
return 0
|
86
|
+
|
87
|
+
|
88
|
+
class GlobalOrderingService:
|
89
|
+
"""Global ordering service for distributed events.
|
90
|
+
|
91
|
+
Provides:
|
92
|
+
- Total ordering of events across edge nodes
|
93
|
+
- Causal dependency tracking
|
94
|
+
- Conflict detection and resolution
|
95
|
+
- Event deduplication
|
96
|
+
"""
|
97
|
+
|
98
|
+
def __init__(self, node_id: str):
|
99
|
+
"""Initialize global ordering service.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
node_id: Unique identifier for this node
|
103
|
+
"""
|
104
|
+
self.node_id = node_id
|
105
|
+
self.clock = HybridLogicalClock(node_id)
|
106
|
+
self.event_history: List[Dict[str, Any]] = []
|
107
|
+
self.causal_graph: Dict[str, List[str]] = defaultdict(list)
|
108
|
+
self.seen_events: set = set()
|
109
|
+
self._lock = asyncio.Lock()
|
110
|
+
|
111
|
+
async def order_events(self, events: List[Dict[str, Any]]) -> Dict[str, Any]:
|
112
|
+
"""Order a list of events globally.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
events: List of events to order
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
Dict with ordered events and metadata
|
119
|
+
"""
|
120
|
+
async with self._lock:
|
121
|
+
ordered_events = []
|
122
|
+
|
123
|
+
for event in events:
|
124
|
+
# Generate event ID if not present
|
125
|
+
if "id" not in event:
|
126
|
+
event["id"] = self._generate_event_id(event)
|
127
|
+
|
128
|
+
# Skip duplicates
|
129
|
+
if event["id"] in self.seen_events:
|
130
|
+
continue
|
131
|
+
|
132
|
+
# Assign HLC timestamp
|
133
|
+
timestamp = await self.clock.now()
|
134
|
+
event["hlc_timestamp"] = timestamp
|
135
|
+
event["hlc_time"] = timestamp[0]
|
136
|
+
event["hlc_counter"] = timestamp[1]
|
137
|
+
event["hlc_node"] = timestamp[2]
|
138
|
+
|
139
|
+
# Track causal dependencies
|
140
|
+
if "depends_on" in event:
|
141
|
+
for dep in event["depends_on"]:
|
142
|
+
self.causal_graph[event["id"]].append(dep)
|
143
|
+
|
144
|
+
ordered_events.append(event)
|
145
|
+
self.seen_events.add(event["id"])
|
146
|
+
|
147
|
+
# Sort by HLC timestamp
|
148
|
+
ordered_events.sort(
|
149
|
+
key=lambda e: (e["hlc_time"], e["hlc_counter"], e["hlc_node"])
|
150
|
+
)
|
151
|
+
|
152
|
+
# Add to history
|
153
|
+
self.event_history.extend(ordered_events)
|
154
|
+
|
155
|
+
return {
|
156
|
+
"ordered_events": ordered_events,
|
157
|
+
"logical_clock": self.clock.logical_time,
|
158
|
+
"causal_dependencies": dict(self.causal_graph),
|
159
|
+
"total_events": len(self.event_history),
|
160
|
+
}
|
161
|
+
|
162
|
+
async def merge_histories(
|
163
|
+
self, remote_history: List[Dict[str, Any]]
|
164
|
+
) -> Dict[str, Any]:
|
165
|
+
"""Merge remote event history with local history.
|
166
|
+
|
167
|
+
Args:
|
168
|
+
remote_history: Event history from remote node
|
169
|
+
|
170
|
+
Returns:
|
171
|
+
Dict with merged history and conflict information
|
172
|
+
"""
|
173
|
+
async with self._lock:
|
174
|
+
conflicts = []
|
175
|
+
merged_events = []
|
176
|
+
|
177
|
+
# Update clock with remote timestamps
|
178
|
+
for event in remote_history:
|
179
|
+
if "hlc_time" in event and "hlc_counter" in event:
|
180
|
+
await self.clock.update(event["hlc_time"], event["hlc_counter"])
|
181
|
+
|
182
|
+
# Merge histories
|
183
|
+
local_by_id = {e["id"]: e for e in self.event_history if "id" in e}
|
184
|
+
|
185
|
+
for remote_event in remote_history:
|
186
|
+
event_id = remote_event.get("id")
|
187
|
+
if not event_id:
|
188
|
+
continue
|
189
|
+
|
190
|
+
if event_id in local_by_id:
|
191
|
+
# Check for conflicts
|
192
|
+
local_event = local_by_id[event_id]
|
193
|
+
if self._events_conflict(local_event, remote_event):
|
194
|
+
conflicts.append(
|
195
|
+
{
|
196
|
+
"event_id": event_id,
|
197
|
+
"local": local_event,
|
198
|
+
"remote": remote_event,
|
199
|
+
}
|
200
|
+
)
|
201
|
+
# Keep event with higher timestamp
|
202
|
+
if self._compare_event_timestamps(remote_event, local_event) > 0:
|
203
|
+
local_by_id[event_id] = remote_event
|
204
|
+
else:
|
205
|
+
# New event
|
206
|
+
local_by_id[event_id] = remote_event
|
207
|
+
self.seen_events.add(event_id)
|
208
|
+
|
209
|
+
# Rebuild ordered history
|
210
|
+
self.event_history = list(local_by_id.values())
|
211
|
+
self.event_history.sort(
|
212
|
+
key=lambda e: (
|
213
|
+
e.get("hlc_time", 0),
|
214
|
+
e.get("hlc_counter", 0),
|
215
|
+
e.get("hlc_node", ""),
|
216
|
+
)
|
217
|
+
)
|
218
|
+
|
219
|
+
return {
|
220
|
+
"merged_events": len(self.event_history),
|
221
|
+
"conflicts": conflicts,
|
222
|
+
"conflict_count": len(conflicts),
|
223
|
+
"logical_clock": self.clock.logical_time,
|
224
|
+
}
|
225
|
+
|
226
|
+
def get_causal_order(self, event_id: str) -> List[str]:
|
227
|
+
"""Get causal ordering for an event.
|
228
|
+
|
229
|
+
Args:
|
230
|
+
event_id: Event ID to get dependencies for
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
List of event IDs that must precede this event
|
234
|
+
"""
|
235
|
+
visited = set()
|
236
|
+
order = []
|
237
|
+
|
238
|
+
def dfs(eid: str):
|
239
|
+
if eid in visited:
|
240
|
+
return
|
241
|
+
visited.add(eid)
|
242
|
+
|
243
|
+
for dep in self.causal_graph.get(eid, []):
|
244
|
+
dfs(dep)
|
245
|
+
|
246
|
+
order.append(eid)
|
247
|
+
|
248
|
+
dfs(event_id)
|
249
|
+
return order[:-1] # Exclude the event itself
|
250
|
+
|
251
|
+
def detect_causal_violations(self) -> List[Dict[str, Any]]:
|
252
|
+
"""Detect violations of causal ordering.
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
List of violations found
|
256
|
+
"""
|
257
|
+
violations = []
|
258
|
+
event_positions = {
|
259
|
+
e["id"]: i for i, e in enumerate(self.event_history) if "id" in e
|
260
|
+
}
|
261
|
+
|
262
|
+
for event_id, deps in self.causal_graph.items():
|
263
|
+
event_pos = event_positions.get(event_id)
|
264
|
+
if event_pos is None:
|
265
|
+
continue
|
266
|
+
|
267
|
+
for dep in deps:
|
268
|
+
dep_pos = event_positions.get(dep)
|
269
|
+
if dep_pos is None:
|
270
|
+
violations.append(
|
271
|
+
{
|
272
|
+
"type": "missing_dependency",
|
273
|
+
"event": event_id,
|
274
|
+
"missing": dep,
|
275
|
+
}
|
276
|
+
)
|
277
|
+
elif dep_pos > event_pos:
|
278
|
+
violations.append(
|
279
|
+
{
|
280
|
+
"type": "causal_violation",
|
281
|
+
"event": event_id,
|
282
|
+
"dependency": dep,
|
283
|
+
"event_position": event_pos,
|
284
|
+
"dependency_position": dep_pos,
|
285
|
+
}
|
286
|
+
)
|
287
|
+
|
288
|
+
return violations
|
289
|
+
|
290
|
+
def _generate_event_id(self, event: Dict[str, Any]) -> str:
|
291
|
+
"""Generate unique event ID.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
event: Event data
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
Unique event ID
|
298
|
+
"""
|
299
|
+
# Create deterministic ID from event content
|
300
|
+
content = json.dumps(event, sort_keys=True)
|
301
|
+
hash_obj = hashlib.sha256(content.encode())
|
302
|
+
return f"event_{hash_obj.hexdigest()[:16]}_{self.node_id}"
|
303
|
+
|
304
|
+
def _events_conflict(self, event1: Dict[str, Any], event2: Dict[str, Any]) -> bool:
|
305
|
+
"""Check if two events conflict.
|
306
|
+
|
307
|
+
Args:
|
308
|
+
event1: First event
|
309
|
+
event2: Second event
|
310
|
+
|
311
|
+
Returns:
|
312
|
+
True if events conflict
|
313
|
+
"""
|
314
|
+
# Events conflict if they have same ID but different content
|
315
|
+
if event1.get("id") != event2.get("id"):
|
316
|
+
return False
|
317
|
+
|
318
|
+
# Compare non-timestamp fields
|
319
|
+
e1_copy = {
|
320
|
+
k: v
|
321
|
+
for k, v in event1.items()
|
322
|
+
if not k.startswith("hlc_") and k != "timestamp"
|
323
|
+
}
|
324
|
+
e2_copy = {
|
325
|
+
k: v
|
326
|
+
for k, v in event2.items()
|
327
|
+
if not k.startswith("hlc_") and k != "timestamp"
|
328
|
+
}
|
329
|
+
|
330
|
+
return e1_copy != e2_copy
|
331
|
+
|
332
|
+
def _compare_event_timestamps(
|
333
|
+
self, event1: Dict[str, Any], event2: Dict[str, Any]
|
334
|
+
) -> int:
|
335
|
+
"""Compare event timestamps.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
event1: First event
|
339
|
+
event2: Second event
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
-1 if event1 < event2, 0 if equal, 1 if event1 > event2
|
343
|
+
"""
|
344
|
+
ts1 = (
|
345
|
+
event1.get("hlc_time", 0),
|
346
|
+
event1.get("hlc_counter", 0),
|
347
|
+
event1.get("hlc_node", ""),
|
348
|
+
)
|
349
|
+
ts2 = (
|
350
|
+
event2.get("hlc_time", 0),
|
351
|
+
event2.get("hlc_counter", 0),
|
352
|
+
event2.get("hlc_node", ""),
|
353
|
+
)
|
354
|
+
|
355
|
+
return self.clock.compare(ts1, ts2)
|
@@ -0,0 +1,217 @@
|
|
1
|
+
"""Edge leader election service using Raft consensus."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import logging
|
5
|
+
from datetime import datetime, timedelta
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
|
+
|
8
|
+
from .raft import RaftNode, RaftState
|
9
|
+
|
10
|
+
|
11
|
+
class EdgeLeaderElection:
|
12
|
+
"""Leader election service for edge nodes using Raft consensus.
|
13
|
+
|
14
|
+
This service manages leader election across edge nodes, providing:
|
15
|
+
- Automatic leader election on startup
|
16
|
+
- Leader failure detection and re-election
|
17
|
+
- Stable leader information for coordination
|
18
|
+
- Network partition handling
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self, raft_nodes: Dict[str, RaftNode]):
|
22
|
+
"""Initialize leader election service.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
raft_nodes: Dictionary of node_id -> RaftNode instances
|
26
|
+
"""
|
27
|
+
self.raft_nodes = raft_nodes
|
28
|
+
self.current_leader: Optional[str] = None
|
29
|
+
self.current_term: int = 0
|
30
|
+
self.last_leader_change = datetime.now()
|
31
|
+
self.stability_threshold = timedelta(seconds=5)
|
32
|
+
self.logger = logging.getLogger("EdgeLeaderElection")
|
33
|
+
|
34
|
+
# Election monitoring
|
35
|
+
self._monitor_task: Optional[asyncio.Task] = None
|
36
|
+
self._running = False
|
37
|
+
|
38
|
+
async def start(self):
|
39
|
+
"""Start leader election monitoring."""
|
40
|
+
self._running = True
|
41
|
+
self._monitor_task = asyncio.create_task(self._monitor_leadership())
|
42
|
+
self.logger.info("Leader election service started")
|
43
|
+
|
44
|
+
async def stop(self):
|
45
|
+
"""Stop leader election monitoring."""
|
46
|
+
self._running = False
|
47
|
+
if self._monitor_task:
|
48
|
+
self._monitor_task.cancel()
|
49
|
+
try:
|
50
|
+
await self._monitor_task
|
51
|
+
except asyncio.CancelledError:
|
52
|
+
pass
|
53
|
+
self.logger.info("Leader election service stopped")
|
54
|
+
|
55
|
+
async def start_election(self) -> Dict[str, Any]:
|
56
|
+
"""Start a new leader election.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
Dict with election results including leader and term
|
60
|
+
"""
|
61
|
+
self.logger.info("Starting new leader election")
|
62
|
+
|
63
|
+
# Find a candidate node to trigger election
|
64
|
+
candidate_nodes = [
|
65
|
+
node for node in self.raft_nodes.values() if node.state != RaftState.LEADER
|
66
|
+
]
|
67
|
+
|
68
|
+
if not candidate_nodes:
|
69
|
+
# Current leader still active
|
70
|
+
return self.get_current_leader()
|
71
|
+
|
72
|
+
# Trigger election on first non-leader node
|
73
|
+
candidate = candidate_nodes[0]
|
74
|
+
candidate._become_candidate()
|
75
|
+
await candidate._collect_votes()
|
76
|
+
|
77
|
+
# Wait briefly for election to complete
|
78
|
+
await asyncio.sleep(0.1)
|
79
|
+
|
80
|
+
# Update and return leader info
|
81
|
+
self._update_leader_info()
|
82
|
+
return self.get_current_leader()
|
83
|
+
|
84
|
+
def get_current_leader(self) -> Dict[str, Any]:
|
85
|
+
"""Get current leader information.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
Dict with leader ID, term, and stability status
|
89
|
+
"""
|
90
|
+
self._update_leader_info()
|
91
|
+
|
92
|
+
stable = False
|
93
|
+
if self.current_leader:
|
94
|
+
time_since_change = datetime.now() - self.last_leader_change
|
95
|
+
stable = time_since_change > self.stability_threshold
|
96
|
+
|
97
|
+
return {
|
98
|
+
"leader": self.current_leader,
|
99
|
+
"term": self.current_term,
|
100
|
+
"stable": stable,
|
101
|
+
"time_since_change": (
|
102
|
+
datetime.now() - self.last_leader_change
|
103
|
+
).total_seconds(),
|
104
|
+
}
|
105
|
+
|
106
|
+
def force_election(self) -> None:
|
107
|
+
"""Force a new election by demoting current leader."""
|
108
|
+
for node_id, node in self.raft_nodes.items():
|
109
|
+
if node.state == RaftState.LEADER:
|
110
|
+
node._become_follower()
|
111
|
+
self.logger.info(f"Forced leader {node_id} to step down")
|
112
|
+
break
|
113
|
+
|
114
|
+
async def wait_for_stable_leader(self, timeout: float = 10.0) -> Dict[str, Any]:
|
115
|
+
"""Wait for a stable leader to be elected.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
timeout: Maximum time to wait in seconds
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
Leader information once stable
|
122
|
+
|
123
|
+
Raises:
|
124
|
+
TimeoutError: If no stable leader within timeout
|
125
|
+
"""
|
126
|
+
start_time = datetime.now()
|
127
|
+
|
128
|
+
while (datetime.now() - start_time).total_seconds() < timeout:
|
129
|
+
leader_info = self.get_current_leader()
|
130
|
+
|
131
|
+
if leader_info["leader"] and leader_info["stable"]:
|
132
|
+
return leader_info
|
133
|
+
|
134
|
+
await asyncio.sleep(0.1)
|
135
|
+
|
136
|
+
raise TimeoutError(f"No stable leader elected within {timeout} seconds")
|
137
|
+
|
138
|
+
def _update_leader_info(self):
|
139
|
+
"""Update current leader information from Raft nodes."""
|
140
|
+
new_leader = None
|
141
|
+
new_term = 0
|
142
|
+
|
143
|
+
for node_id, node in self.raft_nodes.items():
|
144
|
+
if node.state == RaftState.LEADER:
|
145
|
+
new_leader = node_id
|
146
|
+
new_term = node.current_term
|
147
|
+
break
|
148
|
+
|
149
|
+
# Check if leader changed
|
150
|
+
if new_leader != self.current_leader or new_term != self.current_term:
|
151
|
+
self.current_leader = new_leader
|
152
|
+
self.current_term = new_term
|
153
|
+
self.last_leader_change = datetime.now()
|
154
|
+
|
155
|
+
if new_leader:
|
156
|
+
self.logger.info(f"New leader elected: {new_leader} (term {new_term})")
|
157
|
+
else:
|
158
|
+
self.logger.warning("No leader - cluster in election")
|
159
|
+
|
160
|
+
async def _monitor_leadership(self):
|
161
|
+
"""Background task to monitor leadership stability."""
|
162
|
+
while self._running:
|
163
|
+
try:
|
164
|
+
self._update_leader_info()
|
165
|
+
|
166
|
+
# Check if we need to trigger election
|
167
|
+
leader_info = self.get_current_leader()
|
168
|
+
if not leader_info["leader"]:
|
169
|
+
# No leader for too long
|
170
|
+
time_without_leader = (
|
171
|
+
datetime.now() - self.last_leader_change
|
172
|
+
).total_seconds()
|
173
|
+
if time_without_leader > 2.0: # 2 seconds without leader
|
174
|
+
self.logger.warning(
|
175
|
+
"No leader for 2 seconds, triggering election"
|
176
|
+
)
|
177
|
+
await self.start_election()
|
178
|
+
|
179
|
+
await asyncio.sleep(0.5) # Check every 500ms
|
180
|
+
|
181
|
+
except Exception as e:
|
182
|
+
self.logger.error(f"Leadership monitor error: {e}")
|
183
|
+
|
184
|
+
def get_cluster_health(self) -> Dict[str, Any]:
|
185
|
+
"""Get health information about the cluster.
|
186
|
+
|
187
|
+
Returns:
|
188
|
+
Dict with cluster health metrics
|
189
|
+
"""
|
190
|
+
total_nodes = len(self.raft_nodes)
|
191
|
+
leader_count = sum(
|
192
|
+
1 for node in self.raft_nodes.values() if node.state == RaftState.LEADER
|
193
|
+
)
|
194
|
+
follower_count = sum(
|
195
|
+
1 for node in self.raft_nodes.values() if node.state == RaftState.FOLLOWER
|
196
|
+
)
|
197
|
+
candidate_count = sum(
|
198
|
+
1 for node in self.raft_nodes.values() if node.state == RaftState.CANDIDATE
|
199
|
+
)
|
200
|
+
|
201
|
+
# Check for split brain
|
202
|
+
split_brain = leader_count > 1
|
203
|
+
|
204
|
+
# Check for partitions
|
205
|
+
has_quorum = (follower_count + leader_count) > total_nodes // 2
|
206
|
+
|
207
|
+
return {
|
208
|
+
"total_nodes": total_nodes,
|
209
|
+
"leader_count": leader_count,
|
210
|
+
"follower_count": follower_count,
|
211
|
+
"candidate_count": candidate_count,
|
212
|
+
"split_brain": split_brain,
|
213
|
+
"has_quorum": has_quorum,
|
214
|
+
"current_leader": self.current_leader,
|
215
|
+
"current_term": self.current_term,
|
216
|
+
"healthy": leader_count == 1 and has_quorum and not split_brain,
|
217
|
+
}
|