kailash 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -7
- kailash/cli/__init__.py +11 -1
- kailash/cli/validation_audit.py +570 -0
- kailash/core/actors/supervisor.py +1 -1
- kailash/core/resilience/circuit_breaker.py +71 -1
- kailash/core/resilience/health_monitor.py +172 -0
- kailash/edge/compliance.py +33 -0
- kailash/edge/consistency.py +609 -0
- kailash/edge/coordination/__init__.py +30 -0
- kailash/edge/coordination/global_ordering.py +355 -0
- kailash/edge/coordination/leader_election.py +217 -0
- kailash/edge/coordination/partition_detector.py +296 -0
- kailash/edge/coordination/raft.py +485 -0
- kailash/edge/discovery.py +63 -1
- kailash/edge/migration/__init__.py +19 -0
- kailash/edge/migration/edge_migrator.py +832 -0
- kailash/edge/monitoring/__init__.py +21 -0
- kailash/edge/monitoring/edge_monitor.py +736 -0
- kailash/edge/prediction/__init__.py +10 -0
- kailash/edge/prediction/predictive_warmer.py +591 -0
- kailash/edge/resource/__init__.py +102 -0
- kailash/edge/resource/cloud_integration.py +796 -0
- kailash/edge/resource/cost_optimizer.py +949 -0
- kailash/edge/resource/docker_integration.py +919 -0
- kailash/edge/resource/kubernetes_integration.py +893 -0
- kailash/edge/resource/platform_integration.py +913 -0
- kailash/edge/resource/predictive_scaler.py +959 -0
- kailash/edge/resource/resource_analyzer.py +824 -0
- kailash/edge/resource/resource_pools.py +610 -0
- kailash/integrations/dataflow_edge.py +261 -0
- kailash/mcp_server/registry_integration.py +1 -1
- kailash/monitoring/__init__.py +18 -0
- kailash/monitoring/alerts.py +646 -0
- kailash/monitoring/metrics.py +677 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/__init__.py +17 -0
- kailash/nodes/ai/a2a.py +1914 -43
- kailash/nodes/ai/a2a_backup.py +1807 -0
- kailash/nodes/ai/hybrid_search.py +972 -0
- kailash/nodes/ai/semantic_memory.py +558 -0
- kailash/nodes/ai/streaming_analytics.py +947 -0
- kailash/nodes/base.py +545 -0
- kailash/nodes/edge/__init__.py +36 -0
- kailash/nodes/edge/base.py +240 -0
- kailash/nodes/edge/cloud_node.py +710 -0
- kailash/nodes/edge/coordination.py +239 -0
- kailash/nodes/edge/docker_node.py +825 -0
- kailash/nodes/edge/edge_data.py +582 -0
- kailash/nodes/edge/edge_migration_node.py +392 -0
- kailash/nodes/edge/edge_monitoring_node.py +421 -0
- kailash/nodes/edge/edge_state.py +673 -0
- kailash/nodes/edge/edge_warming_node.py +393 -0
- kailash/nodes/edge/kubernetes_node.py +652 -0
- kailash/nodes/edge/platform_node.py +766 -0
- kailash/nodes/edge/resource_analyzer_node.py +378 -0
- kailash/nodes/edge/resource_optimizer_node.py +501 -0
- kailash/nodes/edge/resource_scaler_node.py +397 -0
- kailash/nodes/ports.py +676 -0
- kailash/runtime/local.py +344 -1
- kailash/runtime/validation/__init__.py +20 -0
- kailash/runtime/validation/connection_context.py +119 -0
- kailash/runtime/validation/enhanced_error_formatter.py +202 -0
- kailash/runtime/validation/error_categorizer.py +164 -0
- kailash/runtime/validation/metrics.py +380 -0
- kailash/runtime/validation/performance.py +615 -0
- kailash/runtime/validation/suggestion_engine.py +212 -0
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +234 -8
- kailash/workflow/contracts.py +418 -0
- kailash/workflow/edge_infrastructure.py +369 -0
- kailash/workflow/migration.py +3 -3
- kailash/workflow/type_inference.py +669 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/METADATA +44 -27
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/RECORD +78 -28
- kailash/nexus/__init__.py +0 -21
- kailash/nexus/cli/__init__.py +0 -5
- kailash/nexus/cli/__main__.py +0 -6
- kailash/nexus/cli/main.py +0 -176
- kailash/nexus/factory.py +0 -413
- kailash/nexus/gateway.py +0 -545
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,296 @@
|
|
1
|
+
"""Network partition detection for edge coordination."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import logging
|
5
|
+
from collections import defaultdict
|
6
|
+
from datetime import datetime, timedelta
|
7
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
8
|
+
|
9
|
+
|
10
|
+
class PartitionDetector:
|
11
|
+
"""Detects network partitions in distributed edge systems.
|
12
|
+
|
13
|
+
Uses heartbeat monitoring and cluster state analysis to detect:
|
14
|
+
- Network partitions (split-brain scenarios)
|
15
|
+
- Node failures
|
16
|
+
- Connectivity issues
|
17
|
+
- Quorum status
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
node_id: str,
|
23
|
+
peers: List[str],
|
24
|
+
heartbeat_interval_ms: int = 100,
|
25
|
+
failure_threshold_ms: int = 500,
|
26
|
+
):
|
27
|
+
"""Initialize partition detector.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
node_id: This node's identifier
|
31
|
+
peers: List of peer node IDs
|
32
|
+
heartbeat_interval_ms: Heartbeat interval in milliseconds
|
33
|
+
failure_threshold_ms: Time without heartbeat to consider failure
|
34
|
+
"""
|
35
|
+
self.node_id = node_id
|
36
|
+
self.peers = set(peers)
|
37
|
+
self.heartbeat_interval_ms = heartbeat_interval_ms
|
38
|
+
self.failure_threshold_ms = failure_threshold_ms
|
39
|
+
|
40
|
+
# Heartbeat tracking
|
41
|
+
self.last_heartbeats: Dict[str, datetime] = {}
|
42
|
+
self.peer_connections: Dict[str, Set[str]] = defaultdict(set)
|
43
|
+
self.my_connections: Set[str] = set()
|
44
|
+
|
45
|
+
# Partition state
|
46
|
+
self.current_partition: Optional[Set[str]] = None
|
47
|
+
self.partition_start_time: Optional[datetime] = None
|
48
|
+
self.partition_history: List[Dict[str, Any]] = []
|
49
|
+
|
50
|
+
# Monitoring
|
51
|
+
self._monitor_task: Optional[asyncio.Task] = None
|
52
|
+
self._running = False
|
53
|
+
self.logger = logging.getLogger(f"PartitionDetector[{node_id}]")
|
54
|
+
|
55
|
+
async def start(self):
|
56
|
+
"""Start partition detection."""
|
57
|
+
self._running = True
|
58
|
+
self._monitor_task = asyncio.create_task(self._monitor_partitions())
|
59
|
+
self.logger.info("Partition detector started")
|
60
|
+
|
61
|
+
async def stop(self):
|
62
|
+
"""Stop partition detection."""
|
63
|
+
self._running = False
|
64
|
+
if self._monitor_task:
|
65
|
+
self._monitor_task.cancel()
|
66
|
+
try:
|
67
|
+
await self._monitor_task
|
68
|
+
except asyncio.CancelledError:
|
69
|
+
pass
|
70
|
+
self.logger.info("Partition detector stopped")
|
71
|
+
|
72
|
+
def record_heartbeat(self, from_node: str):
|
73
|
+
"""Record heartbeat from a peer node.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
from_node: Node ID that sent heartbeat
|
77
|
+
"""
|
78
|
+
self.last_heartbeats[from_node] = datetime.now()
|
79
|
+
self.my_connections.add(from_node)
|
80
|
+
|
81
|
+
def update_peer_connections(self, node_id: str, connections: Set[str]):
|
82
|
+
"""Update connectivity information for a peer.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
node_id: Peer node ID
|
86
|
+
connections: Set of nodes the peer can reach
|
87
|
+
"""
|
88
|
+
self.peer_connections[node_id] = connections
|
89
|
+
|
90
|
+
def get_partition_status(self) -> Dict[str, Any]:
|
91
|
+
"""Get current partition status.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
Dict with partition information
|
95
|
+
"""
|
96
|
+
now = datetime.now()
|
97
|
+
active_peers = self._get_active_peers(now)
|
98
|
+
|
99
|
+
# Check for partition
|
100
|
+
is_partitioned = self._detect_partition(active_peers)
|
101
|
+
|
102
|
+
# Calculate quorum
|
103
|
+
total_nodes = len(self.peers) + 1 # Include self
|
104
|
+
reachable_nodes = len(active_peers) + 1 # Include self
|
105
|
+
has_quorum = reachable_nodes > total_nodes // 2
|
106
|
+
|
107
|
+
# Get partition groups
|
108
|
+
groups = self._identify_partition_groups(active_peers)
|
109
|
+
|
110
|
+
return {
|
111
|
+
"is_partitioned": is_partitioned,
|
112
|
+
"has_quorum": has_quorum,
|
113
|
+
"reachable_nodes": reachable_nodes,
|
114
|
+
"total_nodes": total_nodes,
|
115
|
+
"active_peers": list(active_peers),
|
116
|
+
"unreachable_peers": list(self.peers - active_peers),
|
117
|
+
"partition_groups": groups,
|
118
|
+
"current_partition": (
|
119
|
+
list(self.current_partition) if self.current_partition else None
|
120
|
+
),
|
121
|
+
"partition_duration": self._get_partition_duration(),
|
122
|
+
}
|
123
|
+
|
124
|
+
def _get_active_peers(self, now: datetime) -> Set[str]:
|
125
|
+
"""Get set of currently active peers.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
now: Current time
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
Set of active peer IDs
|
132
|
+
"""
|
133
|
+
active = set()
|
134
|
+
threshold = timedelta(milliseconds=self.failure_threshold_ms)
|
135
|
+
|
136
|
+
for peer in self.peers:
|
137
|
+
if peer in self.last_heartbeats:
|
138
|
+
if now - self.last_heartbeats[peer] < threshold:
|
139
|
+
active.add(peer)
|
140
|
+
|
141
|
+
return active
|
142
|
+
|
143
|
+
def _detect_partition(self, active_peers: Set[str]) -> bool:
|
144
|
+
"""Detect if network is partitioned.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
active_peers: Set of active peer IDs
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
True if partition detected
|
151
|
+
"""
|
152
|
+
# Simple detection: partition if we can't reach all peers
|
153
|
+
# but some peers can reach each other
|
154
|
+
if len(active_peers) < len(self.peers):
|
155
|
+
# Check if unreachable peers can reach each other
|
156
|
+
unreachable = self.peers - active_peers
|
157
|
+
|
158
|
+
for peer in unreachable:
|
159
|
+
if peer in self.peer_connections:
|
160
|
+
# Check if this peer can reach other unreachable peers
|
161
|
+
peer_reach = self.peer_connections[peer]
|
162
|
+
if peer_reach & unreachable:
|
163
|
+
# Partition detected
|
164
|
+
return True
|
165
|
+
|
166
|
+
return False
|
167
|
+
|
168
|
+
def _identify_partition_groups(self, active_peers: Set[str]) -> List[Set[str]]:
|
169
|
+
"""Identify partition groups in the network.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
active_peers: Set of active peer IDs
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
List of partition groups (sets of node IDs)
|
176
|
+
"""
|
177
|
+
# Build connectivity graph
|
178
|
+
graph = defaultdict(set)
|
179
|
+
|
180
|
+
# Add self connections
|
181
|
+
graph[self.node_id] = active_peers.copy()
|
182
|
+
|
183
|
+
# Add peer connections
|
184
|
+
for peer, connections in self.peer_connections.items():
|
185
|
+
graph[peer] = connections.copy()
|
186
|
+
|
187
|
+
# Find connected components
|
188
|
+
visited = set()
|
189
|
+
groups = []
|
190
|
+
|
191
|
+
def dfs(node: str, group: Set[str]):
|
192
|
+
if node in visited:
|
193
|
+
return
|
194
|
+
visited.add(node)
|
195
|
+
group.add(node)
|
196
|
+
|
197
|
+
for neighbor in graph.get(node, set()):
|
198
|
+
if neighbor not in visited:
|
199
|
+
dfs(neighbor, group)
|
200
|
+
|
201
|
+
# Start DFS from all nodes
|
202
|
+
all_nodes = {self.node_id} | self.peers
|
203
|
+
for node in all_nodes:
|
204
|
+
if node not in visited:
|
205
|
+
group = set()
|
206
|
+
dfs(node, group)
|
207
|
+
if group:
|
208
|
+
groups.append(group)
|
209
|
+
|
210
|
+
return groups
|
211
|
+
|
212
|
+
def _get_partition_duration(self) -> Optional[float]:
|
213
|
+
"""Get duration of current partition in seconds.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
Duration in seconds or None if not partitioned
|
217
|
+
"""
|
218
|
+
if self.partition_start_time:
|
219
|
+
return (datetime.now() - self.partition_start_time).total_seconds()
|
220
|
+
return None
|
221
|
+
|
222
|
+
async def _monitor_partitions(self):
|
223
|
+
"""Background task to monitor for partitions."""
|
224
|
+
while self._running:
|
225
|
+
try:
|
226
|
+
status = self.get_partition_status()
|
227
|
+
|
228
|
+
# Check for partition state change
|
229
|
+
if status["is_partitioned"] and not self.current_partition:
|
230
|
+
# New partition detected
|
231
|
+
self.current_partition = set(status["active_peers"])
|
232
|
+
self.current_partition.add(self.node_id)
|
233
|
+
self.partition_start_time = datetime.now()
|
234
|
+
|
235
|
+
self.logger.warning(
|
236
|
+
f"Network partition detected! In partition with: {self.current_partition}"
|
237
|
+
)
|
238
|
+
|
239
|
+
# Record in history
|
240
|
+
self.partition_history.append(
|
241
|
+
{
|
242
|
+
"detected_at": self.partition_start_time,
|
243
|
+
"partition": list(self.current_partition),
|
244
|
+
"groups": status["partition_groups"],
|
245
|
+
}
|
246
|
+
)
|
247
|
+
|
248
|
+
elif not status["is_partitioned"] and self.current_partition:
|
249
|
+
# Partition healed
|
250
|
+
duration = self._get_partition_duration()
|
251
|
+
self.logger.info(
|
252
|
+
f"Network partition healed after {duration:.2f} seconds"
|
253
|
+
)
|
254
|
+
|
255
|
+
# Update history
|
256
|
+
if self.partition_history:
|
257
|
+
self.partition_history[-1]["healed_at"] = datetime.now()
|
258
|
+
self.partition_history[-1]["duration"] = duration
|
259
|
+
|
260
|
+
self.current_partition = None
|
261
|
+
self.partition_start_time = None
|
262
|
+
|
263
|
+
await asyncio.sleep(self.heartbeat_interval_ms / 1000)
|
264
|
+
|
265
|
+
except Exception as e:
|
266
|
+
self.logger.error(f"Partition monitor error: {e}")
|
267
|
+
|
268
|
+
def should_participate_in_election(self) -> bool:
|
269
|
+
"""Check if this node should participate in leader election.
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
True if node should participate (has quorum)
|
273
|
+
"""
|
274
|
+
status = self.get_partition_status()
|
275
|
+
return status["has_quorum"]
|
276
|
+
|
277
|
+
def get_partition_metrics(self) -> Dict[str, Any]:
|
278
|
+
"""Get partition detection metrics.
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
Dict with partition metrics
|
282
|
+
"""
|
283
|
+
total_partitions = len(self.partition_history)
|
284
|
+
total_duration = sum(p.get("duration", 0) for p in self.partition_history)
|
285
|
+
|
286
|
+
current_duration = self._get_partition_duration()
|
287
|
+
if current_duration:
|
288
|
+
total_duration += current_duration
|
289
|
+
|
290
|
+
return {
|
291
|
+
"total_partitions": total_partitions,
|
292
|
+
"total_partition_duration": total_duration,
|
293
|
+
"current_partition_duration": current_duration,
|
294
|
+
"partition_history_size": len(self.partition_history),
|
295
|
+
"is_currently_partitioned": self.current_partition is not None,
|
296
|
+
}
|