kailash 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/client/__init__.py +12 -0
- kailash/client/enhanced_client.py +306 -0
- kailash/core/actors/__init__.py +16 -0
- kailash/core/actors/connection_actor.py +566 -0
- kailash/core/actors/supervisor.py +364 -0
- kailash/edge/__init__.py +16 -0
- kailash/edge/compliance.py +834 -0
- kailash/edge/discovery.py +659 -0
- kailash/edge/location.py +582 -0
- kailash/gateway/__init__.py +33 -0
- kailash/gateway/api.py +289 -0
- kailash/gateway/enhanced_gateway.py +357 -0
- kailash/gateway/resource_resolver.py +217 -0
- kailash/gateway/security.py +227 -0
- kailash/middleware/auth/models.py +2 -2
- kailash/middleware/database/base_models.py +1 -7
- kailash/middleware/gateway/__init__.py +22 -0
- kailash/middleware/gateway/checkpoint_manager.py +398 -0
- kailash/middleware/gateway/deduplicator.py +382 -0
- kailash/middleware/gateway/durable_gateway.py +417 -0
- kailash/middleware/gateway/durable_request.py +498 -0
- kailash/middleware/gateway/event_store.py +459 -0
- kailash/nodes/admin/permission_check.py +817 -33
- kailash/nodes/admin/role_management.py +1242 -108
- kailash/nodes/admin/schema_manager.py +438 -0
- kailash/nodes/admin/user_management.py +1124 -1582
- kailash/nodes/code/__init__.py +8 -1
- kailash/nodes/code/async_python.py +1035 -0
- kailash/nodes/code/python.py +1 -0
- kailash/nodes/data/async_sql.py +9 -3
- kailash/nodes/data/sql.py +20 -11
- kailash/nodes/data/workflow_connection_pool.py +643 -0
- kailash/nodes/rag/__init__.py +1 -4
- kailash/resources/__init__.py +40 -0
- kailash/resources/factory.py +533 -0
- kailash/resources/health.py +319 -0
- kailash/resources/reference.py +288 -0
- kailash/resources/registry.py +392 -0
- kailash/runtime/async_local.py +711 -302
- kailash/testing/__init__.py +34 -0
- kailash/testing/async_test_case.py +353 -0
- kailash/testing/async_utils.py +345 -0
- kailash/testing/fixtures.py +458 -0
- kailash/testing/mock_registry.py +495 -0
- kailash/workflow/__init__.py +8 -0
- kailash/workflow/async_builder.py +621 -0
- kailash/workflow/async_patterns.py +766 -0
- kailash/workflow/cyclic_runner.py +107 -16
- kailash/workflow/graph.py +7 -2
- kailash/workflow/resilience.py +11 -1
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/METADATA +7 -4
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/RECORD +57 -22
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/WHEEL +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,364 @@
|
|
1
|
+
"""Actor supervision for fault tolerance.
|
2
|
+
|
3
|
+
This module implements supervision strategies for managing actor lifecycles
|
4
|
+
and handling failures gracefully.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
from datetime import UTC, datetime, timedelta
|
10
|
+
from enum import Enum
|
11
|
+
from typing import Any, Callable, Dict, List, Optional
|
12
|
+
|
13
|
+
from .connection_actor import ActorConnection, ConnectionState
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class SupervisionStrategy(Enum):
|
19
|
+
"""Supervision strategies for handling actor failures."""
|
20
|
+
|
21
|
+
ONE_FOR_ONE = "one_for_one" # Restart only the failed actor
|
22
|
+
ONE_FOR_ALL = "one_for_all" # Restart all actors on any failure
|
23
|
+
REST_FOR_ONE = "rest_for_one" # Restart failed actor and all after it
|
24
|
+
|
25
|
+
|
26
|
+
class RestartDecision(Enum):
|
27
|
+
"""Decision on whether to restart a failed actor."""
|
28
|
+
|
29
|
+
RESTART = "restart"
|
30
|
+
STOP = "stop"
|
31
|
+
ESCALATE = "escalate"
|
32
|
+
|
33
|
+
|
34
|
+
class ActorSupervisor:
|
35
|
+
"""
|
36
|
+
Supervises a group of actors, handling failures and restarts.
|
37
|
+
|
38
|
+
Inspired by Erlang/OTP supervision trees, this class manages
|
39
|
+
actor lifecycles and implements various restart strategies.
|
40
|
+
"""
|
41
|
+
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
name: str,
|
45
|
+
strategy: SupervisionStrategy = SupervisionStrategy.ONE_FOR_ONE,
|
46
|
+
max_restarts: int = 3,
|
47
|
+
restart_window: float = 60.0,
|
48
|
+
restart_delay: float = 1.0,
|
49
|
+
):
|
50
|
+
"""
|
51
|
+
Initialize actor supervisor.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
name: Supervisor name
|
55
|
+
strategy: Supervision strategy to use
|
56
|
+
max_restarts: Maximum restarts within window
|
57
|
+
restart_window: Time window for restart counting (seconds)
|
58
|
+
restart_delay: Delay between restarts (seconds)
|
59
|
+
"""
|
60
|
+
self.name = name
|
61
|
+
self.strategy = strategy
|
62
|
+
self.max_restarts = max_restarts
|
63
|
+
self.restart_window = restart_window
|
64
|
+
self.restart_delay = restart_delay
|
65
|
+
|
66
|
+
# Supervised actors
|
67
|
+
self.actors: Dict[str, ActorConnection] = {}
|
68
|
+
self.actor_order: List[str] = [] # For REST_FOR_ONE strategy
|
69
|
+
|
70
|
+
# Restart tracking
|
71
|
+
self.restart_counts: Dict[str, List[datetime]] = {}
|
72
|
+
|
73
|
+
# Callbacks
|
74
|
+
self.on_actor_failure: Optional[Callable[[str, Exception], None]] = None
|
75
|
+
self.on_actor_restart: Optional[Callable[[str, int], None]] = None
|
76
|
+
self.on_supervisor_failure: Optional[Callable[[Exception], None]] = None
|
77
|
+
|
78
|
+
# Supervisor state
|
79
|
+
self._running = False
|
80
|
+
self._monitor_task = None
|
81
|
+
|
82
|
+
async def start(self):
|
83
|
+
"""Start the supervisor and all actors."""
|
84
|
+
self._running = True
|
85
|
+
|
86
|
+
# Start all actors
|
87
|
+
for actor_id in self.actor_order:
|
88
|
+
actor = self.actors[actor_id]
|
89
|
+
actor.supervisor = self
|
90
|
+
await self._start_actor(actor)
|
91
|
+
|
92
|
+
# Start monitoring
|
93
|
+
self._monitor_task = asyncio.create_task(self._monitor_actors())
|
94
|
+
|
95
|
+
logger.info(f"Supervisor {self.name} started with {len(self.actors)} actors")
|
96
|
+
|
97
|
+
async def stop(self):
|
98
|
+
"""Stop the supervisor and all actors."""
|
99
|
+
self._running = False
|
100
|
+
|
101
|
+
# Cancel monitoring
|
102
|
+
if self._monitor_task:
|
103
|
+
self._monitor_task.cancel()
|
104
|
+
try:
|
105
|
+
await self._monitor_task
|
106
|
+
except asyncio.CancelledError:
|
107
|
+
pass
|
108
|
+
|
109
|
+
# Stop all actors
|
110
|
+
for actor in self.actors.values():
|
111
|
+
await actor.stop()
|
112
|
+
|
113
|
+
logger.info(f"Supervisor {self.name} stopped")
|
114
|
+
|
115
|
+
def add_actor(self, actor: ActorConnection):
|
116
|
+
"""
|
117
|
+
Add an actor to supervision.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
actor: Actor to supervise
|
121
|
+
"""
|
122
|
+
self.actors[actor.id] = actor
|
123
|
+
self.actor_order.append(actor.id)
|
124
|
+
self.restart_counts[actor.id] = []
|
125
|
+
actor.supervisor = self
|
126
|
+
|
127
|
+
# Start actor if supervisor is running
|
128
|
+
if self._running:
|
129
|
+
asyncio.create_task(self._start_actor(actor))
|
130
|
+
|
131
|
+
def remove_actor(self, actor_id: str):
|
132
|
+
"""
|
133
|
+
Remove an actor from supervision.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
actor_id: ID of actor to remove
|
137
|
+
"""
|
138
|
+
if actor_id in self.actors:
|
139
|
+
actor = self.actors[actor_id]
|
140
|
+
asyncio.create_task(actor.stop())
|
141
|
+
|
142
|
+
del self.actors[actor_id]
|
143
|
+
self.actor_order.remove(actor_id)
|
144
|
+
del self.restart_counts[actor_id]
|
145
|
+
|
146
|
+
async def notify_failure(self, actor_id: str, error: Optional[Exception] = None):
|
147
|
+
"""
|
148
|
+
Notify supervisor of actor failure.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
actor_id: ID of failed actor
|
152
|
+
error: Exception that caused failure
|
153
|
+
"""
|
154
|
+
logger.warning(f"Actor {actor_id} failed: {error}")
|
155
|
+
|
156
|
+
# Callback
|
157
|
+
if self.on_actor_failure:
|
158
|
+
self.on_actor_failure(actor_id, error)
|
159
|
+
|
160
|
+
# Decide on restart
|
161
|
+
decision = self._decide_restart(actor_id)
|
162
|
+
|
163
|
+
if decision == RestartDecision.RESTART:
|
164
|
+
await self._handle_restart(actor_id)
|
165
|
+
elif decision == RestartDecision.STOP:
|
166
|
+
self.remove_actor(actor_id)
|
167
|
+
elif decision == RestartDecision.ESCALATE:
|
168
|
+
await self._escalate_failure(error)
|
169
|
+
|
170
|
+
async def notify_recycling(self, actor_id: str):
|
171
|
+
"""
|
172
|
+
Notify supervisor that actor is recycling.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
actor_id: ID of recycling actor
|
176
|
+
"""
|
177
|
+
logger.info(f"Actor {actor_id} is recycling")
|
178
|
+
|
179
|
+
# Create replacement actor
|
180
|
+
if actor_id in self.actors:
|
181
|
+
old_actor = self.actors[actor_id]
|
182
|
+
|
183
|
+
# Create new actor with same config
|
184
|
+
new_actor = ActorConnection(
|
185
|
+
connection_id=f"{actor_id}_new",
|
186
|
+
db_config=old_actor.db_config,
|
187
|
+
health_check_query=old_actor.health_check_query,
|
188
|
+
health_check_interval=old_actor.health_check_interval,
|
189
|
+
max_lifetime=old_actor.max_lifetime,
|
190
|
+
max_idle_time=old_actor.max_idle_time,
|
191
|
+
)
|
192
|
+
|
193
|
+
# Start new actor
|
194
|
+
await self._start_actor(new_actor)
|
195
|
+
|
196
|
+
# Swap actors
|
197
|
+
await self._swap_actors(actor_id, new_actor)
|
198
|
+
|
199
|
+
async def _monitor_actors(self):
|
200
|
+
"""Monitor actor health periodically."""
|
201
|
+
while self._running:
|
202
|
+
try:
|
203
|
+
await asyncio.sleep(10) # Check every 10 seconds
|
204
|
+
|
205
|
+
for actor_id, actor in list(self.actors.items()):
|
206
|
+
if actor.state == ConnectionState.FAILED:
|
207
|
+
await self.notify_failure(actor_id)
|
208
|
+
elif actor.state == ConnectionState.TERMINATED:
|
209
|
+
# Actor stopped unexpectedly
|
210
|
+
await self.notify_failure(
|
211
|
+
actor_id, RuntimeError("Actor terminated unexpectedly")
|
212
|
+
)
|
213
|
+
|
214
|
+
except Exception as e:
|
215
|
+
logger.error(f"Monitor error in supervisor {self.name}: {e}")
|
216
|
+
|
217
|
+
async def _start_actor(self, actor: ActorConnection):
|
218
|
+
"""Start an actor with error handling."""
|
219
|
+
try:
|
220
|
+
await actor.start()
|
221
|
+
except Exception as e:
|
222
|
+
logger.error(f"Failed to start actor {actor.id}: {e}")
|
223
|
+
await self.notify_failure(actor.id, e)
|
224
|
+
|
225
|
+
def _decide_restart(self, actor_id: str) -> RestartDecision:
|
226
|
+
"""Decide whether to restart a failed actor."""
|
227
|
+
# Check restart count within window
|
228
|
+
now = datetime.now(UTC)
|
229
|
+
window_start = now - timedelta(seconds=self.restart_window)
|
230
|
+
|
231
|
+
# Filter restarts within window
|
232
|
+
recent_restarts = [
|
233
|
+
ts for ts in self.restart_counts[actor_id] if ts > window_start
|
234
|
+
]
|
235
|
+
|
236
|
+
if len(recent_restarts) >= self.max_restarts:
|
237
|
+
logger.error(
|
238
|
+
f"Actor {actor_id} exceeded max restarts "
|
239
|
+
f"({self.max_restarts} in {self.restart_window}s)"
|
240
|
+
)
|
241
|
+
return RestartDecision.ESCALATE
|
242
|
+
|
243
|
+
return RestartDecision.RESTART
|
244
|
+
|
245
|
+
async def _handle_restart(self, actor_id: str):
|
246
|
+
"""Handle actor restart based on strategy."""
|
247
|
+
# Record restart
|
248
|
+
self.restart_counts[actor_id].append(datetime.now(UTC))
|
249
|
+
|
250
|
+
# Delay before restart
|
251
|
+
await asyncio.sleep(self.restart_delay)
|
252
|
+
|
253
|
+
if self.strategy == SupervisionStrategy.ONE_FOR_ONE:
|
254
|
+
await self._restart_one(actor_id)
|
255
|
+
elif self.strategy == SupervisionStrategy.ONE_FOR_ALL:
|
256
|
+
await self._restart_all()
|
257
|
+
elif self.strategy == SupervisionStrategy.REST_FOR_ONE:
|
258
|
+
await self._restart_rest(actor_id)
|
259
|
+
|
260
|
+
# Callback
|
261
|
+
if self.on_actor_restart:
|
262
|
+
restart_count = len(self.restart_counts[actor_id])
|
263
|
+
self.on_actor_restart(actor_id, restart_count)
|
264
|
+
|
265
|
+
async def _restart_one(self, actor_id: str):
|
266
|
+
"""Restart a single actor."""
|
267
|
+
if actor_id not in self.actors:
|
268
|
+
return
|
269
|
+
|
270
|
+
actor = self.actors[actor_id]
|
271
|
+
|
272
|
+
# Stop the failed actor
|
273
|
+
await actor.stop()
|
274
|
+
|
275
|
+
# Create new actor with same config
|
276
|
+
new_actor = ActorConnection(
|
277
|
+
connection_id=actor_id,
|
278
|
+
db_config=actor.db_config,
|
279
|
+
health_check_query=actor.health_check_query,
|
280
|
+
health_check_interval=actor.health_check_interval,
|
281
|
+
max_lifetime=actor.max_lifetime,
|
282
|
+
max_idle_time=actor.max_idle_time,
|
283
|
+
)
|
284
|
+
|
285
|
+
# Replace and start
|
286
|
+
self.actors[actor_id] = new_actor
|
287
|
+
new_actor.supervisor = self
|
288
|
+
await self._start_actor(new_actor)
|
289
|
+
|
290
|
+
async def _restart_all(self):
|
291
|
+
"""Restart all actors."""
|
292
|
+
# Stop all actors
|
293
|
+
for actor in self.actors.values():
|
294
|
+
await actor.stop()
|
295
|
+
|
296
|
+
# Restart all
|
297
|
+
for actor_id in self.actor_order:
|
298
|
+
await self._restart_one(actor_id)
|
299
|
+
|
300
|
+
async def _restart_rest(self, failed_actor_id: str):
|
301
|
+
"""Restart failed actor and all actors after it."""
|
302
|
+
if failed_actor_id not in self.actor_order:
|
303
|
+
return
|
304
|
+
|
305
|
+
failed_index = self.actor_order.index(failed_actor_id)
|
306
|
+
|
307
|
+
# Restart from failed actor onwards
|
308
|
+
for i in range(failed_index, len(self.actor_order)):
|
309
|
+
actor_id = self.actor_order[i]
|
310
|
+
await self._restart_one(actor_id)
|
311
|
+
|
312
|
+
async def _swap_actors(self, old_id: str, new_actor: ActorConnection):
|
313
|
+
"""Atomically swap an old actor with a new one."""
|
314
|
+
if old_id not in self.actors:
|
315
|
+
return
|
316
|
+
|
317
|
+
old_actor = self.actors[old_id]
|
318
|
+
|
319
|
+
# Wait for old actor to drain
|
320
|
+
drain_timeout = 30.0
|
321
|
+
start_time = asyncio.get_event_loop().time()
|
322
|
+
|
323
|
+
while old_actor.state != ConnectionState.TERMINATED:
|
324
|
+
if asyncio.get_event_loop().time() - start_time > drain_timeout:
|
325
|
+
logger.warning(f"Timeout draining actor {old_id}, forcing stop")
|
326
|
+
break
|
327
|
+
await asyncio.sleep(0.1)
|
328
|
+
|
329
|
+
# Stop old actor
|
330
|
+
await old_actor.stop()
|
331
|
+
|
332
|
+
# Replace with new actor
|
333
|
+
self.actors[old_id] = new_actor
|
334
|
+
new_actor.supervisor = self
|
335
|
+
|
336
|
+
logger.info(f"Swapped actor {old_id} with new instance")
|
337
|
+
|
338
|
+
async def _escalate_failure(self, error: Optional[Exception]):
|
339
|
+
"""Escalate failure to higher level."""
|
340
|
+
logger.critical(f"Supervisor {self.name} escalating failure: {error}")
|
341
|
+
|
342
|
+
if self.on_supervisor_failure:
|
343
|
+
self.on_supervisor_failure(error)
|
344
|
+
else:
|
345
|
+
# Default behavior: stop supervisor
|
346
|
+
await self.stop()
|
347
|
+
|
348
|
+
def get_stats(self) -> Dict[str, Any]:
|
349
|
+
"""Get supervisor statistics."""
|
350
|
+
stats = {
|
351
|
+
"name": self.name,
|
352
|
+
"strategy": self.strategy.value,
|
353
|
+
"running": self._running,
|
354
|
+
"actors": {},
|
355
|
+
}
|
356
|
+
|
357
|
+
for actor_id, actor in self.actors.items():
|
358
|
+
stats["actors"][actor_id] = {
|
359
|
+
"state": actor.state.value,
|
360
|
+
"health_score": actor.stats.health_score,
|
361
|
+
"restart_count": len(self.restart_counts.get(actor_id, [])),
|
362
|
+
}
|
363
|
+
|
364
|
+
return stats
|
kailash/edge/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
"""Edge computing infrastructure for global distribution.
|
2
|
+
|
3
|
+
This module provides edge computing capabilities for Kailash SDK,
|
4
|
+
enabling global distribution of compute and data with sub-10ms latency.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from .compliance import ComplianceRouter
|
8
|
+
from .discovery import EdgeDiscovery, EdgeSelectionStrategy
|
9
|
+
from .location import EdgeLocation
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"EdgeLocation",
|
13
|
+
"EdgeDiscovery",
|
14
|
+
"EdgeSelectionStrategy",
|
15
|
+
"ComplianceRouter",
|
16
|
+
]
|