kailash 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/client/__init__.py +12 -0
- kailash/client/enhanced_client.py +306 -0
- kailash/core/actors/__init__.py +16 -0
- kailash/core/actors/connection_actor.py +566 -0
- kailash/core/actors/supervisor.py +364 -0
- kailash/edge/__init__.py +16 -0
- kailash/edge/compliance.py +834 -0
- kailash/edge/discovery.py +659 -0
- kailash/edge/location.py +582 -0
- kailash/gateway/__init__.py +33 -0
- kailash/gateway/api.py +289 -0
- kailash/gateway/enhanced_gateway.py +357 -0
- kailash/gateway/resource_resolver.py +217 -0
- kailash/gateway/security.py +227 -0
- kailash/middleware/auth/models.py +2 -2
- kailash/middleware/database/base_models.py +1 -7
- kailash/middleware/gateway/__init__.py +22 -0
- kailash/middleware/gateway/checkpoint_manager.py +398 -0
- kailash/middleware/gateway/deduplicator.py +382 -0
- kailash/middleware/gateway/durable_gateway.py +417 -0
- kailash/middleware/gateway/durable_request.py +498 -0
- kailash/middleware/gateway/event_store.py +459 -0
- kailash/nodes/admin/permission_check.py +817 -33
- kailash/nodes/admin/role_management.py +1242 -108
- kailash/nodes/admin/schema_manager.py +438 -0
- kailash/nodes/admin/user_management.py +1124 -1582
- kailash/nodes/code/__init__.py +8 -1
- kailash/nodes/code/async_python.py +1035 -0
- kailash/nodes/code/python.py +1 -0
- kailash/nodes/data/async_sql.py +9 -3
- kailash/nodes/data/sql.py +20 -11
- kailash/nodes/data/workflow_connection_pool.py +643 -0
- kailash/nodes/rag/__init__.py +1 -4
- kailash/resources/__init__.py +40 -0
- kailash/resources/factory.py +533 -0
- kailash/resources/health.py +319 -0
- kailash/resources/reference.py +288 -0
- kailash/resources/registry.py +392 -0
- kailash/runtime/async_local.py +711 -302
- kailash/testing/__init__.py +34 -0
- kailash/testing/async_test_case.py +353 -0
- kailash/testing/async_utils.py +345 -0
- kailash/testing/fixtures.py +458 -0
- kailash/testing/mock_registry.py +495 -0
- kailash/workflow/__init__.py +8 -0
- kailash/workflow/async_builder.py +621 -0
- kailash/workflow/async_patterns.py +766 -0
- kailash/workflow/cyclic_runner.py +107 -16
- kailash/workflow/graph.py +7 -2
- kailash/workflow/resilience.py +11 -1
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/METADATA +7 -4
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/RECORD +57 -22
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/WHEEL +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.5.0.dist-info → kailash-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,398 @@
|
|
1
|
+
"""Checkpoint management with tiered storage and compression.
|
2
|
+
|
3
|
+
This module provides:
|
4
|
+
- Checkpoint creation and restoration
|
5
|
+
- Tiered storage (memory/disk/cloud)
|
6
|
+
- Automatic compression
|
7
|
+
- Garbage collection
|
8
|
+
"""
|
9
|
+
|
10
|
+
import asyncio
|
11
|
+
import datetime as dt
|
12
|
+
import gzip
|
13
|
+
import json
|
14
|
+
import logging
|
15
|
+
import os
|
16
|
+
import time
|
17
|
+
from collections import OrderedDict
|
18
|
+
from datetime import datetime, timedelta
|
19
|
+
from pathlib import Path
|
20
|
+
from typing import Any, Dict, List, Optional, Protocol
|
21
|
+
|
22
|
+
from .durable_request import Checkpoint
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class StorageBackend(Protocol):
|
28
|
+
"""Protocol for checkpoint storage backends."""
|
29
|
+
|
30
|
+
async def save(self, key: str, data: bytes) -> None:
|
31
|
+
"""Save data to storage."""
|
32
|
+
...
|
33
|
+
|
34
|
+
async def load(self, key: str) -> Optional[bytes]:
|
35
|
+
"""Load data from storage."""
|
36
|
+
...
|
37
|
+
|
38
|
+
async def delete(self, key: str) -> None:
|
39
|
+
"""Delete data from storage."""
|
40
|
+
...
|
41
|
+
|
42
|
+
async def list_keys(self, prefix: str) -> List[str]:
|
43
|
+
"""List keys with prefix."""
|
44
|
+
...
|
45
|
+
|
46
|
+
|
47
|
+
class MemoryStorage:
|
48
|
+
"""In-memory storage backend with LRU eviction."""
|
49
|
+
|
50
|
+
def __init__(self, max_size_mb: int = 100):
|
51
|
+
self.max_size_bytes = max_size_mb * 1024 * 1024
|
52
|
+
self.data: OrderedDict[str, bytes] = OrderedDict()
|
53
|
+
self.current_size = 0
|
54
|
+
self._lock = asyncio.Lock()
|
55
|
+
|
56
|
+
async def save(self, key: str, data: bytes) -> None:
|
57
|
+
"""Save to memory with LRU eviction."""
|
58
|
+
async with self._lock:
|
59
|
+
# Remove if exists to update position
|
60
|
+
if key in self.data:
|
61
|
+
self.current_size -= len(self.data[key])
|
62
|
+
del self.data[key]
|
63
|
+
|
64
|
+
# Evict oldest entries if needed
|
65
|
+
while self.current_size + len(data) > self.max_size_bytes and self.data:
|
66
|
+
evicted_key, evicted_data = self.data.popitem(last=False)
|
67
|
+
self.current_size -= len(evicted_data)
|
68
|
+
logger.debug(f"Evicted {evicted_key} from memory storage")
|
69
|
+
|
70
|
+
# Add new data
|
71
|
+
self.data[key] = data
|
72
|
+
self.current_size += len(data)
|
73
|
+
|
74
|
+
async def load(self, key: str) -> Optional[bytes]:
|
75
|
+
"""Load from memory."""
|
76
|
+
async with self._lock:
|
77
|
+
if key in self.data:
|
78
|
+
# Move to end (most recently used)
|
79
|
+
self.data.move_to_end(key)
|
80
|
+
return self.data[key]
|
81
|
+
return None
|
82
|
+
|
83
|
+
async def delete(self, key: str) -> None:
|
84
|
+
"""Delete from memory."""
|
85
|
+
async with self._lock:
|
86
|
+
if key in self.data:
|
87
|
+
self.current_size -= len(self.data[key])
|
88
|
+
del self.data[key]
|
89
|
+
|
90
|
+
async def list_keys(self, prefix: str) -> List[str]:
|
91
|
+
"""List keys with prefix."""
|
92
|
+
async with self._lock:
|
93
|
+
return [k for k in self.data.keys() if k.startswith(prefix)]
|
94
|
+
|
95
|
+
|
96
|
+
class DiskStorage:
|
97
|
+
"""Disk-based storage backend."""
|
98
|
+
|
99
|
+
def __init__(self, base_path: str = "/tmp/kailash_checkpoints"):
|
100
|
+
self.base_path = Path(base_path)
|
101
|
+
self.base_path.mkdir(parents=True, exist_ok=True)
|
102
|
+
self._lock = asyncio.Lock()
|
103
|
+
|
104
|
+
def _get_path(self, key: str) -> Path:
|
105
|
+
"""Get file path for key."""
|
106
|
+
# Use subdirectories to avoid too many files in one directory
|
107
|
+
parts = key.split("_")
|
108
|
+
if len(parts) >= 2:
|
109
|
+
subdir = self.base_path / parts[0]
|
110
|
+
subdir.mkdir(exist_ok=True)
|
111
|
+
return subdir / f"{key}.ckpt"
|
112
|
+
return self.base_path / f"{key}.ckpt"
|
113
|
+
|
114
|
+
async def save(self, key: str, data: bytes) -> None:
|
115
|
+
"""Save to disk."""
|
116
|
+
path = self._get_path(key)
|
117
|
+
|
118
|
+
# Write atomically
|
119
|
+
temp_path = path.with_suffix(".tmp")
|
120
|
+
try:
|
121
|
+
await asyncio.get_event_loop().run_in_executor(
|
122
|
+
None, lambda: temp_path.write_bytes(data)
|
123
|
+
)
|
124
|
+
|
125
|
+
# Atomic rename
|
126
|
+
await asyncio.get_event_loop().run_in_executor(
|
127
|
+
None, lambda: temp_path.rename(path)
|
128
|
+
)
|
129
|
+
except Exception as e:
|
130
|
+
logger.error(f"Failed to save checkpoint to disk: {e}")
|
131
|
+
if temp_path.exists():
|
132
|
+
temp_path.unlink()
|
133
|
+
raise
|
134
|
+
|
135
|
+
async def load(self, key: str) -> Optional[bytes]:
|
136
|
+
"""Load from disk."""
|
137
|
+
path = self._get_path(key)
|
138
|
+
|
139
|
+
if not path.exists():
|
140
|
+
return None
|
141
|
+
|
142
|
+
try:
|
143
|
+
return await asyncio.get_event_loop().run_in_executor(None, path.read_bytes)
|
144
|
+
except Exception as e:
|
145
|
+
logger.error(f"Failed to load checkpoint from disk: {e}")
|
146
|
+
return None
|
147
|
+
|
148
|
+
async def delete(self, key: str) -> None:
|
149
|
+
"""Delete from disk."""
|
150
|
+
path = self._get_path(key)
|
151
|
+
|
152
|
+
if path.exists():
|
153
|
+
await asyncio.get_event_loop().run_in_executor(None, path.unlink)
|
154
|
+
|
155
|
+
async def list_keys(self, prefix: str) -> List[str]:
|
156
|
+
"""List keys with prefix."""
|
157
|
+
keys = []
|
158
|
+
|
159
|
+
for path in self.base_path.rglob("*.ckpt"):
|
160
|
+
key = path.stem
|
161
|
+
if key.startswith(prefix):
|
162
|
+
keys.append(key)
|
163
|
+
|
164
|
+
return keys
|
165
|
+
|
166
|
+
|
167
|
+
class CheckpointManager:
|
168
|
+
"""Manages checkpoints with tiered storage and compression."""
|
169
|
+
|
170
|
+
def __init__(
|
171
|
+
self,
|
172
|
+
memory_storage: Optional[MemoryStorage] = None,
|
173
|
+
disk_storage: Optional[DiskStorage] = None,
|
174
|
+
cloud_storage: Optional[StorageBackend] = None,
|
175
|
+
compression_enabled: bool = True,
|
176
|
+
compression_threshold_bytes: int = 1024, # 1KB
|
177
|
+
retention_hours: int = 24,
|
178
|
+
):
|
179
|
+
"""Initialize checkpoint manager."""
|
180
|
+
self.memory_storage = memory_storage or MemoryStorage()
|
181
|
+
self.disk_storage = disk_storage or DiskStorage()
|
182
|
+
self.cloud_storage = cloud_storage # Optional cloud backend
|
183
|
+
self.compression_enabled = compression_enabled
|
184
|
+
self.compression_threshold = compression_threshold_bytes
|
185
|
+
self.retention_hours = retention_hours
|
186
|
+
|
187
|
+
# Metrics
|
188
|
+
self.save_count = 0
|
189
|
+
self.load_count = 0
|
190
|
+
self.compression_ratio_sum = 0.0
|
191
|
+
|
192
|
+
# Start garbage collection task
|
193
|
+
self._gc_task = asyncio.create_task(self._garbage_collection_loop())
|
194
|
+
|
195
|
+
async def save_checkpoint(self, checkpoint: Checkpoint) -> None:
|
196
|
+
"""Save checkpoint to storage."""
|
197
|
+
start_time = time.time()
|
198
|
+
|
199
|
+
# Serialize checkpoint
|
200
|
+
data = json.dumps(checkpoint.to_dict()).encode("utf-8")
|
201
|
+
original_size = len(data)
|
202
|
+
|
203
|
+
# Compress if enabled and beneficial
|
204
|
+
compression_ratio = 1.0 # Default to no compression
|
205
|
+
if self.compression_enabled and original_size > self.compression_threshold:
|
206
|
+
compressed = gzip.compress(data, compresslevel=6)
|
207
|
+
if len(compressed) < original_size:
|
208
|
+
data = compressed
|
209
|
+
compression_ratio = len(compressed) / original_size
|
210
|
+
logger.debug(
|
211
|
+
f"Compressed checkpoint {checkpoint.checkpoint_id}: "
|
212
|
+
f"{original_size} -> {len(data)} bytes ({compression_ratio:.2f})"
|
213
|
+
)
|
214
|
+
|
215
|
+
# Always update compression ratio sum
|
216
|
+
self.compression_ratio_sum += compression_ratio
|
217
|
+
|
218
|
+
# Save to tiered storage
|
219
|
+
key = checkpoint.checkpoint_id
|
220
|
+
|
221
|
+
# Always save to memory for fast access
|
222
|
+
await self.memory_storage.save(key, data)
|
223
|
+
|
224
|
+
# Save to disk for durability
|
225
|
+
await self.disk_storage.save(key, data)
|
226
|
+
|
227
|
+
# Save to cloud if available (async, don't wait)
|
228
|
+
if self.cloud_storage:
|
229
|
+
asyncio.create_task(self._save_to_cloud(key, data))
|
230
|
+
|
231
|
+
self.save_count += 1
|
232
|
+
duration_ms = (time.time() - start_time) * 1000
|
233
|
+
|
234
|
+
logger.info(
|
235
|
+
f"Saved checkpoint {checkpoint.checkpoint_id} "
|
236
|
+
f"({len(data)} bytes) in {duration_ms:.1f}ms"
|
237
|
+
)
|
238
|
+
|
239
|
+
async def load_checkpoint(self, checkpoint_id: str) -> Optional[Checkpoint]:
|
240
|
+
"""Load checkpoint from storage."""
|
241
|
+
start_time = time.time()
|
242
|
+
|
243
|
+
# Try memory first (fastest)
|
244
|
+
data = await self.memory_storage.load(checkpoint_id)
|
245
|
+
source = "memory"
|
246
|
+
|
247
|
+
# Try disk if not in memory
|
248
|
+
if not data:
|
249
|
+
data = await self.disk_storage.load(checkpoint_id)
|
250
|
+
source = "disk"
|
251
|
+
|
252
|
+
# Promote to memory if found
|
253
|
+
if data:
|
254
|
+
await self.memory_storage.save(checkpoint_id, data)
|
255
|
+
|
256
|
+
# Try cloud as last resort
|
257
|
+
if not data and self.cloud_storage:
|
258
|
+
data = await self.cloud_storage.load(checkpoint_id)
|
259
|
+
source = "cloud"
|
260
|
+
|
261
|
+
# Promote to memory and disk if found
|
262
|
+
if data:
|
263
|
+
await self.memory_storage.save(checkpoint_id, data)
|
264
|
+
await self.disk_storage.save(checkpoint_id, data)
|
265
|
+
|
266
|
+
if not data:
|
267
|
+
logger.warning(f"Checkpoint {checkpoint_id} not found")
|
268
|
+
return None
|
269
|
+
|
270
|
+
# Decompress if needed
|
271
|
+
try:
|
272
|
+
# Try to decompress first
|
273
|
+
decompressed = gzip.decompress(data)
|
274
|
+
data = decompressed
|
275
|
+
except:
|
276
|
+
# Not compressed or decompression failed
|
277
|
+
pass
|
278
|
+
|
279
|
+
# Deserialize
|
280
|
+
try:
|
281
|
+
checkpoint_dict = json.loads(data.decode("utf-8"))
|
282
|
+
checkpoint = Checkpoint.from_dict(checkpoint_dict)
|
283
|
+
|
284
|
+
self.load_count += 1
|
285
|
+
duration_ms = (time.time() - start_time) * 1000
|
286
|
+
|
287
|
+
logger.info(
|
288
|
+
f"Loaded checkpoint {checkpoint_id} from {source} "
|
289
|
+
f"in {duration_ms:.1f}ms"
|
290
|
+
)
|
291
|
+
|
292
|
+
return checkpoint
|
293
|
+
|
294
|
+
except Exception as e:
|
295
|
+
logger.error(f"Failed to deserialize checkpoint {checkpoint_id}: {e}")
|
296
|
+
return None
|
297
|
+
|
298
|
+
async def load_latest_checkpoint(self, request_id: str) -> Optional[Checkpoint]:
|
299
|
+
"""Load the latest checkpoint for a request."""
|
300
|
+
# List all checkpoints for request
|
301
|
+
prefix = f"ckpt_{request_id}"
|
302
|
+
|
303
|
+
# Check all storage tiers
|
304
|
+
all_keys = set()
|
305
|
+
all_keys.update(await self.memory_storage.list_keys(prefix))
|
306
|
+
all_keys.update(await self.disk_storage.list_keys(prefix))
|
307
|
+
if self.cloud_storage:
|
308
|
+
all_keys.update(await self.cloud_storage.list_keys(prefix))
|
309
|
+
|
310
|
+
if not all_keys:
|
311
|
+
return None
|
312
|
+
|
313
|
+
# Load all checkpoints and find latest by sequence
|
314
|
+
checkpoints = []
|
315
|
+
for key in all_keys:
|
316
|
+
checkpoint = await self.load_checkpoint(key)
|
317
|
+
if checkpoint and checkpoint.request_id == request_id:
|
318
|
+
checkpoints.append(checkpoint)
|
319
|
+
|
320
|
+
if not checkpoints:
|
321
|
+
return None
|
322
|
+
|
323
|
+
# Return checkpoint with highest sequence number
|
324
|
+
return max(checkpoints, key=lambda c: c.sequence)
|
325
|
+
|
326
|
+
async def delete_checkpoint(self, checkpoint_id: str) -> None:
|
327
|
+
"""Delete checkpoint from all storage tiers."""
|
328
|
+
await self.memory_storage.delete(checkpoint_id)
|
329
|
+
await self.disk_storage.delete(checkpoint_id)
|
330
|
+
if self.cloud_storage:
|
331
|
+
await self.cloud_storage.delete(checkpoint_id)
|
332
|
+
|
333
|
+
logger.info(f"Deleted checkpoint {checkpoint_id}")
|
334
|
+
|
335
|
+
async def _save_to_cloud(self, key: str, data: bytes) -> None:
|
336
|
+
"""Save to cloud storage asynchronously."""
|
337
|
+
try:
|
338
|
+
await self.cloud_storage.save(key, data)
|
339
|
+
logger.debug(f"Saved checkpoint {key} to cloud storage")
|
340
|
+
except Exception as e:
|
341
|
+
logger.error(f"Failed to save checkpoint {key} to cloud: {e}")
|
342
|
+
|
343
|
+
async def _garbage_collection_loop(self) -> None:
|
344
|
+
"""Periodically clean up old checkpoints."""
|
345
|
+
while True:
|
346
|
+
try:
|
347
|
+
await asyncio.sleep(3600) # Run every hour
|
348
|
+
await self._garbage_collection()
|
349
|
+
except asyncio.CancelledError:
|
350
|
+
break
|
351
|
+
except Exception as e:
|
352
|
+
logger.error(f"Garbage collection error: {e}")
|
353
|
+
|
354
|
+
async def _garbage_collection(self) -> None:
|
355
|
+
"""Clean up old checkpoints."""
|
356
|
+
cutoff_time = datetime.now(dt.UTC) - timedelta(hours=self.retention_hours)
|
357
|
+
deleted_count = 0
|
358
|
+
|
359
|
+
# Get all checkpoint keys from disk (most complete list)
|
360
|
+
all_keys = await self.disk_storage.list_keys("ckpt_")
|
361
|
+
|
362
|
+
for key in all_keys:
|
363
|
+
checkpoint = await self.load_checkpoint(key)
|
364
|
+
if checkpoint:
|
365
|
+
# Handle both timezone-aware and naive datetimes
|
366
|
+
checkpoint_time = checkpoint.created_at
|
367
|
+
if checkpoint_time.tzinfo is None:
|
368
|
+
# Assume naive datetime is UTC
|
369
|
+
checkpoint_time = checkpoint_time.replace(tzinfo=dt.UTC)
|
370
|
+
|
371
|
+
if checkpoint_time < cutoff_time:
|
372
|
+
await self.delete_checkpoint(key)
|
373
|
+
deleted_count += 1
|
374
|
+
|
375
|
+
if deleted_count > 0:
|
376
|
+
logger.info(f"Garbage collection deleted {deleted_count} old checkpoints")
|
377
|
+
|
378
|
+
def get_stats(self) -> Dict[str, Any]:
|
379
|
+
"""Get checkpoint manager statistics."""
|
380
|
+
avg_compression_ratio = (
|
381
|
+
self.compression_ratio_sum / self.save_count if self.save_count > 0 else 1.0
|
382
|
+
)
|
383
|
+
|
384
|
+
return {
|
385
|
+
"save_count": self.save_count,
|
386
|
+
"load_count": self.load_count,
|
387
|
+
"avg_compression_ratio": avg_compression_ratio,
|
388
|
+
"compression_enabled": self.compression_enabled,
|
389
|
+
"retention_hours": self.retention_hours,
|
390
|
+
}
|
391
|
+
|
392
|
+
async def close(self) -> None:
|
393
|
+
"""Close checkpoint manager and cleanup."""
|
394
|
+
self._gc_task.cancel()
|
395
|
+
try:
|
396
|
+
await self._gc_task
|
397
|
+
except asyncio.CancelledError:
|
398
|
+
pass
|