kailash 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. kailash/__init__.py +1 -7
  2. kailash/cli/__init__.py +11 -1
  3. kailash/cli/validation_audit.py +570 -0
  4. kailash/core/actors/supervisor.py +1 -1
  5. kailash/core/resilience/circuit_breaker.py +71 -1
  6. kailash/core/resilience/health_monitor.py +172 -0
  7. kailash/edge/compliance.py +33 -0
  8. kailash/edge/consistency.py +609 -0
  9. kailash/edge/coordination/__init__.py +30 -0
  10. kailash/edge/coordination/global_ordering.py +355 -0
  11. kailash/edge/coordination/leader_election.py +217 -0
  12. kailash/edge/coordination/partition_detector.py +296 -0
  13. kailash/edge/coordination/raft.py +485 -0
  14. kailash/edge/discovery.py +63 -1
  15. kailash/edge/migration/__init__.py +19 -0
  16. kailash/edge/migration/edge_migrator.py +832 -0
  17. kailash/edge/monitoring/__init__.py +21 -0
  18. kailash/edge/monitoring/edge_monitor.py +736 -0
  19. kailash/edge/prediction/__init__.py +10 -0
  20. kailash/edge/prediction/predictive_warmer.py +591 -0
  21. kailash/edge/resource/__init__.py +102 -0
  22. kailash/edge/resource/cloud_integration.py +796 -0
  23. kailash/edge/resource/cost_optimizer.py +949 -0
  24. kailash/edge/resource/docker_integration.py +919 -0
  25. kailash/edge/resource/kubernetes_integration.py +893 -0
  26. kailash/edge/resource/platform_integration.py +913 -0
  27. kailash/edge/resource/predictive_scaler.py +959 -0
  28. kailash/edge/resource/resource_analyzer.py +824 -0
  29. kailash/edge/resource/resource_pools.py +610 -0
  30. kailash/integrations/dataflow_edge.py +261 -0
  31. kailash/mcp_server/registry_integration.py +1 -1
  32. kailash/monitoring/__init__.py +18 -0
  33. kailash/monitoring/alerts.py +646 -0
  34. kailash/monitoring/metrics.py +677 -0
  35. kailash/nodes/__init__.py +2 -0
  36. kailash/nodes/ai/semantic_memory.py +2 -2
  37. kailash/nodes/base.py +545 -0
  38. kailash/nodes/edge/__init__.py +36 -0
  39. kailash/nodes/edge/base.py +240 -0
  40. kailash/nodes/edge/cloud_node.py +710 -0
  41. kailash/nodes/edge/coordination.py +239 -0
  42. kailash/nodes/edge/docker_node.py +825 -0
  43. kailash/nodes/edge/edge_data.py +582 -0
  44. kailash/nodes/edge/edge_migration_node.py +392 -0
  45. kailash/nodes/edge/edge_monitoring_node.py +421 -0
  46. kailash/nodes/edge/edge_state.py +673 -0
  47. kailash/nodes/edge/edge_warming_node.py +393 -0
  48. kailash/nodes/edge/kubernetes_node.py +652 -0
  49. kailash/nodes/edge/platform_node.py +766 -0
  50. kailash/nodes/edge/resource_analyzer_node.py +378 -0
  51. kailash/nodes/edge/resource_optimizer_node.py +501 -0
  52. kailash/nodes/edge/resource_scaler_node.py +397 -0
  53. kailash/nodes/ports.py +676 -0
  54. kailash/runtime/local.py +344 -1
  55. kailash/runtime/validation/__init__.py +20 -0
  56. kailash/runtime/validation/connection_context.py +119 -0
  57. kailash/runtime/validation/enhanced_error_formatter.py +202 -0
  58. kailash/runtime/validation/error_categorizer.py +164 -0
  59. kailash/runtime/validation/metrics.py +380 -0
  60. kailash/runtime/validation/performance.py +615 -0
  61. kailash/runtime/validation/suggestion_engine.py +212 -0
  62. kailash/testing/fixtures.py +2 -2
  63. kailash/workflow/builder.py +230 -4
  64. kailash/workflow/contracts.py +418 -0
  65. kailash/workflow/edge_infrastructure.py +369 -0
  66. kailash/workflow/migration.py +3 -3
  67. kailash/workflow/type_inference.py +669 -0
  68. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/METADATA +43 -27
  69. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/RECORD +73 -27
  70. kailash/nexus/__init__.py +0 -21
  71. kailash/nexus/cli/__init__.py +0 -5
  72. kailash/nexus/cli/__main__.py +0 -6
  73. kailash/nexus/cli/main.py +0 -176
  74. kailash/nexus/factory.py +0 -413
  75. kailash/nexus/gateway.py +0 -545
  76. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
  77. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
  78. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
  79. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,673 @@
1
+ """Edge state management for distributed stateful operations."""
2
+
3
+ import asyncio
4
+ import hashlib
5
+ from datetime import UTC, datetime, timedelta
6
+ from enum import Enum
7
+ from typing import Any, Callable, Dict, List, Optional, Set
8
+
9
+ from kailash.edge.location import EdgeLocation
10
+ from kailash.nodes.base import NodeParameter, register_node
11
+
12
+ from .base import EdgeNode
13
+
14
+
15
+ class StateOperation(Enum):
16
+ """Operations for state management."""
17
+
18
+ GET = "get"
19
+ SET = "set"
20
+ UPDATE = "update"
21
+ DELETE = "delete"
22
+ INCREMENT = "increment"
23
+ APPEND = "append"
24
+ LOCK = "lock"
25
+ UNLOCK = "unlock"
26
+
27
+
28
+ @register_node()
29
+ class EdgeStateMachine(EdgeNode):
30
+ """Distributed state machine with global uniqueness guarantees.
31
+
32
+ Similar to Cloudflare Durable Objects - ensures single instance
33
+ globally for a given state ID with automatic edge affinity.
34
+ """
35
+
36
+ def get_parameters(self) -> Dict[str, NodeParameter]:
37
+ """Get node parameters."""
38
+ return {
39
+ "state_id": NodeParameter(
40
+ name="state_id",
41
+ type=str,
42
+ required=True,
43
+ description="Unique identifier for this state instance",
44
+ ),
45
+ "operation": NodeParameter(
46
+ name="operation",
47
+ type=str,
48
+ default="get",
49
+ required=False,
50
+ description="State operation (get|set|update|delete|increment|append|lock|unlock)",
51
+ ),
52
+ "key": NodeParameter(
53
+ name="key",
54
+ type=str,
55
+ required=False,
56
+ description="State key for operations",
57
+ ),
58
+ "value": NodeParameter(
59
+ name="value",
60
+ type=object, # Can be any type
61
+ required=False,
62
+ description="Value to set/append",
63
+ ),
64
+ "update_fn": NodeParameter(
65
+ name="update_fn",
66
+ type=object, # Will be validated as callable
67
+ required=False,
68
+ description="Update function for update operations",
69
+ ),
70
+ "increment": NodeParameter(
71
+ name="increment",
72
+ type=int,
73
+ default=1,
74
+ required=False,
75
+ description="Amount to increment by",
76
+ ),
77
+ "lock_name": NodeParameter(
78
+ name="lock_name",
79
+ type=str,
80
+ required=False,
81
+ description="Name of lock to acquire/release",
82
+ ),
83
+ "timeout_ms": NodeParameter(
84
+ name="timeout_ms",
85
+ type=int,
86
+ default=30000,
87
+ required=False,
88
+ description="Lock timeout in milliseconds",
89
+ ),
90
+ "lease_duration_ms": NodeParameter(
91
+ name="lease_duration_ms",
92
+ type=int,
93
+ default=30000,
94
+ required=False,
95
+ description="Lease duration for global lock (ms)",
96
+ ),
97
+ "enable_persistence": NodeParameter(
98
+ name="enable_persistence",
99
+ type=bool,
100
+ default=True,
101
+ required=False,
102
+ description="Whether to persist state to durable storage",
103
+ ),
104
+ "enable_replication": NodeParameter(
105
+ name="enable_replication",
106
+ type=bool,
107
+ default=True,
108
+ required=False,
109
+ description="Whether to replicate state for availability",
110
+ ),
111
+ }
112
+
113
+ # Class-level registry for global uniqueness
114
+ _global_instances: Dict[str, "EdgeStateMachine"] = {}
115
+ _global_locks: Dict[str, Dict[str, Any]] = {}
116
+
117
+ def __init__(self, **config):
118
+ """Initialize edge state machine."""
119
+ self.state_id = config.get("state_id")
120
+ if not self.state_id:
121
+ raise ValueError("state_id is required for EdgeStateMachine")
122
+
123
+ super().__init__(**config)
124
+
125
+ # Instance state
126
+ self.state_data: Dict[str, Any] = {}
127
+ self.state_metadata: Dict[str, Any] = {
128
+ "created_at": datetime.now(UTC).isoformat(),
129
+ "version": 0,
130
+ "last_modified": datetime.now(UTC).isoformat(),
131
+ "access_count": 0,
132
+ }
133
+
134
+ # Locks and leases
135
+ self.local_locks: Set[str] = set()
136
+ self.lease_expiry: Optional[datetime] = None
137
+
138
+ # Replication tracking
139
+ self.replica_edges: List[EdgeLocation] = []
140
+ self.is_primary = False
141
+ self._background_tasks: List[asyncio.Task] = []
142
+
143
+ async def initialize(self):
144
+ """Initialize with global uniqueness check."""
145
+ # Initialize parent edge infrastructure
146
+ await super().initialize()
147
+
148
+ # Ensure single global instance
149
+ await self._ensure_single_instance()
150
+
151
+ # Load persisted state if exists
152
+ if self.config.get("enable_persistence", True):
153
+ await self._load_persisted_state()
154
+
155
+ async def async_run(self, **kwargs) -> Dict[str, Any]:
156
+ """Execute state operation."""
157
+ operation = StateOperation(kwargs.get("operation", "get"))
158
+
159
+ # Check if we're still the primary instance
160
+ if not await self._verify_primary_status():
161
+ # Redirect to current primary
162
+ primary_edge = await self._find_primary_instance()
163
+ return {
164
+ "success": False,
165
+ "redirect": True,
166
+ "primary_edge": primary_edge.name if primary_edge else None,
167
+ "message": "State instance has moved to different edge",
168
+ }
169
+
170
+ # Update access metadata
171
+ self.state_metadata["access_count"] += 1
172
+ self.state_metadata["last_accessed"] = datetime.now(UTC).isoformat()
173
+
174
+ # Handle operation
175
+ if operation == StateOperation.GET:
176
+ return await self._handle_get(kwargs)
177
+ elif operation == StateOperation.SET:
178
+ return await self._handle_set(kwargs)
179
+ elif operation == StateOperation.UPDATE:
180
+ return await self._handle_update(kwargs)
181
+ elif operation == StateOperation.DELETE:
182
+ return await self._handle_delete(kwargs)
183
+ elif operation == StateOperation.INCREMENT:
184
+ return await self._handle_increment(kwargs)
185
+ elif operation == StateOperation.APPEND:
186
+ return await self._handle_append(kwargs)
187
+ elif operation == StateOperation.LOCK:
188
+ return await self._handle_lock(kwargs)
189
+ elif operation == StateOperation.UNLOCK:
190
+ return await self._handle_unlock(kwargs)
191
+ else:
192
+ raise ValueError(f"Unknown operation: {operation}")
193
+
194
+ async def _ensure_single_instance(self):
195
+ """Ensure only one instance exists globally for this state_id."""
196
+ # Try to acquire global lock
197
+ lock_acquired = await self._acquire_global_lock()
198
+
199
+ if not lock_acquired:
200
+ # Another instance exists
201
+ existing_edge = await self._find_primary_instance()
202
+ if existing_edge:
203
+ raise RuntimeError(
204
+ f"State instance {self.state_id} already exists "
205
+ f"on edge {existing_edge.name}"
206
+ )
207
+
208
+ # Register as global instance
209
+ EdgeStateMachine._global_instances[self.state_id] = self
210
+ self.is_primary = True
211
+
212
+ # Set edge affinity for this state
213
+ self._set_edge_affinity()
214
+
215
+ async def _acquire_global_lock(self) -> bool:
216
+ """Acquire global lock for state_id."""
217
+ lock_key = f"state:{self.state_id}"
218
+
219
+ # Check if lock exists
220
+ if lock_key in EdgeStateMachine._global_locks:
221
+ lock_info = EdgeStateMachine._global_locks[lock_key]
222
+
223
+ # Check if lock expired
224
+ if datetime.now(UTC) < lock_info["expiry"]:
225
+ return False
226
+
227
+ # Acquire lock
228
+ lease_duration_ms = self.config.get("lease_duration_ms", 30000)
229
+ expiry = datetime.now(UTC) + timedelta(milliseconds=lease_duration_ms)
230
+
231
+ EdgeStateMachine._global_locks[lock_key] = {
232
+ "owner": self.current_edge.name if self.current_edge else "unknown",
233
+ "expiry": expiry,
234
+ "state_id": self.state_id,
235
+ }
236
+
237
+ self.lease_expiry = expiry
238
+
239
+ # Start lease renewal task
240
+ self._lease_renewal_task = asyncio.create_task(self._renew_lease())
241
+
242
+ return True
243
+
244
+ async def _renew_lease(self):
245
+ """Periodically renew global lock lease."""
246
+ lease_duration_ms = self.config.get("lease_duration_ms", 30000)
247
+ renewal_interval = lease_duration_ms * 0.5 / 1000 # Renew at 50%
248
+
249
+ while self.is_primary:
250
+ await asyncio.sleep(renewal_interval)
251
+
252
+ if self.is_primary and self.lease_expiry:
253
+ # Extend lease
254
+ self.lease_expiry = datetime.now(UTC) + timedelta(
255
+ milliseconds=lease_duration_ms
256
+ )
257
+
258
+ lock_key = f"state:{self.state_id}"
259
+ if lock_key in EdgeStateMachine._global_locks:
260
+ EdgeStateMachine._global_locks[lock_key][
261
+ "expiry"
262
+ ] = self.lease_expiry
263
+
264
+ def _set_edge_affinity(self):
265
+ """Set edge affinity based on state_id hash."""
266
+ # Use consistent hashing to determine preferred edge
267
+ state_hash = hashlib.md5(self.state_id.encode()).hexdigest()
268
+ hash_value = int(state_hash[:8], 16)
269
+
270
+ # Get all edges and sort by name for consistency
271
+ all_edges = sorted(self.edge_discovery.get_all_edges(), key=lambda e: e.name)
272
+
273
+ if all_edges:
274
+ # Select edge based on hash
275
+ preferred_index = hash_value % len(all_edges)
276
+ self.preferred_locations = [all_edges[preferred_index].name]
277
+
278
+ async def _find_primary_instance(self) -> Optional[EdgeLocation]:
279
+ """Find which edge hosts the primary instance."""
280
+ # In production, this would query a distributed registry
281
+ lock_key = f"state:{self.state_id}"
282
+
283
+ if lock_key in EdgeStateMachine._global_locks:
284
+ lock_info = EdgeStateMachine._global_locks[lock_key]
285
+ edge_name = lock_info.get("owner")
286
+
287
+ if edge_name:
288
+ return self.edge_discovery.get_edge(edge_name)
289
+
290
+ return None
291
+
292
+ async def _verify_primary_status(self) -> bool:
293
+ """Verify we're still the primary instance."""
294
+ if not self.is_primary:
295
+ return False
296
+
297
+ # Check if lease is still valid
298
+ if self.lease_expiry and datetime.now(UTC) > self.lease_expiry:
299
+ self.is_primary = False
300
+ return False
301
+
302
+ return True
303
+
304
+ async def _handle_get(self, params: Dict[str, Any]) -> Dict[str, Any]:
305
+ """Handle GET operation."""
306
+ key = params.get("key")
307
+
308
+ if key:
309
+ # Get specific key
310
+ value = self.state_data.get(key)
311
+ return {
312
+ "success": True,
313
+ "key": key,
314
+ "value": value,
315
+ "exists": key in self.state_data,
316
+ "metadata": self.state_metadata,
317
+ }
318
+ else:
319
+ # Get entire state
320
+ return {
321
+ "success": True,
322
+ "state": self.state_data.copy(),
323
+ "metadata": self.state_metadata,
324
+ }
325
+
326
+ async def _handle_set(self, params: Dict[str, Any]) -> Dict[str, Any]:
327
+ """Handle SET operation."""
328
+ key = params.get("key")
329
+ value = params.get("value")
330
+
331
+ if not key:
332
+ raise ValueError("SET requires 'key'")
333
+
334
+ # Update state
335
+ old_value = self.state_data.get(key)
336
+ self.state_data[key] = value
337
+
338
+ # Update metadata
339
+ self.state_metadata["version"] += 1
340
+ self.state_metadata["last_modified"] = datetime.now(UTC).isoformat()
341
+
342
+ # Persist if enabled
343
+ if self.config.get("enable_persistence", True):
344
+ await self._persist_state()
345
+
346
+ # Replicate if enabled
347
+ if self.config.get("enable_replication", True):
348
+ task = asyncio.create_task(self._replicate_state())
349
+ self._background_tasks.append(task)
350
+
351
+ return {
352
+ "success": True,
353
+ "key": key,
354
+ "old_value": old_value,
355
+ "new_value": value,
356
+ "version": self.state_metadata["version"],
357
+ }
358
+
359
+ async def _handle_update(self, params: Dict[str, Any]) -> Dict[str, Any]:
360
+ """Handle UPDATE operation with function."""
361
+ key = params.get("key")
362
+ update_fn = params.get("update_fn")
363
+
364
+ if not key or not callable(update_fn):
365
+ raise ValueError("UPDATE requires 'key' and callable 'update_fn'")
366
+
367
+ # Get current value
368
+ current_value = self.state_data.get(key)
369
+
370
+ # Apply update function
371
+ try:
372
+ new_value = update_fn(current_value)
373
+ except Exception as e:
374
+ return {"success": False, "error": f"Update function failed: {str(e)}"}
375
+
376
+ # Update state
377
+ self.state_data[key] = new_value
378
+
379
+ # Update metadata
380
+ self.state_metadata["version"] += 1
381
+ self.state_metadata["last_modified"] = datetime.now(UTC).isoformat()
382
+
383
+ # Persist and replicate
384
+ if self.config.get("enable_persistence", True):
385
+ await self._persist_state()
386
+
387
+ if self.config.get("enable_replication", True):
388
+ task = asyncio.create_task(self._replicate_state())
389
+ self._background_tasks.append(task)
390
+
391
+ return {
392
+ "success": True,
393
+ "key": key,
394
+ "old_value": current_value,
395
+ "new_value": new_value,
396
+ "version": self.state_metadata["version"],
397
+ }
398
+
399
+ async def _handle_delete(self, params: Dict[str, Any]) -> Dict[str, Any]:
400
+ """Handle DELETE operation."""
401
+ key = params.get("key")
402
+
403
+ if not key:
404
+ raise ValueError("DELETE requires 'key'")
405
+
406
+ # Delete from state
407
+ old_value = self.state_data.pop(key, None)
408
+
409
+ # Update metadata
410
+ self.state_metadata["version"] += 1
411
+ self.state_metadata["last_modified"] = datetime.now(UTC).isoformat()
412
+
413
+ # Persist and replicate
414
+ if self.config.get("enable_persistence", True):
415
+ await self._persist_state()
416
+
417
+ if self.config.get("enable_replication", True):
418
+ task = asyncio.create_task(self._replicate_state())
419
+ self._background_tasks.append(task)
420
+
421
+ return {
422
+ "success": True,
423
+ "key": key,
424
+ "deleted": old_value is not None,
425
+ "old_value": old_value,
426
+ "version": self.state_metadata["version"],
427
+ }
428
+
429
+ async def _handle_increment(self, params: Dict[str, Any]) -> Dict[str, Any]:
430
+ """Handle INCREMENT operation for numeric values."""
431
+ key = params.get("key")
432
+ increment = params.get("increment", 1)
433
+
434
+ if not key:
435
+ raise ValueError("INCREMENT requires 'key'")
436
+
437
+ # Get current value
438
+ current_value = self.state_data.get(key, 0)
439
+
440
+ # Validate numeric
441
+ if not isinstance(current_value, (int, float)):
442
+ return {
443
+ "success": False,
444
+ "error": f"Cannot increment non-numeric value: {type(current_value)}",
445
+ }
446
+
447
+ # Increment
448
+ new_value = current_value + increment
449
+ self.state_data[key] = new_value
450
+
451
+ # Update metadata
452
+ self.state_metadata["version"] += 1
453
+ self.state_metadata["last_modified"] = datetime.now(UTC).isoformat()
454
+
455
+ # Persist and replicate
456
+ if self.config.get("enable_persistence", True):
457
+ await self._persist_state()
458
+
459
+ if self.config.get("enable_replication", True):
460
+ task = asyncio.create_task(self._replicate_state())
461
+ self._background_tasks.append(task)
462
+
463
+ return {
464
+ "success": True,
465
+ "key": key,
466
+ "old_value": current_value,
467
+ "new_value": new_value,
468
+ "increment": increment,
469
+ "version": self.state_metadata["version"],
470
+ }
471
+
472
+ async def _handle_append(self, params: Dict[str, Any]) -> Dict[str, Any]:
473
+ """Handle APPEND operation for list values."""
474
+ key = params.get("key")
475
+ value = params.get("value")
476
+
477
+ if not key:
478
+ raise ValueError("APPEND requires 'key'")
479
+
480
+ # Get current value
481
+ current_value = self.state_data.get(key, [])
482
+
483
+ # Ensure it's a list
484
+ if not isinstance(current_value, list):
485
+ return {
486
+ "success": False,
487
+ "error": f"Cannot append to non-list value: {type(current_value)}",
488
+ }
489
+
490
+ # Append
491
+ new_value = current_value + [value]
492
+ self.state_data[key] = new_value
493
+
494
+ # Update metadata
495
+ self.state_metadata["version"] += 1
496
+ self.state_metadata["last_modified"] = datetime.now(UTC).isoformat()
497
+
498
+ # Persist and replicate
499
+ if self.config.get("enable_persistence", True):
500
+ await self._persist_state()
501
+
502
+ if self.config.get("enable_replication", True):
503
+ task = asyncio.create_task(self._replicate_state())
504
+ self._background_tasks.append(task)
505
+
506
+ return {
507
+ "success": True,
508
+ "key": key,
509
+ "list_size": len(new_value),
510
+ "appended_value": value,
511
+ "version": self.state_metadata["version"],
512
+ }
513
+
514
+ async def _handle_lock(self, params: Dict[str, Any]) -> Dict[str, Any]:
515
+ """Handle LOCK operation for distributed locking."""
516
+ lock_name = params.get("lock_name")
517
+ timeout_ms = params.get("timeout_ms", 5000)
518
+
519
+ if not lock_name:
520
+ raise ValueError("LOCK requires 'lock_name'")
521
+
522
+ # Check if already locked
523
+ if lock_name in self.local_locks:
524
+ return {
525
+ "success": False,
526
+ "lock_name": lock_name,
527
+ "error": "Lock already held",
528
+ }
529
+
530
+ # Acquire lock
531
+ self.local_locks.add(lock_name)
532
+
533
+ # Set up auto-release
534
+ task = asyncio.create_task(self._auto_release_lock(lock_name, timeout_ms))
535
+ self._background_tasks.append(task)
536
+
537
+ return {
538
+ "success": True,
539
+ "lock_name": lock_name,
540
+ "timeout_ms": timeout_ms,
541
+ "holder": self.current_edge.name if self.current_edge else "unknown",
542
+ }
543
+
544
+ async def _handle_unlock(self, params: Dict[str, Any]) -> Dict[str, Any]:
545
+ """Handle UNLOCK operation."""
546
+ lock_name = params.get("lock_name")
547
+
548
+ if not lock_name:
549
+ raise ValueError("UNLOCK requires 'lock_name'")
550
+
551
+ # Release lock
552
+ released = lock_name in self.local_locks
553
+ self.local_locks.discard(lock_name)
554
+
555
+ return {"success": True, "lock_name": lock_name, "released": released}
556
+
557
+ async def _auto_release_lock(self, lock_name: str, timeout_ms: int):
558
+ """Auto-release lock after timeout."""
559
+ await asyncio.sleep(timeout_ms / 1000)
560
+ self.local_locks.discard(lock_name)
561
+
562
+ async def _persist_state(self):
563
+ """Persist state to durable storage."""
564
+ # In production, this would write to distributed storage
565
+ # For now, simulate with delay
566
+ await asyncio.sleep(0.01)
567
+
568
+ self.logger.debug(
569
+ f"Persisted state for {self.state_id} "
570
+ f"(version: {self.state_metadata['version']})"
571
+ )
572
+
573
+ async def _load_persisted_state(self):
574
+ """Load state from durable storage."""
575
+ # In production, this would read from distributed storage
576
+ # For now, start with empty state
577
+ pass
578
+
579
+ async def _replicate_state(self):
580
+ """Replicate state to backup edges."""
581
+ if not self.config.get("enable_replication", True):
582
+ return
583
+
584
+ # Select replica edges if not already done
585
+ if not self.replica_edges:
586
+ await self._select_replica_edges()
587
+
588
+ # Replicate to each edge
589
+ replication_tasks = []
590
+ for edge in self.replica_edges:
591
+ replication_tasks.append(self._replicate_to_edge(edge))
592
+
593
+ await asyncio.gather(*replication_tasks, return_exceptions=True)
594
+
595
+ async def _select_replica_edges(self):
596
+ """Select edges for state replication."""
597
+ all_edges = self.edge_discovery.get_all_edges()
598
+
599
+ # Remove current edge
600
+ candidate_edges = [
601
+ e
602
+ for e in all_edges
603
+ if e.name != (self.current_edge.name if self.current_edge else None)
604
+ ]
605
+
606
+ # Select based on different regions for availability
607
+ regions_seen = set()
608
+ for edge in candidate_edges:
609
+ if edge.region not in regions_seen:
610
+ self.replica_edges.append(edge)
611
+ regions_seen.add(edge.region)
612
+
613
+ if len(self.replica_edges) >= 2: # Keep 2 replicas
614
+ break
615
+
616
+ async def _replicate_to_edge(self, edge: EdgeLocation):
617
+ """Replicate state to specific edge."""
618
+ # In production, this would use edge-to-edge communication
619
+ await asyncio.sleep(0.02) # Simulate replication
620
+
621
+ self.logger.debug(f"Replicated state {self.state_id} to edge {edge.name}")
622
+
623
+ async def migrate_to_edge(
624
+ self, target_edge: EdgeLocation, state_data: Optional[Dict[str, Any]] = None
625
+ ) -> bool:
626
+ """Migrate state machine to different edge."""
627
+ if not self.is_primary:
628
+ return False
629
+
630
+ try:
631
+ # Transfer primary status
632
+ self.is_primary = False
633
+
634
+ # Update global registry
635
+ lock_key = f"state:{self.state_id}"
636
+ if lock_key in EdgeStateMachine._global_locks:
637
+ EdgeStateMachine._global_locks[lock_key]["owner"] = target_edge.name
638
+
639
+ # Persist final state
640
+ await self._persist_state()
641
+
642
+ # Clean up
643
+ if self.state_id in EdgeStateMachine._global_instances:
644
+ del EdgeStateMachine._global_instances[self.state_id]
645
+
646
+ return True
647
+
648
+ except Exception as e:
649
+ self.logger.error(f"State migration failed: {e}")
650
+ self.is_primary = True # Restore primary status
651
+ return False
652
+
653
+ async def cleanup(self):
654
+ """Cleanup resources including background tasks."""
655
+ # Cancel lease renewal task if running
656
+ if hasattr(self, "_lease_renewal_task") and self._lease_renewal_task:
657
+ self._lease_renewal_task.cancel()
658
+ try:
659
+ await self._lease_renewal_task
660
+ except asyncio.CancelledError:
661
+ pass
662
+
663
+ # Cancel all background tasks
664
+ for task in self._background_tasks:
665
+ if not task.done():
666
+ task.cancel()
667
+
668
+ # Wait for all tasks to complete
669
+ if self._background_tasks:
670
+ await asyncio.gather(*self._background_tasks, return_exceptions=True)
671
+
672
+ # Mark as not primary to stop renewal loop
673
+ self.is_primary = False