puffinflow 2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. puffinflow/__init__.py +132 -0
  2. puffinflow/core/__init__.py +110 -0
  3. puffinflow/core/agent/__init__.py +320 -0
  4. puffinflow/core/agent/base.py +1635 -0
  5. puffinflow/core/agent/checkpoint.py +50 -0
  6. puffinflow/core/agent/context.py +521 -0
  7. puffinflow/core/agent/decorators/__init__.py +90 -0
  8. puffinflow/core/agent/decorators/builder.py +454 -0
  9. puffinflow/core/agent/decorators/flexible.py +714 -0
  10. puffinflow/core/agent/decorators/inspection.py +144 -0
  11. puffinflow/core/agent/dependencies.py +57 -0
  12. puffinflow/core/agent/scheduling/__init__.py +21 -0
  13. puffinflow/core/agent/scheduling/builder.py +160 -0
  14. puffinflow/core/agent/scheduling/exceptions.py +35 -0
  15. puffinflow/core/agent/scheduling/inputs.py +137 -0
  16. puffinflow/core/agent/scheduling/parser.py +209 -0
  17. puffinflow/core/agent/scheduling/scheduler.py +413 -0
  18. puffinflow/core/agent/state.py +141 -0
  19. puffinflow/core/config.py +62 -0
  20. puffinflow/core/coordination/__init__.py +137 -0
  21. puffinflow/core/coordination/agent_group.py +359 -0
  22. puffinflow/core/coordination/agent_pool.py +629 -0
  23. puffinflow/core/coordination/agent_team.py +577 -0
  24. puffinflow/core/coordination/coordinator.py +720 -0
  25. puffinflow/core/coordination/deadlock.py +1759 -0
  26. puffinflow/core/coordination/fluent_api.py +421 -0
  27. puffinflow/core/coordination/primitives.py +478 -0
  28. puffinflow/core/coordination/rate_limiter.py +520 -0
  29. puffinflow/core/observability/__init__.py +47 -0
  30. puffinflow/core/observability/agent.py +139 -0
  31. puffinflow/core/observability/alerting.py +73 -0
  32. puffinflow/core/observability/config.py +127 -0
  33. puffinflow/core/observability/context.py +88 -0
  34. puffinflow/core/observability/core.py +147 -0
  35. puffinflow/core/observability/decorators.py +105 -0
  36. puffinflow/core/observability/events.py +71 -0
  37. puffinflow/core/observability/interfaces.py +196 -0
  38. puffinflow/core/observability/metrics.py +137 -0
  39. puffinflow/core/observability/tracing.py +209 -0
  40. puffinflow/core/reliability/__init__.py +27 -0
  41. puffinflow/core/reliability/bulkhead.py +96 -0
  42. puffinflow/core/reliability/circuit_breaker.py +149 -0
  43. puffinflow/core/reliability/leak_detector.py +122 -0
  44. puffinflow/core/resources/__init__.py +77 -0
  45. puffinflow/core/resources/allocation.py +790 -0
  46. puffinflow/core/resources/pool.py +645 -0
  47. puffinflow/core/resources/quotas.py +567 -0
  48. puffinflow/core/resources/requirements.py +217 -0
  49. puffinflow/version.py +21 -0
  50. puffinflow-2.dev0.dist-info/METADATA +334 -0
  51. puffinflow-2.dev0.dist-info/RECORD +55 -0
  52. puffinflow-2.dev0.dist-info/WHEEL +5 -0
  53. puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
  54. puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
  55. puffinflow-2.dev0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1759 @@
1
+ """
2
+ Deadlock detection for workflow execution.
3
+
4
+ This module provides comprehensive deadlock detection capabilities including:
5
+ - Dependency graph cycle detection
6
+ - Resource wait-for graph analysis
7
+ - Configurable resolution strategies
8
+ - Performance monitoring and metrics
9
+ - Memory management and cleanup
10
+ - Thread-safe operations
11
+ """
12
+
13
+ import asyncio
14
+ import logging
15
+ import time
16
+ import uuid
17
+ import weakref
18
+ from collections import deque
19
+ from dataclasses import dataclass, field
20
+ from datetime import datetime, timedelta, timezone
21
+ from enum import Enum, auto
22
+ from typing import Any, Callable, Optional, Union
23
+
24
+ # Configure logging
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class DeadlockResolutionStrategy(Enum):
29
+ """Strategies for resolving deadlocks"""
30
+
31
+ RAISE_EXCEPTION = auto()
32
+ KILL_YOUNGEST = auto()
33
+ KILL_OLDEST = auto()
34
+ KILL_LOWEST_PRIORITY = auto()
35
+ PREEMPT_RESOURCES = auto()
36
+ ROLLBACK_TRANSACTION = auto()
37
+ LOG_ONLY = auto()
38
+ CUSTOM_CALLBACK = auto()
39
+
40
+
41
+ class DeadlockError(Exception):
42
+ """Raised when a deadlock is detected"""
43
+
44
+ def __init__(
45
+ self,
46
+ cycle: list[str],
47
+ detection_id: Optional[str] = None,
48
+ message: str = "Deadlock detected",
49
+ ):
50
+ self.cycle = cycle
51
+ self.detection_id = detection_id or str(uuid.uuid4())
52
+ self.timestamp = datetime.now(timezone.utc)
53
+ super().__init__(f"{message}: {' -> '.join(cycle)} (ID: {self.detection_id})")
54
+
55
+
56
+ @dataclass
57
+ class ResourceNode:
58
+ """Node in resource wait graph with enhanced metadata"""
59
+
60
+ resource_id: str
61
+ resource_type: str
62
+ holders: set[str] = field(default_factory=set)
63
+ waiters: set[str] = field(default_factory=set)
64
+ acquired_at: Optional[datetime] = None
65
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
66
+ last_accessed: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
67
+ access_count: int = 0
68
+ max_holders: int = 1 # For semaphore-like resources
69
+ priority: int = 0
70
+
71
+ def is_free(self) -> bool:
72
+ """Check if resource has available capacity"""
73
+ return len(self.holders) < self.max_holders
74
+
75
+ def can_acquire(self, count: int = 1) -> bool:
76
+ """Check if resource can be acquired by count holders"""
77
+ return len(self.holders) + count <= self.max_holders
78
+
79
+ def age_seconds(self) -> float:
80
+ """Get age of resource in seconds"""
81
+ return (datetime.now(timezone.utc) - self.created_at).total_seconds()
82
+
83
+ def idle_time_seconds(self) -> float:
84
+ """Get idle time since last access"""
85
+ return (datetime.now(timezone.utc) - self.last_accessed).total_seconds()
86
+
87
+ def update_access(self) -> None:
88
+ """Update last access time"""
89
+ self.last_accessed = datetime.now(timezone.utc)
90
+
91
+
92
+ @dataclass
93
+ class ProcessNode:
94
+ """Node representing a process/state in wait graph with enhanced tracking"""
95
+
96
+ process_id: str
97
+ process_name: str
98
+ holding: set[str] = field(default_factory=set)
99
+ waiting_for: set[str] = field(default_factory=set)
100
+ started_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
101
+ blocked_at: Optional[datetime] = None
102
+ last_activity: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
103
+ priority: int = 0
104
+ timeout: Optional[float] = None # Timeout in seconds
105
+ metadata: dict[str, Any] = field(default_factory=dict)
106
+
107
+ def is_blocked(self) -> bool:
108
+ """Check if process is blocked"""
109
+ return len(self.waiting_for) > 0
110
+
111
+ def is_timed_out(self) -> bool:
112
+ """Check if process has timed out"""
113
+ if not self.timeout or not self.blocked_at:
114
+ return False
115
+ return (
116
+ datetime.now(timezone.utc) - self.blocked_at
117
+ ).total_seconds() > self.timeout
118
+
119
+ def age_seconds(self) -> float:
120
+ """Get age of process in seconds"""
121
+ return (datetime.now(timezone.utc) - self.started_at).total_seconds()
122
+
123
+ def blocked_duration_seconds(self) -> float:
124
+ """Get how long process has been blocked"""
125
+ if self.blocked_at:
126
+ return (datetime.now(timezone.utc) - self.blocked_at).total_seconds()
127
+ return 0.0
128
+
129
+ def idle_time_seconds(self) -> float:
130
+ """Get idle time since last activity"""
131
+ return (datetime.now(timezone.utc) - self.last_activity).total_seconds()
132
+
133
+ def update_activity(self) -> None:
134
+ """Update last activity timestamp"""
135
+ self.last_activity = datetime.now(timezone.utc)
136
+
137
+
138
+ @dataclass
139
+ class CycleDetectionResult:
140
+ """Enhanced result of cycle detection with performance metrics"""
141
+
142
+ has_cycle: bool
143
+ cycles: list[list[str]] = field(default_factory=list)
144
+ detection_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
145
+ graph_size: int = 0
146
+ edge_count: int = 0
147
+ detection_duration_ms: float = 0.0
148
+ algorithm_used: str = "dfs"
149
+
150
+ def get_shortest_cycle(self) -> Optional[list[str]]:
151
+ """Get the shortest detected cycle"""
152
+ if not self.cycles:
153
+ return None
154
+ return min(self.cycles, key=len)
155
+
156
+ def get_longest_cycle(self) -> Optional[list[str]]:
157
+ """Get the longest detected cycle"""
158
+ if not self.cycles:
159
+ return None
160
+ return max(self.cycles, key=len)
161
+
162
+ def get_critical_cycle(self) -> Optional[list[str]]:
163
+ """Get the most critical cycle (shortest with highest priority nodes)"""
164
+ if not self.cycles:
165
+ return None
166
+ # For now, return shortest. Can be enhanced with priority logic
167
+ return self.get_shortest_cycle()
168
+
169
+
170
+ class NodeCleanupStrategy:
171
+ """Strategy for node cleanup with different policies"""
172
+
173
+ @staticmethod
174
+ def lru_cleanup(
175
+ nodes: dict[str, Any], metadata: dict[str, dict], count: int
176
+ ) -> list[str]:
177
+ """Least Recently Used cleanup"""
178
+ sorted_nodes = sorted(
179
+ nodes.keys(),
180
+ key=lambda n: metadata.get(n, {}).get(
181
+ "last_access", datetime.min.replace(tzinfo=timezone.utc)
182
+ ),
183
+ )
184
+ return sorted_nodes[:count]
185
+
186
+ @staticmethod
187
+ def age_based_cleanup(
188
+ nodes: dict[str, Any], metadata: dict[str, dict], count: int
189
+ ) -> list[str]:
190
+ """Age-based cleanup (oldest first)"""
191
+ sorted_nodes = sorted(
192
+ nodes.keys(),
193
+ key=lambda n: metadata.get(n, {}).get(
194
+ "created_at", datetime.max.replace(tzinfo=timezone.utc)
195
+ ),
196
+ )
197
+ return sorted_nodes[:count]
198
+
199
+ @staticmethod
200
+ def usage_based_cleanup(
201
+ nodes: dict[str, Any], metadata: dict[str, dict], count: int
202
+ ) -> list[str]:
203
+ """Usage-based cleanup (least used first)"""
204
+ sorted_nodes = sorted(
205
+ nodes.keys(),
206
+ key=lambda n: metadata.get(n, {}).get("access_count", float("inf")),
207
+ )
208
+ return sorted_nodes[:count]
209
+
210
+
211
+ class DependencyGraph:
212
+ """Enhanced thread-safe graph for tracking dependencies and detecting cycles"""
213
+
214
+ def __init__(
215
+ self,
216
+ max_nodes: int = 10000,
217
+ cleanup_threshold: float = 0.8,
218
+ cache_ttl: float = 5.0,
219
+ enable_metrics: bool = True,
220
+ prevent_cycles: bool = False,
221
+ ):
222
+ self.nodes: dict[str, set[str]] = {}
223
+ self.reverse_edges: dict[str, set[str]] = {}
224
+ self.node_metadata: dict[str, Any] = {} # Store metadata directly as provided
225
+
226
+ # Configuration
227
+ self.max_nodes = max_nodes
228
+ self.cleanup_threshold = cleanup_threshold
229
+ self.cache_ttl = cache_ttl
230
+ self.enable_metrics = enable_metrics
231
+ self.prevent_cycles = prevent_cycles # Option to prevent cycle creation
232
+
233
+ # Thread safety
234
+ self._lock = asyncio.Lock()
235
+ self._operation_count = 0
236
+
237
+ # Caching
238
+ self._cycle_cache: dict[str, CycleDetectionResult] = {}
239
+ self._topology_cache: Optional[tuple[list[str], str, float]] = None
240
+
241
+ # Metrics
242
+ self._metrics: dict[str, Union[int, float]] = {
243
+ "operations": 0,
244
+ "cache_hits": 0,
245
+ "cache_misses": 0,
246
+ "cleanups_performed": 0,
247
+ "nodes_cleaned": 0,
248
+ "avg_detection_time_ms": 0.0,
249
+ }
250
+
251
+ async def add_dependency(
252
+ self, node: str, depends_on: str, metadata: Optional[dict[str, Any]] = None
253
+ ) -> bool:
254
+ """Add a dependency edge with metadata and validation"""
255
+ if not node or not depends_on:
256
+ raise ValueError("Node and dependency names cannot be empty")
257
+
258
+ # Allow self-loops for testing compatibility
259
+ # if node == depends_on:
260
+ # raise ValueError("Node cannot depend on itself")
261
+
262
+ async with self._lock:
263
+ self._operation_count += 1
264
+
265
+ # Check capacity and cleanup if needed
266
+ if len(self.nodes) >= self.max_nodes * self.cleanup_threshold:
267
+ await self._cleanup_old_nodes_internal()
268
+
269
+ # Store metadata directly as provided by user
270
+ if metadata is not None:
271
+ self.node_metadata[node] = metadata
272
+
273
+ # Check if this would create a cycle (optional)
274
+ if self.prevent_cycles and self._would_create_cycle_sync(node, depends_on):
275
+ return False
276
+
277
+ # Add the dependency
278
+ if node not in self.nodes:
279
+ self.nodes[node] = set()
280
+ if depends_on not in self.reverse_edges:
281
+ self.reverse_edges[depends_on] = set()
282
+
283
+ self.nodes[node].add(depends_on)
284
+ self.reverse_edges[depends_on].add(node)
285
+
286
+ # Invalidate caches
287
+ self._invalidate_caches()
288
+
289
+ if self.enable_metrics:
290
+ self._metrics["operations"] += 1
291
+
292
+ return True
293
+
294
+ def _would_create_cycle_sync(self, from_node: str, to_node: str) -> bool:
295
+ """Synchronous cycle check for use during dependency addition"""
296
+ # Simple DFS to check if to_node can reach from_node
297
+ visited = set()
298
+
299
+ def dfs(node: str) -> bool:
300
+ if node == from_node:
301
+ return True
302
+ if node in visited:
303
+ return False
304
+ visited.add(node)
305
+
306
+ return any(dfs(neighbor) for neighbor in self.nodes.get(node, []))
307
+
308
+ return dfs(to_node)
309
+
310
+ async def remove_dependency(self, node: str, depends_on: str) -> bool:
311
+ """Remove a dependency edge"""
312
+ async with self._lock:
313
+ return await self._remove_dependency_internal(node, depends_on)
314
+
315
+ async def _remove_dependency_internal(self, node: str, depends_on: str) -> bool:
316
+ """Internal method to remove dependency without acquiring lock"""
317
+ if node not in self.nodes or depends_on not in self.nodes[node]:
318
+ return False
319
+
320
+ self.nodes[node].discard(depends_on)
321
+ if not self.nodes[node]:
322
+ del self.nodes[node]
323
+ self.node_metadata.pop(node, None)
324
+
325
+ if depends_on in self.reverse_edges:
326
+ self.reverse_edges[depends_on].discard(node)
327
+ if not self.reverse_edges[depends_on]:
328
+ del self.reverse_edges[depends_on]
329
+
330
+ self._invalidate_caches()
331
+ return True
332
+
333
+ async def remove_node(self, node: str) -> bool:
334
+ """Remove a node and all its edges"""
335
+ async with self._lock:
336
+ return await self._remove_node_internal(node)
337
+
338
+ async def _remove_node_internal(self, node: str) -> bool:
339
+ """Internal method to remove node without acquiring lock"""
340
+ removed = False
341
+
342
+ # Remove outgoing edges
343
+ if node in self.nodes:
344
+ for dep in list(self.nodes[node]):
345
+ if dep in self.reverse_edges:
346
+ self.reverse_edges[dep].discard(node)
347
+ if not self.reverse_edges[dep]:
348
+ del self.reverse_edges[dep]
349
+ del self.nodes[node]
350
+ removed = True
351
+
352
+ # Remove incoming edges
353
+ if node in self.reverse_edges:
354
+ for dependent in list(self.reverse_edges[node]):
355
+ if dependent in self.nodes:
356
+ self.nodes[dependent].discard(node)
357
+ if not self.nodes[dependent]:
358
+ del self.nodes[dependent]
359
+ self.node_metadata.pop(dependent, None)
360
+ del self.reverse_edges[node]
361
+ removed = True
362
+
363
+ # Remove metadata
364
+ if node in self.node_metadata:
365
+ del self.node_metadata[node]
366
+ removed = True
367
+
368
+ if removed:
369
+ self._invalidate_caches()
370
+
371
+ return removed
372
+
373
+ async def _cleanup_old_nodes_internal(self) -> int:
374
+ """Internal cleanup method without acquiring lock"""
375
+ if len(self.nodes) < self.max_nodes * self.cleanup_threshold:
376
+ return 0
377
+
378
+ target_size = int(self.max_nodes * 0.6) # Clean to 60% capacity
379
+ nodes_to_remove_count = len(self.nodes) - target_size
380
+
381
+ if nodes_to_remove_count <= 0:
382
+ return 0
383
+
384
+ # Simple cleanup - remove oldest nodes
385
+ nodes_to_remove = list(self.nodes.keys())[:nodes_to_remove_count]
386
+
387
+ cleaned_count = 0
388
+ for node in nodes_to_remove:
389
+ if await self._remove_node_internal(node):
390
+ cleaned_count += 1
391
+
392
+ if self.enable_metrics:
393
+ self._metrics["cleanups_performed"] += 1
394
+ self._metrics["nodes_cleaned"] += cleaned_count
395
+
396
+ logger.info(f"Cleaned up {cleaned_count} nodes from dependency graph")
397
+ return cleaned_count
398
+
399
+ def find_cycles(self, use_cache: bool = True) -> CycleDetectionResult:
400
+ """Find all cycles in the graph using optimized DFS with proper cycle detection"""
401
+ start_time = time.perf_counter()
402
+
403
+ # Check cache first
404
+ if use_cache:
405
+ cache_key = self._get_graph_hash()
406
+ cached_result = self._cycle_cache.get(cache_key)
407
+ if cached_result and self._is_cache_valid(cached_result):
408
+ if self.enable_metrics:
409
+ self._metrics["cache_hits"] += 1
410
+ return cached_result
411
+
412
+ if self.enable_metrics:
413
+ self._metrics["cache_misses"] += 1
414
+
415
+ # Perform cycle detection using proper DFS for directed graphs
416
+ cycles = []
417
+ visited = set()
418
+ rec_stack = set() # Recursion stack to track current path
419
+
420
+ def dfs_detect_cycles(node: str, path: list[str]) -> None:
421
+ # If node is in recursion stack, we found a cycle
422
+ if node in rec_stack:
423
+ # Find the cycle in the current path
424
+ try:
425
+ cycle_start = path.index(node)
426
+ cycle = [*path[cycle_start:], node]
427
+ cycles.append(cycle)
428
+ except ValueError:
429
+ # Fallback if node not found in path
430
+ cycles.append([node])
431
+ return
432
+
433
+ # If already visited but not in current path, skip
434
+ if node in visited:
435
+ return
436
+
437
+ # Mark as visited and add to recursion stack
438
+ visited.add(node)
439
+ rec_stack.add(node)
440
+ path.append(node)
441
+
442
+ # Visit all neighbors
443
+ for neighbor in self.nodes.get(node, []):
444
+ dfs_detect_cycles(neighbor, path)
445
+
446
+ # Backtrack: remove from recursion stack and path
447
+ rec_stack.remove(node)
448
+ path.pop()
449
+
450
+ # Check all nodes to handle disconnected components
451
+ for node in list(self.nodes.keys()):
452
+ if node not in visited:
453
+ dfs_detect_cycles(node, [])
454
+
455
+ detection_duration = (time.perf_counter() - start_time) * 1000
456
+
457
+ # Count edges
458
+ edge_count = sum(len(deps) for deps in self.nodes.values())
459
+
460
+ result = CycleDetectionResult(
461
+ has_cycle=len(cycles) > 0,
462
+ cycles=cycles,
463
+ graph_size=len(self.nodes),
464
+ edge_count=edge_count,
465
+ detection_duration_ms=detection_duration,
466
+ algorithm_used="dfs",
467
+ )
468
+
469
+ # Cache the result
470
+ if use_cache:
471
+ cache_key = self._get_graph_hash()
472
+ self._cycle_cache[cache_key] = result
473
+
474
+ # Update metrics
475
+ if self.enable_metrics:
476
+ alpha = 0.1
477
+ self._metrics["avg_detection_time_ms"] = (
478
+ alpha * detection_duration
479
+ + (1 - alpha) * self._metrics["avg_detection_time_ms"]
480
+ )
481
+
482
+ return result
483
+
484
+ def topological_sort(self) -> Optional[list[str]]:
485
+ """Perform topological sort if no cycles exist"""
486
+ # Check cache first
487
+ if self._topology_cache:
488
+ result, graph_hash, timestamp = self._topology_cache
489
+ if (
490
+ self._get_graph_hash() == graph_hash
491
+ and time.time() - timestamp < self.cache_ttl
492
+ ):
493
+ return result
494
+
495
+ # Check for cycles first - if cycles exist, no topological sort possible
496
+ cycle_result = self.find_cycles()
497
+ if cycle_result.has_cycle:
498
+ return None
499
+
500
+ # Get all unique nodes in the graph
501
+ all_nodes = set(self.nodes.keys())
502
+ for deps in self.nodes.values():
503
+ all_nodes.update(deps)
504
+
505
+ if not all_nodes:
506
+ return []
507
+
508
+ # Kahn's algorithm
509
+ in_degree = dict.fromkeys(all_nodes, 0)
510
+
511
+ # Calculate in-degrees
512
+ for node in self.nodes:
513
+ for dep in self.nodes[node]:
514
+ in_degree[dep] += 1
515
+
516
+ # Find nodes with no incoming edges
517
+ queue = deque([node for node, degree in in_degree.items() if degree == 0])
518
+ result = []
519
+
520
+ while queue:
521
+ node = queue.popleft()
522
+ result.append(node)
523
+
524
+ # Remove edges from this node
525
+ for neighbor in self.nodes.get(node, []):
526
+ in_degree[neighbor] -= 1
527
+ if in_degree[neighbor] == 0:
528
+ queue.append(neighbor)
529
+
530
+ # Validate result - if not all nodes processed, there was a cycle
531
+ if len(result) != len(all_nodes):
532
+ return None
533
+
534
+ # Cache the result
535
+ self._topology_cache = (result, self._get_graph_hash(), time.time())
536
+ return result
537
+
538
+ def _get_graph_hash(self) -> str:
539
+ """Get a hash representing the current graph state"""
540
+ edge_count = sum(len(deps) for deps in self.nodes.values())
541
+ return f"{len(self.nodes)}:{edge_count}:{self._operation_count}"
542
+
543
+ def _is_cache_valid(self, result: CycleDetectionResult) -> bool:
544
+ """Check if cached result is still valid"""
545
+ return (time.time() - result.detection_time.timestamp()) < self.cache_ttl
546
+
547
+ def _invalidate_caches(self) -> None:
548
+ """Invalidate all caches"""
549
+ self._cycle_cache.clear()
550
+ self._topology_cache = None
551
+
552
+ def get_metrics(self) -> dict[str, Any]:
553
+ """Get performance metrics"""
554
+ return {
555
+ **self._metrics,
556
+ "node_count": len(self.nodes),
557
+ "edge_count": sum(len(deps) for deps in self.nodes.values()),
558
+ "cache_size": len(self._cycle_cache),
559
+ "operation_count": self._operation_count,
560
+ }
561
+
562
+ async def health_check(self) -> dict[str, Any]:
563
+ """Perform health check"""
564
+ async with self._lock:
565
+ return {
566
+ "status": "healthy",
567
+ "node_count": len(self.nodes),
568
+ "memory_usage_percent": len(self.nodes) / self.max_nodes * 100,
569
+ "cache_hit_rate": (
570
+ (
571
+ self._metrics["cache_hits"]
572
+ / max(
573
+ 1,
574
+ self._metrics["cache_hits"] + self._metrics["cache_misses"],
575
+ )
576
+ )
577
+ * 100
578
+ if self.enable_metrics
579
+ else 0
580
+ ),
581
+ "last_cleanup": self._metrics.get("last_cleanup"),
582
+ "needs_cleanup": len(self.nodes)
583
+ >= self.max_nodes * self.cleanup_threshold,
584
+ }
585
+
586
+
587
+ class ResourceWaitGraph:
588
+ """Enhanced wait-for graph for resource-based deadlock detection"""
589
+
590
+ def __init__(
591
+ self,
592
+ max_resources: int = 5000,
593
+ max_processes: int = 5000,
594
+ cleanup_interval: float = 300.0,
595
+ enable_timeouts: bool = True,
596
+ ):
597
+ self.resources: dict[str, ResourceNode] = {}
598
+ self.processes: dict[str, ProcessNode] = {}
599
+
600
+ # Configuration
601
+ self.max_resources = max_resources
602
+ self.max_processes = max_processes
603
+ self.cleanup_interval = cleanup_interval
604
+ self.enable_timeouts = enable_timeouts
605
+
606
+ # Thread safety
607
+ self._lock = asyncio.Lock()
608
+
609
+ # Caching and optimization
610
+ self._wait_graph_cache: Optional[DependencyGraph] = None
611
+ self._cache_invalidated = True
612
+ self._last_cleanup = datetime.now(timezone.utc)
613
+
614
+ # Metrics
615
+ self._metrics: dict[str, int] = {
616
+ "resource_acquisitions": 0,
617
+ "resource_releases": 0,
618
+ "deadlock_detections": 0,
619
+ "timeouts": 0,
620
+ "preemptions": 0,
621
+ }
622
+
623
+ async def add_resource(
624
+ self,
625
+ resource_id: str,
626
+ resource_type: str = "generic",
627
+ max_holders: int = 1,
628
+ priority: int = 0,
629
+ ) -> bool:
630
+ """Add a resource to the graph with configuration"""
631
+ if not resource_id:
632
+ raise ValueError("Resource ID cannot be empty")
633
+
634
+ async with self._lock:
635
+ if len(self.resources) >= self.max_resources:
636
+ await self._cleanup_old_resources_internal()
637
+
638
+ if resource_id not in self.resources:
639
+ self.resources[resource_id] = ResourceNode(
640
+ resource_id=resource_id,
641
+ resource_type=resource_type,
642
+ max_holders=max_holders,
643
+ priority=priority,
644
+ )
645
+ self._cache_invalidated = True
646
+ return True
647
+ return False
648
+
649
+ async def add_process(
650
+ self,
651
+ process_id: str,
652
+ process_name: str = "",
653
+ priority: int = 0,
654
+ timeout: Optional[float] = None,
655
+ ) -> bool:
656
+ """Add a process to the graph with configuration"""
657
+ if not process_id:
658
+ raise ValueError("Process ID cannot be empty")
659
+
660
+ async with self._lock:
661
+ if len(self.processes) >= self.max_processes:
662
+ await self._cleanup_old_processes_internal()
663
+
664
+ if process_id not in self.processes:
665
+ self.processes[process_id] = ProcessNode(
666
+ process_id=process_id,
667
+ process_name=process_name or process_id,
668
+ priority=priority,
669
+ timeout=timeout,
670
+ )
671
+ self._cache_invalidated = True
672
+ return True
673
+ return False
674
+
675
+ async def acquire_resource(
676
+ self,
677
+ process_id: str,
678
+ resource_id: str,
679
+ count: int = 1,
680
+ timeout: Optional[float] = None,
681
+ ) -> bool:
682
+ """Process attempts to acquire a resource with optional timeout"""
683
+ async with self._lock:
684
+ return await self._acquire_resource_internal(
685
+ process_id, resource_id, count, timeout
686
+ )
687
+
688
+ async def _acquire_resource_internal(
689
+ self,
690
+ process_id: str,
691
+ resource_id: str,
692
+ count: int = 1,
693
+ timeout: Optional[float] = None,
694
+ ) -> bool:
695
+ """Internal method to acquire resource without acquiring lock"""
696
+ if count <= 0:
697
+ raise ValueError("Count must be positive")
698
+
699
+ # Ensure resource and process exist
700
+ if resource_id not in self.resources:
701
+ self.resources[resource_id] = ResourceNode(
702
+ resource_id=resource_id, resource_type="generic"
703
+ )
704
+ self._cache_invalidated = True
705
+
706
+ if process_id not in self.processes:
707
+ self.processes[process_id] = ProcessNode(
708
+ process_id=process_id, process_name=process_id, timeout=timeout
709
+ )
710
+ self._cache_invalidated = True
711
+
712
+ resource = self.resources[resource_id]
713
+ process = self.processes[process_id]
714
+
715
+ # Check if resource can be acquired
716
+ if resource.can_acquire(count) and process_id not in resource.waiters:
717
+ # Successful acquisition - use simple process ID for holders
718
+ resource.holders.add(process_id)
719
+
720
+ resource.acquired_at = datetime.now(timezone.utc)
721
+ resource.access_count += 1
722
+ resource.update_access()
723
+
724
+ process.holding.add(resource_id)
725
+ process.waiting_for.discard(resource_id)
726
+ process.update_activity()
727
+
728
+ # Clear blocked status if not waiting for anything
729
+ if not process.waiting_for:
730
+ process.blocked_at = None
731
+
732
+ self._cache_invalidated = True
733
+ self._metrics["resource_acquisitions"] += 1
734
+ return True
735
+ else:
736
+ # Must wait
737
+ resource.waiters.add(process_id)
738
+ process.waiting_for.add(resource_id)
739
+ if process.blocked_at is None:
740
+ process.blocked_at = datetime.now(timezone.utc)
741
+
742
+ self._cache_invalidated = True
743
+ return False
744
+
745
+ async def release_resource(
746
+ self, process_id: str, resource_id: str, count: int = 1
747
+ ) -> bool:
748
+ """Process releases a resource"""
749
+ if count <= 0:
750
+ raise ValueError("Count must be positive")
751
+
752
+ async with self._lock:
753
+ if resource_id not in self.resources or process_id not in self.processes:
754
+ return False
755
+
756
+ resource = self.resources[resource_id]
757
+ process = self.processes[process_id]
758
+
759
+ # Release the resource
760
+ resource.holders.discard(process_id)
761
+ process.holding.discard(resource_id)
762
+ process.update_activity()
763
+ resource.update_access()
764
+
765
+ # Try to wake up waiters (using internal method)
766
+ await self._process_waiters_internal(resource_id)
767
+
768
+ self._cache_invalidated = True
769
+ self._metrics["resource_releases"] += 1
770
+ return True
771
+
772
+ async def _process_waiters_internal(self, resource_id: str) -> None:
773
+ """Process waiting list for a resource (internal method)"""
774
+ if resource_id not in self.resources:
775
+ return
776
+
777
+ resource = self.resources[resource_id]
778
+
779
+ # Sort waiters by priority and wait time
780
+ if resource.waiters:
781
+ sorted_waiters = sorted(
782
+ resource.waiters,
783
+ key=lambda pid: (
784
+ -self.processes.get(pid, ProcessNode("", "")).priority,
785
+ self.processes.get(pid, ProcessNode("", "")).blocked_at
786
+ or datetime.max.replace(tzinfo=timezone.utc),
787
+ ),
788
+ )
789
+
790
+ # Try to satisfy waiters using internal method
791
+ for waiter_id in list(sorted_waiters):
792
+ if resource.can_acquire(1):
793
+ resource.waiters.remove(waiter_id)
794
+ # Use internal method to avoid lock acquisition
795
+ await self._acquire_resource_internal(waiter_id, resource_id)
796
+ else:
797
+ break
798
+
799
+ async def detect_deadlock(self) -> CycleDetectionResult:
800
+ """Detect deadlocks using wait-for graph analysis"""
801
+ async with self._lock:
802
+ # Check for timeouts first
803
+ if self.enable_timeouts:
804
+ await self._handle_timeouts_internal()
805
+
806
+ # Build or reuse wait-for graph
807
+ if self._cache_invalidated or self._wait_graph_cache is None:
808
+ self._wait_graph_cache = DependencyGraph()
809
+
810
+ # Add edges: if P1 waits for resource held by P2, add edge P1 -> P2
811
+ for resource in self.resources.values():
812
+ for waiter in resource.waiters:
813
+ for holder in resource.holders:
814
+ if waiter != holder:
815
+ await self._wait_graph_cache.add_dependency(
816
+ waiter, holder
817
+ )
818
+
819
+ self._cache_invalidated = False
820
+
821
+ # Find cycles
822
+ result = self._wait_graph_cache.find_cycles()
823
+ self._metrics["deadlock_detections"] += 1
824
+ return result
825
+
826
+ async def _handle_timeouts_internal(self) -> None:
827
+ """Handle process timeouts (internal method)"""
828
+ timed_out_processes = []
829
+
830
+ for process in self.processes.values():
831
+ if process.is_timed_out():
832
+ timed_out_processes.append(process.process_id)
833
+
834
+ for process_id in timed_out_processes:
835
+ await self._timeout_process_internal(process_id)
836
+ self._metrics["timeouts"] += 1
837
+
838
+ async def _timeout_process_internal(self, process_id: str) -> None:
839
+ """Handle process timeout (internal method)"""
840
+ if process_id not in self.processes:
841
+ return
842
+
843
+ process = self.processes[process_id]
844
+
845
+ # Remove from all waiting lists
846
+ for resource_id in list(process.waiting_for):
847
+ if resource_id in self.resources:
848
+ self.resources[resource_id].waiters.discard(process_id)
849
+
850
+ process.waiting_for.clear()
851
+ process.blocked_at = None
852
+
853
+ logger.warning(f"Process {process_id} timed out after waiting")
854
+
855
+ async def _cleanup_old_resources_internal(self) -> int:
856
+ """Clean up old unused resources (internal method)"""
857
+ now = datetime.now(timezone.utc)
858
+ cleanup_threshold = timedelta(seconds=self.cleanup_interval)
859
+
860
+ old_resources = [
861
+ rid
862
+ for rid, resource in self.resources.items()
863
+ if (
864
+ resource.is_free()
865
+ and len(resource.waiters) == 0
866
+ and now - resource.last_accessed > cleanup_threshold
867
+ )
868
+ ]
869
+
870
+ cleaned_count = 0
871
+ for rid in old_resources[: len(self.resources) // 4]: # Remove 25%
872
+ del self.resources[rid]
873
+ cleaned_count += 1
874
+
875
+ self._last_cleanup = now
876
+ return cleaned_count
877
+
878
+ async def _cleanup_old_processes_internal(self) -> int:
879
+ """Clean up old inactive processes (internal method)"""
880
+ now = datetime.now(timezone.utc)
881
+ cleanup_threshold = timedelta(seconds=self.cleanup_interval)
882
+
883
+ old_processes = [
884
+ pid
885
+ for pid, process in self.processes.items()
886
+ if (
887
+ len(process.holding) == 0
888
+ and len(process.waiting_for) == 0
889
+ and now - process.last_activity > cleanup_threshold
890
+ )
891
+ ]
892
+
893
+ cleaned_count = 0
894
+ for pid in old_processes[: len(self.processes) // 4]: # Remove 25%
895
+ del self.processes[pid]
896
+ cleaned_count += 1
897
+
898
+ return cleaned_count
899
+
900
+ def get_blocked_processes(self) -> list[ProcessNode]:
901
+ """Get all currently blocked processes"""
902
+ return [proc for proc in self.processes.values() if proc.is_blocked()]
903
+
904
+ def get_resource_holders(self, resource_id: str) -> set[str]:
905
+ """Get processes holding a resource"""
906
+ if resource_id in self.resources:
907
+ return self.resources[resource_id].holders.copy()
908
+ return set()
909
+
910
+ def get_resource_waiters(self, resource_id: str) -> set[str]:
911
+ """Get processes waiting for a resource"""
912
+ if resource_id in self.resources:
913
+ return self.resources[resource_id].waiters.copy()
914
+ return set()
915
+
916
+ def get_resource_stats(self) -> dict[str, Any]:
917
+ """Get comprehensive resource statistics"""
918
+ total_resources = len(self.resources)
919
+ free_resources = sum(1 for r in self.resources.values() if r.is_free())
920
+ total_holders = sum(len(r.holders) for r in self.resources.values())
921
+ total_waiters = sum(len(r.waiters) for r in self.resources.values())
922
+
923
+ return {
924
+ "total_resources": total_resources,
925
+ "free_resources": free_resources,
926
+ "utilized_resources": total_resources - free_resources,
927
+ "total_holders": total_holders,
928
+ "total_waiters": total_waiters,
929
+ "average_utilization": (total_resources - free_resources)
930
+ / max(1, total_resources),
931
+ "blocked_processes": len(self.get_blocked_processes()),
932
+ }
933
+
934
+ def get_metrics(self) -> dict[str, Any]:
935
+ """Get performance metrics"""
936
+ return {
937
+ **self._metrics,
938
+ **self.get_resource_stats(),
939
+ "total_processes": len(self.processes),
940
+ }
941
+
942
+
943
+ class DeadlockDetector:
944
+ """Production-grade deadlock detection with comprehensive monitoring and resolution"""
945
+
946
+ def __init__(
947
+ self,
948
+ agent: Any,
949
+ detection_interval: float = 1.0,
950
+ max_cycles: int = 100,
951
+ resolution_strategy: DeadlockResolutionStrategy = DeadlockResolutionStrategy.LOG_ONLY,
952
+ enable_metrics: bool = True,
953
+ enable_health_monitoring: bool = True,
954
+ max_resolution_attempts: int = 3,
955
+ ):
956
+ self.agent = weakref.proxy(agent) if agent else None
957
+ self.detection_interval = detection_interval
958
+ self.max_cycles = max_cycles
959
+ self.resolution_strategy = resolution_strategy
960
+ self.enable_metrics = enable_metrics
961
+ self.enable_health_monitoring = enable_health_monitoring
962
+ self.max_resolution_attempts = max_resolution_attempts
963
+
964
+ # Core components
965
+ self._dependency_graph = DependencyGraph(enable_metrics=enable_metrics)
966
+ self._resource_graph = ResourceWaitGraph()
967
+
968
+ # Control and synchronization
969
+ self._lock = asyncio.Lock()
970
+ self._detection_task: Optional[asyncio.Task] = None
971
+ self._health_task: Optional[asyncio.Task] = None
972
+ self._shutdown_event = asyncio.Event()
973
+
974
+ # State tracking
975
+ self._cycle_count = 0
976
+ self._last_cycle: Optional[list[str]] = None
977
+ self._detection_history: deque = deque(maxlen=1000)
978
+ self._resolution_history: deque = deque(maxlen=100)
979
+
980
+ # Metrics and monitoring
981
+ self._metrics: dict[str, Union[int, float]] = {
982
+ "total_detections": 0,
983
+ "deadlocks_found": 0,
984
+ "deadlocks_resolved": 0,
985
+ "detection_errors": 0,
986
+ "resolution_failures": 0,
987
+ "avg_detection_time_ms": 0.0,
988
+ "uptime_seconds": 0.0,
989
+ "last_error": "", # type: ignore
990
+ }
991
+
992
+ # Callbacks and extensibility
993
+ self._resolution_callbacks: list[Callable[[list[str]], bool]] = []
994
+ self._notification_callbacks: list[Callable[[str, dict[str, Any]], None]] = []
995
+
996
+ # Health monitoring
997
+ self._health_status = "initializing"
998
+ self._last_successful_detection = datetime.now(timezone.utc)
999
+ self._start_time = datetime.now(timezone.utc)
1000
+
1001
+ async def start(self) -> bool:
1002
+ """Start deadlock detection with comprehensive initialization"""
1003
+ try:
1004
+ async with self._lock:
1005
+ if self._detection_task and not self._detection_task.done():
1006
+ logger.warning("Deadlock detector already running")
1007
+ return False
1008
+
1009
+ self._shutdown_event.clear()
1010
+ self._health_status = "starting"
1011
+
1012
+ # Start detection task
1013
+ self._detection_task = asyncio.create_task(self._detection_loop())
1014
+
1015
+ # Start health monitoring if enabled
1016
+ if self.enable_health_monitoring:
1017
+ self._health_task = asyncio.create_task(
1018
+ self._health_monitoring_loop()
1019
+ )
1020
+
1021
+ self._health_status = "running"
1022
+ self._start_time = datetime.now(timezone.utc)
1023
+
1024
+ logger.info(
1025
+ f"Deadlock detector started with strategy: {self.resolution_strategy.name}"
1026
+ )
1027
+ await self._notify(
1028
+ "deadlock_detector_started",
1029
+ {"strategy": self.resolution_strategy.name},
1030
+ )
1031
+
1032
+ return True
1033
+
1034
+ except Exception as e:
1035
+ self._health_status = "error"
1036
+ self._metrics["last_error"] = str(e) # type: ignore
1037
+ logger.error(f"Failed to start deadlock detector: {e}")
1038
+ return False
1039
+
1040
+ async def stop(self, timeout: float = 10.0) -> bool:
1041
+ """Stop deadlock detection gracefully"""
1042
+ try:
1043
+ async with self._lock:
1044
+ self._health_status = "stopping"
1045
+ self._shutdown_event.set()
1046
+
1047
+ # Cancel tasks
1048
+ tasks_to_cancel = []
1049
+ if self._detection_task:
1050
+ tasks_to_cancel.append(self._detection_task)
1051
+ if self._health_task:
1052
+ tasks_to_cancel.append(self._health_task)
1053
+
1054
+ if tasks_to_cancel:
1055
+ for task in tasks_to_cancel:
1056
+ task.cancel()
1057
+
1058
+ try:
1059
+ await asyncio.wait_for(
1060
+ asyncio.gather(*tasks_to_cancel, return_exceptions=True),
1061
+ timeout=timeout,
1062
+ )
1063
+ except asyncio.TimeoutError:
1064
+ logger.warning(
1065
+ "Some tasks did not stop gracefully within timeout"
1066
+ )
1067
+
1068
+ self._detection_task = None
1069
+ self._health_task = None
1070
+ self._health_status = "stopped"
1071
+
1072
+ logger.info("Deadlock detector stopped")
1073
+ await self._notify("deadlock_detector_stopped", {})
1074
+ return True
1075
+
1076
+ except Exception as e:
1077
+ self._health_status = "error"
1078
+ logger.error(f"Error stopping deadlock detector: {e}")
1079
+ return False
1080
+
1081
+ async def _detection_loop(self) -> None:
1082
+ """Main detection loop with comprehensive error handling"""
1083
+ consecutive_errors = 0
1084
+ max_consecutive_errors = 5
1085
+
1086
+ try:
1087
+ while not self._shutdown_event.is_set():
1088
+ try:
1089
+ # Wait for next detection cycle
1090
+ await asyncio.wait_for(
1091
+ self._shutdown_event.wait(), timeout=self.detection_interval
1092
+ )
1093
+ if self._shutdown_event.is_set():
1094
+ break
1095
+
1096
+ except asyncio.TimeoutError:
1097
+ pass # Normal timeout, continue with detection
1098
+
1099
+ detection_start = time.perf_counter()
1100
+
1101
+ try:
1102
+ # Perform detection
1103
+ await self._perform_detection_cycle()
1104
+
1105
+ # Update metrics
1106
+ detection_duration = (time.perf_counter() - detection_start) * 1000
1107
+ self._update_detection_metrics(detection_duration)
1108
+
1109
+ # Reset error counter on successful detection
1110
+ consecutive_errors = 0
1111
+ self._last_successful_detection = datetime.now(timezone.utc)
1112
+
1113
+ except Exception as detection_error:
1114
+ consecutive_errors += 1
1115
+ self._metrics["detection_errors"] += 1
1116
+ self._metrics["last_error"] = str(detection_error) # type: ignore
1117
+
1118
+ logger.error(f"Detection cycle error: {detection_error}")
1119
+
1120
+ # Implement exponential backoff on errors
1121
+ if consecutive_errors >= max_consecutive_errors:
1122
+ logger.critical(
1123
+ f"Too many consecutive errors ({consecutive_errors}), stopping detection"
1124
+ )
1125
+ self._health_status = "error"
1126
+ break
1127
+
1128
+ # Exponential backoff with jitter
1129
+ error_delay = min(
1130
+ self.detection_interval
1131
+ * (2**consecutive_errors)
1132
+ * (0.5 + 0.5 * time.time() % 1),
1133
+ 60.0,
1134
+ )
1135
+ await asyncio.sleep(error_delay)
1136
+
1137
+ except asyncio.CancelledError:
1138
+ logger.info("Detection loop cancelled")
1139
+ except Exception as e:
1140
+ logger.critical(f"Unexpected error in detection loop: {e}")
1141
+ self._health_status = "error"
1142
+ self._metrics["last_error"] = str(e) # type: ignore
1143
+
1144
+ async def _perform_detection_cycle(self) -> None:
1145
+ """Perform a single detection cycle"""
1146
+ self._metrics["total_detections"] += 1
1147
+
1148
+ # Check state dependencies
1149
+ state_result = self._dependency_graph.find_cycles()
1150
+ if state_result.has_cycle:
1151
+ await self._handle_deadlock_detection(state_result, "dependency_graph")
1152
+
1153
+ # Check resource wait graph
1154
+ resource_result = await self._resource_graph.detect_deadlock()
1155
+ if resource_result.has_cycle:
1156
+ await self._handle_deadlock_detection(resource_result, "resource_graph")
1157
+
1158
+ # Keep detection history
1159
+ self._detection_history.append(
1160
+ {
1161
+ "timestamp": datetime.now(timezone.utc),
1162
+ "state_cycles": len(state_result.cycles),
1163
+ "resource_cycles": len(resource_result.cycles),
1164
+ "total_cycles": len(state_result.cycles) + len(resource_result.cycles),
1165
+ }
1166
+ )
1167
+
1168
+ async def _handle_deadlock_detection(
1169
+ self, result: CycleDetectionResult, source: str
1170
+ ) -> None:
1171
+ """Handle detected deadlock with enhanced resolution logic"""
1172
+ self._cycle_count += 1
1173
+ self._last_cycle = result.get_critical_cycle()
1174
+ self._metrics["deadlocks_found"] += 1
1175
+
1176
+ detection_id = str(uuid.uuid4())
1177
+
1178
+ logger.error(
1179
+ f"Deadlock detected from {source} (ID: {detection_id}): "
1180
+ f"cycle_count={self._cycle_count}, "
1181
+ f"cycle={self._last_cycle}, "
1182
+ f"total_cycles={len(result.cycles)}"
1183
+ )
1184
+
1185
+ # Notify callbacks
1186
+ await self._notify(
1187
+ "deadlock_detected",
1188
+ {
1189
+ "detection_id": detection_id,
1190
+ "source": source,
1191
+ "cycle": self._last_cycle,
1192
+ "total_cycles": len(result.cycles),
1193
+ "timestamp": datetime.now(timezone.utc).isoformat(),
1194
+ },
1195
+ )
1196
+
1197
+ # Attempt resolution
1198
+ resolution_attempts = 0
1199
+ resolved = False
1200
+
1201
+ while resolution_attempts < self.max_resolution_attempts and not resolved:
1202
+ resolution_attempts += 1
1203
+
1204
+ try:
1205
+ # Try custom callbacks first
1206
+ for callback in self._resolution_callbacks:
1207
+ try:
1208
+ if self._last_cycle and await self._run_callback_safely(
1209
+ callback, self._last_cycle
1210
+ ):
1211
+ resolved = True
1212
+ self._metrics["deadlocks_resolved"] += 1
1213
+ logger.info(
1214
+ f"Deadlock {detection_id} resolved by custom callback (attempt {resolution_attempts})"
1215
+ )
1216
+ break
1217
+ except Exception as e:
1218
+ logger.error(f"Resolution callback failed: {e}")
1219
+
1220
+ # Apply configured strategy if not resolved
1221
+ if not resolved and self._last_cycle:
1222
+ resolved = await self._apply_resolution_strategy(
1223
+ self._last_cycle, detection_id
1224
+ )
1225
+
1226
+ if resolved:
1227
+ break
1228
+
1229
+ except Exception as e:
1230
+ logger.error(f"Resolution attempt {resolution_attempts} failed: {e}")
1231
+
1232
+ # Wait before retry
1233
+ if resolution_attempts < self.max_resolution_attempts:
1234
+ await asyncio.sleep(0.1 * resolution_attempts) # Progressive delay
1235
+
1236
+ # Record resolution outcome
1237
+ self._resolution_history.append(
1238
+ {
1239
+ "detection_id": detection_id,
1240
+ "cycle": self._last_cycle,
1241
+ "resolved": resolved,
1242
+ "attempts": resolution_attempts,
1243
+ "strategy": self.resolution_strategy.name,
1244
+ "timestamp": datetime.now(timezone.utc),
1245
+ }
1246
+ )
1247
+
1248
+ if not resolved:
1249
+ self._metrics["resolution_failures"] += 1
1250
+
1251
+ # Raise exception if strategy requires it
1252
+ if self.resolution_strategy == DeadlockResolutionStrategy.RAISE_EXCEPTION:
1253
+ if self._last_cycle:
1254
+ raise DeadlockError(self._last_cycle, detection_id)
1255
+ else:
1256
+ raise DeadlockError([], detection_id)
1257
+
1258
+ # Add the missing method alias for backward compatibility
1259
+ async def _handle_deadlock(
1260
+ self, result: CycleDetectionResult, source: str = "test"
1261
+ ) -> None:
1262
+ """Handle detected deadlock (alias for backward compatibility)"""
1263
+ return await self._handle_deadlock_detection(result, source)
1264
+
1265
+ async def _run_callback_safely(self, callback: Callable, cycle: list[str]) -> bool:
1266
+ """Run callback safely with timeout"""
1267
+ try:
1268
+ if asyncio.iscoroutinefunction(callback):
1269
+ return await asyncio.wait_for(callback(cycle), timeout=5.0)
1270
+ else:
1271
+ # Run sync callback in thread pool
1272
+ loop = asyncio.get_event_loop()
1273
+ return await loop.run_in_executor(None, callback, cycle)
1274
+ except asyncio.TimeoutError:
1275
+ logger.warning("Resolution callback timed out")
1276
+ return False
1277
+
1278
+ async def _apply_resolution_strategy(
1279
+ self, cycle: list[str], detection_id: str
1280
+ ) -> bool:
1281
+ """Apply the configured resolution strategy"""
1282
+ try:
1283
+ if self.resolution_strategy == DeadlockResolutionStrategy.LOG_ONLY:
1284
+ return True # Just log, consider resolved
1285
+
1286
+ elif self.resolution_strategy == DeadlockResolutionStrategy.KILL_YOUNGEST:
1287
+ return await self._kill_youngest_process(cycle, detection_id)
1288
+
1289
+ elif self.resolution_strategy == DeadlockResolutionStrategy.KILL_OLDEST:
1290
+ return await self._kill_oldest_process(cycle, detection_id)
1291
+
1292
+ elif (
1293
+ self.resolution_strategy
1294
+ == DeadlockResolutionStrategy.KILL_LOWEST_PRIORITY
1295
+ ):
1296
+ return await self._kill_lowest_priority_process(cycle, detection_id)
1297
+
1298
+ elif (
1299
+ self.resolution_strategy == DeadlockResolutionStrategy.PREEMPT_RESOURCES
1300
+ ):
1301
+ return await self._preempt_resources(cycle, detection_id)
1302
+
1303
+ return False
1304
+
1305
+ except Exception as e:
1306
+ logger.error(
1307
+ f"Resolution strategy {self.resolution_strategy.name} failed: {e}"
1308
+ )
1309
+ return False
1310
+
1311
+ async def _kill_youngest_process(self, cycle: list[str], detection_id: str) -> bool:
1312
+ """Kill the youngest process in the cycle"""
1313
+ try:
1314
+ valid_processes = [
1315
+ pid for pid in cycle if pid in self._resource_graph.processes
1316
+ ]
1317
+ if not valid_processes:
1318
+ return False
1319
+
1320
+ youngest_process = min(
1321
+ valid_processes,
1322
+ key=lambda pid: self._resource_graph.processes[pid].age_seconds(),
1323
+ )
1324
+
1325
+ await self._terminate_process(
1326
+ youngest_process, f"deadlock_resolution_{detection_id}"
1327
+ )
1328
+ logger.info(
1329
+ f"Killed youngest process {youngest_process} to resolve deadlock {detection_id}"
1330
+ )
1331
+ return True
1332
+
1333
+ except Exception as e:
1334
+ logger.error(f"Failed to kill youngest process: {e}")
1335
+ return False
1336
+
1337
+ async def _kill_oldest_process(self, cycle: list[str], detection_id: str) -> bool:
1338
+ """Kill the oldest process in the cycle"""
1339
+ try:
1340
+ valid_processes = [
1341
+ pid for pid in cycle if pid in self._resource_graph.processes
1342
+ ]
1343
+ if not valid_processes:
1344
+ return False
1345
+
1346
+ oldest_process = max(
1347
+ valid_processes,
1348
+ key=lambda pid: self._resource_graph.processes[pid].age_seconds(),
1349
+ )
1350
+
1351
+ await self._terminate_process(
1352
+ oldest_process, f"deadlock_resolution_{detection_id}"
1353
+ )
1354
+ logger.info(
1355
+ f"Killed oldest process {oldest_process} to resolve deadlock {detection_id}"
1356
+ )
1357
+ return True
1358
+
1359
+ except Exception as e:
1360
+ logger.error(f"Failed to kill oldest process: {e}")
1361
+ return False
1362
+
1363
+ async def _kill_lowest_priority_process(
1364
+ self, cycle: list[str], detection_id: str
1365
+ ) -> bool:
1366
+ """Kill the lowest priority process in the cycle"""
1367
+ try:
1368
+ valid_processes = [
1369
+ pid for pid in cycle if pid in self._resource_graph.processes
1370
+ ]
1371
+ if not valid_processes:
1372
+ return False
1373
+
1374
+ lowest_priority_process = min(
1375
+ valid_processes,
1376
+ key=lambda pid: self._resource_graph.processes[pid].priority,
1377
+ )
1378
+
1379
+ await self._terminate_process(
1380
+ lowest_priority_process, f"deadlock_resolution_{detection_id}"
1381
+ )
1382
+ logger.info(
1383
+ f"Killed lowest priority process {lowest_priority_process} to resolve deadlock {detection_id}"
1384
+ )
1385
+ return True
1386
+
1387
+ except Exception as e:
1388
+ logger.error(f"Failed to kill lowest priority process: {e}")
1389
+ return False
1390
+
1391
+ async def _preempt_resources(self, cycle: list[str], detection_id: str) -> bool:
1392
+ """Preempt resources from processes in the cycle"""
1393
+ try:
1394
+ valid_processes = [
1395
+ pid for pid in cycle if pid in self._resource_graph.processes
1396
+ ]
1397
+ if not valid_processes:
1398
+ return False
1399
+
1400
+ # Find process with most resources to preempt from
1401
+ victim_process = max(
1402
+ valid_processes,
1403
+ key=lambda pid: len(self._resource_graph.processes[pid].holding),
1404
+ )
1405
+
1406
+ process = self._resource_graph.processes[victim_process]
1407
+ resources_to_preempt = list(process.holding)
1408
+
1409
+ # Release all resources held by victim process
1410
+ for resource_id in resources_to_preempt:
1411
+ await self._resource_graph.release_resource(victim_process, resource_id)
1412
+
1413
+ logger.info(
1414
+ f"Preempted {len(resources_to_preempt)} resources from process "
1415
+ f"{victim_process} to resolve deadlock {detection_id}"
1416
+ )
1417
+ self._resource_graph._metrics["preemptions"] += 1
1418
+ return True
1419
+
1420
+ except Exception as e:
1421
+ logger.error(f"Failed to preempt resources: {e}")
1422
+ return False
1423
+
1424
+ async def _terminate_process(self, process_id: str, reason: str) -> None:
1425
+ """Terminate a process and clean up its resources"""
1426
+ try:
1427
+ if process_id in self._resource_graph.processes:
1428
+ process = self._resource_graph.processes[process_id]
1429
+
1430
+ # Release all held resources
1431
+ for resource_id in list(process.holding):
1432
+ await self._resource_graph.release_resource(process_id, resource_id)
1433
+
1434
+ # Remove from waiting lists
1435
+ for resource_id in list(process.waiting_for):
1436
+ if resource_id in self._resource_graph.resources:
1437
+ self._resource_graph.resources[resource_id].waiters.discard(
1438
+ process_id
1439
+ )
1440
+
1441
+ # Remove process
1442
+ del self._resource_graph.processes[process_id]
1443
+
1444
+ # Remove from dependency graph
1445
+ await self._dependency_graph.remove_node(process_id)
1446
+
1447
+ logger.info(f"Terminated process {process_id}, reason: {reason}")
1448
+
1449
+ except Exception as e:
1450
+ logger.error(f"Failed to terminate process {process_id}: {e}")
1451
+
1452
+ async def _health_monitoring_loop(self) -> None:
1453
+ """Health monitoring loop"""
1454
+ try:
1455
+ while not self._shutdown_event.is_set():
1456
+ try:
1457
+ await asyncio.wait_for(self._shutdown_event.wait(), timeout=30.0)
1458
+ if self._shutdown_event.is_set():
1459
+ break
1460
+ except asyncio.TimeoutError:
1461
+ pass
1462
+
1463
+ # Perform health checks
1464
+ await self._perform_health_checks()
1465
+
1466
+ except asyncio.CancelledError:
1467
+ logger.info("Health monitoring loop cancelled")
1468
+ except Exception as e:
1469
+ logger.error(f"Health monitoring error: {e}")
1470
+
1471
+ async def _perform_health_checks(self) -> None:
1472
+ """Perform comprehensive health checks"""
1473
+ try:
1474
+ now = datetime.now(timezone.utc)
1475
+
1476
+ # Check if detection is stuck
1477
+ time_since_last_detection = (
1478
+ now - self._last_successful_detection
1479
+ ).total_seconds()
1480
+ if (
1481
+ time_since_last_detection > self.detection_interval * 10
1482
+ ): # 10x normal interval
1483
+ self._health_status = "degraded"
1484
+ logger.warning(
1485
+ f"No successful detection in {time_since_last_detection:.1f} seconds"
1486
+ )
1487
+
1488
+ # Check graph health
1489
+ dep_health = await self._dependency_graph.health_check()
1490
+ resource_health = self._resource_graph.get_metrics()
1491
+
1492
+ # Update uptime
1493
+ self._metrics["uptime_seconds"] = (now - self._start_time).total_seconds()
1494
+
1495
+ # Log health status periodically
1496
+ if int(time.time()) % 300 == 0: # Every 5 minutes
1497
+ logger.info(
1498
+ f"Health check: status={self._health_status}, "
1499
+ f"dep_nodes={dep_health['node_count']}, "
1500
+ f"resources={resource_health['total_resources']}, "
1501
+ f"blocked_processes={resource_health['blocked_processes']}"
1502
+ )
1503
+
1504
+ except Exception as e:
1505
+ logger.error(f"Health check failed: {e}")
1506
+
1507
+ async def _notify(self, event: str, data: dict[str, Any]) -> None:
1508
+ """Send notifications to registered callbacks"""
1509
+ for callback in self._notification_callbacks:
1510
+ try:
1511
+ if asyncio.iscoroutinefunction(callback):
1512
+ await callback(event, data)
1513
+ else:
1514
+ callback(event, data)
1515
+ except Exception as e:
1516
+ logger.error(f"Notification callback failed for event {event}: {e}")
1517
+
1518
+ def _update_detection_metrics(self, duration_ms: float) -> None:
1519
+ """Update detection performance metrics"""
1520
+ if self.enable_metrics:
1521
+ # Exponential moving average
1522
+ alpha = 0.1
1523
+ self._metrics["avg_detection_time_ms"] = (
1524
+ alpha * duration_ms
1525
+ + (1 - alpha) * self._metrics["avg_detection_time_ms"]
1526
+ )
1527
+
1528
+ # Public API methods
1529
+
1530
+ def add_resolution_callback(self, callback: Callable[[list[str]], bool]) -> None:
1531
+ """Add a callback for custom deadlock resolution"""
1532
+ self._resolution_callbacks.append(callback)
1533
+
1534
+ def add_notification_callback(
1535
+ self, callback: Callable[[str, dict[str, Any]], None]
1536
+ ) -> None:
1537
+ """Add a callback for event notifications"""
1538
+ self._notification_callbacks.append(callback)
1539
+
1540
+ async def add_dependency(
1541
+ self, from_state: str, to_state: str, metadata: Optional[dict[str, Any]] = None
1542
+ ) -> bool:
1543
+ """Add a dependency between states"""
1544
+ return await self._dependency_graph.add_dependency(
1545
+ from_state, to_state, metadata
1546
+ )
1547
+
1548
+ async def remove_dependency(self, from_state: str, to_state: str) -> bool:
1549
+ """Remove a dependency between states"""
1550
+ return await self._dependency_graph.remove_dependency(from_state, to_state)
1551
+
1552
+ async def acquire_resource(
1553
+ self,
1554
+ process_id: str,
1555
+ resource_id: str,
1556
+ process_name: Optional[str] = None,
1557
+ priority: int = 0,
1558
+ timeout: Optional[float] = None,
1559
+ ) -> bool:
1560
+ """Process attempts to acquire a resource"""
1561
+ if process_name:
1562
+ await self._resource_graph.add_process(
1563
+ process_id, process_name, priority, timeout
1564
+ )
1565
+
1566
+ success = await self._resource_graph.acquire_resource(
1567
+ process_id, resource_id, timeout=timeout
1568
+ )
1569
+
1570
+ # Immediate deadlock check after failed acquisition
1571
+ if not success:
1572
+ try:
1573
+ result = await self._resource_graph.detect_deadlock()
1574
+ if result.has_cycle:
1575
+ await self._handle_deadlock_detection(result, "immediate_check")
1576
+ except Exception as e:
1577
+ logger.error(f"Error during immediate deadlock check: {e}")
1578
+
1579
+ return success
1580
+
1581
+ async def release_resource(self, process_id: str, resource_id: str) -> bool:
1582
+ """Process releases a resource"""
1583
+ return await self._resource_graph.release_resource(process_id, resource_id)
1584
+
1585
+ def get_comprehensive_status(self) -> dict[str, Any]:
1586
+ """Get comprehensive detector status"""
1587
+ return {
1588
+ # Basic status
1589
+ "active": bool(self._detection_task and not self._detection_task.done()),
1590
+ "health_status": self._health_status,
1591
+ "cycle_count": self._cycle_count,
1592
+ "last_cycle": self._last_cycle,
1593
+ # Missing fields that tests expect
1594
+ "graph_size": len(self._dependency_graph.nodes),
1595
+ "resource_count": len(self._resource_graph.resources),
1596
+ "process_count": len(self._resource_graph.processes),
1597
+ "blocked_processes": len(self._resource_graph.get_blocked_processes()),
1598
+ # Configuration
1599
+ "detection_interval": self.detection_interval,
1600
+ "resolution_strategy": self.resolution_strategy.name,
1601
+ "max_resolution_attempts": self.max_resolution_attempts,
1602
+ # Graph statistics
1603
+ "dependency_graph": self._dependency_graph.get_metrics(),
1604
+ "resource_graph": self._resource_graph.get_metrics(),
1605
+ # Performance metrics
1606
+ "metrics": self._metrics.copy(),
1607
+ # Recent activity
1608
+ "recent_detections": len(
1609
+ [
1610
+ h
1611
+ for h in self._detection_history
1612
+ if (datetime.now(timezone.utc) - h["timestamp"]).total_seconds()
1613
+ < 300
1614
+ ]
1615
+ ),
1616
+ "recent_resolutions": len(
1617
+ [
1618
+ r
1619
+ for r in self._resolution_history
1620
+ if (datetime.now(timezone.utc) - r["timestamp"]).total_seconds()
1621
+ < 300
1622
+ ]
1623
+ ),
1624
+ # Health indicators
1625
+ "last_successful_detection": self._last_successful_detection.isoformat(),
1626
+ "time_since_last_detection": (
1627
+ datetime.now(timezone.utc) - self._last_successful_detection
1628
+ ).total_seconds(),
1629
+ }
1630
+
1631
+ async def force_detection(self) -> CycleDetectionResult:
1632
+ """Force an immediate deadlock detection"""
1633
+ try:
1634
+ # Check both graphs
1635
+ state_result = self._dependency_graph.find_cycles()
1636
+ resource_result = await self._resource_graph.detect_deadlock()
1637
+
1638
+ # Return combined result
1639
+ all_cycles = state_result.cycles + resource_result.cycles
1640
+
1641
+ return CycleDetectionResult(
1642
+ has_cycle=len(all_cycles) > 0,
1643
+ cycles=all_cycles,
1644
+ graph_size=state_result.graph_size + resource_result.graph_size,
1645
+ detection_duration_ms=max(
1646
+ state_result.detection_duration_ms,
1647
+ resource_result.detection_duration_ms,
1648
+ ),
1649
+ algorithm_used="combined",
1650
+ )
1651
+
1652
+ except Exception as e:
1653
+ logger.error(f"Force detection failed: {e}")
1654
+ raise
1655
+
1656
+ async def export_state(self) -> dict[str, Any]:
1657
+ """Export current state for debugging/analysis"""
1658
+ return {
1659
+ "timestamp": datetime.now(timezone.utc).isoformat(),
1660
+ "status": self.get_comprehensive_status(),
1661
+ "dependency_graph": self._dependency_graph.get_metrics(),
1662
+ "resource_graph": {
1663
+ "processes": {
1664
+ pid: {
1665
+ "name": proc.process_name,
1666
+ "holding": list(proc.holding),
1667
+ "waiting_for": list(proc.waiting_for),
1668
+ "priority": proc.priority,
1669
+ "blocked_duration": proc.blocked_duration_seconds(),
1670
+ }
1671
+ for pid, proc in self._resource_graph.processes.items()
1672
+ },
1673
+ "resources": {
1674
+ rid: {
1675
+ "type": res.resource_type,
1676
+ "holders": list(res.holders),
1677
+ "waiters": list(res.waiters),
1678
+ "access_count": res.access_count,
1679
+ }
1680
+ for rid, res in self._resource_graph.resources.items()
1681
+ },
1682
+ },
1683
+ "detection_history": list(self._detection_history)[
1684
+ -10:
1685
+ ], # Last 10 detections
1686
+ "resolution_history": list(self._resolution_history)[
1687
+ -10:
1688
+ ], # Last 10 resolutions
1689
+ }
1690
+
1691
+ def get_status(self) -> dict[str, Any]:
1692
+ """Get comprehensive detector status (alias for backward compatibility)"""
1693
+ return self.get_comprehensive_status()
1694
+
1695
+ def get_dependency_graph(self) -> dict[str, set[str]]:
1696
+ """Get current dependency graph"""
1697
+ return dict(self._dependency_graph.nodes)
1698
+
1699
+ def get_wait_graph(self) -> dict[str, dict[str, Any]]:
1700
+ """Get current wait-for graph with enhanced information"""
1701
+ graph = {}
1702
+
1703
+ for process_id, process in self._resource_graph.processes.items():
1704
+ graph[process_id] = {
1705
+ "name": process.process_name,
1706
+ "holding": list(process.holding),
1707
+ "waiting_for": list(process.waiting_for),
1708
+ "blocked": process.is_blocked(),
1709
+ "blocked_duration_seconds": process.blocked_duration_seconds(),
1710
+ "age_seconds": process.age_seconds(),
1711
+ "priority": process.priority,
1712
+ "last_activity": process.last_activity.isoformat(),
1713
+ }
1714
+
1715
+ return graph
1716
+
1717
+ def find_potential_deadlocks(self) -> list[tuple[str, str]]:
1718
+ """Find potential deadlock situations before they occur"""
1719
+ potential = []
1720
+
1721
+ # Check for circular wait conditions
1722
+ for p1_id, p1 in self._resource_graph.processes.items():
1723
+ for p2_id, p2 in self._resource_graph.processes.items():
1724
+ if p1_id == p2_id:
1725
+ continue
1726
+
1727
+ # Check if P1 holds what P2 wants and vice versa
1728
+ p1_holds_p2_wants = bool(p1.holding & p2.waiting_for)
1729
+ p2_holds_p1_wants = bool(p2.holding & p1.waiting_for)
1730
+
1731
+ if p1_holds_p2_wants and p2_holds_p1_wants:
1732
+ potential.append((p1_id, p2_id))
1733
+
1734
+ return potential
1735
+
1736
+ def get_metrics(self) -> dict[str, Any]:
1737
+ """Get comprehensive metrics"""
1738
+ return {
1739
+ **self._metrics,
1740
+ "detection_history_length": len(self._detection_history),
1741
+ "active_processes": len(self._resource_graph.processes),
1742
+ "active_resources": len(self._resource_graph.resources),
1743
+ "blocked_processes": len(self._resource_graph.get_blocked_processes()),
1744
+ }
1745
+
1746
+ # Context manager support
1747
+ async def __aenter__(self) -> "DeadlockDetector":
1748
+ """Async context manager entry"""
1749
+ await self.start()
1750
+ return self
1751
+
1752
+ async def __aexit__(
1753
+ self,
1754
+ exc_type: Optional[type],
1755
+ exc_val: Optional[BaseException],
1756
+ exc_tb: Optional[object],
1757
+ ) -> None:
1758
+ """Async context manager exit"""
1759
+ await self.stop()