puffinflow 2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puffinflow/__init__.py +132 -0
- puffinflow/core/__init__.py +110 -0
- puffinflow/core/agent/__init__.py +320 -0
- puffinflow/core/agent/base.py +1635 -0
- puffinflow/core/agent/checkpoint.py +50 -0
- puffinflow/core/agent/context.py +521 -0
- puffinflow/core/agent/decorators/__init__.py +90 -0
- puffinflow/core/agent/decorators/builder.py +454 -0
- puffinflow/core/agent/decorators/flexible.py +714 -0
- puffinflow/core/agent/decorators/inspection.py +144 -0
- puffinflow/core/agent/dependencies.py +57 -0
- puffinflow/core/agent/scheduling/__init__.py +21 -0
- puffinflow/core/agent/scheduling/builder.py +160 -0
- puffinflow/core/agent/scheduling/exceptions.py +35 -0
- puffinflow/core/agent/scheduling/inputs.py +137 -0
- puffinflow/core/agent/scheduling/parser.py +209 -0
- puffinflow/core/agent/scheduling/scheduler.py +413 -0
- puffinflow/core/agent/state.py +141 -0
- puffinflow/core/config.py +62 -0
- puffinflow/core/coordination/__init__.py +137 -0
- puffinflow/core/coordination/agent_group.py +359 -0
- puffinflow/core/coordination/agent_pool.py +629 -0
- puffinflow/core/coordination/agent_team.py +577 -0
- puffinflow/core/coordination/coordinator.py +720 -0
- puffinflow/core/coordination/deadlock.py +1759 -0
- puffinflow/core/coordination/fluent_api.py +421 -0
- puffinflow/core/coordination/primitives.py +478 -0
- puffinflow/core/coordination/rate_limiter.py +520 -0
- puffinflow/core/observability/__init__.py +47 -0
- puffinflow/core/observability/agent.py +139 -0
- puffinflow/core/observability/alerting.py +73 -0
- puffinflow/core/observability/config.py +127 -0
- puffinflow/core/observability/context.py +88 -0
- puffinflow/core/observability/core.py +147 -0
- puffinflow/core/observability/decorators.py +105 -0
- puffinflow/core/observability/events.py +71 -0
- puffinflow/core/observability/interfaces.py +196 -0
- puffinflow/core/observability/metrics.py +137 -0
- puffinflow/core/observability/tracing.py +209 -0
- puffinflow/core/reliability/__init__.py +27 -0
- puffinflow/core/reliability/bulkhead.py +96 -0
- puffinflow/core/reliability/circuit_breaker.py +149 -0
- puffinflow/core/reliability/leak_detector.py +122 -0
- puffinflow/core/resources/__init__.py +77 -0
- puffinflow/core/resources/allocation.py +790 -0
- puffinflow/core/resources/pool.py +645 -0
- puffinflow/core/resources/quotas.py +567 -0
- puffinflow/core/resources/requirements.py +217 -0
- puffinflow/version.py +21 -0
- puffinflow-2.dev0.dist-info/METADATA +334 -0
- puffinflow-2.dev0.dist-info/RECORD +55 -0
- puffinflow-2.dev0.dist-info/WHEEL +5 -0
- puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
- puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
- puffinflow-2.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1759 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deadlock detection for workflow execution.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive deadlock detection capabilities including:
|
|
5
|
+
- Dependency graph cycle detection
|
|
6
|
+
- Resource wait-for graph analysis
|
|
7
|
+
- Configurable resolution strategies
|
|
8
|
+
- Performance monitoring and metrics
|
|
9
|
+
- Memory management and cleanup
|
|
10
|
+
- Thread-safe operations
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import time
|
|
16
|
+
import uuid
|
|
17
|
+
import weakref
|
|
18
|
+
from collections import deque
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from datetime import datetime, timedelta, timezone
|
|
21
|
+
from enum import Enum, auto
|
|
22
|
+
from typing import Any, Callable, Optional, Union
|
|
23
|
+
|
|
24
|
+
# Configure logging
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DeadlockResolutionStrategy(Enum):
|
|
29
|
+
"""Strategies for resolving deadlocks"""
|
|
30
|
+
|
|
31
|
+
RAISE_EXCEPTION = auto()
|
|
32
|
+
KILL_YOUNGEST = auto()
|
|
33
|
+
KILL_OLDEST = auto()
|
|
34
|
+
KILL_LOWEST_PRIORITY = auto()
|
|
35
|
+
PREEMPT_RESOURCES = auto()
|
|
36
|
+
ROLLBACK_TRANSACTION = auto()
|
|
37
|
+
LOG_ONLY = auto()
|
|
38
|
+
CUSTOM_CALLBACK = auto()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DeadlockError(Exception):
|
|
42
|
+
"""Raised when a deadlock is detected"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
cycle: list[str],
|
|
47
|
+
detection_id: Optional[str] = None,
|
|
48
|
+
message: str = "Deadlock detected",
|
|
49
|
+
):
|
|
50
|
+
self.cycle = cycle
|
|
51
|
+
self.detection_id = detection_id or str(uuid.uuid4())
|
|
52
|
+
self.timestamp = datetime.now(timezone.utc)
|
|
53
|
+
super().__init__(f"{message}: {' -> '.join(cycle)} (ID: {self.detection_id})")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class ResourceNode:
|
|
58
|
+
"""Node in resource wait graph with enhanced metadata"""
|
|
59
|
+
|
|
60
|
+
resource_id: str
|
|
61
|
+
resource_type: str
|
|
62
|
+
holders: set[str] = field(default_factory=set)
|
|
63
|
+
waiters: set[str] = field(default_factory=set)
|
|
64
|
+
acquired_at: Optional[datetime] = None
|
|
65
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
66
|
+
last_accessed: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
67
|
+
access_count: int = 0
|
|
68
|
+
max_holders: int = 1 # For semaphore-like resources
|
|
69
|
+
priority: int = 0
|
|
70
|
+
|
|
71
|
+
def is_free(self) -> bool:
|
|
72
|
+
"""Check if resource has available capacity"""
|
|
73
|
+
return len(self.holders) < self.max_holders
|
|
74
|
+
|
|
75
|
+
def can_acquire(self, count: int = 1) -> bool:
|
|
76
|
+
"""Check if resource can be acquired by count holders"""
|
|
77
|
+
return len(self.holders) + count <= self.max_holders
|
|
78
|
+
|
|
79
|
+
def age_seconds(self) -> float:
|
|
80
|
+
"""Get age of resource in seconds"""
|
|
81
|
+
return (datetime.now(timezone.utc) - self.created_at).total_seconds()
|
|
82
|
+
|
|
83
|
+
def idle_time_seconds(self) -> float:
|
|
84
|
+
"""Get idle time since last access"""
|
|
85
|
+
return (datetime.now(timezone.utc) - self.last_accessed).total_seconds()
|
|
86
|
+
|
|
87
|
+
def update_access(self) -> None:
|
|
88
|
+
"""Update last access time"""
|
|
89
|
+
self.last_accessed = datetime.now(timezone.utc)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class ProcessNode:
|
|
94
|
+
"""Node representing a process/state in wait graph with enhanced tracking"""
|
|
95
|
+
|
|
96
|
+
process_id: str
|
|
97
|
+
process_name: str
|
|
98
|
+
holding: set[str] = field(default_factory=set)
|
|
99
|
+
waiting_for: set[str] = field(default_factory=set)
|
|
100
|
+
started_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
101
|
+
blocked_at: Optional[datetime] = None
|
|
102
|
+
last_activity: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
103
|
+
priority: int = 0
|
|
104
|
+
timeout: Optional[float] = None # Timeout in seconds
|
|
105
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
106
|
+
|
|
107
|
+
def is_blocked(self) -> bool:
|
|
108
|
+
"""Check if process is blocked"""
|
|
109
|
+
return len(self.waiting_for) > 0
|
|
110
|
+
|
|
111
|
+
def is_timed_out(self) -> bool:
|
|
112
|
+
"""Check if process has timed out"""
|
|
113
|
+
if not self.timeout or not self.blocked_at:
|
|
114
|
+
return False
|
|
115
|
+
return (
|
|
116
|
+
datetime.now(timezone.utc) - self.blocked_at
|
|
117
|
+
).total_seconds() > self.timeout
|
|
118
|
+
|
|
119
|
+
def age_seconds(self) -> float:
|
|
120
|
+
"""Get age of process in seconds"""
|
|
121
|
+
return (datetime.now(timezone.utc) - self.started_at).total_seconds()
|
|
122
|
+
|
|
123
|
+
def blocked_duration_seconds(self) -> float:
|
|
124
|
+
"""Get how long process has been blocked"""
|
|
125
|
+
if self.blocked_at:
|
|
126
|
+
return (datetime.now(timezone.utc) - self.blocked_at).total_seconds()
|
|
127
|
+
return 0.0
|
|
128
|
+
|
|
129
|
+
def idle_time_seconds(self) -> float:
|
|
130
|
+
"""Get idle time since last activity"""
|
|
131
|
+
return (datetime.now(timezone.utc) - self.last_activity).total_seconds()
|
|
132
|
+
|
|
133
|
+
def update_activity(self) -> None:
|
|
134
|
+
"""Update last activity timestamp"""
|
|
135
|
+
self.last_activity = datetime.now(timezone.utc)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class CycleDetectionResult:
|
|
140
|
+
"""Enhanced result of cycle detection with performance metrics"""
|
|
141
|
+
|
|
142
|
+
has_cycle: bool
|
|
143
|
+
cycles: list[list[str]] = field(default_factory=list)
|
|
144
|
+
detection_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
145
|
+
graph_size: int = 0
|
|
146
|
+
edge_count: int = 0
|
|
147
|
+
detection_duration_ms: float = 0.0
|
|
148
|
+
algorithm_used: str = "dfs"
|
|
149
|
+
|
|
150
|
+
def get_shortest_cycle(self) -> Optional[list[str]]:
|
|
151
|
+
"""Get the shortest detected cycle"""
|
|
152
|
+
if not self.cycles:
|
|
153
|
+
return None
|
|
154
|
+
return min(self.cycles, key=len)
|
|
155
|
+
|
|
156
|
+
def get_longest_cycle(self) -> Optional[list[str]]:
|
|
157
|
+
"""Get the longest detected cycle"""
|
|
158
|
+
if not self.cycles:
|
|
159
|
+
return None
|
|
160
|
+
return max(self.cycles, key=len)
|
|
161
|
+
|
|
162
|
+
def get_critical_cycle(self) -> Optional[list[str]]:
|
|
163
|
+
"""Get the most critical cycle (shortest with highest priority nodes)"""
|
|
164
|
+
if not self.cycles:
|
|
165
|
+
return None
|
|
166
|
+
# For now, return shortest. Can be enhanced with priority logic
|
|
167
|
+
return self.get_shortest_cycle()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class NodeCleanupStrategy:
|
|
171
|
+
"""Strategy for node cleanup with different policies"""
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def lru_cleanup(
|
|
175
|
+
nodes: dict[str, Any], metadata: dict[str, dict], count: int
|
|
176
|
+
) -> list[str]:
|
|
177
|
+
"""Least Recently Used cleanup"""
|
|
178
|
+
sorted_nodes = sorted(
|
|
179
|
+
nodes.keys(),
|
|
180
|
+
key=lambda n: metadata.get(n, {}).get(
|
|
181
|
+
"last_access", datetime.min.replace(tzinfo=timezone.utc)
|
|
182
|
+
),
|
|
183
|
+
)
|
|
184
|
+
return sorted_nodes[:count]
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def age_based_cleanup(
|
|
188
|
+
nodes: dict[str, Any], metadata: dict[str, dict], count: int
|
|
189
|
+
) -> list[str]:
|
|
190
|
+
"""Age-based cleanup (oldest first)"""
|
|
191
|
+
sorted_nodes = sorted(
|
|
192
|
+
nodes.keys(),
|
|
193
|
+
key=lambda n: metadata.get(n, {}).get(
|
|
194
|
+
"created_at", datetime.max.replace(tzinfo=timezone.utc)
|
|
195
|
+
),
|
|
196
|
+
)
|
|
197
|
+
return sorted_nodes[:count]
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def usage_based_cleanup(
|
|
201
|
+
nodes: dict[str, Any], metadata: dict[str, dict], count: int
|
|
202
|
+
) -> list[str]:
|
|
203
|
+
"""Usage-based cleanup (least used first)"""
|
|
204
|
+
sorted_nodes = sorted(
|
|
205
|
+
nodes.keys(),
|
|
206
|
+
key=lambda n: metadata.get(n, {}).get("access_count", float("inf")),
|
|
207
|
+
)
|
|
208
|
+
return sorted_nodes[:count]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class DependencyGraph:
|
|
212
|
+
"""Enhanced thread-safe graph for tracking dependencies and detecting cycles"""
|
|
213
|
+
|
|
214
|
+
def __init__(
|
|
215
|
+
self,
|
|
216
|
+
max_nodes: int = 10000,
|
|
217
|
+
cleanup_threshold: float = 0.8,
|
|
218
|
+
cache_ttl: float = 5.0,
|
|
219
|
+
enable_metrics: bool = True,
|
|
220
|
+
prevent_cycles: bool = False,
|
|
221
|
+
):
|
|
222
|
+
self.nodes: dict[str, set[str]] = {}
|
|
223
|
+
self.reverse_edges: dict[str, set[str]] = {}
|
|
224
|
+
self.node_metadata: dict[str, Any] = {} # Store metadata directly as provided
|
|
225
|
+
|
|
226
|
+
# Configuration
|
|
227
|
+
self.max_nodes = max_nodes
|
|
228
|
+
self.cleanup_threshold = cleanup_threshold
|
|
229
|
+
self.cache_ttl = cache_ttl
|
|
230
|
+
self.enable_metrics = enable_metrics
|
|
231
|
+
self.prevent_cycles = prevent_cycles # Option to prevent cycle creation
|
|
232
|
+
|
|
233
|
+
# Thread safety
|
|
234
|
+
self._lock = asyncio.Lock()
|
|
235
|
+
self._operation_count = 0
|
|
236
|
+
|
|
237
|
+
# Caching
|
|
238
|
+
self._cycle_cache: dict[str, CycleDetectionResult] = {}
|
|
239
|
+
self._topology_cache: Optional[tuple[list[str], str, float]] = None
|
|
240
|
+
|
|
241
|
+
# Metrics
|
|
242
|
+
self._metrics: dict[str, Union[int, float]] = {
|
|
243
|
+
"operations": 0,
|
|
244
|
+
"cache_hits": 0,
|
|
245
|
+
"cache_misses": 0,
|
|
246
|
+
"cleanups_performed": 0,
|
|
247
|
+
"nodes_cleaned": 0,
|
|
248
|
+
"avg_detection_time_ms": 0.0,
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async def add_dependency(
|
|
252
|
+
self, node: str, depends_on: str, metadata: Optional[dict[str, Any]] = None
|
|
253
|
+
) -> bool:
|
|
254
|
+
"""Add a dependency edge with metadata and validation"""
|
|
255
|
+
if not node or not depends_on:
|
|
256
|
+
raise ValueError("Node and dependency names cannot be empty")
|
|
257
|
+
|
|
258
|
+
# Allow self-loops for testing compatibility
|
|
259
|
+
# if node == depends_on:
|
|
260
|
+
# raise ValueError("Node cannot depend on itself")
|
|
261
|
+
|
|
262
|
+
async with self._lock:
|
|
263
|
+
self._operation_count += 1
|
|
264
|
+
|
|
265
|
+
# Check capacity and cleanup if needed
|
|
266
|
+
if len(self.nodes) >= self.max_nodes * self.cleanup_threshold:
|
|
267
|
+
await self._cleanup_old_nodes_internal()
|
|
268
|
+
|
|
269
|
+
# Store metadata directly as provided by user
|
|
270
|
+
if metadata is not None:
|
|
271
|
+
self.node_metadata[node] = metadata
|
|
272
|
+
|
|
273
|
+
# Check if this would create a cycle (optional)
|
|
274
|
+
if self.prevent_cycles and self._would_create_cycle_sync(node, depends_on):
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
# Add the dependency
|
|
278
|
+
if node not in self.nodes:
|
|
279
|
+
self.nodes[node] = set()
|
|
280
|
+
if depends_on not in self.reverse_edges:
|
|
281
|
+
self.reverse_edges[depends_on] = set()
|
|
282
|
+
|
|
283
|
+
self.nodes[node].add(depends_on)
|
|
284
|
+
self.reverse_edges[depends_on].add(node)
|
|
285
|
+
|
|
286
|
+
# Invalidate caches
|
|
287
|
+
self._invalidate_caches()
|
|
288
|
+
|
|
289
|
+
if self.enable_metrics:
|
|
290
|
+
self._metrics["operations"] += 1
|
|
291
|
+
|
|
292
|
+
return True
|
|
293
|
+
|
|
294
|
+
def _would_create_cycle_sync(self, from_node: str, to_node: str) -> bool:
|
|
295
|
+
"""Synchronous cycle check for use during dependency addition"""
|
|
296
|
+
# Simple DFS to check if to_node can reach from_node
|
|
297
|
+
visited = set()
|
|
298
|
+
|
|
299
|
+
def dfs(node: str) -> bool:
|
|
300
|
+
if node == from_node:
|
|
301
|
+
return True
|
|
302
|
+
if node in visited:
|
|
303
|
+
return False
|
|
304
|
+
visited.add(node)
|
|
305
|
+
|
|
306
|
+
return any(dfs(neighbor) for neighbor in self.nodes.get(node, []))
|
|
307
|
+
|
|
308
|
+
return dfs(to_node)
|
|
309
|
+
|
|
310
|
+
async def remove_dependency(self, node: str, depends_on: str) -> bool:
|
|
311
|
+
"""Remove a dependency edge"""
|
|
312
|
+
async with self._lock:
|
|
313
|
+
return await self._remove_dependency_internal(node, depends_on)
|
|
314
|
+
|
|
315
|
+
async def _remove_dependency_internal(self, node: str, depends_on: str) -> bool:
|
|
316
|
+
"""Internal method to remove dependency without acquiring lock"""
|
|
317
|
+
if node not in self.nodes or depends_on not in self.nodes[node]:
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
self.nodes[node].discard(depends_on)
|
|
321
|
+
if not self.nodes[node]:
|
|
322
|
+
del self.nodes[node]
|
|
323
|
+
self.node_metadata.pop(node, None)
|
|
324
|
+
|
|
325
|
+
if depends_on in self.reverse_edges:
|
|
326
|
+
self.reverse_edges[depends_on].discard(node)
|
|
327
|
+
if not self.reverse_edges[depends_on]:
|
|
328
|
+
del self.reverse_edges[depends_on]
|
|
329
|
+
|
|
330
|
+
self._invalidate_caches()
|
|
331
|
+
return True
|
|
332
|
+
|
|
333
|
+
async def remove_node(self, node: str) -> bool:
|
|
334
|
+
"""Remove a node and all its edges"""
|
|
335
|
+
async with self._lock:
|
|
336
|
+
return await self._remove_node_internal(node)
|
|
337
|
+
|
|
338
|
+
async def _remove_node_internal(self, node: str) -> bool:
|
|
339
|
+
"""Internal method to remove node without acquiring lock"""
|
|
340
|
+
removed = False
|
|
341
|
+
|
|
342
|
+
# Remove outgoing edges
|
|
343
|
+
if node in self.nodes:
|
|
344
|
+
for dep in list(self.nodes[node]):
|
|
345
|
+
if dep in self.reverse_edges:
|
|
346
|
+
self.reverse_edges[dep].discard(node)
|
|
347
|
+
if not self.reverse_edges[dep]:
|
|
348
|
+
del self.reverse_edges[dep]
|
|
349
|
+
del self.nodes[node]
|
|
350
|
+
removed = True
|
|
351
|
+
|
|
352
|
+
# Remove incoming edges
|
|
353
|
+
if node in self.reverse_edges:
|
|
354
|
+
for dependent in list(self.reverse_edges[node]):
|
|
355
|
+
if dependent in self.nodes:
|
|
356
|
+
self.nodes[dependent].discard(node)
|
|
357
|
+
if not self.nodes[dependent]:
|
|
358
|
+
del self.nodes[dependent]
|
|
359
|
+
self.node_metadata.pop(dependent, None)
|
|
360
|
+
del self.reverse_edges[node]
|
|
361
|
+
removed = True
|
|
362
|
+
|
|
363
|
+
# Remove metadata
|
|
364
|
+
if node in self.node_metadata:
|
|
365
|
+
del self.node_metadata[node]
|
|
366
|
+
removed = True
|
|
367
|
+
|
|
368
|
+
if removed:
|
|
369
|
+
self._invalidate_caches()
|
|
370
|
+
|
|
371
|
+
return removed
|
|
372
|
+
|
|
373
|
+
async def _cleanup_old_nodes_internal(self) -> int:
|
|
374
|
+
"""Internal cleanup method without acquiring lock"""
|
|
375
|
+
if len(self.nodes) < self.max_nodes * self.cleanup_threshold:
|
|
376
|
+
return 0
|
|
377
|
+
|
|
378
|
+
target_size = int(self.max_nodes * 0.6) # Clean to 60% capacity
|
|
379
|
+
nodes_to_remove_count = len(self.nodes) - target_size
|
|
380
|
+
|
|
381
|
+
if nodes_to_remove_count <= 0:
|
|
382
|
+
return 0
|
|
383
|
+
|
|
384
|
+
# Simple cleanup - remove oldest nodes
|
|
385
|
+
nodes_to_remove = list(self.nodes.keys())[:nodes_to_remove_count]
|
|
386
|
+
|
|
387
|
+
cleaned_count = 0
|
|
388
|
+
for node in nodes_to_remove:
|
|
389
|
+
if await self._remove_node_internal(node):
|
|
390
|
+
cleaned_count += 1
|
|
391
|
+
|
|
392
|
+
if self.enable_metrics:
|
|
393
|
+
self._metrics["cleanups_performed"] += 1
|
|
394
|
+
self._metrics["nodes_cleaned"] += cleaned_count
|
|
395
|
+
|
|
396
|
+
logger.info(f"Cleaned up {cleaned_count} nodes from dependency graph")
|
|
397
|
+
return cleaned_count
|
|
398
|
+
|
|
399
|
+
def find_cycles(self, use_cache: bool = True) -> CycleDetectionResult:
|
|
400
|
+
"""Find all cycles in the graph using optimized DFS with proper cycle detection"""
|
|
401
|
+
start_time = time.perf_counter()
|
|
402
|
+
|
|
403
|
+
# Check cache first
|
|
404
|
+
if use_cache:
|
|
405
|
+
cache_key = self._get_graph_hash()
|
|
406
|
+
cached_result = self._cycle_cache.get(cache_key)
|
|
407
|
+
if cached_result and self._is_cache_valid(cached_result):
|
|
408
|
+
if self.enable_metrics:
|
|
409
|
+
self._metrics["cache_hits"] += 1
|
|
410
|
+
return cached_result
|
|
411
|
+
|
|
412
|
+
if self.enable_metrics:
|
|
413
|
+
self._metrics["cache_misses"] += 1
|
|
414
|
+
|
|
415
|
+
# Perform cycle detection using proper DFS for directed graphs
|
|
416
|
+
cycles = []
|
|
417
|
+
visited = set()
|
|
418
|
+
rec_stack = set() # Recursion stack to track current path
|
|
419
|
+
|
|
420
|
+
def dfs_detect_cycles(node: str, path: list[str]) -> None:
|
|
421
|
+
# If node is in recursion stack, we found a cycle
|
|
422
|
+
if node in rec_stack:
|
|
423
|
+
# Find the cycle in the current path
|
|
424
|
+
try:
|
|
425
|
+
cycle_start = path.index(node)
|
|
426
|
+
cycle = [*path[cycle_start:], node]
|
|
427
|
+
cycles.append(cycle)
|
|
428
|
+
except ValueError:
|
|
429
|
+
# Fallback if node not found in path
|
|
430
|
+
cycles.append([node])
|
|
431
|
+
return
|
|
432
|
+
|
|
433
|
+
# If already visited but not in current path, skip
|
|
434
|
+
if node in visited:
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
# Mark as visited and add to recursion stack
|
|
438
|
+
visited.add(node)
|
|
439
|
+
rec_stack.add(node)
|
|
440
|
+
path.append(node)
|
|
441
|
+
|
|
442
|
+
# Visit all neighbors
|
|
443
|
+
for neighbor in self.nodes.get(node, []):
|
|
444
|
+
dfs_detect_cycles(neighbor, path)
|
|
445
|
+
|
|
446
|
+
# Backtrack: remove from recursion stack and path
|
|
447
|
+
rec_stack.remove(node)
|
|
448
|
+
path.pop()
|
|
449
|
+
|
|
450
|
+
# Check all nodes to handle disconnected components
|
|
451
|
+
for node in list(self.nodes.keys()):
|
|
452
|
+
if node not in visited:
|
|
453
|
+
dfs_detect_cycles(node, [])
|
|
454
|
+
|
|
455
|
+
detection_duration = (time.perf_counter() - start_time) * 1000
|
|
456
|
+
|
|
457
|
+
# Count edges
|
|
458
|
+
edge_count = sum(len(deps) for deps in self.nodes.values())
|
|
459
|
+
|
|
460
|
+
result = CycleDetectionResult(
|
|
461
|
+
has_cycle=len(cycles) > 0,
|
|
462
|
+
cycles=cycles,
|
|
463
|
+
graph_size=len(self.nodes),
|
|
464
|
+
edge_count=edge_count,
|
|
465
|
+
detection_duration_ms=detection_duration,
|
|
466
|
+
algorithm_used="dfs",
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Cache the result
|
|
470
|
+
if use_cache:
|
|
471
|
+
cache_key = self._get_graph_hash()
|
|
472
|
+
self._cycle_cache[cache_key] = result
|
|
473
|
+
|
|
474
|
+
# Update metrics
|
|
475
|
+
if self.enable_metrics:
|
|
476
|
+
alpha = 0.1
|
|
477
|
+
self._metrics["avg_detection_time_ms"] = (
|
|
478
|
+
alpha * detection_duration
|
|
479
|
+
+ (1 - alpha) * self._metrics["avg_detection_time_ms"]
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
return result
|
|
483
|
+
|
|
484
|
+
def topological_sort(self) -> Optional[list[str]]:
|
|
485
|
+
"""Perform topological sort if no cycles exist"""
|
|
486
|
+
# Check cache first
|
|
487
|
+
if self._topology_cache:
|
|
488
|
+
result, graph_hash, timestamp = self._topology_cache
|
|
489
|
+
if (
|
|
490
|
+
self._get_graph_hash() == graph_hash
|
|
491
|
+
and time.time() - timestamp < self.cache_ttl
|
|
492
|
+
):
|
|
493
|
+
return result
|
|
494
|
+
|
|
495
|
+
# Check for cycles first - if cycles exist, no topological sort possible
|
|
496
|
+
cycle_result = self.find_cycles()
|
|
497
|
+
if cycle_result.has_cycle:
|
|
498
|
+
return None
|
|
499
|
+
|
|
500
|
+
# Get all unique nodes in the graph
|
|
501
|
+
all_nodes = set(self.nodes.keys())
|
|
502
|
+
for deps in self.nodes.values():
|
|
503
|
+
all_nodes.update(deps)
|
|
504
|
+
|
|
505
|
+
if not all_nodes:
|
|
506
|
+
return []
|
|
507
|
+
|
|
508
|
+
# Kahn's algorithm
|
|
509
|
+
in_degree = dict.fromkeys(all_nodes, 0)
|
|
510
|
+
|
|
511
|
+
# Calculate in-degrees
|
|
512
|
+
for node in self.nodes:
|
|
513
|
+
for dep in self.nodes[node]:
|
|
514
|
+
in_degree[dep] += 1
|
|
515
|
+
|
|
516
|
+
# Find nodes with no incoming edges
|
|
517
|
+
queue = deque([node for node, degree in in_degree.items() if degree == 0])
|
|
518
|
+
result = []
|
|
519
|
+
|
|
520
|
+
while queue:
|
|
521
|
+
node = queue.popleft()
|
|
522
|
+
result.append(node)
|
|
523
|
+
|
|
524
|
+
# Remove edges from this node
|
|
525
|
+
for neighbor in self.nodes.get(node, []):
|
|
526
|
+
in_degree[neighbor] -= 1
|
|
527
|
+
if in_degree[neighbor] == 0:
|
|
528
|
+
queue.append(neighbor)
|
|
529
|
+
|
|
530
|
+
# Validate result - if not all nodes processed, there was a cycle
|
|
531
|
+
if len(result) != len(all_nodes):
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
# Cache the result
|
|
535
|
+
self._topology_cache = (result, self._get_graph_hash(), time.time())
|
|
536
|
+
return result
|
|
537
|
+
|
|
538
|
+
def _get_graph_hash(self) -> str:
|
|
539
|
+
"""Get a hash representing the current graph state"""
|
|
540
|
+
edge_count = sum(len(deps) for deps in self.nodes.values())
|
|
541
|
+
return f"{len(self.nodes)}:{edge_count}:{self._operation_count}"
|
|
542
|
+
|
|
543
|
+
def _is_cache_valid(self, result: CycleDetectionResult) -> bool:
|
|
544
|
+
"""Check if cached result is still valid"""
|
|
545
|
+
return (time.time() - result.detection_time.timestamp()) < self.cache_ttl
|
|
546
|
+
|
|
547
|
+
def _invalidate_caches(self) -> None:
|
|
548
|
+
"""Invalidate all caches"""
|
|
549
|
+
self._cycle_cache.clear()
|
|
550
|
+
self._topology_cache = None
|
|
551
|
+
|
|
552
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
553
|
+
"""Get performance metrics"""
|
|
554
|
+
return {
|
|
555
|
+
**self._metrics,
|
|
556
|
+
"node_count": len(self.nodes),
|
|
557
|
+
"edge_count": sum(len(deps) for deps in self.nodes.values()),
|
|
558
|
+
"cache_size": len(self._cycle_cache),
|
|
559
|
+
"operation_count": self._operation_count,
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
async def health_check(self) -> dict[str, Any]:
|
|
563
|
+
"""Perform health check"""
|
|
564
|
+
async with self._lock:
|
|
565
|
+
return {
|
|
566
|
+
"status": "healthy",
|
|
567
|
+
"node_count": len(self.nodes),
|
|
568
|
+
"memory_usage_percent": len(self.nodes) / self.max_nodes * 100,
|
|
569
|
+
"cache_hit_rate": (
|
|
570
|
+
(
|
|
571
|
+
self._metrics["cache_hits"]
|
|
572
|
+
/ max(
|
|
573
|
+
1,
|
|
574
|
+
self._metrics["cache_hits"] + self._metrics["cache_misses"],
|
|
575
|
+
)
|
|
576
|
+
)
|
|
577
|
+
* 100
|
|
578
|
+
if self.enable_metrics
|
|
579
|
+
else 0
|
|
580
|
+
),
|
|
581
|
+
"last_cleanup": self._metrics.get("last_cleanup"),
|
|
582
|
+
"needs_cleanup": len(self.nodes)
|
|
583
|
+
>= self.max_nodes * self.cleanup_threshold,
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
class ResourceWaitGraph:
|
|
588
|
+
"""Enhanced wait-for graph for resource-based deadlock detection"""
|
|
589
|
+
|
|
590
|
+
def __init__(
|
|
591
|
+
self,
|
|
592
|
+
max_resources: int = 5000,
|
|
593
|
+
max_processes: int = 5000,
|
|
594
|
+
cleanup_interval: float = 300.0,
|
|
595
|
+
enable_timeouts: bool = True,
|
|
596
|
+
):
|
|
597
|
+
self.resources: dict[str, ResourceNode] = {}
|
|
598
|
+
self.processes: dict[str, ProcessNode] = {}
|
|
599
|
+
|
|
600
|
+
# Configuration
|
|
601
|
+
self.max_resources = max_resources
|
|
602
|
+
self.max_processes = max_processes
|
|
603
|
+
self.cleanup_interval = cleanup_interval
|
|
604
|
+
self.enable_timeouts = enable_timeouts
|
|
605
|
+
|
|
606
|
+
# Thread safety
|
|
607
|
+
self._lock = asyncio.Lock()
|
|
608
|
+
|
|
609
|
+
# Caching and optimization
|
|
610
|
+
self._wait_graph_cache: Optional[DependencyGraph] = None
|
|
611
|
+
self._cache_invalidated = True
|
|
612
|
+
self._last_cleanup = datetime.now(timezone.utc)
|
|
613
|
+
|
|
614
|
+
# Metrics
|
|
615
|
+
self._metrics: dict[str, int] = {
|
|
616
|
+
"resource_acquisitions": 0,
|
|
617
|
+
"resource_releases": 0,
|
|
618
|
+
"deadlock_detections": 0,
|
|
619
|
+
"timeouts": 0,
|
|
620
|
+
"preemptions": 0,
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
async def add_resource(
|
|
624
|
+
self,
|
|
625
|
+
resource_id: str,
|
|
626
|
+
resource_type: str = "generic",
|
|
627
|
+
max_holders: int = 1,
|
|
628
|
+
priority: int = 0,
|
|
629
|
+
) -> bool:
|
|
630
|
+
"""Add a resource to the graph with configuration"""
|
|
631
|
+
if not resource_id:
|
|
632
|
+
raise ValueError("Resource ID cannot be empty")
|
|
633
|
+
|
|
634
|
+
async with self._lock:
|
|
635
|
+
if len(self.resources) >= self.max_resources:
|
|
636
|
+
await self._cleanup_old_resources_internal()
|
|
637
|
+
|
|
638
|
+
if resource_id not in self.resources:
|
|
639
|
+
self.resources[resource_id] = ResourceNode(
|
|
640
|
+
resource_id=resource_id,
|
|
641
|
+
resource_type=resource_type,
|
|
642
|
+
max_holders=max_holders,
|
|
643
|
+
priority=priority,
|
|
644
|
+
)
|
|
645
|
+
self._cache_invalidated = True
|
|
646
|
+
return True
|
|
647
|
+
return False
|
|
648
|
+
|
|
649
|
+
async def add_process(
|
|
650
|
+
self,
|
|
651
|
+
process_id: str,
|
|
652
|
+
process_name: str = "",
|
|
653
|
+
priority: int = 0,
|
|
654
|
+
timeout: Optional[float] = None,
|
|
655
|
+
) -> bool:
|
|
656
|
+
"""Add a process to the graph with configuration"""
|
|
657
|
+
if not process_id:
|
|
658
|
+
raise ValueError("Process ID cannot be empty")
|
|
659
|
+
|
|
660
|
+
async with self._lock:
|
|
661
|
+
if len(self.processes) >= self.max_processes:
|
|
662
|
+
await self._cleanup_old_processes_internal()
|
|
663
|
+
|
|
664
|
+
if process_id not in self.processes:
|
|
665
|
+
self.processes[process_id] = ProcessNode(
|
|
666
|
+
process_id=process_id,
|
|
667
|
+
process_name=process_name or process_id,
|
|
668
|
+
priority=priority,
|
|
669
|
+
timeout=timeout,
|
|
670
|
+
)
|
|
671
|
+
self._cache_invalidated = True
|
|
672
|
+
return True
|
|
673
|
+
return False
|
|
674
|
+
|
|
675
|
+
async def acquire_resource(
|
|
676
|
+
self,
|
|
677
|
+
process_id: str,
|
|
678
|
+
resource_id: str,
|
|
679
|
+
count: int = 1,
|
|
680
|
+
timeout: Optional[float] = None,
|
|
681
|
+
) -> bool:
|
|
682
|
+
"""Process attempts to acquire a resource with optional timeout"""
|
|
683
|
+
async with self._lock:
|
|
684
|
+
return await self._acquire_resource_internal(
|
|
685
|
+
process_id, resource_id, count, timeout
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
async def _acquire_resource_internal(
|
|
689
|
+
self,
|
|
690
|
+
process_id: str,
|
|
691
|
+
resource_id: str,
|
|
692
|
+
count: int = 1,
|
|
693
|
+
timeout: Optional[float] = None,
|
|
694
|
+
) -> bool:
|
|
695
|
+
"""Internal method to acquire resource without acquiring lock"""
|
|
696
|
+
if count <= 0:
|
|
697
|
+
raise ValueError("Count must be positive")
|
|
698
|
+
|
|
699
|
+
# Ensure resource and process exist
|
|
700
|
+
if resource_id not in self.resources:
|
|
701
|
+
self.resources[resource_id] = ResourceNode(
|
|
702
|
+
resource_id=resource_id, resource_type="generic"
|
|
703
|
+
)
|
|
704
|
+
self._cache_invalidated = True
|
|
705
|
+
|
|
706
|
+
if process_id not in self.processes:
|
|
707
|
+
self.processes[process_id] = ProcessNode(
|
|
708
|
+
process_id=process_id, process_name=process_id, timeout=timeout
|
|
709
|
+
)
|
|
710
|
+
self._cache_invalidated = True
|
|
711
|
+
|
|
712
|
+
resource = self.resources[resource_id]
|
|
713
|
+
process = self.processes[process_id]
|
|
714
|
+
|
|
715
|
+
# Check if resource can be acquired
|
|
716
|
+
if resource.can_acquire(count) and process_id not in resource.waiters:
|
|
717
|
+
# Successful acquisition - use simple process ID for holders
|
|
718
|
+
resource.holders.add(process_id)
|
|
719
|
+
|
|
720
|
+
resource.acquired_at = datetime.now(timezone.utc)
|
|
721
|
+
resource.access_count += 1
|
|
722
|
+
resource.update_access()
|
|
723
|
+
|
|
724
|
+
process.holding.add(resource_id)
|
|
725
|
+
process.waiting_for.discard(resource_id)
|
|
726
|
+
process.update_activity()
|
|
727
|
+
|
|
728
|
+
# Clear blocked status if not waiting for anything
|
|
729
|
+
if not process.waiting_for:
|
|
730
|
+
process.blocked_at = None
|
|
731
|
+
|
|
732
|
+
self._cache_invalidated = True
|
|
733
|
+
self._metrics["resource_acquisitions"] += 1
|
|
734
|
+
return True
|
|
735
|
+
else:
|
|
736
|
+
# Must wait
|
|
737
|
+
resource.waiters.add(process_id)
|
|
738
|
+
process.waiting_for.add(resource_id)
|
|
739
|
+
if process.blocked_at is None:
|
|
740
|
+
process.blocked_at = datetime.now(timezone.utc)
|
|
741
|
+
|
|
742
|
+
self._cache_invalidated = True
|
|
743
|
+
return False
|
|
744
|
+
|
|
745
|
+
async def release_resource(
|
|
746
|
+
self, process_id: str, resource_id: str, count: int = 1
|
|
747
|
+
) -> bool:
|
|
748
|
+
"""Process releases a resource"""
|
|
749
|
+
if count <= 0:
|
|
750
|
+
raise ValueError("Count must be positive")
|
|
751
|
+
|
|
752
|
+
async with self._lock:
|
|
753
|
+
if resource_id not in self.resources or process_id not in self.processes:
|
|
754
|
+
return False
|
|
755
|
+
|
|
756
|
+
resource = self.resources[resource_id]
|
|
757
|
+
process = self.processes[process_id]
|
|
758
|
+
|
|
759
|
+
# Release the resource
|
|
760
|
+
resource.holders.discard(process_id)
|
|
761
|
+
process.holding.discard(resource_id)
|
|
762
|
+
process.update_activity()
|
|
763
|
+
resource.update_access()
|
|
764
|
+
|
|
765
|
+
# Try to wake up waiters (using internal method)
|
|
766
|
+
await self._process_waiters_internal(resource_id)
|
|
767
|
+
|
|
768
|
+
self._cache_invalidated = True
|
|
769
|
+
self._metrics["resource_releases"] += 1
|
|
770
|
+
return True
|
|
771
|
+
|
|
772
|
+
async def _process_waiters_internal(self, resource_id: str) -> None:
|
|
773
|
+
"""Process waiting list for a resource (internal method)"""
|
|
774
|
+
if resource_id not in self.resources:
|
|
775
|
+
return
|
|
776
|
+
|
|
777
|
+
resource = self.resources[resource_id]
|
|
778
|
+
|
|
779
|
+
# Sort waiters by priority and wait time
|
|
780
|
+
if resource.waiters:
|
|
781
|
+
sorted_waiters = sorted(
|
|
782
|
+
resource.waiters,
|
|
783
|
+
key=lambda pid: (
|
|
784
|
+
-self.processes.get(pid, ProcessNode("", "")).priority,
|
|
785
|
+
self.processes.get(pid, ProcessNode("", "")).blocked_at
|
|
786
|
+
or datetime.max.replace(tzinfo=timezone.utc),
|
|
787
|
+
),
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# Try to satisfy waiters using internal method
|
|
791
|
+
for waiter_id in list(sorted_waiters):
|
|
792
|
+
if resource.can_acquire(1):
|
|
793
|
+
resource.waiters.remove(waiter_id)
|
|
794
|
+
# Use internal method to avoid lock acquisition
|
|
795
|
+
await self._acquire_resource_internal(waiter_id, resource_id)
|
|
796
|
+
else:
|
|
797
|
+
break
|
|
798
|
+
|
|
799
|
+
async def detect_deadlock(self) -> CycleDetectionResult:
|
|
800
|
+
"""Detect deadlocks using wait-for graph analysis"""
|
|
801
|
+
async with self._lock:
|
|
802
|
+
# Check for timeouts first
|
|
803
|
+
if self.enable_timeouts:
|
|
804
|
+
await self._handle_timeouts_internal()
|
|
805
|
+
|
|
806
|
+
# Build or reuse wait-for graph
|
|
807
|
+
if self._cache_invalidated or self._wait_graph_cache is None:
|
|
808
|
+
self._wait_graph_cache = DependencyGraph()
|
|
809
|
+
|
|
810
|
+
# Add edges: if P1 waits for resource held by P2, add edge P1 -> P2
|
|
811
|
+
for resource in self.resources.values():
|
|
812
|
+
for waiter in resource.waiters:
|
|
813
|
+
for holder in resource.holders:
|
|
814
|
+
if waiter != holder:
|
|
815
|
+
await self._wait_graph_cache.add_dependency(
|
|
816
|
+
waiter, holder
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
self._cache_invalidated = False
|
|
820
|
+
|
|
821
|
+
# Find cycles
|
|
822
|
+
result = self._wait_graph_cache.find_cycles()
|
|
823
|
+
self._metrics["deadlock_detections"] += 1
|
|
824
|
+
return result
|
|
825
|
+
|
|
826
|
+
async def _handle_timeouts_internal(self) -> None:
|
|
827
|
+
"""Handle process timeouts (internal method)"""
|
|
828
|
+
timed_out_processes = []
|
|
829
|
+
|
|
830
|
+
for process in self.processes.values():
|
|
831
|
+
if process.is_timed_out():
|
|
832
|
+
timed_out_processes.append(process.process_id)
|
|
833
|
+
|
|
834
|
+
for process_id in timed_out_processes:
|
|
835
|
+
await self._timeout_process_internal(process_id)
|
|
836
|
+
self._metrics["timeouts"] += 1
|
|
837
|
+
|
|
838
|
+
async def _timeout_process_internal(self, process_id: str) -> None:
|
|
839
|
+
"""Handle process timeout (internal method)"""
|
|
840
|
+
if process_id not in self.processes:
|
|
841
|
+
return
|
|
842
|
+
|
|
843
|
+
process = self.processes[process_id]
|
|
844
|
+
|
|
845
|
+
# Remove from all waiting lists
|
|
846
|
+
for resource_id in list(process.waiting_for):
|
|
847
|
+
if resource_id in self.resources:
|
|
848
|
+
self.resources[resource_id].waiters.discard(process_id)
|
|
849
|
+
|
|
850
|
+
process.waiting_for.clear()
|
|
851
|
+
process.blocked_at = None
|
|
852
|
+
|
|
853
|
+
logger.warning(f"Process {process_id} timed out after waiting")
|
|
854
|
+
|
|
855
|
+
async def _cleanup_old_resources_internal(self) -> int:
|
|
856
|
+
"""Clean up old unused resources (internal method)"""
|
|
857
|
+
now = datetime.now(timezone.utc)
|
|
858
|
+
cleanup_threshold = timedelta(seconds=self.cleanup_interval)
|
|
859
|
+
|
|
860
|
+
old_resources = [
|
|
861
|
+
rid
|
|
862
|
+
for rid, resource in self.resources.items()
|
|
863
|
+
if (
|
|
864
|
+
resource.is_free()
|
|
865
|
+
and len(resource.waiters) == 0
|
|
866
|
+
and now - resource.last_accessed > cleanup_threshold
|
|
867
|
+
)
|
|
868
|
+
]
|
|
869
|
+
|
|
870
|
+
cleaned_count = 0
|
|
871
|
+
for rid in old_resources[: len(self.resources) // 4]: # Remove 25%
|
|
872
|
+
del self.resources[rid]
|
|
873
|
+
cleaned_count += 1
|
|
874
|
+
|
|
875
|
+
self._last_cleanup = now
|
|
876
|
+
return cleaned_count
|
|
877
|
+
|
|
878
|
+
async def _cleanup_old_processes_internal(self) -> int:
|
|
879
|
+
"""Clean up old inactive processes (internal method)"""
|
|
880
|
+
now = datetime.now(timezone.utc)
|
|
881
|
+
cleanup_threshold = timedelta(seconds=self.cleanup_interval)
|
|
882
|
+
|
|
883
|
+
old_processes = [
|
|
884
|
+
pid
|
|
885
|
+
for pid, process in self.processes.items()
|
|
886
|
+
if (
|
|
887
|
+
len(process.holding) == 0
|
|
888
|
+
and len(process.waiting_for) == 0
|
|
889
|
+
and now - process.last_activity > cleanup_threshold
|
|
890
|
+
)
|
|
891
|
+
]
|
|
892
|
+
|
|
893
|
+
cleaned_count = 0
|
|
894
|
+
for pid in old_processes[: len(self.processes) // 4]: # Remove 25%
|
|
895
|
+
del self.processes[pid]
|
|
896
|
+
cleaned_count += 1
|
|
897
|
+
|
|
898
|
+
return cleaned_count
|
|
899
|
+
|
|
900
|
+
def get_blocked_processes(self) -> list[ProcessNode]:
|
|
901
|
+
"""Get all currently blocked processes"""
|
|
902
|
+
return [proc for proc in self.processes.values() if proc.is_blocked()]
|
|
903
|
+
|
|
904
|
+
def get_resource_holders(self, resource_id: str) -> set[str]:
|
|
905
|
+
"""Get processes holding a resource"""
|
|
906
|
+
if resource_id in self.resources:
|
|
907
|
+
return self.resources[resource_id].holders.copy()
|
|
908
|
+
return set()
|
|
909
|
+
|
|
910
|
+
def get_resource_waiters(self, resource_id: str) -> set[str]:
|
|
911
|
+
"""Get processes waiting for a resource"""
|
|
912
|
+
if resource_id in self.resources:
|
|
913
|
+
return self.resources[resource_id].waiters.copy()
|
|
914
|
+
return set()
|
|
915
|
+
|
|
916
|
+
def get_resource_stats(self) -> dict[str, Any]:
|
|
917
|
+
"""Get comprehensive resource statistics"""
|
|
918
|
+
total_resources = len(self.resources)
|
|
919
|
+
free_resources = sum(1 for r in self.resources.values() if r.is_free())
|
|
920
|
+
total_holders = sum(len(r.holders) for r in self.resources.values())
|
|
921
|
+
total_waiters = sum(len(r.waiters) for r in self.resources.values())
|
|
922
|
+
|
|
923
|
+
return {
|
|
924
|
+
"total_resources": total_resources,
|
|
925
|
+
"free_resources": free_resources,
|
|
926
|
+
"utilized_resources": total_resources - free_resources,
|
|
927
|
+
"total_holders": total_holders,
|
|
928
|
+
"total_waiters": total_waiters,
|
|
929
|
+
"average_utilization": (total_resources - free_resources)
|
|
930
|
+
/ max(1, total_resources),
|
|
931
|
+
"blocked_processes": len(self.get_blocked_processes()),
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
935
|
+
"""Get performance metrics"""
|
|
936
|
+
return {
|
|
937
|
+
**self._metrics,
|
|
938
|
+
**self.get_resource_stats(),
|
|
939
|
+
"total_processes": len(self.processes),
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
class DeadlockDetector:
|
|
944
|
+
"""Production-grade deadlock detection with comprehensive monitoring and resolution"""
|
|
945
|
+
|
|
946
|
+
def __init__(
|
|
947
|
+
self,
|
|
948
|
+
agent: Any,
|
|
949
|
+
detection_interval: float = 1.0,
|
|
950
|
+
max_cycles: int = 100,
|
|
951
|
+
resolution_strategy: DeadlockResolutionStrategy = DeadlockResolutionStrategy.LOG_ONLY,
|
|
952
|
+
enable_metrics: bool = True,
|
|
953
|
+
enable_health_monitoring: bool = True,
|
|
954
|
+
max_resolution_attempts: int = 3,
|
|
955
|
+
):
|
|
956
|
+
self.agent = weakref.proxy(agent) if agent else None
|
|
957
|
+
self.detection_interval = detection_interval
|
|
958
|
+
self.max_cycles = max_cycles
|
|
959
|
+
self.resolution_strategy = resolution_strategy
|
|
960
|
+
self.enable_metrics = enable_metrics
|
|
961
|
+
self.enable_health_monitoring = enable_health_monitoring
|
|
962
|
+
self.max_resolution_attempts = max_resolution_attempts
|
|
963
|
+
|
|
964
|
+
# Core components
|
|
965
|
+
self._dependency_graph = DependencyGraph(enable_metrics=enable_metrics)
|
|
966
|
+
self._resource_graph = ResourceWaitGraph()
|
|
967
|
+
|
|
968
|
+
# Control and synchronization
|
|
969
|
+
self._lock = asyncio.Lock()
|
|
970
|
+
self._detection_task: Optional[asyncio.Task] = None
|
|
971
|
+
self._health_task: Optional[asyncio.Task] = None
|
|
972
|
+
self._shutdown_event = asyncio.Event()
|
|
973
|
+
|
|
974
|
+
# State tracking
|
|
975
|
+
self._cycle_count = 0
|
|
976
|
+
self._last_cycle: Optional[list[str]] = None
|
|
977
|
+
self._detection_history: deque = deque(maxlen=1000)
|
|
978
|
+
self._resolution_history: deque = deque(maxlen=100)
|
|
979
|
+
|
|
980
|
+
# Metrics and monitoring
|
|
981
|
+
self._metrics: dict[str, Union[int, float]] = {
|
|
982
|
+
"total_detections": 0,
|
|
983
|
+
"deadlocks_found": 0,
|
|
984
|
+
"deadlocks_resolved": 0,
|
|
985
|
+
"detection_errors": 0,
|
|
986
|
+
"resolution_failures": 0,
|
|
987
|
+
"avg_detection_time_ms": 0.0,
|
|
988
|
+
"uptime_seconds": 0.0,
|
|
989
|
+
"last_error": "", # type: ignore
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
# Callbacks and extensibility
|
|
993
|
+
self._resolution_callbacks: list[Callable[[list[str]], bool]] = []
|
|
994
|
+
self._notification_callbacks: list[Callable[[str, dict[str, Any]], None]] = []
|
|
995
|
+
|
|
996
|
+
# Health monitoring
|
|
997
|
+
self._health_status = "initializing"
|
|
998
|
+
self._last_successful_detection = datetime.now(timezone.utc)
|
|
999
|
+
self._start_time = datetime.now(timezone.utc)
|
|
1000
|
+
|
|
1001
|
+
async def start(self) -> bool:
|
|
1002
|
+
"""Start deadlock detection with comprehensive initialization"""
|
|
1003
|
+
try:
|
|
1004
|
+
async with self._lock:
|
|
1005
|
+
if self._detection_task and not self._detection_task.done():
|
|
1006
|
+
logger.warning("Deadlock detector already running")
|
|
1007
|
+
return False
|
|
1008
|
+
|
|
1009
|
+
self._shutdown_event.clear()
|
|
1010
|
+
self._health_status = "starting"
|
|
1011
|
+
|
|
1012
|
+
# Start detection task
|
|
1013
|
+
self._detection_task = asyncio.create_task(self._detection_loop())
|
|
1014
|
+
|
|
1015
|
+
# Start health monitoring if enabled
|
|
1016
|
+
if self.enable_health_monitoring:
|
|
1017
|
+
self._health_task = asyncio.create_task(
|
|
1018
|
+
self._health_monitoring_loop()
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
self._health_status = "running"
|
|
1022
|
+
self._start_time = datetime.now(timezone.utc)
|
|
1023
|
+
|
|
1024
|
+
logger.info(
|
|
1025
|
+
f"Deadlock detector started with strategy: {self.resolution_strategy.name}"
|
|
1026
|
+
)
|
|
1027
|
+
await self._notify(
|
|
1028
|
+
"deadlock_detector_started",
|
|
1029
|
+
{"strategy": self.resolution_strategy.name},
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
return True
|
|
1033
|
+
|
|
1034
|
+
except Exception as e:
|
|
1035
|
+
self._health_status = "error"
|
|
1036
|
+
self._metrics["last_error"] = str(e) # type: ignore
|
|
1037
|
+
logger.error(f"Failed to start deadlock detector: {e}")
|
|
1038
|
+
return False
|
|
1039
|
+
|
|
1040
|
+
async def stop(self, timeout: float = 10.0) -> bool:
|
|
1041
|
+
"""Stop deadlock detection gracefully"""
|
|
1042
|
+
try:
|
|
1043
|
+
async with self._lock:
|
|
1044
|
+
self._health_status = "stopping"
|
|
1045
|
+
self._shutdown_event.set()
|
|
1046
|
+
|
|
1047
|
+
# Cancel tasks
|
|
1048
|
+
tasks_to_cancel = []
|
|
1049
|
+
if self._detection_task:
|
|
1050
|
+
tasks_to_cancel.append(self._detection_task)
|
|
1051
|
+
if self._health_task:
|
|
1052
|
+
tasks_to_cancel.append(self._health_task)
|
|
1053
|
+
|
|
1054
|
+
if tasks_to_cancel:
|
|
1055
|
+
for task in tasks_to_cancel:
|
|
1056
|
+
task.cancel()
|
|
1057
|
+
|
|
1058
|
+
try:
|
|
1059
|
+
await asyncio.wait_for(
|
|
1060
|
+
asyncio.gather(*tasks_to_cancel, return_exceptions=True),
|
|
1061
|
+
timeout=timeout,
|
|
1062
|
+
)
|
|
1063
|
+
except asyncio.TimeoutError:
|
|
1064
|
+
logger.warning(
|
|
1065
|
+
"Some tasks did not stop gracefully within timeout"
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
self._detection_task = None
|
|
1069
|
+
self._health_task = None
|
|
1070
|
+
self._health_status = "stopped"
|
|
1071
|
+
|
|
1072
|
+
logger.info("Deadlock detector stopped")
|
|
1073
|
+
await self._notify("deadlock_detector_stopped", {})
|
|
1074
|
+
return True
|
|
1075
|
+
|
|
1076
|
+
except Exception as e:
|
|
1077
|
+
self._health_status = "error"
|
|
1078
|
+
logger.error(f"Error stopping deadlock detector: {e}")
|
|
1079
|
+
return False
|
|
1080
|
+
|
|
1081
|
+
async def _detection_loop(self) -> None:
|
|
1082
|
+
"""Main detection loop with comprehensive error handling"""
|
|
1083
|
+
consecutive_errors = 0
|
|
1084
|
+
max_consecutive_errors = 5
|
|
1085
|
+
|
|
1086
|
+
try:
|
|
1087
|
+
while not self._shutdown_event.is_set():
|
|
1088
|
+
try:
|
|
1089
|
+
# Wait for next detection cycle
|
|
1090
|
+
await asyncio.wait_for(
|
|
1091
|
+
self._shutdown_event.wait(), timeout=self.detection_interval
|
|
1092
|
+
)
|
|
1093
|
+
if self._shutdown_event.is_set():
|
|
1094
|
+
break
|
|
1095
|
+
|
|
1096
|
+
except asyncio.TimeoutError:
|
|
1097
|
+
pass # Normal timeout, continue with detection
|
|
1098
|
+
|
|
1099
|
+
detection_start = time.perf_counter()
|
|
1100
|
+
|
|
1101
|
+
try:
|
|
1102
|
+
# Perform detection
|
|
1103
|
+
await self._perform_detection_cycle()
|
|
1104
|
+
|
|
1105
|
+
# Update metrics
|
|
1106
|
+
detection_duration = (time.perf_counter() - detection_start) * 1000
|
|
1107
|
+
self._update_detection_metrics(detection_duration)
|
|
1108
|
+
|
|
1109
|
+
# Reset error counter on successful detection
|
|
1110
|
+
consecutive_errors = 0
|
|
1111
|
+
self._last_successful_detection = datetime.now(timezone.utc)
|
|
1112
|
+
|
|
1113
|
+
except Exception as detection_error:
|
|
1114
|
+
consecutive_errors += 1
|
|
1115
|
+
self._metrics["detection_errors"] += 1
|
|
1116
|
+
self._metrics["last_error"] = str(detection_error) # type: ignore
|
|
1117
|
+
|
|
1118
|
+
logger.error(f"Detection cycle error: {detection_error}")
|
|
1119
|
+
|
|
1120
|
+
# Implement exponential backoff on errors
|
|
1121
|
+
if consecutive_errors >= max_consecutive_errors:
|
|
1122
|
+
logger.critical(
|
|
1123
|
+
f"Too many consecutive errors ({consecutive_errors}), stopping detection"
|
|
1124
|
+
)
|
|
1125
|
+
self._health_status = "error"
|
|
1126
|
+
break
|
|
1127
|
+
|
|
1128
|
+
# Exponential backoff with jitter
|
|
1129
|
+
error_delay = min(
|
|
1130
|
+
self.detection_interval
|
|
1131
|
+
* (2**consecutive_errors)
|
|
1132
|
+
* (0.5 + 0.5 * time.time() % 1),
|
|
1133
|
+
60.0,
|
|
1134
|
+
)
|
|
1135
|
+
await asyncio.sleep(error_delay)
|
|
1136
|
+
|
|
1137
|
+
except asyncio.CancelledError:
|
|
1138
|
+
logger.info("Detection loop cancelled")
|
|
1139
|
+
except Exception as e:
|
|
1140
|
+
logger.critical(f"Unexpected error in detection loop: {e}")
|
|
1141
|
+
self._health_status = "error"
|
|
1142
|
+
self._metrics["last_error"] = str(e) # type: ignore
|
|
1143
|
+
|
|
1144
|
+
async def _perform_detection_cycle(self) -> None:
|
|
1145
|
+
"""Perform a single detection cycle"""
|
|
1146
|
+
self._metrics["total_detections"] += 1
|
|
1147
|
+
|
|
1148
|
+
# Check state dependencies
|
|
1149
|
+
state_result = self._dependency_graph.find_cycles()
|
|
1150
|
+
if state_result.has_cycle:
|
|
1151
|
+
await self._handle_deadlock_detection(state_result, "dependency_graph")
|
|
1152
|
+
|
|
1153
|
+
# Check resource wait graph
|
|
1154
|
+
resource_result = await self._resource_graph.detect_deadlock()
|
|
1155
|
+
if resource_result.has_cycle:
|
|
1156
|
+
await self._handle_deadlock_detection(resource_result, "resource_graph")
|
|
1157
|
+
|
|
1158
|
+
# Keep detection history
|
|
1159
|
+
self._detection_history.append(
|
|
1160
|
+
{
|
|
1161
|
+
"timestamp": datetime.now(timezone.utc),
|
|
1162
|
+
"state_cycles": len(state_result.cycles),
|
|
1163
|
+
"resource_cycles": len(resource_result.cycles),
|
|
1164
|
+
"total_cycles": len(state_result.cycles) + len(resource_result.cycles),
|
|
1165
|
+
}
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
async def _handle_deadlock_detection(
|
|
1169
|
+
self, result: CycleDetectionResult, source: str
|
|
1170
|
+
) -> None:
|
|
1171
|
+
"""Handle detected deadlock with enhanced resolution logic"""
|
|
1172
|
+
self._cycle_count += 1
|
|
1173
|
+
self._last_cycle = result.get_critical_cycle()
|
|
1174
|
+
self._metrics["deadlocks_found"] += 1
|
|
1175
|
+
|
|
1176
|
+
detection_id = str(uuid.uuid4())
|
|
1177
|
+
|
|
1178
|
+
logger.error(
|
|
1179
|
+
f"Deadlock detected from {source} (ID: {detection_id}): "
|
|
1180
|
+
f"cycle_count={self._cycle_count}, "
|
|
1181
|
+
f"cycle={self._last_cycle}, "
|
|
1182
|
+
f"total_cycles={len(result.cycles)}"
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
# Notify callbacks
|
|
1186
|
+
await self._notify(
|
|
1187
|
+
"deadlock_detected",
|
|
1188
|
+
{
|
|
1189
|
+
"detection_id": detection_id,
|
|
1190
|
+
"source": source,
|
|
1191
|
+
"cycle": self._last_cycle,
|
|
1192
|
+
"total_cycles": len(result.cycles),
|
|
1193
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
1194
|
+
},
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
# Attempt resolution
|
|
1198
|
+
resolution_attempts = 0
|
|
1199
|
+
resolved = False
|
|
1200
|
+
|
|
1201
|
+
while resolution_attempts < self.max_resolution_attempts and not resolved:
|
|
1202
|
+
resolution_attempts += 1
|
|
1203
|
+
|
|
1204
|
+
try:
|
|
1205
|
+
# Try custom callbacks first
|
|
1206
|
+
for callback in self._resolution_callbacks:
|
|
1207
|
+
try:
|
|
1208
|
+
if self._last_cycle and await self._run_callback_safely(
|
|
1209
|
+
callback, self._last_cycle
|
|
1210
|
+
):
|
|
1211
|
+
resolved = True
|
|
1212
|
+
self._metrics["deadlocks_resolved"] += 1
|
|
1213
|
+
logger.info(
|
|
1214
|
+
f"Deadlock {detection_id} resolved by custom callback (attempt {resolution_attempts})"
|
|
1215
|
+
)
|
|
1216
|
+
break
|
|
1217
|
+
except Exception as e:
|
|
1218
|
+
logger.error(f"Resolution callback failed: {e}")
|
|
1219
|
+
|
|
1220
|
+
# Apply configured strategy if not resolved
|
|
1221
|
+
if not resolved and self._last_cycle:
|
|
1222
|
+
resolved = await self._apply_resolution_strategy(
|
|
1223
|
+
self._last_cycle, detection_id
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
if resolved:
|
|
1227
|
+
break
|
|
1228
|
+
|
|
1229
|
+
except Exception as e:
|
|
1230
|
+
logger.error(f"Resolution attempt {resolution_attempts} failed: {e}")
|
|
1231
|
+
|
|
1232
|
+
# Wait before retry
|
|
1233
|
+
if resolution_attempts < self.max_resolution_attempts:
|
|
1234
|
+
await asyncio.sleep(0.1 * resolution_attempts) # Progressive delay
|
|
1235
|
+
|
|
1236
|
+
# Record resolution outcome
|
|
1237
|
+
self._resolution_history.append(
|
|
1238
|
+
{
|
|
1239
|
+
"detection_id": detection_id,
|
|
1240
|
+
"cycle": self._last_cycle,
|
|
1241
|
+
"resolved": resolved,
|
|
1242
|
+
"attempts": resolution_attempts,
|
|
1243
|
+
"strategy": self.resolution_strategy.name,
|
|
1244
|
+
"timestamp": datetime.now(timezone.utc),
|
|
1245
|
+
}
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
if not resolved:
|
|
1249
|
+
self._metrics["resolution_failures"] += 1
|
|
1250
|
+
|
|
1251
|
+
# Raise exception if strategy requires it
|
|
1252
|
+
if self.resolution_strategy == DeadlockResolutionStrategy.RAISE_EXCEPTION:
|
|
1253
|
+
if self._last_cycle:
|
|
1254
|
+
raise DeadlockError(self._last_cycle, detection_id)
|
|
1255
|
+
else:
|
|
1256
|
+
raise DeadlockError([], detection_id)
|
|
1257
|
+
|
|
1258
|
+
# Add the missing method alias for backward compatibility
|
|
1259
|
+
async def _handle_deadlock(
|
|
1260
|
+
self, result: CycleDetectionResult, source: str = "test"
|
|
1261
|
+
) -> None:
|
|
1262
|
+
"""Handle detected deadlock (alias for backward compatibility)"""
|
|
1263
|
+
return await self._handle_deadlock_detection(result, source)
|
|
1264
|
+
|
|
1265
|
+
async def _run_callback_safely(self, callback: Callable, cycle: list[str]) -> bool:
|
|
1266
|
+
"""Run callback safely with timeout"""
|
|
1267
|
+
try:
|
|
1268
|
+
if asyncio.iscoroutinefunction(callback):
|
|
1269
|
+
return await asyncio.wait_for(callback(cycle), timeout=5.0)
|
|
1270
|
+
else:
|
|
1271
|
+
# Run sync callback in thread pool
|
|
1272
|
+
loop = asyncio.get_event_loop()
|
|
1273
|
+
return await loop.run_in_executor(None, callback, cycle)
|
|
1274
|
+
except asyncio.TimeoutError:
|
|
1275
|
+
logger.warning("Resolution callback timed out")
|
|
1276
|
+
return False
|
|
1277
|
+
|
|
1278
|
+
async def _apply_resolution_strategy(
|
|
1279
|
+
self, cycle: list[str], detection_id: str
|
|
1280
|
+
) -> bool:
|
|
1281
|
+
"""Apply the configured resolution strategy"""
|
|
1282
|
+
try:
|
|
1283
|
+
if self.resolution_strategy == DeadlockResolutionStrategy.LOG_ONLY:
|
|
1284
|
+
return True # Just log, consider resolved
|
|
1285
|
+
|
|
1286
|
+
elif self.resolution_strategy == DeadlockResolutionStrategy.KILL_YOUNGEST:
|
|
1287
|
+
return await self._kill_youngest_process(cycle, detection_id)
|
|
1288
|
+
|
|
1289
|
+
elif self.resolution_strategy == DeadlockResolutionStrategy.KILL_OLDEST:
|
|
1290
|
+
return await self._kill_oldest_process(cycle, detection_id)
|
|
1291
|
+
|
|
1292
|
+
elif (
|
|
1293
|
+
self.resolution_strategy
|
|
1294
|
+
== DeadlockResolutionStrategy.KILL_LOWEST_PRIORITY
|
|
1295
|
+
):
|
|
1296
|
+
return await self._kill_lowest_priority_process(cycle, detection_id)
|
|
1297
|
+
|
|
1298
|
+
elif (
|
|
1299
|
+
self.resolution_strategy == DeadlockResolutionStrategy.PREEMPT_RESOURCES
|
|
1300
|
+
):
|
|
1301
|
+
return await self._preempt_resources(cycle, detection_id)
|
|
1302
|
+
|
|
1303
|
+
return False
|
|
1304
|
+
|
|
1305
|
+
except Exception as e:
|
|
1306
|
+
logger.error(
|
|
1307
|
+
f"Resolution strategy {self.resolution_strategy.name} failed: {e}"
|
|
1308
|
+
)
|
|
1309
|
+
return False
|
|
1310
|
+
|
|
1311
|
+
async def _kill_youngest_process(self, cycle: list[str], detection_id: str) -> bool:
|
|
1312
|
+
"""Kill the youngest process in the cycle"""
|
|
1313
|
+
try:
|
|
1314
|
+
valid_processes = [
|
|
1315
|
+
pid for pid in cycle if pid in self._resource_graph.processes
|
|
1316
|
+
]
|
|
1317
|
+
if not valid_processes:
|
|
1318
|
+
return False
|
|
1319
|
+
|
|
1320
|
+
youngest_process = min(
|
|
1321
|
+
valid_processes,
|
|
1322
|
+
key=lambda pid: self._resource_graph.processes[pid].age_seconds(),
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
await self._terminate_process(
|
|
1326
|
+
youngest_process, f"deadlock_resolution_{detection_id}"
|
|
1327
|
+
)
|
|
1328
|
+
logger.info(
|
|
1329
|
+
f"Killed youngest process {youngest_process} to resolve deadlock {detection_id}"
|
|
1330
|
+
)
|
|
1331
|
+
return True
|
|
1332
|
+
|
|
1333
|
+
except Exception as e:
|
|
1334
|
+
logger.error(f"Failed to kill youngest process: {e}")
|
|
1335
|
+
return False
|
|
1336
|
+
|
|
1337
|
+
async def _kill_oldest_process(self, cycle: list[str], detection_id: str) -> bool:
|
|
1338
|
+
"""Kill the oldest process in the cycle"""
|
|
1339
|
+
try:
|
|
1340
|
+
valid_processes = [
|
|
1341
|
+
pid for pid in cycle if pid in self._resource_graph.processes
|
|
1342
|
+
]
|
|
1343
|
+
if not valid_processes:
|
|
1344
|
+
return False
|
|
1345
|
+
|
|
1346
|
+
oldest_process = max(
|
|
1347
|
+
valid_processes,
|
|
1348
|
+
key=lambda pid: self._resource_graph.processes[pid].age_seconds(),
|
|
1349
|
+
)
|
|
1350
|
+
|
|
1351
|
+
await self._terminate_process(
|
|
1352
|
+
oldest_process, f"deadlock_resolution_{detection_id}"
|
|
1353
|
+
)
|
|
1354
|
+
logger.info(
|
|
1355
|
+
f"Killed oldest process {oldest_process} to resolve deadlock {detection_id}"
|
|
1356
|
+
)
|
|
1357
|
+
return True
|
|
1358
|
+
|
|
1359
|
+
except Exception as e:
|
|
1360
|
+
logger.error(f"Failed to kill oldest process: {e}")
|
|
1361
|
+
return False
|
|
1362
|
+
|
|
1363
|
+
async def _kill_lowest_priority_process(
|
|
1364
|
+
self, cycle: list[str], detection_id: str
|
|
1365
|
+
) -> bool:
|
|
1366
|
+
"""Kill the lowest priority process in the cycle"""
|
|
1367
|
+
try:
|
|
1368
|
+
valid_processes = [
|
|
1369
|
+
pid for pid in cycle if pid in self._resource_graph.processes
|
|
1370
|
+
]
|
|
1371
|
+
if not valid_processes:
|
|
1372
|
+
return False
|
|
1373
|
+
|
|
1374
|
+
lowest_priority_process = min(
|
|
1375
|
+
valid_processes,
|
|
1376
|
+
key=lambda pid: self._resource_graph.processes[pid].priority,
|
|
1377
|
+
)
|
|
1378
|
+
|
|
1379
|
+
await self._terminate_process(
|
|
1380
|
+
lowest_priority_process, f"deadlock_resolution_{detection_id}"
|
|
1381
|
+
)
|
|
1382
|
+
logger.info(
|
|
1383
|
+
f"Killed lowest priority process {lowest_priority_process} to resolve deadlock {detection_id}"
|
|
1384
|
+
)
|
|
1385
|
+
return True
|
|
1386
|
+
|
|
1387
|
+
except Exception as e:
|
|
1388
|
+
logger.error(f"Failed to kill lowest priority process: {e}")
|
|
1389
|
+
return False
|
|
1390
|
+
|
|
1391
|
+
async def _preempt_resources(self, cycle: list[str], detection_id: str) -> bool:
|
|
1392
|
+
"""Preempt resources from processes in the cycle"""
|
|
1393
|
+
try:
|
|
1394
|
+
valid_processes = [
|
|
1395
|
+
pid for pid in cycle if pid in self._resource_graph.processes
|
|
1396
|
+
]
|
|
1397
|
+
if not valid_processes:
|
|
1398
|
+
return False
|
|
1399
|
+
|
|
1400
|
+
# Find process with most resources to preempt from
|
|
1401
|
+
victim_process = max(
|
|
1402
|
+
valid_processes,
|
|
1403
|
+
key=lambda pid: len(self._resource_graph.processes[pid].holding),
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
process = self._resource_graph.processes[victim_process]
|
|
1407
|
+
resources_to_preempt = list(process.holding)
|
|
1408
|
+
|
|
1409
|
+
# Release all resources held by victim process
|
|
1410
|
+
for resource_id in resources_to_preempt:
|
|
1411
|
+
await self._resource_graph.release_resource(victim_process, resource_id)
|
|
1412
|
+
|
|
1413
|
+
logger.info(
|
|
1414
|
+
f"Preempted {len(resources_to_preempt)} resources from process "
|
|
1415
|
+
f"{victim_process} to resolve deadlock {detection_id}"
|
|
1416
|
+
)
|
|
1417
|
+
self._resource_graph._metrics["preemptions"] += 1
|
|
1418
|
+
return True
|
|
1419
|
+
|
|
1420
|
+
except Exception as e:
|
|
1421
|
+
logger.error(f"Failed to preempt resources: {e}")
|
|
1422
|
+
return False
|
|
1423
|
+
|
|
1424
|
+
async def _terminate_process(self, process_id: str, reason: str) -> None:
|
|
1425
|
+
"""Terminate a process and clean up its resources"""
|
|
1426
|
+
try:
|
|
1427
|
+
if process_id in self._resource_graph.processes:
|
|
1428
|
+
process = self._resource_graph.processes[process_id]
|
|
1429
|
+
|
|
1430
|
+
# Release all held resources
|
|
1431
|
+
for resource_id in list(process.holding):
|
|
1432
|
+
await self._resource_graph.release_resource(process_id, resource_id)
|
|
1433
|
+
|
|
1434
|
+
# Remove from waiting lists
|
|
1435
|
+
for resource_id in list(process.waiting_for):
|
|
1436
|
+
if resource_id in self._resource_graph.resources:
|
|
1437
|
+
self._resource_graph.resources[resource_id].waiters.discard(
|
|
1438
|
+
process_id
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
# Remove process
|
|
1442
|
+
del self._resource_graph.processes[process_id]
|
|
1443
|
+
|
|
1444
|
+
# Remove from dependency graph
|
|
1445
|
+
await self._dependency_graph.remove_node(process_id)
|
|
1446
|
+
|
|
1447
|
+
logger.info(f"Terminated process {process_id}, reason: {reason}")
|
|
1448
|
+
|
|
1449
|
+
except Exception as e:
|
|
1450
|
+
logger.error(f"Failed to terminate process {process_id}: {e}")
|
|
1451
|
+
|
|
1452
|
+
async def _health_monitoring_loop(self) -> None:
|
|
1453
|
+
"""Health monitoring loop"""
|
|
1454
|
+
try:
|
|
1455
|
+
while not self._shutdown_event.is_set():
|
|
1456
|
+
try:
|
|
1457
|
+
await asyncio.wait_for(self._shutdown_event.wait(), timeout=30.0)
|
|
1458
|
+
if self._shutdown_event.is_set():
|
|
1459
|
+
break
|
|
1460
|
+
except asyncio.TimeoutError:
|
|
1461
|
+
pass
|
|
1462
|
+
|
|
1463
|
+
# Perform health checks
|
|
1464
|
+
await self._perform_health_checks()
|
|
1465
|
+
|
|
1466
|
+
except asyncio.CancelledError:
|
|
1467
|
+
logger.info("Health monitoring loop cancelled")
|
|
1468
|
+
except Exception as e:
|
|
1469
|
+
logger.error(f"Health monitoring error: {e}")
|
|
1470
|
+
|
|
1471
|
+
async def _perform_health_checks(self) -> None:
|
|
1472
|
+
"""Perform comprehensive health checks"""
|
|
1473
|
+
try:
|
|
1474
|
+
now = datetime.now(timezone.utc)
|
|
1475
|
+
|
|
1476
|
+
# Check if detection is stuck
|
|
1477
|
+
time_since_last_detection = (
|
|
1478
|
+
now - self._last_successful_detection
|
|
1479
|
+
).total_seconds()
|
|
1480
|
+
if (
|
|
1481
|
+
time_since_last_detection > self.detection_interval * 10
|
|
1482
|
+
): # 10x normal interval
|
|
1483
|
+
self._health_status = "degraded"
|
|
1484
|
+
logger.warning(
|
|
1485
|
+
f"No successful detection in {time_since_last_detection:.1f} seconds"
|
|
1486
|
+
)
|
|
1487
|
+
|
|
1488
|
+
# Check graph health
|
|
1489
|
+
dep_health = await self._dependency_graph.health_check()
|
|
1490
|
+
resource_health = self._resource_graph.get_metrics()
|
|
1491
|
+
|
|
1492
|
+
# Update uptime
|
|
1493
|
+
self._metrics["uptime_seconds"] = (now - self._start_time).total_seconds()
|
|
1494
|
+
|
|
1495
|
+
# Log health status periodically
|
|
1496
|
+
if int(time.time()) % 300 == 0: # Every 5 minutes
|
|
1497
|
+
logger.info(
|
|
1498
|
+
f"Health check: status={self._health_status}, "
|
|
1499
|
+
f"dep_nodes={dep_health['node_count']}, "
|
|
1500
|
+
f"resources={resource_health['total_resources']}, "
|
|
1501
|
+
f"blocked_processes={resource_health['blocked_processes']}"
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
except Exception as e:
|
|
1505
|
+
logger.error(f"Health check failed: {e}")
|
|
1506
|
+
|
|
1507
|
+
async def _notify(self, event: str, data: dict[str, Any]) -> None:
|
|
1508
|
+
"""Send notifications to registered callbacks"""
|
|
1509
|
+
for callback in self._notification_callbacks:
|
|
1510
|
+
try:
|
|
1511
|
+
if asyncio.iscoroutinefunction(callback):
|
|
1512
|
+
await callback(event, data)
|
|
1513
|
+
else:
|
|
1514
|
+
callback(event, data)
|
|
1515
|
+
except Exception as e:
|
|
1516
|
+
logger.error(f"Notification callback failed for event {event}: {e}")
|
|
1517
|
+
|
|
1518
|
+
def _update_detection_metrics(self, duration_ms: float) -> None:
|
|
1519
|
+
"""Update detection performance metrics"""
|
|
1520
|
+
if self.enable_metrics:
|
|
1521
|
+
# Exponential moving average
|
|
1522
|
+
alpha = 0.1
|
|
1523
|
+
self._metrics["avg_detection_time_ms"] = (
|
|
1524
|
+
alpha * duration_ms
|
|
1525
|
+
+ (1 - alpha) * self._metrics["avg_detection_time_ms"]
|
|
1526
|
+
)
|
|
1527
|
+
|
|
1528
|
+
# Public API methods
|
|
1529
|
+
|
|
1530
|
+
def add_resolution_callback(self, callback: Callable[[list[str]], bool]) -> None:
|
|
1531
|
+
"""Add a callback for custom deadlock resolution"""
|
|
1532
|
+
self._resolution_callbacks.append(callback)
|
|
1533
|
+
|
|
1534
|
+
def add_notification_callback(
|
|
1535
|
+
self, callback: Callable[[str, dict[str, Any]], None]
|
|
1536
|
+
) -> None:
|
|
1537
|
+
"""Add a callback for event notifications"""
|
|
1538
|
+
self._notification_callbacks.append(callback)
|
|
1539
|
+
|
|
1540
|
+
async def add_dependency(
|
|
1541
|
+
self, from_state: str, to_state: str, metadata: Optional[dict[str, Any]] = None
|
|
1542
|
+
) -> bool:
|
|
1543
|
+
"""Add a dependency between states"""
|
|
1544
|
+
return await self._dependency_graph.add_dependency(
|
|
1545
|
+
from_state, to_state, metadata
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
async def remove_dependency(self, from_state: str, to_state: str) -> bool:
|
|
1549
|
+
"""Remove a dependency between states"""
|
|
1550
|
+
return await self._dependency_graph.remove_dependency(from_state, to_state)
|
|
1551
|
+
|
|
1552
|
+
async def acquire_resource(
|
|
1553
|
+
self,
|
|
1554
|
+
process_id: str,
|
|
1555
|
+
resource_id: str,
|
|
1556
|
+
process_name: Optional[str] = None,
|
|
1557
|
+
priority: int = 0,
|
|
1558
|
+
timeout: Optional[float] = None,
|
|
1559
|
+
) -> bool:
|
|
1560
|
+
"""Process attempts to acquire a resource"""
|
|
1561
|
+
if process_name:
|
|
1562
|
+
await self._resource_graph.add_process(
|
|
1563
|
+
process_id, process_name, priority, timeout
|
|
1564
|
+
)
|
|
1565
|
+
|
|
1566
|
+
success = await self._resource_graph.acquire_resource(
|
|
1567
|
+
process_id, resource_id, timeout=timeout
|
|
1568
|
+
)
|
|
1569
|
+
|
|
1570
|
+
# Immediate deadlock check after failed acquisition
|
|
1571
|
+
if not success:
|
|
1572
|
+
try:
|
|
1573
|
+
result = await self._resource_graph.detect_deadlock()
|
|
1574
|
+
if result.has_cycle:
|
|
1575
|
+
await self._handle_deadlock_detection(result, "immediate_check")
|
|
1576
|
+
except Exception as e:
|
|
1577
|
+
logger.error(f"Error during immediate deadlock check: {e}")
|
|
1578
|
+
|
|
1579
|
+
return success
|
|
1580
|
+
|
|
1581
|
+
async def release_resource(self, process_id: str, resource_id: str) -> bool:
|
|
1582
|
+
"""Process releases a resource"""
|
|
1583
|
+
return await self._resource_graph.release_resource(process_id, resource_id)
|
|
1584
|
+
|
|
1585
|
+
def get_comprehensive_status(self) -> dict[str, Any]:
|
|
1586
|
+
"""Get comprehensive detector status"""
|
|
1587
|
+
return {
|
|
1588
|
+
# Basic status
|
|
1589
|
+
"active": bool(self._detection_task and not self._detection_task.done()),
|
|
1590
|
+
"health_status": self._health_status,
|
|
1591
|
+
"cycle_count": self._cycle_count,
|
|
1592
|
+
"last_cycle": self._last_cycle,
|
|
1593
|
+
# Missing fields that tests expect
|
|
1594
|
+
"graph_size": len(self._dependency_graph.nodes),
|
|
1595
|
+
"resource_count": len(self._resource_graph.resources),
|
|
1596
|
+
"process_count": len(self._resource_graph.processes),
|
|
1597
|
+
"blocked_processes": len(self._resource_graph.get_blocked_processes()),
|
|
1598
|
+
# Configuration
|
|
1599
|
+
"detection_interval": self.detection_interval,
|
|
1600
|
+
"resolution_strategy": self.resolution_strategy.name,
|
|
1601
|
+
"max_resolution_attempts": self.max_resolution_attempts,
|
|
1602
|
+
# Graph statistics
|
|
1603
|
+
"dependency_graph": self._dependency_graph.get_metrics(),
|
|
1604
|
+
"resource_graph": self._resource_graph.get_metrics(),
|
|
1605
|
+
# Performance metrics
|
|
1606
|
+
"metrics": self._metrics.copy(),
|
|
1607
|
+
# Recent activity
|
|
1608
|
+
"recent_detections": len(
|
|
1609
|
+
[
|
|
1610
|
+
h
|
|
1611
|
+
for h in self._detection_history
|
|
1612
|
+
if (datetime.now(timezone.utc) - h["timestamp"]).total_seconds()
|
|
1613
|
+
< 300
|
|
1614
|
+
]
|
|
1615
|
+
),
|
|
1616
|
+
"recent_resolutions": len(
|
|
1617
|
+
[
|
|
1618
|
+
r
|
|
1619
|
+
for r in self._resolution_history
|
|
1620
|
+
if (datetime.now(timezone.utc) - r["timestamp"]).total_seconds()
|
|
1621
|
+
< 300
|
|
1622
|
+
]
|
|
1623
|
+
),
|
|
1624
|
+
# Health indicators
|
|
1625
|
+
"last_successful_detection": self._last_successful_detection.isoformat(),
|
|
1626
|
+
"time_since_last_detection": (
|
|
1627
|
+
datetime.now(timezone.utc) - self._last_successful_detection
|
|
1628
|
+
).total_seconds(),
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1631
|
+
async def force_detection(self) -> CycleDetectionResult:
|
|
1632
|
+
"""Force an immediate deadlock detection"""
|
|
1633
|
+
try:
|
|
1634
|
+
# Check both graphs
|
|
1635
|
+
state_result = self._dependency_graph.find_cycles()
|
|
1636
|
+
resource_result = await self._resource_graph.detect_deadlock()
|
|
1637
|
+
|
|
1638
|
+
# Return combined result
|
|
1639
|
+
all_cycles = state_result.cycles + resource_result.cycles
|
|
1640
|
+
|
|
1641
|
+
return CycleDetectionResult(
|
|
1642
|
+
has_cycle=len(all_cycles) > 0,
|
|
1643
|
+
cycles=all_cycles,
|
|
1644
|
+
graph_size=state_result.graph_size + resource_result.graph_size,
|
|
1645
|
+
detection_duration_ms=max(
|
|
1646
|
+
state_result.detection_duration_ms,
|
|
1647
|
+
resource_result.detection_duration_ms,
|
|
1648
|
+
),
|
|
1649
|
+
algorithm_used="combined",
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
except Exception as e:
|
|
1653
|
+
logger.error(f"Force detection failed: {e}")
|
|
1654
|
+
raise
|
|
1655
|
+
|
|
1656
|
+
async def export_state(self) -> dict[str, Any]:
|
|
1657
|
+
"""Export current state for debugging/analysis"""
|
|
1658
|
+
return {
|
|
1659
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
1660
|
+
"status": self.get_comprehensive_status(),
|
|
1661
|
+
"dependency_graph": self._dependency_graph.get_metrics(),
|
|
1662
|
+
"resource_graph": {
|
|
1663
|
+
"processes": {
|
|
1664
|
+
pid: {
|
|
1665
|
+
"name": proc.process_name,
|
|
1666
|
+
"holding": list(proc.holding),
|
|
1667
|
+
"waiting_for": list(proc.waiting_for),
|
|
1668
|
+
"priority": proc.priority,
|
|
1669
|
+
"blocked_duration": proc.blocked_duration_seconds(),
|
|
1670
|
+
}
|
|
1671
|
+
for pid, proc in self._resource_graph.processes.items()
|
|
1672
|
+
},
|
|
1673
|
+
"resources": {
|
|
1674
|
+
rid: {
|
|
1675
|
+
"type": res.resource_type,
|
|
1676
|
+
"holders": list(res.holders),
|
|
1677
|
+
"waiters": list(res.waiters),
|
|
1678
|
+
"access_count": res.access_count,
|
|
1679
|
+
}
|
|
1680
|
+
for rid, res in self._resource_graph.resources.items()
|
|
1681
|
+
},
|
|
1682
|
+
},
|
|
1683
|
+
"detection_history": list(self._detection_history)[
|
|
1684
|
+
-10:
|
|
1685
|
+
], # Last 10 detections
|
|
1686
|
+
"resolution_history": list(self._resolution_history)[
|
|
1687
|
+
-10:
|
|
1688
|
+
], # Last 10 resolutions
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
def get_status(self) -> dict[str, Any]:
|
|
1692
|
+
"""Get comprehensive detector status (alias for backward compatibility)"""
|
|
1693
|
+
return self.get_comprehensive_status()
|
|
1694
|
+
|
|
1695
|
+
def get_dependency_graph(self) -> dict[str, set[str]]:
|
|
1696
|
+
"""Get current dependency graph"""
|
|
1697
|
+
return dict(self._dependency_graph.nodes)
|
|
1698
|
+
|
|
1699
|
+
def get_wait_graph(self) -> dict[str, dict[str, Any]]:
|
|
1700
|
+
"""Get current wait-for graph with enhanced information"""
|
|
1701
|
+
graph = {}
|
|
1702
|
+
|
|
1703
|
+
for process_id, process in self._resource_graph.processes.items():
|
|
1704
|
+
graph[process_id] = {
|
|
1705
|
+
"name": process.process_name,
|
|
1706
|
+
"holding": list(process.holding),
|
|
1707
|
+
"waiting_for": list(process.waiting_for),
|
|
1708
|
+
"blocked": process.is_blocked(),
|
|
1709
|
+
"blocked_duration_seconds": process.blocked_duration_seconds(),
|
|
1710
|
+
"age_seconds": process.age_seconds(),
|
|
1711
|
+
"priority": process.priority,
|
|
1712
|
+
"last_activity": process.last_activity.isoformat(),
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
return graph
|
|
1716
|
+
|
|
1717
|
+
def find_potential_deadlocks(self) -> list[tuple[str, str]]:
|
|
1718
|
+
"""Find potential deadlock situations before they occur"""
|
|
1719
|
+
potential = []
|
|
1720
|
+
|
|
1721
|
+
# Check for circular wait conditions
|
|
1722
|
+
for p1_id, p1 in self._resource_graph.processes.items():
|
|
1723
|
+
for p2_id, p2 in self._resource_graph.processes.items():
|
|
1724
|
+
if p1_id == p2_id:
|
|
1725
|
+
continue
|
|
1726
|
+
|
|
1727
|
+
# Check if P1 holds what P2 wants and vice versa
|
|
1728
|
+
p1_holds_p2_wants = bool(p1.holding & p2.waiting_for)
|
|
1729
|
+
p2_holds_p1_wants = bool(p2.holding & p1.waiting_for)
|
|
1730
|
+
|
|
1731
|
+
if p1_holds_p2_wants and p2_holds_p1_wants:
|
|
1732
|
+
potential.append((p1_id, p2_id))
|
|
1733
|
+
|
|
1734
|
+
return potential
|
|
1735
|
+
|
|
1736
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
1737
|
+
"""Get comprehensive metrics"""
|
|
1738
|
+
return {
|
|
1739
|
+
**self._metrics,
|
|
1740
|
+
"detection_history_length": len(self._detection_history),
|
|
1741
|
+
"active_processes": len(self._resource_graph.processes),
|
|
1742
|
+
"active_resources": len(self._resource_graph.resources),
|
|
1743
|
+
"blocked_processes": len(self._resource_graph.get_blocked_processes()),
|
|
1744
|
+
}
|
|
1745
|
+
|
|
1746
|
+
# Context manager support
|
|
1747
|
+
async def __aenter__(self) -> "DeadlockDetector":
|
|
1748
|
+
"""Async context manager entry"""
|
|
1749
|
+
await self.start()
|
|
1750
|
+
return self
|
|
1751
|
+
|
|
1752
|
+
async def __aexit__(
|
|
1753
|
+
self,
|
|
1754
|
+
exc_type: Optional[type],
|
|
1755
|
+
exc_val: Optional[BaseException],
|
|
1756
|
+
exc_tb: Optional[object],
|
|
1757
|
+
) -> None:
|
|
1758
|
+
"""Async context manager exit"""
|
|
1759
|
+
await self.stop()
|