kailash 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +35 -4
- kailash/adapters/__init__.py +5 -0
- kailash/adapters/mcp_platform_adapter.py +273 -0
- kailash/channels/__init__.py +21 -0
- kailash/channels/api_channel.py +409 -0
- kailash/channels/base.py +271 -0
- kailash/channels/cli_channel.py +661 -0
- kailash/channels/event_router.py +496 -0
- kailash/channels/mcp_channel.py +648 -0
- kailash/channels/session.py +423 -0
- kailash/mcp_server/discovery.py +1 -1
- kailash/middleware/core/agent_ui.py +5 -0
- kailash/middleware/mcp/enhanced_server.py +22 -16
- kailash/nexus/__init__.py +21 -0
- kailash/nexus/factory.py +413 -0
- kailash/nexus/gateway.py +545 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/iterative_llm_agent.py +988 -17
- kailash/nodes/ai/llm_agent.py +29 -9
- kailash/nodes/api/__init__.py +2 -2
- kailash/nodes/api/monitoring.py +1 -1
- kailash/nodes/base_async.py +54 -14
- kailash/nodes/code/async_python.py +1 -1
- kailash/nodes/data/bulk_operations.py +939 -0
- kailash/nodes/data/query_builder.py +373 -0
- kailash/nodes/data/query_cache.py +512 -0
- kailash/nodes/monitoring/__init__.py +10 -0
- kailash/nodes/monitoring/deadlock_detector.py +964 -0
- kailash/nodes/monitoring/performance_anomaly.py +1078 -0
- kailash/nodes/monitoring/race_condition_detector.py +1151 -0
- kailash/nodes/monitoring/transaction_metrics.py +790 -0
- kailash/nodes/monitoring/transaction_monitor.py +931 -0
- kailash/nodes/system/__init__.py +17 -0
- kailash/nodes/system/command_parser.py +820 -0
- kailash/nodes/transaction/__init__.py +48 -0
- kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
- kailash/nodes/transaction/saga_coordinator.py +652 -0
- kailash/nodes/transaction/saga_state_storage.py +411 -0
- kailash/nodes/transaction/saga_step.py +467 -0
- kailash/nodes/transaction/transaction_context.py +756 -0
- kailash/nodes/transaction/two_phase_commit.py +978 -0
- kailash/nodes/transform/processors.py +17 -1
- kailash/nodes/validation/__init__.py +21 -0
- kailash/nodes/validation/test_executor.py +532 -0
- kailash/nodes/validation/validation_nodes.py +447 -0
- kailash/resources/factory.py +1 -1
- kailash/runtime/async_local.py +84 -21
- kailash/runtime/local.py +21 -2
- kailash/runtime/parameter_injector.py +187 -31
- kailash/security.py +16 -1
- kailash/servers/__init__.py +32 -0
- kailash/servers/durable_workflow_server.py +430 -0
- kailash/servers/enterprise_workflow_server.py +466 -0
- kailash/servers/gateway.py +183 -0
- kailash/servers/workflow_server.py +290 -0
- kailash/utils/data_validation.py +192 -0
- kailash/workflow/builder.py +291 -12
- kailash/workflow/validation.py +144 -8
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/METADATA +1 -1
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/RECORD +64 -26
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/WHEEL +0 -0
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,964 @@
|
|
1
|
+
"""Deadlock detection and resolution node for database operations.
|
2
|
+
|
3
|
+
This module provides comprehensive deadlock detection capabilities with
|
4
|
+
graph-based analysis, automatic resolution strategies, and detailed reporting.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import time
|
10
|
+
from collections import defaultdict, deque
|
11
|
+
from dataclasses import dataclass, field
|
12
|
+
from datetime import UTC, datetime
|
13
|
+
from enum import Enum
|
14
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
15
|
+
|
16
|
+
from kailash.nodes.base import NodeParameter, register_node
|
17
|
+
from kailash.nodes.base_async import AsyncNode
|
18
|
+
from kailash.sdk_exceptions import NodeExecutionError
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class DeadlockType(Enum):
|
24
|
+
"""Types of deadlocks that can be detected."""
|
25
|
+
|
26
|
+
RESOURCE_LOCK = "resource_lock"
|
27
|
+
WAIT_FOR_GRAPH = "wait_for_graph"
|
28
|
+
TIMEOUT_INFERRED = "timeout_inferred"
|
29
|
+
CIRCULAR_DEPENDENCY = "circular_dependency"
|
30
|
+
|
31
|
+
|
32
|
+
class ResolutionStrategy(Enum):
|
33
|
+
"""Deadlock resolution strategies."""
|
34
|
+
|
35
|
+
VICTIM_SELECTION = "victim_selection"
|
36
|
+
TIMEOUT_ROLLBACK = "timeout_rollback"
|
37
|
+
PRIORITY_BASED = "priority_based"
|
38
|
+
COST_BASED = "cost_based"
|
39
|
+
MANUAL = "manual"
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class ResourceLock:
|
44
|
+
"""Represents a resource lock in the system."""
|
45
|
+
|
46
|
+
resource_id: str
|
47
|
+
lock_type: str # shared, exclusive, update
|
48
|
+
holder_transaction_id: str
|
49
|
+
requested_at: float
|
50
|
+
granted_at: Optional[float] = None
|
51
|
+
timeout: Optional[float] = None
|
52
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
53
|
+
|
54
|
+
|
55
|
+
@dataclass
|
56
|
+
class TransactionWait:
|
57
|
+
"""Represents a transaction waiting for a resource."""
|
58
|
+
|
59
|
+
transaction_id: str
|
60
|
+
waiting_for_transaction_id: str
|
61
|
+
resource_id: str
|
62
|
+
wait_start_time: float
|
63
|
+
timeout: Optional[float] = None
|
64
|
+
priority: int = 0
|
65
|
+
cost: float = 0.0
|
66
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
67
|
+
|
68
|
+
|
69
|
+
@dataclass
|
70
|
+
class DeadlockDetection:
|
71
|
+
"""Represents a detected deadlock."""
|
72
|
+
|
73
|
+
detection_id: str
|
74
|
+
deadlock_type: DeadlockType
|
75
|
+
involved_transactions: List[str]
|
76
|
+
involved_resources: List[str]
|
77
|
+
detection_time: float
|
78
|
+
wait_chain: List[TransactionWait]
|
79
|
+
victim_candidates: List[str] = field(default_factory=list)
|
80
|
+
recommended_strategy: Optional[ResolutionStrategy] = None
|
81
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
82
|
+
|
83
|
+
|
84
|
+
@register_node()
|
85
|
+
class DeadlockDetectorNode(AsyncNode):
|
86
|
+
"""Node for detecting and resolving database deadlocks.
|
87
|
+
|
88
|
+
This node provides comprehensive deadlock detection including:
|
89
|
+
- Graph-based cycle detection in wait-for graphs
|
90
|
+
- Timeout-based deadlock inference
|
91
|
+
- Victim selection with multiple strategies
|
92
|
+
- Automatic deadlock resolution
|
93
|
+
- Detailed deadlock reporting and analysis
|
94
|
+
- Integration with database transaction monitoring
|
95
|
+
|
96
|
+
Design Purpose:
|
97
|
+
- Prevent and resolve database deadlocks in production
|
98
|
+
- Provide actionable insights for deadlock prevention
|
99
|
+
- Support multiple resolution strategies
|
100
|
+
- Enable proactive deadlock monitoring
|
101
|
+
|
102
|
+
Examples:
|
103
|
+
>>> # Register active transaction locks
|
104
|
+
>>> detector = DeadlockDetectorNode()
|
105
|
+
>>> result = await detector.execute(
|
106
|
+
... operation="register_lock",
|
107
|
+
... transaction_id="txn_123",
|
108
|
+
... resource_id="table_orders",
|
109
|
+
... lock_type="exclusive"
|
110
|
+
... )
|
111
|
+
|
112
|
+
>>> # Register transaction wait
|
113
|
+
>>> result = await detector.execute(
|
114
|
+
... operation="register_wait",
|
115
|
+
... transaction_id="txn_456",
|
116
|
+
... waiting_for_transaction_id="txn_123",
|
117
|
+
... resource_id="table_orders"
|
118
|
+
... )
|
119
|
+
|
120
|
+
>>> # Detect deadlocks
|
121
|
+
>>> result = await detector.execute(
|
122
|
+
... operation="detect_deadlocks",
|
123
|
+
... detection_algorithm="wait_for_graph"
|
124
|
+
... )
|
125
|
+
"""
|
126
|
+
|
127
|
+
def __init__(self, **kwargs):
|
128
|
+
"""Initialize the deadlock detector node."""
|
129
|
+
super().__init__(**kwargs)
|
130
|
+
self._active_locks: Dict[str, ResourceLock] = {}
|
131
|
+
self._active_waits: Dict[str, TransactionWait] = {}
|
132
|
+
self._wait_for_graph: Dict[str, Set[str]] = defaultdict(set)
|
133
|
+
self._transaction_resources: Dict[str, Set[str]] = defaultdict(set)
|
134
|
+
self._resource_holders: Dict[str, str] = {}
|
135
|
+
self._detected_deadlocks: List[DeadlockDetection] = []
|
136
|
+
self._detection_history: List[Dict[str, Any]] = []
|
137
|
+
self._monitoring_active = False
|
138
|
+
self._background_tasks: Set[asyncio.Task] = set()
|
139
|
+
self.logger.info(f"Initialized DeadlockDetectorNode: {self.id}")
|
140
|
+
|
141
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
142
|
+
"""Define the parameters this node accepts."""
|
143
|
+
return {
|
144
|
+
"operation": NodeParameter(
|
145
|
+
name="operation",
|
146
|
+
type=str,
|
147
|
+
required=True,
|
148
|
+
description="Operation (register_lock, register_wait, detect_deadlocks, resolve_deadlock, get_status)",
|
149
|
+
),
|
150
|
+
"transaction_id": NodeParameter(
|
151
|
+
name="transaction_id",
|
152
|
+
type=str,
|
153
|
+
required=False,
|
154
|
+
description="Transaction identifier",
|
155
|
+
),
|
156
|
+
"resource_id": NodeParameter(
|
157
|
+
name="resource_id",
|
158
|
+
type=str,
|
159
|
+
required=False,
|
160
|
+
description="Resource identifier (table, row, etc.)",
|
161
|
+
),
|
162
|
+
"lock_type": NodeParameter(
|
163
|
+
name="lock_type",
|
164
|
+
type=str,
|
165
|
+
required=False,
|
166
|
+
default="exclusive",
|
167
|
+
description="Type of lock (shared, exclusive, update)",
|
168
|
+
),
|
169
|
+
"waiting_for_transaction_id": NodeParameter(
|
170
|
+
name="waiting_for_transaction_id",
|
171
|
+
type=str,
|
172
|
+
required=False,
|
173
|
+
description="Transaction ID that this transaction is waiting for",
|
174
|
+
),
|
175
|
+
"timeout": NodeParameter(
|
176
|
+
name="timeout",
|
177
|
+
type=float,
|
178
|
+
required=False,
|
179
|
+
description="Timeout for lock or wait in seconds",
|
180
|
+
),
|
181
|
+
"priority": NodeParameter(
|
182
|
+
name="priority",
|
183
|
+
type=int,
|
184
|
+
required=False,
|
185
|
+
default=0,
|
186
|
+
description="Transaction priority for victim selection",
|
187
|
+
),
|
188
|
+
"cost": NodeParameter(
|
189
|
+
name="cost",
|
190
|
+
type=float,
|
191
|
+
required=False,
|
192
|
+
default=0.0,
|
193
|
+
description="Transaction cost for victim selection",
|
194
|
+
),
|
195
|
+
"detection_algorithm": NodeParameter(
|
196
|
+
name="detection_algorithm",
|
197
|
+
type=str,
|
198
|
+
required=False,
|
199
|
+
default="wait_for_graph",
|
200
|
+
description="Detection algorithm (wait_for_graph, timeout_based, combined)",
|
201
|
+
),
|
202
|
+
"resolution_strategy": NodeParameter(
|
203
|
+
name="resolution_strategy",
|
204
|
+
type=str,
|
205
|
+
required=False,
|
206
|
+
default="victim_selection",
|
207
|
+
description="Resolution strategy (victim_selection, timeout_rollback, priority_based, cost_based)",
|
208
|
+
),
|
209
|
+
"deadlock_id": NodeParameter(
|
210
|
+
name="deadlock_id",
|
211
|
+
type=str,
|
212
|
+
required=False,
|
213
|
+
description="Deadlock detection ID for resolution",
|
214
|
+
),
|
215
|
+
"victim_transaction_id": NodeParameter(
|
216
|
+
name="victim_transaction_id",
|
217
|
+
type=str,
|
218
|
+
required=False,
|
219
|
+
description="Transaction to abort as deadlock victim",
|
220
|
+
),
|
221
|
+
"enable_monitoring": NodeParameter(
|
222
|
+
name="enable_monitoring",
|
223
|
+
type=bool,
|
224
|
+
required=False,
|
225
|
+
default=False,
|
226
|
+
description="Enable continuous deadlock monitoring",
|
227
|
+
),
|
228
|
+
"monitoring_interval": NodeParameter(
|
229
|
+
name="monitoring_interval",
|
230
|
+
type=float,
|
231
|
+
required=False,
|
232
|
+
default=1.0,
|
233
|
+
description="Monitoring interval in seconds",
|
234
|
+
),
|
235
|
+
"metadata": NodeParameter(
|
236
|
+
name="metadata",
|
237
|
+
type=dict,
|
238
|
+
required=False,
|
239
|
+
default={},
|
240
|
+
description="Additional metadata for the operation",
|
241
|
+
),
|
242
|
+
}
|
243
|
+
|
244
|
+
def get_output_schema(self) -> Dict[str, NodeParameter]:
|
245
|
+
"""Define the output schema for this node."""
|
246
|
+
return {
|
247
|
+
"deadlocks_detected": NodeParameter(
|
248
|
+
name="deadlocks_detected",
|
249
|
+
type=list,
|
250
|
+
description="List of detected deadlocks",
|
251
|
+
),
|
252
|
+
"deadlock_count": NodeParameter(
|
253
|
+
name="deadlock_count",
|
254
|
+
type=int,
|
255
|
+
description="Number of deadlocks detected",
|
256
|
+
),
|
257
|
+
"active_locks": NodeParameter(
|
258
|
+
name="active_locks", type=int, description="Number of active locks"
|
259
|
+
),
|
260
|
+
"active_waits": NodeParameter(
|
261
|
+
name="active_waits", type=int, description="Number of active waits"
|
262
|
+
),
|
263
|
+
"resolution_actions": NodeParameter(
|
264
|
+
name="resolution_actions",
|
265
|
+
type=list,
|
266
|
+
description="Recommended or taken resolution actions",
|
267
|
+
),
|
268
|
+
"wait_for_graph": NodeParameter(
|
269
|
+
name="wait_for_graph",
|
270
|
+
type=dict,
|
271
|
+
description="Current wait-for graph structure",
|
272
|
+
),
|
273
|
+
"monitoring_status": NodeParameter(
|
274
|
+
name="monitoring_status",
|
275
|
+
type=str,
|
276
|
+
description="Current monitoring status",
|
277
|
+
),
|
278
|
+
"timestamp": NodeParameter(
|
279
|
+
name="timestamp", type=str, description="ISO timestamp of operation"
|
280
|
+
),
|
281
|
+
"status": NodeParameter(
|
282
|
+
name="status", type=str, description="Operation status"
|
283
|
+
),
|
284
|
+
}
|
285
|
+
|
286
|
+
async def async_run(self, **kwargs) -> Dict[str, Any]:
|
287
|
+
"""Execute deadlock detection operation."""
|
288
|
+
operation = kwargs.get("operation")
|
289
|
+
|
290
|
+
try:
|
291
|
+
if operation == "initialize":
|
292
|
+
return await self._initialize(**kwargs)
|
293
|
+
elif operation == "register_lock":
|
294
|
+
return await self._register_lock(**kwargs)
|
295
|
+
elif operation == "acquire_resource":
|
296
|
+
return await self._register_lock(**kwargs) # Same as register_lock
|
297
|
+
elif operation == "request_resource":
|
298
|
+
return await self._request_resource(**kwargs) # Custom implementation
|
299
|
+
elif operation == "register_wait":
|
300
|
+
return await self._register_wait(**kwargs)
|
301
|
+
elif operation == "release_lock":
|
302
|
+
return await self._release_lock(**kwargs)
|
303
|
+
elif operation == "release_resource":
|
304
|
+
return await self._release_lock(**kwargs) # Same as release_lock
|
305
|
+
elif operation == "detect_deadlocks":
|
306
|
+
return await self._detect_deadlocks(**kwargs)
|
307
|
+
elif operation == "resolve_deadlock":
|
308
|
+
return await self._resolve_deadlock(**kwargs)
|
309
|
+
elif operation == "get_status":
|
310
|
+
return await self._get_status(**kwargs)
|
311
|
+
elif operation == "start_monitoring":
|
312
|
+
return await self._start_monitoring(**kwargs)
|
313
|
+
elif operation == "stop_monitoring":
|
314
|
+
return await self._stop_monitoring(**kwargs)
|
315
|
+
else:
|
316
|
+
raise ValueError(f"Unknown operation: {operation}")
|
317
|
+
|
318
|
+
except Exception as e:
|
319
|
+
self.logger.error(f"Deadlock detection operation failed: {str(e)}")
|
320
|
+
raise NodeExecutionError(f"Failed to execute deadlock detection: {str(e)}")
|
321
|
+
|
322
|
+
async def _register_lock(self, **kwargs) -> Dict[str, Any]:
|
323
|
+
"""Register a new resource lock."""
|
324
|
+
transaction_id = kwargs.get("transaction_id")
|
325
|
+
resource_id = kwargs.get("resource_id")
|
326
|
+
lock_type = kwargs.get("lock_type", "exclusive")
|
327
|
+
timeout = kwargs.get("timeout")
|
328
|
+
metadata = kwargs.get("metadata", {})
|
329
|
+
|
330
|
+
if not transaction_id or not resource_id:
|
331
|
+
raise ValueError("transaction_id and resource_id are required")
|
332
|
+
|
333
|
+
current_time = time.time()
|
334
|
+
lock_id = f"{transaction_id}:{resource_id}"
|
335
|
+
|
336
|
+
# Create lock record
|
337
|
+
lock = ResourceLock(
|
338
|
+
resource_id=resource_id,
|
339
|
+
lock_type=lock_type,
|
340
|
+
holder_transaction_id=transaction_id,
|
341
|
+
requested_at=current_time,
|
342
|
+
granted_at=current_time,
|
343
|
+
timeout=timeout,
|
344
|
+
metadata=metadata,
|
345
|
+
)
|
346
|
+
|
347
|
+
# Register lock
|
348
|
+
self._active_locks[lock_id] = lock
|
349
|
+
self._transaction_resources[transaction_id].add(resource_id)
|
350
|
+
self._resource_holders[resource_id] = transaction_id
|
351
|
+
|
352
|
+
self.logger.debug(
|
353
|
+
f"Registered lock: {transaction_id} -> {resource_id} ({lock_type})"
|
354
|
+
)
|
355
|
+
|
356
|
+
return {
|
357
|
+
"deadlocks_detected": [],
|
358
|
+
"deadlock_count": 0,
|
359
|
+
"active_locks": len(self._active_locks),
|
360
|
+
"active_waits": len(self._active_waits),
|
361
|
+
"resolution_actions": [],
|
362
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
363
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
364
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
365
|
+
"status": "success",
|
366
|
+
}
|
367
|
+
|
368
|
+
async def _register_wait(self, **kwargs) -> Dict[str, Any]:
|
369
|
+
"""Register a transaction wait."""
|
370
|
+
transaction_id = kwargs.get("transaction_id")
|
371
|
+
waiting_for_transaction_id = kwargs.get("waiting_for_transaction_id")
|
372
|
+
resource_id = kwargs.get("resource_id")
|
373
|
+
timeout = kwargs.get("timeout")
|
374
|
+
priority = kwargs.get("priority", 0)
|
375
|
+
cost = kwargs.get("cost", 0.0)
|
376
|
+
metadata = kwargs.get("metadata", {})
|
377
|
+
|
378
|
+
if not transaction_id or not waiting_for_transaction_id:
|
379
|
+
raise ValueError(
|
380
|
+
"transaction_id and waiting_for_transaction_id are required"
|
381
|
+
)
|
382
|
+
|
383
|
+
current_time = time.time()
|
384
|
+
wait_id = f"{transaction_id}:{waiting_for_transaction_id}"
|
385
|
+
|
386
|
+
# Create wait record
|
387
|
+
wait = TransactionWait(
|
388
|
+
transaction_id=transaction_id,
|
389
|
+
waiting_for_transaction_id=waiting_for_transaction_id,
|
390
|
+
resource_id=resource_id or "unknown",
|
391
|
+
wait_start_time=current_time,
|
392
|
+
timeout=timeout,
|
393
|
+
priority=priority,
|
394
|
+
cost=cost,
|
395
|
+
metadata=metadata,
|
396
|
+
)
|
397
|
+
|
398
|
+
# Register wait and update wait-for graph
|
399
|
+
self._active_waits[wait_id] = wait
|
400
|
+
self._wait_for_graph[transaction_id].add(waiting_for_transaction_id)
|
401
|
+
|
402
|
+
self.logger.debug(
|
403
|
+
f"Registered wait: {transaction_id} -> {waiting_for_transaction_id}"
|
404
|
+
)
|
405
|
+
|
406
|
+
# Check for immediate deadlock
|
407
|
+
deadlocks = await self._detect_cycles_in_wait_graph()
|
408
|
+
|
409
|
+
return {
|
410
|
+
"deadlocks_detected": [self._serialize_deadlock(d) for d in deadlocks],
|
411
|
+
"deadlock_count": len(deadlocks),
|
412
|
+
"active_locks": len(self._active_locks),
|
413
|
+
"active_waits": len(self._active_waits),
|
414
|
+
"resolution_actions": [],
|
415
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
416
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
417
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
418
|
+
"status": "success",
|
419
|
+
}
|
420
|
+
|
421
|
+
async def _release_lock(self, **kwargs) -> Dict[str, Any]:
|
422
|
+
"""Release a resource lock."""
|
423
|
+
transaction_id = kwargs.get("transaction_id")
|
424
|
+
resource_id = kwargs.get("resource_id")
|
425
|
+
|
426
|
+
if not transaction_id:
|
427
|
+
raise ValueError("transaction_id is required")
|
428
|
+
|
429
|
+
# Remove specific lock or all locks for transaction
|
430
|
+
if resource_id:
|
431
|
+
lock_id = f"{transaction_id}:{resource_id}"
|
432
|
+
if lock_id in self._active_locks:
|
433
|
+
del self._active_locks[lock_id]
|
434
|
+
self._transaction_resources[transaction_id].discard(resource_id)
|
435
|
+
if self._resource_holders.get(resource_id) == transaction_id:
|
436
|
+
del self._resource_holders[resource_id]
|
437
|
+
else:
|
438
|
+
# Remove all locks for transaction
|
439
|
+
to_remove = [
|
440
|
+
lock_id
|
441
|
+
for lock_id in self._active_locks
|
442
|
+
if self._active_locks[lock_id].holder_transaction_id == transaction_id
|
443
|
+
]
|
444
|
+
for lock_id in to_remove:
|
445
|
+
lock = self._active_locks[lock_id]
|
446
|
+
del self._active_locks[lock_id]
|
447
|
+
self._transaction_resources[transaction_id].discard(lock.resource_id)
|
448
|
+
if self._resource_holders.get(lock.resource_id) == transaction_id:
|
449
|
+
del self._resource_holders[lock.resource_id]
|
450
|
+
|
451
|
+
# Remove waits involving this transaction
|
452
|
+
to_remove_waits = [
|
453
|
+
wait_id
|
454
|
+
for wait_id in self._active_waits
|
455
|
+
if (
|
456
|
+
self._active_waits[wait_id].transaction_id == transaction_id
|
457
|
+
or self._active_waits[wait_id].waiting_for_transaction_id
|
458
|
+
== transaction_id
|
459
|
+
)
|
460
|
+
]
|
461
|
+
for wait_id in to_remove_waits:
|
462
|
+
wait = self._active_waits[wait_id]
|
463
|
+
del self._active_waits[wait_id]
|
464
|
+
self._wait_for_graph[wait.transaction_id].discard(
|
465
|
+
wait.waiting_for_transaction_id
|
466
|
+
)
|
467
|
+
|
468
|
+
# Clean up empty graph entries
|
469
|
+
if (
|
470
|
+
transaction_id in self._wait_for_graph
|
471
|
+
and not self._wait_for_graph[transaction_id]
|
472
|
+
):
|
473
|
+
del self._wait_for_graph[transaction_id]
|
474
|
+
|
475
|
+
self.logger.debug(f"Released locks for transaction: {transaction_id}")
|
476
|
+
|
477
|
+
return {
|
478
|
+
"deadlocks_detected": [],
|
479
|
+
"deadlock_count": 0,
|
480
|
+
"active_locks": len(self._active_locks),
|
481
|
+
"active_waits": len(self._active_waits),
|
482
|
+
"resolution_actions": [f"Released locks for {transaction_id}"],
|
483
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
484
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
485
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
486
|
+
"status": "success",
|
487
|
+
}
|
488
|
+
|
489
|
+
async def _detect_deadlocks(self, **kwargs) -> Dict[str, Any]:
|
490
|
+
"""Detect deadlocks using specified algorithm."""
|
491
|
+
algorithm = kwargs.get("detection_algorithm", "wait_for_graph")
|
492
|
+
|
493
|
+
deadlocks = []
|
494
|
+
|
495
|
+
if algorithm in ["wait_for_graph", "combined"]:
|
496
|
+
cycle_deadlocks = await self._detect_cycles_in_wait_graph()
|
497
|
+
deadlocks.extend(cycle_deadlocks)
|
498
|
+
|
499
|
+
if algorithm in ["timeout_based", "combined"]:
|
500
|
+
timeout_deadlocks = await self._detect_timeout_deadlocks()
|
501
|
+
deadlocks.extend(timeout_deadlocks)
|
502
|
+
|
503
|
+
# Store detected deadlocks
|
504
|
+
self._detected_deadlocks.extend(deadlocks)
|
505
|
+
|
506
|
+
# Add to detection history
|
507
|
+
self._detection_history.append(
|
508
|
+
{
|
509
|
+
"timestamp": time.time(),
|
510
|
+
"algorithm": algorithm,
|
511
|
+
"deadlocks_found": len(deadlocks),
|
512
|
+
"deadlock_ids": [d.detection_id for d in deadlocks],
|
513
|
+
}
|
514
|
+
)
|
515
|
+
|
516
|
+
# Generate resolution recommendations
|
517
|
+
resolution_actions = []
|
518
|
+
for deadlock in deadlocks:
|
519
|
+
deadlock.victim_candidates = self._select_victim_candidates(deadlock)
|
520
|
+
deadlock.recommended_strategy = self._recommend_resolution_strategy(
|
521
|
+
deadlock
|
522
|
+
)
|
523
|
+
resolution_actions.append(
|
524
|
+
{
|
525
|
+
"deadlock_id": deadlock.detection_id,
|
526
|
+
"recommended_strategy": deadlock.recommended_strategy.value,
|
527
|
+
"victim_candidates": deadlock.victim_candidates,
|
528
|
+
}
|
529
|
+
)
|
530
|
+
|
531
|
+
self.logger.info(
|
532
|
+
f"Detected {len(deadlocks)} deadlocks using {algorithm} algorithm"
|
533
|
+
)
|
534
|
+
|
535
|
+
return {
|
536
|
+
"deadlocks_detected": [self._serialize_deadlock(d) for d in deadlocks],
|
537
|
+
"deadlock_count": len(deadlocks),
|
538
|
+
"active_locks": len(self._active_locks),
|
539
|
+
"active_waits": len(self._active_waits),
|
540
|
+
"resolution_actions": resolution_actions,
|
541
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
542
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
543
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
544
|
+
"status": "success",
|
545
|
+
}
|
546
|
+
|
547
|
+
async def _detect_cycles_in_wait_graph(self) -> List[DeadlockDetection]:
|
548
|
+
"""Detect cycles in the wait-for graph using DFS."""
|
549
|
+
deadlocks = []
|
550
|
+
visited = set()
|
551
|
+
rec_stack = set()
|
552
|
+
|
553
|
+
def dfs_cycle_detection(node: str, path: List[str]) -> Optional[List[str]]:
|
554
|
+
"""DFS-based cycle detection."""
|
555
|
+
if node in rec_stack:
|
556
|
+
# Found cycle - extract it
|
557
|
+
cycle_start_idx = path.index(node)
|
558
|
+
return path[cycle_start_idx:] + [node]
|
559
|
+
|
560
|
+
if node in visited:
|
561
|
+
return None
|
562
|
+
|
563
|
+
visited.add(node)
|
564
|
+
rec_stack.add(node)
|
565
|
+
path.append(node)
|
566
|
+
|
567
|
+
for neighbor in self._wait_for_graph.get(node, set()):
|
568
|
+
cycle = dfs_cycle_detection(neighbor, path)
|
569
|
+
if cycle:
|
570
|
+
return cycle
|
571
|
+
|
572
|
+
rec_stack.remove(node)
|
573
|
+
path.pop()
|
574
|
+
return None
|
575
|
+
|
576
|
+
# Check each unvisited node
|
577
|
+
for transaction_id in self._wait_for_graph:
|
578
|
+
if transaction_id not in visited:
|
579
|
+
cycle = dfs_cycle_detection(transaction_id, [])
|
580
|
+
if cycle:
|
581
|
+
# Create deadlock detection
|
582
|
+
deadlock = self._create_deadlock_from_cycle(cycle)
|
583
|
+
deadlocks.append(deadlock)
|
584
|
+
|
585
|
+
return deadlocks
|
586
|
+
|
587
|
+
async def _detect_timeout_deadlocks(self) -> List[DeadlockDetection]:
|
588
|
+
"""Detect deadlocks based on wait timeouts."""
|
589
|
+
deadlocks = []
|
590
|
+
current_time = time.time()
|
591
|
+
|
592
|
+
# Group waits that have exceeded their timeout
|
593
|
+
timeout_waits = []
|
594
|
+
for wait in self._active_waits.values():
|
595
|
+
if wait.timeout and (current_time - wait.wait_start_time) > wait.timeout:
|
596
|
+
timeout_waits.append(wait)
|
597
|
+
|
598
|
+
# Create deadlock detection for timeout-based inference
|
599
|
+
if timeout_waits:
|
600
|
+
detection_id = f"timeout_deadlock_{int(current_time)}"
|
601
|
+
involved_transactions = list(set(w.transaction_id for w in timeout_waits))
|
602
|
+
involved_resources = list(set(w.resource_id for w in timeout_waits))
|
603
|
+
|
604
|
+
deadlock = DeadlockDetection(
|
605
|
+
detection_id=detection_id,
|
606
|
+
deadlock_type=DeadlockType.TIMEOUT_INFERRED,
|
607
|
+
involved_transactions=involved_transactions,
|
608
|
+
involved_resources=involved_resources,
|
609
|
+
detection_time=current_time,
|
610
|
+
wait_chain=timeout_waits,
|
611
|
+
metadata={
|
612
|
+
"timeout_count": len(timeout_waits),
|
613
|
+
"max_wait_time": max(
|
614
|
+
current_time - w.wait_start_time for w in timeout_waits
|
615
|
+
),
|
616
|
+
},
|
617
|
+
)
|
618
|
+
|
619
|
+
deadlocks.append(deadlock)
|
620
|
+
|
621
|
+
return deadlocks
|
622
|
+
|
623
|
+
def _create_deadlock_from_cycle(self, cycle: List[str]) -> DeadlockDetection:
|
624
|
+
"""Create a deadlock detection from a detected cycle."""
|
625
|
+
current_time = time.time()
|
626
|
+
detection_id = f"cycle_deadlock_{int(current_time)}_{len(cycle)}"
|
627
|
+
|
628
|
+
# Build wait chain from cycle
|
629
|
+
wait_chain = []
|
630
|
+
involved_resources = set()
|
631
|
+
|
632
|
+
for i in range(len(cycle) - 1):
|
633
|
+
current_txn = cycle[i]
|
634
|
+
next_txn = cycle[i + 1]
|
635
|
+
|
636
|
+
# Find the wait relationship
|
637
|
+
wait_id = f"{current_txn}:{next_txn}"
|
638
|
+
if wait_id in self._active_waits:
|
639
|
+
wait = self._active_waits[wait_id]
|
640
|
+
wait_chain.append(wait)
|
641
|
+
involved_resources.add(wait.resource_id)
|
642
|
+
|
643
|
+
return DeadlockDetection(
|
644
|
+
detection_id=detection_id,
|
645
|
+
deadlock_type=DeadlockType.WAIT_FOR_GRAPH,
|
646
|
+
involved_transactions=cycle[:-1], # Remove duplicate last element
|
647
|
+
involved_resources=list(involved_resources),
|
648
|
+
detection_time=current_time,
|
649
|
+
wait_chain=wait_chain,
|
650
|
+
metadata={"cycle_length": len(cycle) - 1, "cycle_path": " -> ".join(cycle)},
|
651
|
+
)
|
652
|
+
|
653
|
+
def _select_victim_candidates(self, deadlock: DeadlockDetection) -> List[str]:
|
654
|
+
"""Select victim candidates for deadlock resolution."""
|
655
|
+
candidates = []
|
656
|
+
|
657
|
+
# Priority-based selection (lower priority = better victim)
|
658
|
+
if deadlock.wait_chain:
|
659
|
+
wait_priorities = [
|
660
|
+
(w.transaction_id, w.priority) for w in deadlock.wait_chain
|
661
|
+
]
|
662
|
+
min_priority = min(p for _, p in wait_priorities)
|
663
|
+
candidates.extend([txn for txn, p in wait_priorities if p == min_priority])
|
664
|
+
|
665
|
+
# Cost-based selection (lower cost = better victim)
|
666
|
+
if deadlock.wait_chain and not candidates:
|
667
|
+
wait_costs = [(w.transaction_id, w.cost) for w in deadlock.wait_chain]
|
668
|
+
min_cost = min(c for _, c in wait_costs)
|
669
|
+
candidates.extend([txn for txn, c in wait_costs if c == min_cost])
|
670
|
+
|
671
|
+
# Default: select transaction with shortest wait time
|
672
|
+
if deadlock.wait_chain and not candidates:
|
673
|
+
wait_times = [
|
674
|
+
(w.transaction_id, w.wait_start_time) for w in deadlock.wait_chain
|
675
|
+
]
|
676
|
+
latest_start = max(t for _, t in wait_times)
|
677
|
+
candidates.extend([txn for txn, t in wait_times if t == latest_start])
|
678
|
+
|
679
|
+
# Fallback: first transaction in the list
|
680
|
+
if not candidates and deadlock.involved_transactions:
|
681
|
+
candidates.append(deadlock.involved_transactions[0])
|
682
|
+
|
683
|
+
return list(set(candidates)) # Remove duplicates
|
684
|
+
|
685
|
+
def _recommend_resolution_strategy(
|
686
|
+
self, deadlock: DeadlockDetection
|
687
|
+
) -> ResolutionStrategy:
|
688
|
+
"""Recommend a resolution strategy for the deadlock."""
|
689
|
+
if deadlock.deadlock_type == DeadlockType.TIMEOUT_INFERRED:
|
690
|
+
return ResolutionStrategy.TIMEOUT_ROLLBACK
|
691
|
+
|
692
|
+
if deadlock.wait_chain:
|
693
|
+
# Check if we have priority information
|
694
|
+
has_priorities = any(w.priority != 0 for w in deadlock.wait_chain)
|
695
|
+
if has_priorities:
|
696
|
+
return ResolutionStrategy.PRIORITY_BASED
|
697
|
+
|
698
|
+
# Check if we have cost information
|
699
|
+
has_costs = any(w.cost != 0.0 for w in deadlock.wait_chain)
|
700
|
+
if has_costs:
|
701
|
+
return ResolutionStrategy.COST_BASED
|
702
|
+
|
703
|
+
return ResolutionStrategy.VICTIM_SELECTION
|
704
|
+
|
705
|
+
async def _resolve_deadlock(self, **kwargs) -> Dict[str, Any]:
|
706
|
+
"""Resolve a detected deadlock."""
|
707
|
+
deadlock_id = kwargs.get("deadlock_id")
|
708
|
+
victim_transaction_id = kwargs.get("victim_transaction_id")
|
709
|
+
strategy = kwargs.get("resolution_strategy", "victim_selection")
|
710
|
+
|
711
|
+
if not deadlock_id:
|
712
|
+
raise ValueError("deadlock_id is required")
|
713
|
+
|
714
|
+
# Find the deadlock
|
715
|
+
deadlock = next(
|
716
|
+
(d for d in self._detected_deadlocks if d.detection_id == deadlock_id), None
|
717
|
+
)
|
718
|
+
if not deadlock:
|
719
|
+
raise ValueError(f"Deadlock {deadlock_id} not found")
|
720
|
+
|
721
|
+
resolution_actions = []
|
722
|
+
|
723
|
+
# Determine victim if not specified
|
724
|
+
if not victim_transaction_id:
|
725
|
+
if deadlock.victim_candidates:
|
726
|
+
victim_transaction_id = deadlock.victim_candidates[0]
|
727
|
+
else:
|
728
|
+
victim_transaction_id = deadlock.involved_transactions[0]
|
729
|
+
|
730
|
+
# Execute resolution strategy
|
731
|
+
if strategy in ["victim_selection", "priority_based", "cost_based"]:
|
732
|
+
# Abort victim transaction
|
733
|
+
await self._release_lock(transaction_id=victim_transaction_id)
|
734
|
+
resolution_actions.append(
|
735
|
+
{
|
736
|
+
"action": "abort_transaction",
|
737
|
+
"transaction_id": victim_transaction_id,
|
738
|
+
"reason": f"Deadlock victim selected using {strategy} strategy",
|
739
|
+
}
|
740
|
+
)
|
741
|
+
|
742
|
+
elif strategy == "timeout_rollback":
|
743
|
+
# Rollback all transactions involved in timeout deadlock
|
744
|
+
for txn_id in deadlock.involved_transactions:
|
745
|
+
await self._release_lock(transaction_id=txn_id)
|
746
|
+
resolution_actions.append(
|
747
|
+
{
|
748
|
+
"action": "timeout_rollback",
|
749
|
+
"transaction_id": txn_id,
|
750
|
+
"reason": "Timeout-based deadlock resolution",
|
751
|
+
}
|
752
|
+
)
|
753
|
+
|
754
|
+
# Mark deadlock as resolved
|
755
|
+
deadlock.metadata["resolved"] = True
|
756
|
+
deadlock.metadata["resolution_time"] = time.time()
|
757
|
+
deadlock.metadata["resolution_strategy"] = strategy
|
758
|
+
deadlock.metadata["victim_transaction"] = victim_transaction_id
|
759
|
+
|
760
|
+
self.logger.info(
|
761
|
+
f"Resolved deadlock {deadlock_id} using {strategy} strategy, victim: {victim_transaction_id}"
|
762
|
+
)
|
763
|
+
|
764
|
+
return {
|
765
|
+
"deadlocks_detected": [],
|
766
|
+
"deadlock_count": 0,
|
767
|
+
"active_locks": len(self._active_locks),
|
768
|
+
"active_waits": len(self._active_waits),
|
769
|
+
"resolution_actions": resolution_actions,
|
770
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
771
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
772
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
773
|
+
"status": "success",
|
774
|
+
}
|
775
|
+
|
776
|
+
async def _request_resource(self, **kwargs) -> Dict[str, Any]:
|
777
|
+
"""Request a resource - simplified version for E2E testing."""
|
778
|
+
transaction_id = kwargs.get("transaction_id")
|
779
|
+
resource_id = kwargs.get("resource_id")
|
780
|
+
resource_type = kwargs.get("resource_type", "database_table")
|
781
|
+
lock_type = kwargs.get("lock_type", "SHARED")
|
782
|
+
|
783
|
+
if not transaction_id or not resource_id:
|
784
|
+
raise ValueError("transaction_id and resource_id are required")
|
785
|
+
|
786
|
+
# For E2E testing, just track the request
|
787
|
+
current_time = time.time()
|
788
|
+
|
789
|
+
# Return status for tracking
|
790
|
+
return {
|
791
|
+
"deadlocks_detected": [
|
792
|
+
self._serialize_deadlock(d) for d in self._detected_deadlocks
|
793
|
+
],
|
794
|
+
"deadlock_count": len(self._detected_deadlocks),
|
795
|
+
"active_locks": len(self._active_locks),
|
796
|
+
"active_waits": len(self._active_waits),
|
797
|
+
"resolution_actions": [],
|
798
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
799
|
+
"monitoring_status": f"requested_{resource_type}_{lock_type}".lower(),
|
800
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
801
|
+
"status": "success",
|
802
|
+
}
|
803
|
+
|
804
|
+
async def _initialize(self, **kwargs) -> Dict[str, Any]:
|
805
|
+
"""Initialize the deadlock detector."""
|
806
|
+
# Reset internal state
|
807
|
+
self._active_locks.clear()
|
808
|
+
self._active_waits.clear()
|
809
|
+
self._detected_deadlocks.clear()
|
810
|
+
self._monitoring_active = False
|
811
|
+
|
812
|
+
# Initialize with provided configuration
|
813
|
+
if "deadlock_timeout" in kwargs:
|
814
|
+
self._deadlock_timeout = kwargs["deadlock_timeout"]
|
815
|
+
if "cycle_detection_enabled" in kwargs:
|
816
|
+
self._cycle_detection_enabled = kwargs["cycle_detection_enabled"]
|
817
|
+
if "timeout_detection_enabled" in kwargs:
|
818
|
+
self._timeout_detection_enabled = kwargs["timeout_detection_enabled"]
|
819
|
+
|
820
|
+
return {
|
821
|
+
"deadlocks_detected": [
|
822
|
+
self._serialize_deadlock(d) for d in self._detected_deadlocks
|
823
|
+
],
|
824
|
+
"deadlock_count": len(self._detected_deadlocks),
|
825
|
+
"active_locks": len(self._active_locks),
|
826
|
+
"active_waits": len(self._active_waits),
|
827
|
+
"resolution_actions": [],
|
828
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
829
|
+
"monitoring_status": "initialized",
|
830
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
831
|
+
"status": "success",
|
832
|
+
}
|
833
|
+
|
834
|
+
async def _get_status(self, **kwargs) -> Dict[str, Any]:
|
835
|
+
"""Get current deadlock detector status."""
|
836
|
+
return {
|
837
|
+
"deadlocks_detected": [
|
838
|
+
self._serialize_deadlock(d) for d in self._detected_deadlocks
|
839
|
+
],
|
840
|
+
"deadlock_count": len(self._detected_deadlocks),
|
841
|
+
"active_locks": len(self._active_locks),
|
842
|
+
"active_waits": len(self._active_waits),
|
843
|
+
"resolution_actions": [],
|
844
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
845
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
846
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
847
|
+
"status": "success",
|
848
|
+
}
|
849
|
+
|
850
|
+
async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
|
851
|
+
"""Start continuous deadlock monitoring."""
|
852
|
+
interval = kwargs.get("monitoring_interval", 1.0)
|
853
|
+
|
854
|
+
if not self._monitoring_active:
|
855
|
+
self._monitoring_active = True
|
856
|
+
monitoring_task = asyncio.create_task(self._monitoring_loop(interval))
|
857
|
+
self._background_tasks.add(monitoring_task)
|
858
|
+
monitoring_task.add_done_callback(self._background_tasks.discard)
|
859
|
+
|
860
|
+
return {
|
861
|
+
"deadlocks_detected": [],
|
862
|
+
"deadlock_count": 0,
|
863
|
+
"active_locks": len(self._active_locks),
|
864
|
+
"active_waits": len(self._active_waits),
|
865
|
+
"resolution_actions": [],
|
866
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
867
|
+
"monitoring_status": "monitoring",
|
868
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
869
|
+
"status": "success",
|
870
|
+
}
|
871
|
+
|
872
|
+
async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
|
873
|
+
"""Stop continuous deadlock monitoring."""
|
874
|
+
self._monitoring_active = False
|
875
|
+
|
876
|
+
# Cancel background tasks
|
877
|
+
for task in self._background_tasks:
|
878
|
+
if not task.done():
|
879
|
+
task.cancel()
|
880
|
+
|
881
|
+
# Wait for tasks to complete
|
882
|
+
if self._background_tasks:
|
883
|
+
await asyncio.gather(*self._background_tasks, return_exceptions=True)
|
884
|
+
|
885
|
+
self._background_tasks.clear()
|
886
|
+
|
887
|
+
return {
|
888
|
+
"deadlocks_detected": [],
|
889
|
+
"deadlock_count": 0,
|
890
|
+
"active_locks": len(self._active_locks),
|
891
|
+
"active_waits": len(self._active_waits),
|
892
|
+
"resolution_actions": [],
|
893
|
+
"wait_for_graph": {k: list(v) for k, v in self._wait_for_graph.items()},
|
894
|
+
"monitoring_status": "stopped",
|
895
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
896
|
+
"status": "success",
|
897
|
+
}
|
898
|
+
|
899
|
+
async def _monitoring_loop(self, interval: float):
|
900
|
+
"""Background monitoring loop for continuous deadlock detection."""
|
901
|
+
while self._monitoring_active:
|
902
|
+
try:
|
903
|
+
await asyncio.sleep(interval)
|
904
|
+
|
905
|
+
# Detect deadlocks
|
906
|
+
deadlocks = await self._detect_cycles_in_wait_graph()
|
907
|
+
timeout_deadlocks = await self._detect_timeout_deadlocks()
|
908
|
+
|
909
|
+
all_deadlocks = deadlocks + timeout_deadlocks
|
910
|
+
|
911
|
+
if all_deadlocks:
|
912
|
+
self.logger.warning(
|
913
|
+
f"Monitoring detected {len(all_deadlocks)} deadlocks"
|
914
|
+
)
|
915
|
+
|
916
|
+
# Store detected deadlocks
|
917
|
+
self._detected_deadlocks.extend(all_deadlocks)
|
918
|
+
|
919
|
+
# TODO: Send alerts or take automatic resolution actions
|
920
|
+
|
921
|
+
except asyncio.CancelledError:
|
922
|
+
break
|
923
|
+
except Exception as e:
|
924
|
+
self.logger.error(f"Monitoring loop error: {e}")
|
925
|
+
|
926
|
+
def _serialize_deadlock(self, deadlock: DeadlockDetection) -> Dict[str, Any]:
|
927
|
+
"""Serialize a deadlock detection to dictionary."""
|
928
|
+
return {
|
929
|
+
"detection_id": deadlock.detection_id,
|
930
|
+
"deadlock_type": deadlock.deadlock_type.value,
|
931
|
+
"involved_transactions": deadlock.involved_transactions,
|
932
|
+
"involved_resources": deadlock.involved_resources,
|
933
|
+
"detection_time": deadlock.detection_time,
|
934
|
+
"wait_chain": [
|
935
|
+
{
|
936
|
+
"transaction_id": w.transaction_id,
|
937
|
+
"waiting_for_transaction_id": w.waiting_for_transaction_id,
|
938
|
+
"resource_id": w.resource_id,
|
939
|
+
"wait_start_time": w.wait_start_time,
|
940
|
+
"timeout": w.timeout,
|
941
|
+
"priority": w.priority,
|
942
|
+
"cost": w.cost,
|
943
|
+
}
|
944
|
+
for w in deadlock.wait_chain
|
945
|
+
],
|
946
|
+
"victim_candidates": deadlock.victim_candidates,
|
947
|
+
"recommended_strategy": (
|
948
|
+
deadlock.recommended_strategy.value
|
949
|
+
if deadlock.recommended_strategy
|
950
|
+
else None
|
951
|
+
),
|
952
|
+
"metadata": deadlock.metadata,
|
953
|
+
}
|
954
|
+
|
955
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
956
|
+
"""Synchronous wrapper for compatibility."""
|
957
|
+
import asyncio
|
958
|
+
|
959
|
+
return asyncio.run(self.async_run(**kwargs))
|
960
|
+
|
961
|
+
async def cleanup(self):
|
962
|
+
"""Cleanup resources when node is destroyed."""
|
963
|
+
await self._stop_monitoring()
|
964
|
+
await super().cleanup() if hasattr(super(), "cleanup") else None
|