kailash 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +35 -4
- kailash/adapters/__init__.py +5 -0
- kailash/adapters/mcp_platform_adapter.py +273 -0
- kailash/channels/__init__.py +21 -0
- kailash/channels/api_channel.py +409 -0
- kailash/channels/base.py +271 -0
- kailash/channels/cli_channel.py +661 -0
- kailash/channels/event_router.py +496 -0
- kailash/channels/mcp_channel.py +648 -0
- kailash/channels/session.py +423 -0
- kailash/mcp_server/discovery.py +1 -1
- kailash/middleware/core/agent_ui.py +5 -0
- kailash/middleware/mcp/enhanced_server.py +22 -16
- kailash/nexus/__init__.py +21 -0
- kailash/nexus/factory.py +413 -0
- kailash/nexus/gateway.py +545 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/iterative_llm_agent.py +988 -17
- kailash/nodes/ai/llm_agent.py +29 -9
- kailash/nodes/api/__init__.py +2 -2
- kailash/nodes/api/monitoring.py +1 -1
- kailash/nodes/base_async.py +54 -14
- kailash/nodes/code/async_python.py +1 -1
- kailash/nodes/data/bulk_operations.py +939 -0
- kailash/nodes/data/query_builder.py +373 -0
- kailash/nodes/data/query_cache.py +512 -0
- kailash/nodes/monitoring/__init__.py +10 -0
- kailash/nodes/monitoring/deadlock_detector.py +964 -0
- kailash/nodes/monitoring/performance_anomaly.py +1078 -0
- kailash/nodes/monitoring/race_condition_detector.py +1151 -0
- kailash/nodes/monitoring/transaction_metrics.py +790 -0
- kailash/nodes/monitoring/transaction_monitor.py +931 -0
- kailash/nodes/system/__init__.py +17 -0
- kailash/nodes/system/command_parser.py +820 -0
- kailash/nodes/transaction/__init__.py +48 -0
- kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
- kailash/nodes/transaction/saga_coordinator.py +652 -0
- kailash/nodes/transaction/saga_state_storage.py +411 -0
- kailash/nodes/transaction/saga_step.py +467 -0
- kailash/nodes/transaction/transaction_context.py +756 -0
- kailash/nodes/transaction/two_phase_commit.py +978 -0
- kailash/nodes/transform/processors.py +17 -1
- kailash/nodes/validation/__init__.py +21 -0
- kailash/nodes/validation/test_executor.py +532 -0
- kailash/nodes/validation/validation_nodes.py +447 -0
- kailash/resources/factory.py +1 -1
- kailash/runtime/async_local.py +84 -21
- kailash/runtime/local.py +21 -2
- kailash/runtime/parameter_injector.py +187 -31
- kailash/security.py +16 -1
- kailash/servers/__init__.py +32 -0
- kailash/servers/durable_workflow_server.py +430 -0
- kailash/servers/enterprise_workflow_server.py +466 -0
- kailash/servers/gateway.py +183 -0
- kailash/servers/workflow_server.py +290 -0
- kailash/utils/data_validation.py +192 -0
- kailash/workflow/builder.py +291 -12
- kailash/workflow/validation.py +144 -8
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/METADATA +1 -1
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/RECORD +64 -26
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/WHEEL +0 -0
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.5.dist-info → kailash-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1151 @@
|
|
1
|
+
"""Race condition detection and analysis node for concurrent operations.
|
2
|
+
|
3
|
+
This module provides comprehensive race condition detection capabilities with
|
4
|
+
concurrent access pattern analysis, timing-based detection, and preventive suggestions.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
import threading
|
11
|
+
import time
|
12
|
+
import uuid
|
13
|
+
from collections import defaultdict, deque
|
14
|
+
from dataclasses import dataclass, field
|
15
|
+
from datetime import UTC, datetime
|
16
|
+
from enum import Enum
|
17
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
18
|
+
|
19
|
+
from kailash.nodes.base import NodeParameter, register_node
|
20
|
+
from kailash.nodes.base_async import AsyncNode
|
21
|
+
from kailash.sdk_exceptions import NodeExecutionError
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class RaceConditionType(Enum):
|
27
|
+
"""Types of race conditions that can be detected."""
|
28
|
+
|
29
|
+
READ_WRITE_RACE = "read_write_race"
|
30
|
+
WRITE_WRITE_RACE = "write_write_race"
|
31
|
+
CHECK_THEN_ACT = "check_then_act"
|
32
|
+
LOST_UPDATE = "lost_update"
|
33
|
+
DIRTY_READ = "dirty_read"
|
34
|
+
PHANTOM_READ = "phantom_read"
|
35
|
+
TIMING_DEPENDENT = "timing_dependent"
|
36
|
+
|
37
|
+
|
38
|
+
class AccessType(Enum):
|
39
|
+
"""Types of resource access."""
|
40
|
+
|
41
|
+
READ = "read"
|
42
|
+
WRITE = "write"
|
43
|
+
READ_WRITE = "read_write"
|
44
|
+
DELETE = "delete"
|
45
|
+
CREATE = "create"
|
46
|
+
|
47
|
+
|
48
|
+
class PreventionStrategy(Enum):
|
49
|
+
"""Race condition prevention strategies."""
|
50
|
+
|
51
|
+
OPTIMISTIC_LOCKING = "optimistic_locking"
|
52
|
+
PESSIMISTIC_LOCKING = "pessimistic_locking"
|
53
|
+
ATOMIC_OPERATIONS = "atomic_operations"
|
54
|
+
SERIALIZATION = "serialization"
|
55
|
+
IMMUTABLE_DATA = "immutable_data"
|
56
|
+
MESSAGE_PASSING = "message_passing"
|
57
|
+
SYNCHRONIZATION = "synchronization"
|
58
|
+
|
59
|
+
|
60
|
+
@dataclass
|
61
|
+
class ResourceAccess:
|
62
|
+
"""Represents a resource access event."""
|
63
|
+
|
64
|
+
access_id: str
|
65
|
+
resource_id: str
|
66
|
+
operation_id: str
|
67
|
+
thread_id: str
|
68
|
+
process_id: str
|
69
|
+
access_type: AccessType
|
70
|
+
start_time: float
|
71
|
+
end_time: Optional[float] = None
|
72
|
+
duration: Optional[float] = None
|
73
|
+
success: bool = True
|
74
|
+
error: Optional[str] = None
|
75
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
76
|
+
|
77
|
+
|
78
|
+
@dataclass
|
79
|
+
class ConcurrentOperation:
|
80
|
+
"""Represents a concurrent operation with multiple accesses."""
|
81
|
+
|
82
|
+
operation_id: str
|
83
|
+
start_time: float
|
84
|
+
thread_id: str
|
85
|
+
process_id: str
|
86
|
+
end_time: Optional[float] = None
|
87
|
+
accesses: List[ResourceAccess] = field(default_factory=list)
|
88
|
+
total_resources: int = 0
|
89
|
+
conflicting_operations: Set[str] = field(default_factory=set)
|
90
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
91
|
+
|
92
|
+
|
93
|
+
@dataclass
|
94
|
+
class RaceConditionDetection:
|
95
|
+
"""Represents a detected race condition."""
|
96
|
+
|
97
|
+
detection_id: str
|
98
|
+
race_type: RaceConditionType
|
99
|
+
involved_operations: List[str]
|
100
|
+
involved_resources: List[str]
|
101
|
+
conflicting_accesses: List[ResourceAccess]
|
102
|
+
detection_time: float
|
103
|
+
confidence_score: float # 0.0 to 1.0
|
104
|
+
severity: str # low, medium, high, critical
|
105
|
+
potential_impact: str
|
106
|
+
recommended_prevention: List[PreventionStrategy] = field(default_factory=list)
|
107
|
+
timing_analysis: Dict[str, float] = field(default_factory=dict)
|
108
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
109
|
+
|
110
|
+
|
111
|
+
@register_node()
|
112
|
+
class RaceConditionDetectorNode(AsyncNode):
|
113
|
+
"""Node for detecting race conditions in concurrent operations.
|
114
|
+
|
115
|
+
This node provides comprehensive race condition detection including:
|
116
|
+
- Concurrent access pattern analysis
|
117
|
+
- Timing-based race condition detection
|
118
|
+
- Read-write conflict identification
|
119
|
+
- Lost update detection
|
120
|
+
- Dirty read detection
|
121
|
+
- Check-then-act race detection
|
122
|
+
- Prevention strategy recommendations
|
123
|
+
|
124
|
+
Design Purpose:
|
125
|
+
- Detect potential race conditions in production systems
|
126
|
+
- Provide actionable insights for race prevention
|
127
|
+
- Support concurrent system troubleshooting
|
128
|
+
- Enable proactive race condition monitoring
|
129
|
+
|
130
|
+
Examples:
|
131
|
+
>>> # Register resource access
|
132
|
+
>>> detector = RaceConditionDetectorNode()
|
133
|
+
>>> result = await detector.execute(
|
134
|
+
... operation="register_access",
|
135
|
+
... resource_id="user_account_123",
|
136
|
+
... operation_id="op_456",
|
137
|
+
... thread_id="thread_1",
|
138
|
+
... access_type="read",
|
139
|
+
... metadata={"query": "SELECT balance FROM accounts"}
|
140
|
+
... )
|
141
|
+
|
142
|
+
>>> # End resource access
|
143
|
+
>>> result = await detector.execute(
|
144
|
+
... operation="end_access",
|
145
|
+
... access_id="access_789",
|
146
|
+
... success=True
|
147
|
+
... )
|
148
|
+
|
149
|
+
>>> # Detect race conditions
|
150
|
+
>>> result = await detector.execute(
|
151
|
+
... operation="detect_races",
|
152
|
+
... detection_window=5.0,
|
153
|
+
... min_confidence=0.7
|
154
|
+
... )
|
155
|
+
"""
|
156
|
+
|
157
|
+
def __init__(self, **kwargs):
|
158
|
+
"""Initialize the race condition detector node."""
|
159
|
+
super().__init__(**kwargs)
|
160
|
+
self._active_accesses: Dict[str, ResourceAccess] = {}
|
161
|
+
self._completed_accesses: List[ResourceAccess] = []
|
162
|
+
self._active_operations: Dict[str, ConcurrentOperation] = {}
|
163
|
+
self._resource_access_history: Dict[str, deque] = defaultdict(
|
164
|
+
lambda: deque(maxlen=1000)
|
165
|
+
)
|
166
|
+
self._detected_races: List[RaceConditionDetection] = []
|
167
|
+
self._monitoring_active = False
|
168
|
+
self._background_tasks: Set[asyncio.Task] = set()
|
169
|
+
self._detection_thresholds = {
|
170
|
+
"min_confidence": 0.5,
|
171
|
+
"timing_threshold": 0.001, # 1ms
|
172
|
+
"overlap_threshold": 0.5,
|
173
|
+
}
|
174
|
+
self.logger.info(f"Initialized RaceConditionDetectorNode: {self.id}")
|
175
|
+
|
176
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
177
|
+
"""Define the parameters this node accepts."""
|
178
|
+
return {
|
179
|
+
"operation": NodeParameter(
|
180
|
+
name="operation",
|
181
|
+
type=str,
|
182
|
+
required=True,
|
183
|
+
description="Operation (register_access, end_access, register_operation, end_operation, detect_races, get_status)",
|
184
|
+
),
|
185
|
+
"access_id": NodeParameter(
|
186
|
+
name="access_id",
|
187
|
+
type=str,
|
188
|
+
required=False,
|
189
|
+
description="Unique access identifier",
|
190
|
+
),
|
191
|
+
"resource_id": NodeParameter(
|
192
|
+
name="resource_id",
|
193
|
+
type=str,
|
194
|
+
required=False,
|
195
|
+
description="Resource being accessed (table, file, object, etc.)",
|
196
|
+
),
|
197
|
+
"operation_id": NodeParameter(
|
198
|
+
name="operation_id",
|
199
|
+
type=str,
|
200
|
+
required=False,
|
201
|
+
description="Operation identifier grouping multiple accesses",
|
202
|
+
),
|
203
|
+
"thread_id": NodeParameter(
|
204
|
+
name="thread_id",
|
205
|
+
type=str,
|
206
|
+
required=False,
|
207
|
+
description="Thread identifier",
|
208
|
+
),
|
209
|
+
"process_id": NodeParameter(
|
210
|
+
name="process_id",
|
211
|
+
type=str,
|
212
|
+
required=False,
|
213
|
+
description="Process identifier",
|
214
|
+
),
|
215
|
+
"access_type": NodeParameter(
|
216
|
+
name="access_type",
|
217
|
+
type=str,
|
218
|
+
required=False,
|
219
|
+
default="read",
|
220
|
+
description="Type of access (read, write, read_write, delete, create)",
|
221
|
+
),
|
222
|
+
"success": NodeParameter(
|
223
|
+
name="success",
|
224
|
+
type=bool,
|
225
|
+
required=False,
|
226
|
+
default=True,
|
227
|
+
description="Whether the access was successful",
|
228
|
+
),
|
229
|
+
"error": NodeParameter(
|
230
|
+
name="error",
|
231
|
+
type=str,
|
232
|
+
required=False,
|
233
|
+
description="Error message if access failed",
|
234
|
+
),
|
235
|
+
"detection_window": NodeParameter(
|
236
|
+
name="detection_window",
|
237
|
+
type=float,
|
238
|
+
required=False,
|
239
|
+
default=5.0,
|
240
|
+
description="Time window for race detection in seconds",
|
241
|
+
),
|
242
|
+
"min_confidence": NodeParameter(
|
243
|
+
name="min_confidence",
|
244
|
+
type=float,
|
245
|
+
required=False,
|
246
|
+
default=0.5,
|
247
|
+
description="Minimum confidence score for race detection (0.0-1.0)",
|
248
|
+
),
|
249
|
+
"resource_filters": NodeParameter(
|
250
|
+
name="resource_filters",
|
251
|
+
type=list,
|
252
|
+
required=False,
|
253
|
+
default=[],
|
254
|
+
description="List of resource patterns to filter detection",
|
255
|
+
),
|
256
|
+
"timing_threshold": NodeParameter(
|
257
|
+
name="timing_threshold",
|
258
|
+
type=float,
|
259
|
+
required=False,
|
260
|
+
default=0.001,
|
261
|
+
description="Timing threshold for race detection in seconds",
|
262
|
+
),
|
263
|
+
"enable_monitoring": NodeParameter(
|
264
|
+
name="enable_monitoring",
|
265
|
+
type=bool,
|
266
|
+
required=False,
|
267
|
+
default=False,
|
268
|
+
description="Enable continuous race condition monitoring",
|
269
|
+
),
|
270
|
+
"monitoring_interval": NodeParameter(
|
271
|
+
name="monitoring_interval",
|
272
|
+
type=float,
|
273
|
+
required=False,
|
274
|
+
default=1.0,
|
275
|
+
description="Monitoring interval in seconds",
|
276
|
+
),
|
277
|
+
"metadata": NodeParameter(
|
278
|
+
name="metadata",
|
279
|
+
type=dict,
|
280
|
+
required=False,
|
281
|
+
default={},
|
282
|
+
description="Additional metadata for the operation",
|
283
|
+
),
|
284
|
+
}
|
285
|
+
|
286
|
+
def get_output_schema(self) -> Dict[str, NodeParameter]:
|
287
|
+
"""Define the output schema for this node."""
|
288
|
+
return {
|
289
|
+
"races_detected": NodeParameter(
|
290
|
+
name="races_detected",
|
291
|
+
type=list,
|
292
|
+
description="List of detected race conditions",
|
293
|
+
),
|
294
|
+
"race_count": NodeParameter(
|
295
|
+
name="race_count", type=int, description="Number of races detected"
|
296
|
+
),
|
297
|
+
"active_accesses": NodeParameter(
|
298
|
+
name="active_accesses",
|
299
|
+
type=int,
|
300
|
+
description="Number of active accesses",
|
301
|
+
),
|
302
|
+
"active_operations": NodeParameter(
|
303
|
+
name="active_operations",
|
304
|
+
type=int,
|
305
|
+
description="Number of active operations",
|
306
|
+
),
|
307
|
+
"prevention_suggestions": NodeParameter(
|
308
|
+
name="prevention_suggestions",
|
309
|
+
type=list,
|
310
|
+
description="Recommended prevention strategies",
|
311
|
+
),
|
312
|
+
"resource_conflicts": NodeParameter(
|
313
|
+
name="resource_conflicts",
|
314
|
+
type=dict,
|
315
|
+
description="Resource-level conflict analysis",
|
316
|
+
),
|
317
|
+
"timing_analysis": NodeParameter(
|
318
|
+
name="timing_analysis",
|
319
|
+
type=dict,
|
320
|
+
description="Timing-based analysis results",
|
321
|
+
),
|
322
|
+
"monitoring_status": NodeParameter(
|
323
|
+
name="monitoring_status",
|
324
|
+
type=str,
|
325
|
+
description="Current monitoring status",
|
326
|
+
),
|
327
|
+
"timestamp": NodeParameter(
|
328
|
+
name="timestamp", type=str, description="ISO timestamp of operation"
|
329
|
+
),
|
330
|
+
"status": NodeParameter(
|
331
|
+
name="status", type=str, description="Operation status"
|
332
|
+
),
|
333
|
+
}
|
334
|
+
|
335
|
+
async def async_run(self, **kwargs) -> Dict[str, Any]:
|
336
|
+
"""Execute race condition detection operation."""
|
337
|
+
operation = kwargs.get("operation")
|
338
|
+
|
339
|
+
try:
|
340
|
+
if operation == "register_access":
|
341
|
+
return await self._register_access(**kwargs)
|
342
|
+
elif operation == "end_access":
|
343
|
+
return await self._end_access(**kwargs)
|
344
|
+
elif operation == "register_operation":
|
345
|
+
return await self._register_operation(**kwargs)
|
346
|
+
elif operation == "end_operation":
|
347
|
+
return await self._end_operation(**kwargs)
|
348
|
+
elif operation == "detect_races":
|
349
|
+
return await self._detect_races(**kwargs)
|
350
|
+
elif operation == "get_status":
|
351
|
+
return await self._get_status(**kwargs)
|
352
|
+
elif operation == "start_monitoring":
|
353
|
+
return await self._start_monitoring(**kwargs)
|
354
|
+
elif operation == "stop_monitoring":
|
355
|
+
return await self._stop_monitoring(**kwargs)
|
356
|
+
elif operation == "report_operation":
|
357
|
+
return await self._report_operation(**kwargs)
|
358
|
+
elif operation == "complete_operation":
|
359
|
+
return await self._complete_operation(**kwargs)
|
360
|
+
else:
|
361
|
+
raise ValueError(f"Unknown operation: {operation}")
|
362
|
+
|
363
|
+
except Exception as e:
|
364
|
+
self.logger.error(f"Race condition detection operation failed: {str(e)}")
|
365
|
+
raise NodeExecutionError(f"Failed to execute race detection: {str(e)}")
|
366
|
+
|
367
|
+
async def _register_access(self, **kwargs) -> Dict[str, Any]:
|
368
|
+
"""Register a new resource access."""
|
369
|
+
resource_id = kwargs.get("resource_id")
|
370
|
+
operation_id = kwargs.get("operation_id")
|
371
|
+
thread_id = kwargs.get("thread_id", "unknown")
|
372
|
+
process_id = kwargs.get("process_id", "unknown")
|
373
|
+
access_type = AccessType(kwargs.get("access_type", "read"))
|
374
|
+
metadata = kwargs.get("metadata", {})
|
375
|
+
|
376
|
+
if not resource_id:
|
377
|
+
raise ValueError("resource_id is required")
|
378
|
+
|
379
|
+
current_time = time.time()
|
380
|
+
access_id = kwargs.get("access_id") or f"access_{int(current_time * 1000000)}"
|
381
|
+
|
382
|
+
# Create access record
|
383
|
+
access = ResourceAccess(
|
384
|
+
access_id=access_id,
|
385
|
+
resource_id=resource_id,
|
386
|
+
operation_id=operation_id or "unknown",
|
387
|
+
thread_id=thread_id,
|
388
|
+
process_id=process_id,
|
389
|
+
access_type=access_type,
|
390
|
+
start_time=current_time,
|
391
|
+
metadata=metadata,
|
392
|
+
)
|
393
|
+
|
394
|
+
# Register access
|
395
|
+
self._active_accesses[access_id] = access
|
396
|
+
self._resource_access_history[resource_id].append(access)
|
397
|
+
|
398
|
+
# Update operation if specified
|
399
|
+
if operation_id and operation_id in self._active_operations:
|
400
|
+
operation = self._active_operations[operation_id]
|
401
|
+
operation.accesses.append(access)
|
402
|
+
operation.total_resources += 1
|
403
|
+
|
404
|
+
# Check for immediate race conditions
|
405
|
+
races = await self._analyze_concurrent_access(resource_id, access)
|
406
|
+
|
407
|
+
self.logger.debug(
|
408
|
+
f"Registered access {access_id} for resource {resource_id} ({access_type.value})"
|
409
|
+
)
|
410
|
+
|
411
|
+
return {
|
412
|
+
"races_detected": [self._serialize_race(r) for r in races],
|
413
|
+
"race_count": len(races),
|
414
|
+
"active_accesses": len(self._active_accesses),
|
415
|
+
"active_operations": len(self._active_operations),
|
416
|
+
"prevention_suggestions": [],
|
417
|
+
"resource_conflicts": {},
|
418
|
+
"timing_analysis": {},
|
419
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
420
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
421
|
+
"status": "success",
|
422
|
+
}
|
423
|
+
|
424
|
+
async def _end_access(self, **kwargs) -> Dict[str, Any]:
|
425
|
+
"""End a resource access."""
|
426
|
+
access_id = kwargs.get("access_id")
|
427
|
+
success = kwargs.get("success", True)
|
428
|
+
error = kwargs.get("error")
|
429
|
+
|
430
|
+
if not access_id:
|
431
|
+
raise ValueError("access_id is required")
|
432
|
+
|
433
|
+
if access_id not in self._active_accesses:
|
434
|
+
raise ValueError(f"Access {access_id} not found")
|
435
|
+
|
436
|
+
access = self._active_accesses.pop(access_id)
|
437
|
+
|
438
|
+
# Complete access
|
439
|
+
access.end_time = time.time()
|
440
|
+
access.duration = access.end_time - access.start_time
|
441
|
+
access.success = success
|
442
|
+
access.error = error
|
443
|
+
|
444
|
+
# Store completed access
|
445
|
+
self._completed_accesses.append(access)
|
446
|
+
|
447
|
+
# Clean old accesses (keep last hour)
|
448
|
+
cutoff_time = time.time() - 3600
|
449
|
+
self._completed_accesses = [
|
450
|
+
a for a in self._completed_accesses if a.start_time > cutoff_time
|
451
|
+
]
|
452
|
+
|
453
|
+
self.logger.debug(
|
454
|
+
f"Ended access {access_id} with duration {access.duration:.3f}s, success: {success}"
|
455
|
+
)
|
456
|
+
|
457
|
+
return {
|
458
|
+
"races_detected": [],
|
459
|
+
"race_count": 0,
|
460
|
+
"active_accesses": len(self._active_accesses),
|
461
|
+
"active_operations": len(self._active_operations),
|
462
|
+
"prevention_suggestions": [],
|
463
|
+
"resource_conflicts": {},
|
464
|
+
"timing_analysis": {"access_duration": access.duration},
|
465
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
466
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
467
|
+
"status": "success",
|
468
|
+
}
|
469
|
+
|
470
|
+
async def _register_operation(self, **kwargs) -> Dict[str, Any]:
|
471
|
+
"""Register a new concurrent operation."""
|
472
|
+
operation_id = kwargs.get("operation_id")
|
473
|
+
thread_id = kwargs.get("thread_id", "unknown")
|
474
|
+
process_id = kwargs.get("process_id", "unknown")
|
475
|
+
metadata = kwargs.get("metadata", {})
|
476
|
+
|
477
|
+
if not operation_id:
|
478
|
+
raise ValueError("operation_id is required")
|
479
|
+
|
480
|
+
current_time = time.time()
|
481
|
+
|
482
|
+
# Create operation record
|
483
|
+
operation = ConcurrentOperation(
|
484
|
+
operation_id=operation_id,
|
485
|
+
start_time=current_time,
|
486
|
+
thread_id=thread_id,
|
487
|
+
process_id=process_id,
|
488
|
+
metadata=metadata,
|
489
|
+
)
|
490
|
+
|
491
|
+
self._active_operations[operation_id] = operation
|
492
|
+
|
493
|
+
self.logger.debug(f"Registered operation {operation_id}")
|
494
|
+
|
495
|
+
return {
|
496
|
+
"races_detected": [],
|
497
|
+
"race_count": 0,
|
498
|
+
"active_accesses": len(self._active_accesses),
|
499
|
+
"active_operations": len(self._active_operations),
|
500
|
+
"prevention_suggestions": [],
|
501
|
+
"resource_conflicts": {},
|
502
|
+
"timing_analysis": {},
|
503
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
504
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
505
|
+
"status": "success",
|
506
|
+
}
|
507
|
+
|
508
|
+
async def _end_operation(self, **kwargs) -> Dict[str, Any]:
|
509
|
+
"""End a concurrent operation."""
|
510
|
+
operation_id = kwargs.get("operation_id")
|
511
|
+
|
512
|
+
if not operation_id:
|
513
|
+
raise ValueError("operation_id is required")
|
514
|
+
|
515
|
+
if operation_id not in self._active_operations:
|
516
|
+
raise ValueError(f"Operation {operation_id} not found")
|
517
|
+
|
518
|
+
operation = self._active_operations.pop(operation_id)
|
519
|
+
operation.end_time = time.time()
|
520
|
+
|
521
|
+
# Analyze operation for race conditions
|
522
|
+
races = await self._analyze_operation_races(operation)
|
523
|
+
|
524
|
+
self.logger.debug(
|
525
|
+
f"Ended operation {operation_id} with {len(operation.accesses)} accesses"
|
526
|
+
)
|
527
|
+
|
528
|
+
return {
|
529
|
+
"races_detected": [self._serialize_race(r) for r in races],
|
530
|
+
"race_count": len(races),
|
531
|
+
"active_accesses": len(self._active_accesses),
|
532
|
+
"active_operations": len(self._active_operations),
|
533
|
+
"prevention_suggestions": [
|
534
|
+
self._get_prevention_strategies(r) for r in races
|
535
|
+
],
|
536
|
+
"resource_conflicts": {},
|
537
|
+
"timing_analysis": {},
|
538
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
539
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
540
|
+
"status": "success",
|
541
|
+
}
|
542
|
+
|
543
|
+
async def _detect_races(self, **kwargs) -> Dict[str, Any]:
|
544
|
+
"""Detect race conditions in recent accesses."""
|
545
|
+
detection_window = kwargs.get("detection_window", 5.0)
|
546
|
+
min_confidence = kwargs.get("min_confidence", 0.5)
|
547
|
+
resource_filters = kwargs.get("resource_filters", [])
|
548
|
+
timing_threshold = kwargs.get("timing_threshold", 0.001)
|
549
|
+
|
550
|
+
current_time = time.time()
|
551
|
+
window_start = current_time - detection_window
|
552
|
+
|
553
|
+
# Analyze recent accesses for race conditions
|
554
|
+
races = []
|
555
|
+
|
556
|
+
# Group accesses by resource within time window
|
557
|
+
resource_accesses = defaultdict(list)
|
558
|
+
for access in self._completed_accesses:
|
559
|
+
if access.start_time >= window_start:
|
560
|
+
if not resource_filters or any(
|
561
|
+
f in access.resource_id for f in resource_filters
|
562
|
+
):
|
563
|
+
resource_accesses[access.resource_id].append(access)
|
564
|
+
|
565
|
+
# Detect races for each resource
|
566
|
+
for resource_id, accesses in resource_accesses.items():
|
567
|
+
if len(accesses) > 1:
|
568
|
+
resource_races = await self._detect_resource_races(
|
569
|
+
resource_id, accesses, timing_threshold, min_confidence
|
570
|
+
)
|
571
|
+
races.extend(resource_races)
|
572
|
+
|
573
|
+
# Store detected races
|
574
|
+
self._detected_races.extend(races)
|
575
|
+
|
576
|
+
# Generate prevention suggestions
|
577
|
+
prevention_suggestions = []
|
578
|
+
for race in races:
|
579
|
+
strategies = self._get_prevention_strategies(race)
|
580
|
+
prevention_suggestions.extend(strategies)
|
581
|
+
|
582
|
+
# Analyze resource conflicts
|
583
|
+
resource_conflicts = self._analyze_resource_conflicts(resource_accesses)
|
584
|
+
|
585
|
+
# Generate timing analysis
|
586
|
+
timing_analysis = self._generate_timing_analysis(resource_accesses)
|
587
|
+
|
588
|
+
self.logger.info(
|
589
|
+
f"Detected {len(races)} race conditions in {detection_window}s window"
|
590
|
+
)
|
591
|
+
|
592
|
+
return {
|
593
|
+
"races_detected": [self._serialize_race(r) for r in races],
|
594
|
+
"race_count": len(races),
|
595
|
+
"active_accesses": len(self._active_accesses),
|
596
|
+
"active_operations": len(self._active_operations),
|
597
|
+
"prevention_suggestions": list(set(prevention_suggestions)),
|
598
|
+
"resource_conflicts": resource_conflicts,
|
599
|
+
"timing_analysis": timing_analysis,
|
600
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
601
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
602
|
+
"status": "success",
|
603
|
+
}
|
604
|
+
|
605
|
+
async def _analyze_concurrent_access(
|
606
|
+
self, resource_id: str, new_access: ResourceAccess
|
607
|
+
) -> List[RaceConditionDetection]:
|
608
|
+
"""Analyze for immediate race conditions with new access."""
|
609
|
+
races = []
|
610
|
+
current_time = time.time()
|
611
|
+
|
612
|
+
# Check concurrent accesses to the same resource
|
613
|
+
concurrent_accesses = [
|
614
|
+
access
|
615
|
+
for access in self._active_accesses.values()
|
616
|
+
if (
|
617
|
+
access.resource_id == resource_id
|
618
|
+
and access.access_id != new_access.access_id
|
619
|
+
and access.thread_id != new_access.thread_id
|
620
|
+
)
|
621
|
+
]
|
622
|
+
|
623
|
+
for concurrent_access in concurrent_accesses:
|
624
|
+
# Check for write-write or read-write conflicts
|
625
|
+
if new_access.access_type in [
|
626
|
+
AccessType.WRITE,
|
627
|
+
AccessType.READ_WRITE,
|
628
|
+
AccessType.DELETE,
|
629
|
+
] or concurrent_access.access_type in [
|
630
|
+
AccessType.WRITE,
|
631
|
+
AccessType.READ_WRITE,
|
632
|
+
AccessType.DELETE,
|
633
|
+
]:
|
634
|
+
race = await self._create_race_detection(
|
635
|
+
[new_access, concurrent_access], current_time
|
636
|
+
)
|
637
|
+
if race:
|
638
|
+
races.append(race)
|
639
|
+
|
640
|
+
return races
|
641
|
+
|
642
|
+
async def _analyze_operation_races(
|
643
|
+
self, operation: ConcurrentOperation
|
644
|
+
) -> List[RaceConditionDetection]:
|
645
|
+
"""Analyze an operation for race conditions."""
|
646
|
+
races = []
|
647
|
+
|
648
|
+
# Check for check-then-act patterns
|
649
|
+
read_accesses = [
|
650
|
+
a for a in operation.accesses if a.access_type == AccessType.READ
|
651
|
+
]
|
652
|
+
write_accesses = [
|
653
|
+
a for a in operation.accesses if a.access_type == AccessType.WRITE
|
654
|
+
]
|
655
|
+
|
656
|
+
for read_access in read_accesses:
|
657
|
+
for write_access in write_accesses:
|
658
|
+
if (
|
659
|
+
read_access.resource_id == write_access.resource_id
|
660
|
+
and write_access.start_time > read_access.end_time
|
661
|
+
):
|
662
|
+
# Potential check-then-act race
|
663
|
+
race = RaceConditionDetection(
|
664
|
+
detection_id=f"race_{int(time.time() * 1000000)}",
|
665
|
+
race_type=RaceConditionType.CHECK_THEN_ACT,
|
666
|
+
involved_operations=[operation.operation_id],
|
667
|
+
involved_resources=[read_access.resource_id],
|
668
|
+
conflicting_accesses=[read_access, write_access],
|
669
|
+
detection_time=time.time(),
|
670
|
+
confidence_score=0.8,
|
671
|
+
severity="medium",
|
672
|
+
potential_impact="Data inconsistency from stale reads",
|
673
|
+
recommended_prevention=[
|
674
|
+
PreventionStrategy.ATOMIC_OPERATIONS,
|
675
|
+
PreventionStrategy.OPTIMISTIC_LOCKING,
|
676
|
+
],
|
677
|
+
timing_analysis={
|
678
|
+
"gap_duration": write_access.start_time
|
679
|
+
- read_access.end_time
|
680
|
+
},
|
681
|
+
)
|
682
|
+
races.append(race)
|
683
|
+
|
684
|
+
return races
|
685
|
+
|
686
|
+
async def _detect_resource_races(
|
687
|
+
self,
|
688
|
+
resource_id: str,
|
689
|
+
accesses: List[ResourceAccess],
|
690
|
+
timing_threshold: float,
|
691
|
+
min_confidence: float,
|
692
|
+
) -> List[RaceConditionDetection]:
|
693
|
+
"""Detect race conditions for a specific resource."""
|
694
|
+
races = []
|
695
|
+
|
696
|
+
# Sort accesses by start time
|
697
|
+
sorted_accesses = sorted(accesses, key=lambda a: a.start_time)
|
698
|
+
|
699
|
+
# Check for overlapping accesses
|
700
|
+
for i in range(len(sorted_accesses)):
|
701
|
+
for j in range(i + 1, len(sorted_accesses)):
|
702
|
+
access1 = sorted_accesses[i]
|
703
|
+
access2 = sorted_accesses[j]
|
704
|
+
|
705
|
+
# Check if accesses overlap or are very close in time
|
706
|
+
if (
|
707
|
+
access1.end_time
|
708
|
+
and access2.start_time <= access1.end_time + timing_threshold
|
709
|
+
):
|
710
|
+
race_type = self._determine_race_type(access1, access2)
|
711
|
+
if race_type:
|
712
|
+
confidence = self._calculate_confidence(access1, access2)
|
713
|
+
if confidence >= min_confidence:
|
714
|
+
race = RaceConditionDetection(
|
715
|
+
detection_id=f"race_{int(time.time() * 1000000)}_{i}_{j}",
|
716
|
+
race_type=race_type,
|
717
|
+
involved_operations=[
|
718
|
+
access1.operation_id,
|
719
|
+
access2.operation_id,
|
720
|
+
],
|
721
|
+
involved_resources=[resource_id],
|
722
|
+
conflicting_accesses=[access1, access2],
|
723
|
+
detection_time=time.time(),
|
724
|
+
confidence_score=confidence,
|
725
|
+
severity=self._determine_severity(
|
726
|
+
race_type, confidence
|
727
|
+
),
|
728
|
+
potential_impact=self._get_potential_impact(race_type),
|
729
|
+
recommended_prevention=self._get_recommended_prevention(
|
730
|
+
race_type
|
731
|
+
),
|
732
|
+
timing_analysis={
|
733
|
+
"overlap_duration": (
|
734
|
+
access1.end_time - access2.start_time
|
735
|
+
if access1.end_time
|
736
|
+
else 0.0
|
737
|
+
),
|
738
|
+
"timing_gap": access2.start_time
|
739
|
+
- access1.start_time,
|
740
|
+
},
|
741
|
+
)
|
742
|
+
races.append(race)
|
743
|
+
|
744
|
+
return races
|
745
|
+
|
746
|
+
def _determine_race_type(
|
747
|
+
self, access1: ResourceAccess, access2: ResourceAccess
|
748
|
+
) -> Optional[RaceConditionType]:
|
749
|
+
"""Determine the type of race condition between two accesses."""
|
750
|
+
if access1.thread_id == access2.thread_id:
|
751
|
+
return None # Same thread, no race
|
752
|
+
|
753
|
+
# Write-Write race
|
754
|
+
if access1.access_type in [
|
755
|
+
AccessType.WRITE,
|
756
|
+
AccessType.DELETE,
|
757
|
+
] and access2.access_type in [AccessType.WRITE, AccessType.DELETE]:
|
758
|
+
return RaceConditionType.WRITE_WRITE_RACE
|
759
|
+
|
760
|
+
# Read-Write race
|
761
|
+
if (
|
762
|
+
access1.access_type == AccessType.READ
|
763
|
+
and access2.access_type in [AccessType.WRITE, AccessType.DELETE]
|
764
|
+
) or (
|
765
|
+
access1.access_type in [AccessType.WRITE, AccessType.DELETE]
|
766
|
+
and access2.access_type == AccessType.READ
|
767
|
+
):
|
768
|
+
return RaceConditionType.READ_WRITE_RACE
|
769
|
+
|
770
|
+
# Lost update (both read then write)
|
771
|
+
if (
|
772
|
+
access1.access_type == AccessType.READ_WRITE
|
773
|
+
and access2.access_type == AccessType.READ_WRITE
|
774
|
+
):
|
775
|
+
return RaceConditionType.LOST_UPDATE
|
776
|
+
|
777
|
+
return RaceConditionType.TIMING_DEPENDENT
|
778
|
+
|
779
|
+
def _calculate_confidence(
|
780
|
+
self, access1: ResourceAccess, access2: ResourceAccess
|
781
|
+
) -> float:
|
782
|
+
"""Calculate confidence score for race condition detection."""
|
783
|
+
confidence = 0.5 # Base confidence
|
784
|
+
|
785
|
+
# Increase confidence for write conflicts
|
786
|
+
if access1.access_type in [
|
787
|
+
AccessType.WRITE,
|
788
|
+
AccessType.DELETE,
|
789
|
+
] or access2.access_type in [
|
790
|
+
AccessType.WRITE,
|
791
|
+
AccessType.DELETE,
|
792
|
+
]:
|
793
|
+
confidence += 0.3
|
794
|
+
|
795
|
+
# Increase confidence for closer timing
|
796
|
+
if access1.end_time:
|
797
|
+
timing_gap = abs(access2.start_time - access1.start_time)
|
798
|
+
if timing_gap < 0.001: # < 1ms
|
799
|
+
confidence += 0.2
|
800
|
+
elif timing_gap < 0.01: # < 10ms
|
801
|
+
confidence += 0.1
|
802
|
+
|
803
|
+
# Increase confidence for different processes
|
804
|
+
if access1.process_id != access2.process_id:
|
805
|
+
confidence += 0.1
|
806
|
+
|
807
|
+
return min(confidence, 1.0)
|
808
|
+
|
809
|
+
def _determine_severity(
|
810
|
+
self, race_type: RaceConditionType, confidence: float
|
811
|
+
) -> str:
|
812
|
+
"""Determine severity of race condition."""
|
813
|
+
if race_type in [
|
814
|
+
RaceConditionType.WRITE_WRITE_RACE,
|
815
|
+
RaceConditionType.LOST_UPDATE,
|
816
|
+
]:
|
817
|
+
return "critical" if confidence > 0.8 else "high"
|
818
|
+
elif race_type == RaceConditionType.READ_WRITE_RACE:
|
819
|
+
return "high" if confidence > 0.7 else "medium"
|
820
|
+
else:
|
821
|
+
return "medium" if confidence > 0.6 else "low"
|
822
|
+
|
823
|
+
def _get_potential_impact(self, race_type: RaceConditionType) -> str:
|
824
|
+
"""Get potential impact description for race type."""
|
825
|
+
impact_map = {
|
826
|
+
RaceConditionType.WRITE_WRITE_RACE: "Data corruption, lost writes, inconsistent state",
|
827
|
+
RaceConditionType.READ_WRITE_RACE: "Stale data reads, inconsistent views",
|
828
|
+
RaceConditionType.LOST_UPDATE: "Lost updates, data inconsistency",
|
829
|
+
RaceConditionType.CHECK_THEN_ACT: "Logic errors, invalid state transitions",
|
830
|
+
RaceConditionType.DIRTY_READ: "Reading uncommitted data, inconsistent views",
|
831
|
+
RaceConditionType.PHANTOM_READ: "Inconsistent query results",
|
832
|
+
RaceConditionType.TIMING_DEPENDENT: "Unpredictable behavior, intermittent bugs",
|
833
|
+
}
|
834
|
+
return impact_map.get(race_type, "Unknown impact")
|
835
|
+
|
836
|
+
def _get_recommended_prevention(
|
837
|
+
self, race_type: RaceConditionType
|
838
|
+
) -> List[PreventionStrategy]:
|
839
|
+
"""Get recommended prevention strategies for race type."""
|
840
|
+
prevention_map = {
|
841
|
+
RaceConditionType.WRITE_WRITE_RACE: [
|
842
|
+
PreventionStrategy.PESSIMISTIC_LOCKING,
|
843
|
+
PreventionStrategy.ATOMIC_OPERATIONS,
|
844
|
+
],
|
845
|
+
RaceConditionType.READ_WRITE_RACE: [
|
846
|
+
PreventionStrategy.OPTIMISTIC_LOCKING,
|
847
|
+
PreventionStrategy.IMMUTABLE_DATA,
|
848
|
+
],
|
849
|
+
RaceConditionType.LOST_UPDATE: [
|
850
|
+
PreventionStrategy.OPTIMISTIC_LOCKING,
|
851
|
+
PreventionStrategy.ATOMIC_OPERATIONS,
|
852
|
+
],
|
853
|
+
RaceConditionType.CHECK_THEN_ACT: [
|
854
|
+
PreventionStrategy.ATOMIC_OPERATIONS,
|
855
|
+
PreventionStrategy.PESSIMISTIC_LOCKING,
|
856
|
+
],
|
857
|
+
RaceConditionType.TIMING_DEPENDENT: [
|
858
|
+
PreventionStrategy.SYNCHRONIZATION,
|
859
|
+
PreventionStrategy.MESSAGE_PASSING,
|
860
|
+
],
|
861
|
+
}
|
862
|
+
return prevention_map.get(race_type, [PreventionStrategy.SYNCHRONIZATION])
|
863
|
+
|
864
|
+
def _get_prevention_strategies(self, race: RaceConditionDetection) -> List[str]:
|
865
|
+
"""Get prevention strategy names for a race condition."""
|
866
|
+
return [strategy.value for strategy in race.recommended_prevention]
|
867
|
+
|
868
|
+
def _analyze_resource_conflicts(
|
869
|
+
self, resource_accesses: Dict[str, List[ResourceAccess]]
|
870
|
+
) -> Dict[str, Any]:
|
871
|
+
"""Analyze conflicts per resource."""
|
872
|
+
conflicts = {}
|
873
|
+
|
874
|
+
for resource_id, accesses in resource_accesses.items():
|
875
|
+
write_count = sum(
|
876
|
+
1
|
877
|
+
for a in accesses
|
878
|
+
if a.access_type in [AccessType.WRITE, AccessType.DELETE]
|
879
|
+
)
|
880
|
+
read_count = sum(1 for a in accesses if a.access_type == AccessType.READ)
|
881
|
+
unique_threads = len(set(a.thread_id for a in accesses))
|
882
|
+
|
883
|
+
conflicts[resource_id] = {
|
884
|
+
"total_accesses": len(accesses),
|
885
|
+
"write_accesses": write_count,
|
886
|
+
"read_accesses": read_count,
|
887
|
+
"concurrent_threads": unique_threads,
|
888
|
+
"conflict_potential": (
|
889
|
+
"high"
|
890
|
+
if write_count > 1 and unique_threads > 1
|
891
|
+
else "medium" if write_count > 0 and unique_threads > 1 else "low"
|
892
|
+
),
|
893
|
+
}
|
894
|
+
|
895
|
+
return conflicts
|
896
|
+
|
897
|
+
def _generate_timing_analysis(
|
898
|
+
self, resource_accesses: Dict[str, List[ResourceAccess]]
|
899
|
+
) -> Dict[str, Any]:
|
900
|
+
"""Generate timing analysis for race detection."""
|
901
|
+
analysis = {}
|
902
|
+
|
903
|
+
for resource_id, accesses in resource_accesses.items():
|
904
|
+
if len(accesses) > 1:
|
905
|
+
durations = [a.duration for a in accesses if a.duration]
|
906
|
+
start_times = [a.start_time for a in accesses]
|
907
|
+
|
908
|
+
analysis[resource_id] = {
|
909
|
+
"access_count": len(accesses),
|
910
|
+
"avg_duration": sum(durations) / len(durations) if durations else 0,
|
911
|
+
"max_duration": max(durations) if durations else 0,
|
912
|
+
"time_span": max(start_times) - min(start_times),
|
913
|
+
"concurrency_level": len(accesses),
|
914
|
+
}
|
915
|
+
|
916
|
+
return analysis
|
917
|
+
|
918
|
+
async def _create_race_detection(
|
919
|
+
self, accesses: List[ResourceAccess], detection_time: float
|
920
|
+
) -> Optional[RaceConditionDetection]:
|
921
|
+
"""Create a race condition detection from conflicting accesses."""
|
922
|
+
if len(accesses) < 2:
|
923
|
+
return None
|
924
|
+
|
925
|
+
race_type = self._determine_race_type(accesses[0], accesses[1])
|
926
|
+
if not race_type:
|
927
|
+
return None
|
928
|
+
|
929
|
+
confidence = self._calculate_confidence(accesses[0], accesses[1])
|
930
|
+
|
931
|
+
return RaceConditionDetection(
|
932
|
+
detection_id=f"race_{int(detection_time * 1000000)}",
|
933
|
+
race_type=race_type,
|
934
|
+
involved_operations=list(set(a.operation_id for a in accesses)),
|
935
|
+
involved_resources=list(set(a.resource_id for a in accesses)),
|
936
|
+
conflicting_accesses=accesses,
|
937
|
+
detection_time=detection_time,
|
938
|
+
confidence_score=confidence,
|
939
|
+
severity=self._determine_severity(race_type, confidence),
|
940
|
+
potential_impact=self._get_potential_impact(race_type),
|
941
|
+
recommended_prevention=self._get_recommended_prevention(race_type),
|
942
|
+
)
|
943
|
+
|
944
|
+
async def _get_status(self, **kwargs) -> Dict[str, Any]:
|
945
|
+
"""Get current race detector status."""
|
946
|
+
return {
|
947
|
+
"races_detected": [self._serialize_race(r) for r in self._detected_races],
|
948
|
+
"race_count": len(self._detected_races),
|
949
|
+
"active_accesses": len(self._active_accesses),
|
950
|
+
"active_operations": len(self._active_operations),
|
951
|
+
"prevention_suggestions": [],
|
952
|
+
"resource_conflicts": {},
|
953
|
+
"timing_analysis": {},
|
954
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
955
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
956
|
+
"status": "success",
|
957
|
+
}
|
958
|
+
|
959
|
+
async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
|
960
|
+
"""Start continuous race condition monitoring."""
|
961
|
+
interval = kwargs.get("monitoring_interval", 1.0)
|
962
|
+
|
963
|
+
if not self._monitoring_active:
|
964
|
+
self._monitoring_active = True
|
965
|
+
monitoring_task = asyncio.create_task(self._monitoring_loop(interval))
|
966
|
+
self._background_tasks.add(monitoring_task)
|
967
|
+
monitoring_task.add_done_callback(self._background_tasks.discard)
|
968
|
+
|
969
|
+
return {
|
970
|
+
"races_detected": [],
|
971
|
+
"race_count": 0,
|
972
|
+
"active_accesses": len(self._active_accesses),
|
973
|
+
"active_operations": len(self._active_operations),
|
974
|
+
"prevention_suggestions": [],
|
975
|
+
"resource_conflicts": {},
|
976
|
+
"timing_analysis": {},
|
977
|
+
"monitoring_status": "monitoring",
|
978
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
979
|
+
"status": "success",
|
980
|
+
}
|
981
|
+
|
982
|
+
async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
|
983
|
+
"""Stop continuous race condition monitoring."""
|
984
|
+
self._monitoring_active = False
|
985
|
+
|
986
|
+
# Cancel background tasks
|
987
|
+
for task in self._background_tasks:
|
988
|
+
if not task.done():
|
989
|
+
task.cancel()
|
990
|
+
|
991
|
+
# Wait for tasks to complete
|
992
|
+
if self._background_tasks:
|
993
|
+
await asyncio.gather(*self._background_tasks, return_exceptions=True)
|
994
|
+
|
995
|
+
self._background_tasks.clear()
|
996
|
+
|
997
|
+
return {
|
998
|
+
"races_detected": [],
|
999
|
+
"race_count": 0,
|
1000
|
+
"active_accesses": len(self._active_accesses),
|
1001
|
+
"active_operations": len(self._active_operations),
|
1002
|
+
"prevention_suggestions": [],
|
1003
|
+
"resource_conflicts": {},
|
1004
|
+
"timing_analysis": {},
|
1005
|
+
"monitoring_status": "stopped",
|
1006
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
1007
|
+
"status": "success",
|
1008
|
+
}
|
1009
|
+
|
1010
|
+
async def _monitoring_loop(self, interval: float):
|
1011
|
+
"""Background monitoring loop for continuous race detection."""
|
1012
|
+
while self._monitoring_active:
|
1013
|
+
try:
|
1014
|
+
await asyncio.sleep(interval)
|
1015
|
+
|
1016
|
+
# Detect races in recent activity
|
1017
|
+
races = await self._detect_races(detection_window=interval * 2)
|
1018
|
+
|
1019
|
+
if races["race_count"] > 0:
|
1020
|
+
self.logger.warning(
|
1021
|
+
f"Monitoring detected {races['race_count']} race conditions"
|
1022
|
+
)
|
1023
|
+
|
1024
|
+
except asyncio.CancelledError:
|
1025
|
+
break
|
1026
|
+
except Exception as e:
|
1027
|
+
self.logger.error(f"Monitoring loop error: {e}")
|
1028
|
+
|
1029
|
+
def _serialize_race(self, race: RaceConditionDetection) -> Dict[str, Any]:
|
1030
|
+
"""Serialize a race condition detection to dictionary."""
|
1031
|
+
return {
|
1032
|
+
"detection_id": race.detection_id,
|
1033
|
+
"race_type": race.race_type.value,
|
1034
|
+
"involved_operations": race.involved_operations,
|
1035
|
+
"involved_resources": race.involved_resources,
|
1036
|
+
"conflicting_accesses": [
|
1037
|
+
{
|
1038
|
+
"access_id": a.access_id,
|
1039
|
+
"resource_id": a.resource_id,
|
1040
|
+
"operation_id": a.operation_id,
|
1041
|
+
"thread_id": a.thread_id,
|
1042
|
+
"process_id": a.process_id,
|
1043
|
+
"access_type": a.access_type.value,
|
1044
|
+
"start_time": a.start_time,
|
1045
|
+
"end_time": a.end_time,
|
1046
|
+
"duration": a.duration,
|
1047
|
+
"success": a.success,
|
1048
|
+
"error": a.error,
|
1049
|
+
}
|
1050
|
+
for a in race.conflicting_accesses
|
1051
|
+
],
|
1052
|
+
"detection_time": race.detection_time,
|
1053
|
+
"confidence_score": race.confidence_score,
|
1054
|
+
"severity": race.severity,
|
1055
|
+
"potential_impact": race.potential_impact,
|
1056
|
+
"recommended_prevention": [p.value for p in race.recommended_prevention],
|
1057
|
+
"timing_analysis": race.timing_analysis,
|
1058
|
+
"metadata": race.metadata,
|
1059
|
+
}
|
1060
|
+
|
1061
|
+
async def _report_operation(self, **kwargs) -> Dict[str, Any]:
|
1062
|
+
"""Report an operation with resource access for race detection."""
|
1063
|
+
operation_id = kwargs.get("operation_id", str(uuid.uuid4()))
|
1064
|
+
resource_id = kwargs.get("resource_id")
|
1065
|
+
access_type = kwargs.get("access_type", "read")
|
1066
|
+
thread_id = kwargs.get("thread_id", str(threading.get_ident()))
|
1067
|
+
process_id = kwargs.get("process_id", str(os.getpid()))
|
1068
|
+
metadata = kwargs.get("metadata", {})
|
1069
|
+
|
1070
|
+
# Register the operation
|
1071
|
+
register_result = await self._register_operation(
|
1072
|
+
operation_id=operation_id,
|
1073
|
+
operation_type="reported_operation",
|
1074
|
+
metadata=metadata,
|
1075
|
+
)
|
1076
|
+
|
1077
|
+
# Register resource access if specified
|
1078
|
+
if resource_id:
|
1079
|
+
access_result = await self._register_access(
|
1080
|
+
resource_id=resource_id,
|
1081
|
+
operation_id=operation_id,
|
1082
|
+
thread_id=thread_id,
|
1083
|
+
process_id=process_id,
|
1084
|
+
access_type=access_type,
|
1085
|
+
metadata=metadata,
|
1086
|
+
)
|
1087
|
+
|
1088
|
+
return {
|
1089
|
+
"operation_id": operation_id,
|
1090
|
+
"resource_id": resource_id,
|
1091
|
+
"access_type": access_type,
|
1092
|
+
"thread_id": thread_id,
|
1093
|
+
"process_id": process_id,
|
1094
|
+
"detection_status": "reported",
|
1095
|
+
"races_detected": [
|
1096
|
+
self._serialize_race(race) for race in self._detected_races
|
1097
|
+
],
|
1098
|
+
"race_count": len(self._detected_races),
|
1099
|
+
"active_accesses": len(
|
1100
|
+
self._active_accesses
|
1101
|
+
), # Fixed to use correct data structure
|
1102
|
+
"active_operations": len(self._active_operations),
|
1103
|
+
"prevention_suggestions": [],
|
1104
|
+
"resource_conflicts": {},
|
1105
|
+
"timing_analysis": {},
|
1106
|
+
"monitoring_status": "active",
|
1107
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
1108
|
+
"status": "success",
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
async def _complete_operation(self, **kwargs) -> Dict[str, Any]:
|
1112
|
+
"""Complete an operation and perform final race detection analysis."""
|
1113
|
+
operation_id = kwargs.get("operation_id")
|
1114
|
+
resource_id = kwargs.get("resource_id")
|
1115
|
+
success = kwargs.get("success", True)
|
1116
|
+
|
1117
|
+
# If operation_id is provided, complete that specific operation
|
1118
|
+
if operation_id:
|
1119
|
+
if operation_id in self._active_operations:
|
1120
|
+
operation = self._active_operations.pop(operation_id)
|
1121
|
+
# You could add completion logic here
|
1122
|
+
|
1123
|
+
# Return the current state with race detection results
|
1124
|
+
return {
|
1125
|
+
"operation_id": operation_id,
|
1126
|
+
"resource_id": resource_id,
|
1127
|
+
"operation_success": success,
|
1128
|
+
"races_detected": [
|
1129
|
+
self._serialize_race(race) for race in self._detected_races
|
1130
|
+
],
|
1131
|
+
"race_count": len(self._detected_races),
|
1132
|
+
"active_accesses": len(self._active_accesses),
|
1133
|
+
"active_operations": len(self._active_operations),
|
1134
|
+
"prevention_suggestions": [],
|
1135
|
+
"resource_conflicts": {},
|
1136
|
+
"timing_analysis": {},
|
1137
|
+
"monitoring_status": "operation_completed",
|
1138
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
1139
|
+
"status": "success",
|
1140
|
+
}
|
1141
|
+
|
1142
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
1143
|
+
"""Synchronous wrapper for compatibility."""
|
1144
|
+
import asyncio
|
1145
|
+
|
1146
|
+
return asyncio.run(self.async_run(**kwargs))
|
1147
|
+
|
1148
|
+
async def cleanup(self):
|
1149
|
+
"""Cleanup resources when node is destroyed."""
|
1150
|
+
await self._stop_monitoring()
|
1151
|
+
await super().cleanup() if hasattr(super(), "cleanup") else None
|