kailash 0.6.6__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. kailash/__init__.py +35 -5
  2. kailash/adapters/__init__.py +5 -0
  3. kailash/adapters/mcp_platform_adapter.py +273 -0
  4. kailash/channels/__init__.py +21 -0
  5. kailash/channels/api_channel.py +409 -0
  6. kailash/channels/base.py +271 -0
  7. kailash/channels/cli_channel.py +661 -0
  8. kailash/channels/event_router.py +496 -0
  9. kailash/channels/mcp_channel.py +648 -0
  10. kailash/channels/session.py +423 -0
  11. kailash/mcp_server/discovery.py +1 -1
  12. kailash/middleware/mcp/enhanced_server.py +22 -16
  13. kailash/nexus/__init__.py +21 -0
  14. kailash/nexus/factory.py +413 -0
  15. kailash/nexus/gateway.py +545 -0
  16. kailash/nodes/__init__.py +2 -0
  17. kailash/nodes/ai/iterative_llm_agent.py +988 -17
  18. kailash/nodes/ai/llm_agent.py +29 -9
  19. kailash/nodes/api/__init__.py +2 -2
  20. kailash/nodes/api/monitoring.py +1 -1
  21. kailash/nodes/base_async.py +54 -14
  22. kailash/nodes/code/async_python.py +1 -1
  23. kailash/nodes/data/bulk_operations.py +939 -0
  24. kailash/nodes/data/query_builder.py +373 -0
  25. kailash/nodes/data/query_cache.py +512 -0
  26. kailash/nodes/monitoring/__init__.py +10 -0
  27. kailash/nodes/monitoring/deadlock_detector.py +964 -0
  28. kailash/nodes/monitoring/performance_anomaly.py +1078 -0
  29. kailash/nodes/monitoring/race_condition_detector.py +1151 -0
  30. kailash/nodes/monitoring/transaction_metrics.py +790 -0
  31. kailash/nodes/monitoring/transaction_monitor.py +931 -0
  32. kailash/nodes/system/__init__.py +17 -0
  33. kailash/nodes/system/command_parser.py +820 -0
  34. kailash/nodes/transaction/__init__.py +48 -0
  35. kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
  36. kailash/nodes/transaction/saga_coordinator.py +652 -0
  37. kailash/nodes/transaction/saga_state_storage.py +411 -0
  38. kailash/nodes/transaction/saga_step.py +467 -0
  39. kailash/nodes/transaction/transaction_context.py +756 -0
  40. kailash/nodes/transaction/two_phase_commit.py +978 -0
  41. kailash/nodes/transform/processors.py +17 -1
  42. kailash/nodes/validation/__init__.py +21 -0
  43. kailash/nodes/validation/test_executor.py +532 -0
  44. kailash/nodes/validation/validation_nodes.py +447 -0
  45. kailash/resources/factory.py +1 -1
  46. kailash/runtime/async_local.py +84 -21
  47. kailash/runtime/local.py +21 -2
  48. kailash/runtime/parameter_injector.py +187 -31
  49. kailash/security.py +16 -1
  50. kailash/servers/__init__.py +32 -0
  51. kailash/servers/durable_workflow_server.py +430 -0
  52. kailash/servers/enterprise_workflow_server.py +466 -0
  53. kailash/servers/gateway.py +183 -0
  54. kailash/servers/workflow_server.py +290 -0
  55. kailash/utils/data_validation.py +192 -0
  56. kailash/workflow/builder.py +291 -12
  57. kailash/workflow/validation.py +144 -8
  58. {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/METADATA +1 -1
  59. {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/RECORD +63 -25
  60. {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/WHEEL +0 -0
  61. {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/entry_points.txt +0 -0
  62. {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/licenses/LICENSE +0 -0
  63. {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1151 @@
1
+ """Race condition detection and analysis node for concurrent operations.
2
+
3
+ This module provides comprehensive race condition detection capabilities with
4
+ concurrent access pattern analysis, timing-based detection, and preventive suggestions.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import os
10
+ import threading
11
+ import time
12
+ import uuid
13
+ from collections import defaultdict, deque
14
+ from dataclasses import dataclass, field
15
+ from datetime import UTC, datetime
16
+ from enum import Enum
17
+ from typing import Any, Dict, List, Optional, Set, Tuple
18
+
19
+ from kailash.nodes.base import NodeParameter, register_node
20
+ from kailash.nodes.base_async import AsyncNode
21
+ from kailash.sdk_exceptions import NodeExecutionError
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class RaceConditionType(Enum):
27
+ """Types of race conditions that can be detected."""
28
+
29
+ READ_WRITE_RACE = "read_write_race"
30
+ WRITE_WRITE_RACE = "write_write_race"
31
+ CHECK_THEN_ACT = "check_then_act"
32
+ LOST_UPDATE = "lost_update"
33
+ DIRTY_READ = "dirty_read"
34
+ PHANTOM_READ = "phantom_read"
35
+ TIMING_DEPENDENT = "timing_dependent"
36
+
37
+
38
+ class AccessType(Enum):
39
+ """Types of resource access."""
40
+
41
+ READ = "read"
42
+ WRITE = "write"
43
+ READ_WRITE = "read_write"
44
+ DELETE = "delete"
45
+ CREATE = "create"
46
+
47
+
48
+ class PreventionStrategy(Enum):
49
+ """Race condition prevention strategies."""
50
+
51
+ OPTIMISTIC_LOCKING = "optimistic_locking"
52
+ PESSIMISTIC_LOCKING = "pessimistic_locking"
53
+ ATOMIC_OPERATIONS = "atomic_operations"
54
+ SERIALIZATION = "serialization"
55
+ IMMUTABLE_DATA = "immutable_data"
56
+ MESSAGE_PASSING = "message_passing"
57
+ SYNCHRONIZATION = "synchronization"
58
+
59
+
60
+ @dataclass
61
+ class ResourceAccess:
62
+ """Represents a resource access event."""
63
+
64
+ access_id: str
65
+ resource_id: str
66
+ operation_id: str
67
+ thread_id: str
68
+ process_id: str
69
+ access_type: AccessType
70
+ start_time: float
71
+ end_time: Optional[float] = None
72
+ duration: Optional[float] = None
73
+ success: bool = True
74
+ error: Optional[str] = None
75
+ metadata: Dict[str, Any] = field(default_factory=dict)
76
+
77
+
78
+ @dataclass
79
+ class ConcurrentOperation:
80
+ """Represents a concurrent operation with multiple accesses."""
81
+
82
+ operation_id: str
83
+ start_time: float
84
+ thread_id: str
85
+ process_id: str
86
+ end_time: Optional[float] = None
87
+ accesses: List[ResourceAccess] = field(default_factory=list)
88
+ total_resources: int = 0
89
+ conflicting_operations: Set[str] = field(default_factory=set)
90
+ metadata: Dict[str, Any] = field(default_factory=dict)
91
+
92
+
93
+ @dataclass
94
+ class RaceConditionDetection:
95
+ """Represents a detected race condition."""
96
+
97
+ detection_id: str
98
+ race_type: RaceConditionType
99
+ involved_operations: List[str]
100
+ involved_resources: List[str]
101
+ conflicting_accesses: List[ResourceAccess]
102
+ detection_time: float
103
+ confidence_score: float # 0.0 to 1.0
104
+ severity: str # low, medium, high, critical
105
+ potential_impact: str
106
+ recommended_prevention: List[PreventionStrategy] = field(default_factory=list)
107
+ timing_analysis: Dict[str, float] = field(default_factory=dict)
108
+ metadata: Dict[str, Any] = field(default_factory=dict)
109
+
110
+
111
+ @register_node()
112
+ class RaceConditionDetectorNode(AsyncNode):
113
+ """Node for detecting race conditions in concurrent operations.
114
+
115
+ This node provides comprehensive race condition detection including:
116
+ - Concurrent access pattern analysis
117
+ - Timing-based race condition detection
118
+ - Read-write conflict identification
119
+ - Lost update detection
120
+ - Dirty read detection
121
+ - Check-then-act race detection
122
+ - Prevention strategy recommendations
123
+
124
+ Design Purpose:
125
+ - Detect potential race conditions in production systems
126
+ - Provide actionable insights for race prevention
127
+ - Support concurrent system troubleshooting
128
+ - Enable proactive race condition monitoring
129
+
130
+ Examples:
131
+ >>> # Register resource access
132
+ >>> detector = RaceConditionDetectorNode()
133
+ >>> result = await detector.execute(
134
+ ... operation="register_access",
135
+ ... resource_id="user_account_123",
136
+ ... operation_id="op_456",
137
+ ... thread_id="thread_1",
138
+ ... access_type="read",
139
+ ... metadata={"query": "SELECT balance FROM accounts"}
140
+ ... )
141
+
142
+ >>> # End resource access
143
+ >>> result = await detector.execute(
144
+ ... operation="end_access",
145
+ ... access_id="access_789",
146
+ ... success=True
147
+ ... )
148
+
149
+ >>> # Detect race conditions
150
+ >>> result = await detector.execute(
151
+ ... operation="detect_races",
152
+ ... detection_window=5.0,
153
+ ... min_confidence=0.7
154
+ ... )
155
+ """
156
+
157
+ def __init__(self, **kwargs):
158
+ """Initialize the race condition detector node."""
159
+ super().__init__(**kwargs)
160
+ self._active_accesses: Dict[str, ResourceAccess] = {}
161
+ self._completed_accesses: List[ResourceAccess] = []
162
+ self._active_operations: Dict[str, ConcurrentOperation] = {}
163
+ self._resource_access_history: Dict[str, deque] = defaultdict(
164
+ lambda: deque(maxlen=1000)
165
+ )
166
+ self._detected_races: List[RaceConditionDetection] = []
167
+ self._monitoring_active = False
168
+ self._background_tasks: Set[asyncio.Task] = set()
169
+ self._detection_thresholds = {
170
+ "min_confidence": 0.5,
171
+ "timing_threshold": 0.001, # 1ms
172
+ "overlap_threshold": 0.5,
173
+ }
174
+ self.logger.info(f"Initialized RaceConditionDetectorNode: {self.id}")
175
+
176
+ def get_parameters(self) -> Dict[str, NodeParameter]:
177
+ """Define the parameters this node accepts."""
178
+ return {
179
+ "operation": NodeParameter(
180
+ name="operation",
181
+ type=str,
182
+ required=True,
183
+ description="Operation (register_access, end_access, register_operation, end_operation, detect_races, get_status)",
184
+ ),
185
+ "access_id": NodeParameter(
186
+ name="access_id",
187
+ type=str,
188
+ required=False,
189
+ description="Unique access identifier",
190
+ ),
191
+ "resource_id": NodeParameter(
192
+ name="resource_id",
193
+ type=str,
194
+ required=False,
195
+ description="Resource being accessed (table, file, object, etc.)",
196
+ ),
197
+ "operation_id": NodeParameter(
198
+ name="operation_id",
199
+ type=str,
200
+ required=False,
201
+ description="Operation identifier grouping multiple accesses",
202
+ ),
203
+ "thread_id": NodeParameter(
204
+ name="thread_id",
205
+ type=str,
206
+ required=False,
207
+ description="Thread identifier",
208
+ ),
209
+ "process_id": NodeParameter(
210
+ name="process_id",
211
+ type=str,
212
+ required=False,
213
+ description="Process identifier",
214
+ ),
215
+ "access_type": NodeParameter(
216
+ name="access_type",
217
+ type=str,
218
+ required=False,
219
+ default="read",
220
+ description="Type of access (read, write, read_write, delete, create)",
221
+ ),
222
+ "success": NodeParameter(
223
+ name="success",
224
+ type=bool,
225
+ required=False,
226
+ default=True,
227
+ description="Whether the access was successful",
228
+ ),
229
+ "error": NodeParameter(
230
+ name="error",
231
+ type=str,
232
+ required=False,
233
+ description="Error message if access failed",
234
+ ),
235
+ "detection_window": NodeParameter(
236
+ name="detection_window",
237
+ type=float,
238
+ required=False,
239
+ default=5.0,
240
+ description="Time window for race detection in seconds",
241
+ ),
242
+ "min_confidence": NodeParameter(
243
+ name="min_confidence",
244
+ type=float,
245
+ required=False,
246
+ default=0.5,
247
+ description="Minimum confidence score for race detection (0.0-1.0)",
248
+ ),
249
+ "resource_filters": NodeParameter(
250
+ name="resource_filters",
251
+ type=list,
252
+ required=False,
253
+ default=[],
254
+ description="List of resource patterns to filter detection",
255
+ ),
256
+ "timing_threshold": NodeParameter(
257
+ name="timing_threshold",
258
+ type=float,
259
+ required=False,
260
+ default=0.001,
261
+ description="Timing threshold for race detection in seconds",
262
+ ),
263
+ "enable_monitoring": NodeParameter(
264
+ name="enable_monitoring",
265
+ type=bool,
266
+ required=False,
267
+ default=False,
268
+ description="Enable continuous race condition monitoring",
269
+ ),
270
+ "monitoring_interval": NodeParameter(
271
+ name="monitoring_interval",
272
+ type=float,
273
+ required=False,
274
+ default=1.0,
275
+ description="Monitoring interval in seconds",
276
+ ),
277
+ "metadata": NodeParameter(
278
+ name="metadata",
279
+ type=dict,
280
+ required=False,
281
+ default={},
282
+ description="Additional metadata for the operation",
283
+ ),
284
+ }
285
+
286
+ def get_output_schema(self) -> Dict[str, NodeParameter]:
287
+ """Define the output schema for this node."""
288
+ return {
289
+ "races_detected": NodeParameter(
290
+ name="races_detected",
291
+ type=list,
292
+ description="List of detected race conditions",
293
+ ),
294
+ "race_count": NodeParameter(
295
+ name="race_count", type=int, description="Number of races detected"
296
+ ),
297
+ "active_accesses": NodeParameter(
298
+ name="active_accesses",
299
+ type=int,
300
+ description="Number of active accesses",
301
+ ),
302
+ "active_operations": NodeParameter(
303
+ name="active_operations",
304
+ type=int,
305
+ description="Number of active operations",
306
+ ),
307
+ "prevention_suggestions": NodeParameter(
308
+ name="prevention_suggestions",
309
+ type=list,
310
+ description="Recommended prevention strategies",
311
+ ),
312
+ "resource_conflicts": NodeParameter(
313
+ name="resource_conflicts",
314
+ type=dict,
315
+ description="Resource-level conflict analysis",
316
+ ),
317
+ "timing_analysis": NodeParameter(
318
+ name="timing_analysis",
319
+ type=dict,
320
+ description="Timing-based analysis results",
321
+ ),
322
+ "monitoring_status": NodeParameter(
323
+ name="monitoring_status",
324
+ type=str,
325
+ description="Current monitoring status",
326
+ ),
327
+ "timestamp": NodeParameter(
328
+ name="timestamp", type=str, description="ISO timestamp of operation"
329
+ ),
330
+ "status": NodeParameter(
331
+ name="status", type=str, description="Operation status"
332
+ ),
333
+ }
334
+
335
+ async def async_run(self, **kwargs) -> Dict[str, Any]:
336
+ """Execute race condition detection operation."""
337
+ operation = kwargs.get("operation")
338
+
339
+ try:
340
+ if operation == "register_access":
341
+ return await self._register_access(**kwargs)
342
+ elif operation == "end_access":
343
+ return await self._end_access(**kwargs)
344
+ elif operation == "register_operation":
345
+ return await self._register_operation(**kwargs)
346
+ elif operation == "end_operation":
347
+ return await self._end_operation(**kwargs)
348
+ elif operation == "detect_races":
349
+ return await self._detect_races(**kwargs)
350
+ elif operation == "get_status":
351
+ return await self._get_status(**kwargs)
352
+ elif operation == "start_monitoring":
353
+ return await self._start_monitoring(**kwargs)
354
+ elif operation == "stop_monitoring":
355
+ return await self._stop_monitoring(**kwargs)
356
+ elif operation == "report_operation":
357
+ return await self._report_operation(**kwargs)
358
+ elif operation == "complete_operation":
359
+ return await self._complete_operation(**kwargs)
360
+ else:
361
+ raise ValueError(f"Unknown operation: {operation}")
362
+
363
+ except Exception as e:
364
+ self.logger.error(f"Race condition detection operation failed: {str(e)}")
365
+ raise NodeExecutionError(f"Failed to execute race detection: {str(e)}")
366
+
367
+ async def _register_access(self, **kwargs) -> Dict[str, Any]:
368
+ """Register a new resource access."""
369
+ resource_id = kwargs.get("resource_id")
370
+ operation_id = kwargs.get("operation_id")
371
+ thread_id = kwargs.get("thread_id", "unknown")
372
+ process_id = kwargs.get("process_id", "unknown")
373
+ access_type = AccessType(kwargs.get("access_type", "read"))
374
+ metadata = kwargs.get("metadata", {})
375
+
376
+ if not resource_id:
377
+ raise ValueError("resource_id is required")
378
+
379
+ current_time = time.time()
380
+ access_id = kwargs.get("access_id") or f"access_{int(current_time * 1000000)}"
381
+
382
+ # Create access record
383
+ access = ResourceAccess(
384
+ access_id=access_id,
385
+ resource_id=resource_id,
386
+ operation_id=operation_id or "unknown",
387
+ thread_id=thread_id,
388
+ process_id=process_id,
389
+ access_type=access_type,
390
+ start_time=current_time,
391
+ metadata=metadata,
392
+ )
393
+
394
+ # Register access
395
+ self._active_accesses[access_id] = access
396
+ self._resource_access_history[resource_id].append(access)
397
+
398
+ # Update operation if specified
399
+ if operation_id and operation_id in self._active_operations:
400
+ operation = self._active_operations[operation_id]
401
+ operation.accesses.append(access)
402
+ operation.total_resources += 1
403
+
404
+ # Check for immediate race conditions
405
+ races = await self._analyze_concurrent_access(resource_id, access)
406
+
407
+ self.logger.debug(
408
+ f"Registered access {access_id} for resource {resource_id} ({access_type.value})"
409
+ )
410
+
411
+ return {
412
+ "races_detected": [self._serialize_race(r) for r in races],
413
+ "race_count": len(races),
414
+ "active_accesses": len(self._active_accesses),
415
+ "active_operations": len(self._active_operations),
416
+ "prevention_suggestions": [],
417
+ "resource_conflicts": {},
418
+ "timing_analysis": {},
419
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
420
+ "timestamp": datetime.now(UTC).isoformat(),
421
+ "status": "success",
422
+ }
423
+
424
+ async def _end_access(self, **kwargs) -> Dict[str, Any]:
425
+ """End a resource access."""
426
+ access_id = kwargs.get("access_id")
427
+ success = kwargs.get("success", True)
428
+ error = kwargs.get("error")
429
+
430
+ if not access_id:
431
+ raise ValueError("access_id is required")
432
+
433
+ if access_id not in self._active_accesses:
434
+ raise ValueError(f"Access {access_id} not found")
435
+
436
+ access = self._active_accesses.pop(access_id)
437
+
438
+ # Complete access
439
+ access.end_time = time.time()
440
+ access.duration = access.end_time - access.start_time
441
+ access.success = success
442
+ access.error = error
443
+
444
+ # Store completed access
445
+ self._completed_accesses.append(access)
446
+
447
+ # Clean old accesses (keep last hour)
448
+ cutoff_time = time.time() - 3600
449
+ self._completed_accesses = [
450
+ a for a in self._completed_accesses if a.start_time > cutoff_time
451
+ ]
452
+
453
+ self.logger.debug(
454
+ f"Ended access {access_id} with duration {access.duration:.3f}s, success: {success}"
455
+ )
456
+
457
+ return {
458
+ "races_detected": [],
459
+ "race_count": 0,
460
+ "active_accesses": len(self._active_accesses),
461
+ "active_operations": len(self._active_operations),
462
+ "prevention_suggestions": [],
463
+ "resource_conflicts": {},
464
+ "timing_analysis": {"access_duration": access.duration},
465
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
466
+ "timestamp": datetime.now(UTC).isoformat(),
467
+ "status": "success",
468
+ }
469
+
470
+ async def _register_operation(self, **kwargs) -> Dict[str, Any]:
471
+ """Register a new concurrent operation."""
472
+ operation_id = kwargs.get("operation_id")
473
+ thread_id = kwargs.get("thread_id", "unknown")
474
+ process_id = kwargs.get("process_id", "unknown")
475
+ metadata = kwargs.get("metadata", {})
476
+
477
+ if not operation_id:
478
+ raise ValueError("operation_id is required")
479
+
480
+ current_time = time.time()
481
+
482
+ # Create operation record
483
+ operation = ConcurrentOperation(
484
+ operation_id=operation_id,
485
+ start_time=current_time,
486
+ thread_id=thread_id,
487
+ process_id=process_id,
488
+ metadata=metadata,
489
+ )
490
+
491
+ self._active_operations[operation_id] = operation
492
+
493
+ self.logger.debug(f"Registered operation {operation_id}")
494
+
495
+ return {
496
+ "races_detected": [],
497
+ "race_count": 0,
498
+ "active_accesses": len(self._active_accesses),
499
+ "active_operations": len(self._active_operations),
500
+ "prevention_suggestions": [],
501
+ "resource_conflicts": {},
502
+ "timing_analysis": {},
503
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
504
+ "timestamp": datetime.now(UTC).isoformat(),
505
+ "status": "success",
506
+ }
507
+
508
+ async def _end_operation(self, **kwargs) -> Dict[str, Any]:
509
+ """End a concurrent operation."""
510
+ operation_id = kwargs.get("operation_id")
511
+
512
+ if not operation_id:
513
+ raise ValueError("operation_id is required")
514
+
515
+ if operation_id not in self._active_operations:
516
+ raise ValueError(f"Operation {operation_id} not found")
517
+
518
+ operation = self._active_operations.pop(operation_id)
519
+ operation.end_time = time.time()
520
+
521
+ # Analyze operation for race conditions
522
+ races = await self._analyze_operation_races(operation)
523
+
524
+ self.logger.debug(
525
+ f"Ended operation {operation_id} with {len(operation.accesses)} accesses"
526
+ )
527
+
528
+ return {
529
+ "races_detected": [self._serialize_race(r) for r in races],
530
+ "race_count": len(races),
531
+ "active_accesses": len(self._active_accesses),
532
+ "active_operations": len(self._active_operations),
533
+ "prevention_suggestions": [
534
+ self._get_prevention_strategies(r) for r in races
535
+ ],
536
+ "resource_conflicts": {},
537
+ "timing_analysis": {},
538
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
539
+ "timestamp": datetime.now(UTC).isoformat(),
540
+ "status": "success",
541
+ }
542
+
543
+ async def _detect_races(self, **kwargs) -> Dict[str, Any]:
544
+ """Detect race conditions in recent accesses."""
545
+ detection_window = kwargs.get("detection_window", 5.0)
546
+ min_confidence = kwargs.get("min_confidence", 0.5)
547
+ resource_filters = kwargs.get("resource_filters", [])
548
+ timing_threshold = kwargs.get("timing_threshold", 0.001)
549
+
550
+ current_time = time.time()
551
+ window_start = current_time - detection_window
552
+
553
+ # Analyze recent accesses for race conditions
554
+ races = []
555
+
556
+ # Group accesses by resource within time window
557
+ resource_accesses = defaultdict(list)
558
+ for access in self._completed_accesses:
559
+ if access.start_time >= window_start:
560
+ if not resource_filters or any(
561
+ f in access.resource_id for f in resource_filters
562
+ ):
563
+ resource_accesses[access.resource_id].append(access)
564
+
565
+ # Detect races for each resource
566
+ for resource_id, accesses in resource_accesses.items():
567
+ if len(accesses) > 1:
568
+ resource_races = await self._detect_resource_races(
569
+ resource_id, accesses, timing_threshold, min_confidence
570
+ )
571
+ races.extend(resource_races)
572
+
573
+ # Store detected races
574
+ self._detected_races.extend(races)
575
+
576
+ # Generate prevention suggestions
577
+ prevention_suggestions = []
578
+ for race in races:
579
+ strategies = self._get_prevention_strategies(race)
580
+ prevention_suggestions.extend(strategies)
581
+
582
+ # Analyze resource conflicts
583
+ resource_conflicts = self._analyze_resource_conflicts(resource_accesses)
584
+
585
+ # Generate timing analysis
586
+ timing_analysis = self._generate_timing_analysis(resource_accesses)
587
+
588
+ self.logger.info(
589
+ f"Detected {len(races)} race conditions in {detection_window}s window"
590
+ )
591
+
592
+ return {
593
+ "races_detected": [self._serialize_race(r) for r in races],
594
+ "race_count": len(races),
595
+ "active_accesses": len(self._active_accesses),
596
+ "active_operations": len(self._active_operations),
597
+ "prevention_suggestions": list(set(prevention_suggestions)),
598
+ "resource_conflicts": resource_conflicts,
599
+ "timing_analysis": timing_analysis,
600
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
601
+ "timestamp": datetime.now(UTC).isoformat(),
602
+ "status": "success",
603
+ }
604
+
605
+ async def _analyze_concurrent_access(
606
+ self, resource_id: str, new_access: ResourceAccess
607
+ ) -> List[RaceConditionDetection]:
608
+ """Analyze for immediate race conditions with new access."""
609
+ races = []
610
+ current_time = time.time()
611
+
612
+ # Check concurrent accesses to the same resource
613
+ concurrent_accesses = [
614
+ access
615
+ for access in self._active_accesses.values()
616
+ if (
617
+ access.resource_id == resource_id
618
+ and access.access_id != new_access.access_id
619
+ and access.thread_id != new_access.thread_id
620
+ )
621
+ ]
622
+
623
+ for concurrent_access in concurrent_accesses:
624
+ # Check for write-write or read-write conflicts
625
+ if new_access.access_type in [
626
+ AccessType.WRITE,
627
+ AccessType.READ_WRITE,
628
+ AccessType.DELETE,
629
+ ] or concurrent_access.access_type in [
630
+ AccessType.WRITE,
631
+ AccessType.READ_WRITE,
632
+ AccessType.DELETE,
633
+ ]:
634
+ race = await self._create_race_detection(
635
+ [new_access, concurrent_access], current_time
636
+ )
637
+ if race:
638
+ races.append(race)
639
+
640
+ return races
641
+
642
+ async def _analyze_operation_races(
643
+ self, operation: ConcurrentOperation
644
+ ) -> List[RaceConditionDetection]:
645
+ """Analyze an operation for race conditions."""
646
+ races = []
647
+
648
+ # Check for check-then-act patterns
649
+ read_accesses = [
650
+ a for a in operation.accesses if a.access_type == AccessType.READ
651
+ ]
652
+ write_accesses = [
653
+ a for a in operation.accesses if a.access_type == AccessType.WRITE
654
+ ]
655
+
656
+ for read_access in read_accesses:
657
+ for write_access in write_accesses:
658
+ if (
659
+ read_access.resource_id == write_access.resource_id
660
+ and write_access.start_time > read_access.end_time
661
+ ):
662
+ # Potential check-then-act race
663
+ race = RaceConditionDetection(
664
+ detection_id=f"race_{int(time.time() * 1000000)}",
665
+ race_type=RaceConditionType.CHECK_THEN_ACT,
666
+ involved_operations=[operation.operation_id],
667
+ involved_resources=[read_access.resource_id],
668
+ conflicting_accesses=[read_access, write_access],
669
+ detection_time=time.time(),
670
+ confidence_score=0.8,
671
+ severity="medium",
672
+ potential_impact="Data inconsistency from stale reads",
673
+ recommended_prevention=[
674
+ PreventionStrategy.ATOMIC_OPERATIONS,
675
+ PreventionStrategy.OPTIMISTIC_LOCKING,
676
+ ],
677
+ timing_analysis={
678
+ "gap_duration": write_access.start_time
679
+ - read_access.end_time
680
+ },
681
+ )
682
+ races.append(race)
683
+
684
+ return races
685
+
686
+ async def _detect_resource_races(
687
+ self,
688
+ resource_id: str,
689
+ accesses: List[ResourceAccess],
690
+ timing_threshold: float,
691
+ min_confidence: float,
692
+ ) -> List[RaceConditionDetection]:
693
+ """Detect race conditions for a specific resource."""
694
+ races = []
695
+
696
+ # Sort accesses by start time
697
+ sorted_accesses = sorted(accesses, key=lambda a: a.start_time)
698
+
699
+ # Check for overlapping accesses
700
+ for i in range(len(sorted_accesses)):
701
+ for j in range(i + 1, len(sorted_accesses)):
702
+ access1 = sorted_accesses[i]
703
+ access2 = sorted_accesses[j]
704
+
705
+ # Check if accesses overlap or are very close in time
706
+ if (
707
+ access1.end_time
708
+ and access2.start_time <= access1.end_time + timing_threshold
709
+ ):
710
+ race_type = self._determine_race_type(access1, access2)
711
+ if race_type:
712
+ confidence = self._calculate_confidence(access1, access2)
713
+ if confidence >= min_confidence:
714
+ race = RaceConditionDetection(
715
+ detection_id=f"race_{int(time.time() * 1000000)}_{i}_{j}",
716
+ race_type=race_type,
717
+ involved_operations=[
718
+ access1.operation_id,
719
+ access2.operation_id,
720
+ ],
721
+ involved_resources=[resource_id],
722
+ conflicting_accesses=[access1, access2],
723
+ detection_time=time.time(),
724
+ confidence_score=confidence,
725
+ severity=self._determine_severity(
726
+ race_type, confidence
727
+ ),
728
+ potential_impact=self._get_potential_impact(race_type),
729
+ recommended_prevention=self._get_recommended_prevention(
730
+ race_type
731
+ ),
732
+ timing_analysis={
733
+ "overlap_duration": (
734
+ access1.end_time - access2.start_time
735
+ if access1.end_time
736
+ else 0.0
737
+ ),
738
+ "timing_gap": access2.start_time
739
+ - access1.start_time,
740
+ },
741
+ )
742
+ races.append(race)
743
+
744
+ return races
745
+
746
+ def _determine_race_type(
747
+ self, access1: ResourceAccess, access2: ResourceAccess
748
+ ) -> Optional[RaceConditionType]:
749
+ """Determine the type of race condition between two accesses."""
750
+ if access1.thread_id == access2.thread_id:
751
+ return None # Same thread, no race
752
+
753
+ # Write-Write race
754
+ if access1.access_type in [
755
+ AccessType.WRITE,
756
+ AccessType.DELETE,
757
+ ] and access2.access_type in [AccessType.WRITE, AccessType.DELETE]:
758
+ return RaceConditionType.WRITE_WRITE_RACE
759
+
760
+ # Read-Write race
761
+ if (
762
+ access1.access_type == AccessType.READ
763
+ and access2.access_type in [AccessType.WRITE, AccessType.DELETE]
764
+ ) or (
765
+ access1.access_type in [AccessType.WRITE, AccessType.DELETE]
766
+ and access2.access_type == AccessType.READ
767
+ ):
768
+ return RaceConditionType.READ_WRITE_RACE
769
+
770
+ # Lost update (both read then write)
771
+ if (
772
+ access1.access_type == AccessType.READ_WRITE
773
+ and access2.access_type == AccessType.READ_WRITE
774
+ ):
775
+ return RaceConditionType.LOST_UPDATE
776
+
777
+ return RaceConditionType.TIMING_DEPENDENT
778
+
779
+ def _calculate_confidence(
780
+ self, access1: ResourceAccess, access2: ResourceAccess
781
+ ) -> float:
782
+ """Calculate confidence score for race condition detection."""
783
+ confidence = 0.5 # Base confidence
784
+
785
+ # Increase confidence for write conflicts
786
+ if access1.access_type in [
787
+ AccessType.WRITE,
788
+ AccessType.DELETE,
789
+ ] or access2.access_type in [
790
+ AccessType.WRITE,
791
+ AccessType.DELETE,
792
+ ]:
793
+ confidence += 0.3
794
+
795
+ # Increase confidence for closer timing
796
+ if access1.end_time:
797
+ timing_gap = abs(access2.start_time - access1.start_time)
798
+ if timing_gap < 0.001: # < 1ms
799
+ confidence += 0.2
800
+ elif timing_gap < 0.01: # < 10ms
801
+ confidence += 0.1
802
+
803
+ # Increase confidence for different processes
804
+ if access1.process_id != access2.process_id:
805
+ confidence += 0.1
806
+
807
+ return min(confidence, 1.0)
808
+
809
+ def _determine_severity(
810
+ self, race_type: RaceConditionType, confidence: float
811
+ ) -> str:
812
+ """Determine severity of race condition."""
813
+ if race_type in [
814
+ RaceConditionType.WRITE_WRITE_RACE,
815
+ RaceConditionType.LOST_UPDATE,
816
+ ]:
817
+ return "critical" if confidence > 0.8 else "high"
818
+ elif race_type == RaceConditionType.READ_WRITE_RACE:
819
+ return "high" if confidence > 0.7 else "medium"
820
+ else:
821
+ return "medium" if confidence > 0.6 else "low"
822
+
823
+ def _get_potential_impact(self, race_type: RaceConditionType) -> str:
824
+ """Get potential impact description for race type."""
825
+ impact_map = {
826
+ RaceConditionType.WRITE_WRITE_RACE: "Data corruption, lost writes, inconsistent state",
827
+ RaceConditionType.READ_WRITE_RACE: "Stale data reads, inconsistent views",
828
+ RaceConditionType.LOST_UPDATE: "Lost updates, data inconsistency",
829
+ RaceConditionType.CHECK_THEN_ACT: "Logic errors, invalid state transitions",
830
+ RaceConditionType.DIRTY_READ: "Reading uncommitted data, inconsistent views",
831
+ RaceConditionType.PHANTOM_READ: "Inconsistent query results",
832
+ RaceConditionType.TIMING_DEPENDENT: "Unpredictable behavior, intermittent bugs",
833
+ }
834
+ return impact_map.get(race_type, "Unknown impact")
835
+
836
+ def _get_recommended_prevention(
837
+ self, race_type: RaceConditionType
838
+ ) -> List[PreventionStrategy]:
839
+ """Get recommended prevention strategies for race type."""
840
+ prevention_map = {
841
+ RaceConditionType.WRITE_WRITE_RACE: [
842
+ PreventionStrategy.PESSIMISTIC_LOCKING,
843
+ PreventionStrategy.ATOMIC_OPERATIONS,
844
+ ],
845
+ RaceConditionType.READ_WRITE_RACE: [
846
+ PreventionStrategy.OPTIMISTIC_LOCKING,
847
+ PreventionStrategy.IMMUTABLE_DATA,
848
+ ],
849
+ RaceConditionType.LOST_UPDATE: [
850
+ PreventionStrategy.OPTIMISTIC_LOCKING,
851
+ PreventionStrategy.ATOMIC_OPERATIONS,
852
+ ],
853
+ RaceConditionType.CHECK_THEN_ACT: [
854
+ PreventionStrategy.ATOMIC_OPERATIONS,
855
+ PreventionStrategy.PESSIMISTIC_LOCKING,
856
+ ],
857
+ RaceConditionType.TIMING_DEPENDENT: [
858
+ PreventionStrategy.SYNCHRONIZATION,
859
+ PreventionStrategy.MESSAGE_PASSING,
860
+ ],
861
+ }
862
+ return prevention_map.get(race_type, [PreventionStrategy.SYNCHRONIZATION])
863
+
864
+ def _get_prevention_strategies(self, race: RaceConditionDetection) -> List[str]:
865
+ """Get prevention strategy names for a race condition."""
866
+ return [strategy.value for strategy in race.recommended_prevention]
867
+
868
+ def _analyze_resource_conflicts(
869
+ self, resource_accesses: Dict[str, List[ResourceAccess]]
870
+ ) -> Dict[str, Any]:
871
+ """Analyze conflicts per resource."""
872
+ conflicts = {}
873
+
874
+ for resource_id, accesses in resource_accesses.items():
875
+ write_count = sum(
876
+ 1
877
+ for a in accesses
878
+ if a.access_type in [AccessType.WRITE, AccessType.DELETE]
879
+ )
880
+ read_count = sum(1 for a in accesses if a.access_type == AccessType.READ)
881
+ unique_threads = len(set(a.thread_id for a in accesses))
882
+
883
+ conflicts[resource_id] = {
884
+ "total_accesses": len(accesses),
885
+ "write_accesses": write_count,
886
+ "read_accesses": read_count,
887
+ "concurrent_threads": unique_threads,
888
+ "conflict_potential": (
889
+ "high"
890
+ if write_count > 1 and unique_threads > 1
891
+ else "medium" if write_count > 0 and unique_threads > 1 else "low"
892
+ ),
893
+ }
894
+
895
+ return conflicts
896
+
897
+ def _generate_timing_analysis(
898
+ self, resource_accesses: Dict[str, List[ResourceAccess]]
899
+ ) -> Dict[str, Any]:
900
+ """Generate timing analysis for race detection."""
901
+ analysis = {}
902
+
903
+ for resource_id, accesses in resource_accesses.items():
904
+ if len(accesses) > 1:
905
+ durations = [a.duration for a in accesses if a.duration]
906
+ start_times = [a.start_time for a in accesses]
907
+
908
+ analysis[resource_id] = {
909
+ "access_count": len(accesses),
910
+ "avg_duration": sum(durations) / len(durations) if durations else 0,
911
+ "max_duration": max(durations) if durations else 0,
912
+ "time_span": max(start_times) - min(start_times),
913
+ "concurrency_level": len(accesses),
914
+ }
915
+
916
+ return analysis
917
+
918
+ async def _create_race_detection(
919
+ self, accesses: List[ResourceAccess], detection_time: float
920
+ ) -> Optional[RaceConditionDetection]:
921
+ """Create a race condition detection from conflicting accesses."""
922
+ if len(accesses) < 2:
923
+ return None
924
+
925
+ race_type = self._determine_race_type(accesses[0], accesses[1])
926
+ if not race_type:
927
+ return None
928
+
929
+ confidence = self._calculate_confidence(accesses[0], accesses[1])
930
+
931
+ return RaceConditionDetection(
932
+ detection_id=f"race_{int(detection_time * 1000000)}",
933
+ race_type=race_type,
934
+ involved_operations=list(set(a.operation_id for a in accesses)),
935
+ involved_resources=list(set(a.resource_id for a in accesses)),
936
+ conflicting_accesses=accesses,
937
+ detection_time=detection_time,
938
+ confidence_score=confidence,
939
+ severity=self._determine_severity(race_type, confidence),
940
+ potential_impact=self._get_potential_impact(race_type),
941
+ recommended_prevention=self._get_recommended_prevention(race_type),
942
+ )
943
+
944
+ async def _get_status(self, **kwargs) -> Dict[str, Any]:
945
+ """Get current race detector status."""
946
+ return {
947
+ "races_detected": [self._serialize_race(r) for r in self._detected_races],
948
+ "race_count": len(self._detected_races),
949
+ "active_accesses": len(self._active_accesses),
950
+ "active_operations": len(self._active_operations),
951
+ "prevention_suggestions": [],
952
+ "resource_conflicts": {},
953
+ "timing_analysis": {},
954
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
955
+ "timestamp": datetime.now(UTC).isoformat(),
956
+ "status": "success",
957
+ }
958
+
959
+ async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
960
+ """Start continuous race condition monitoring."""
961
+ interval = kwargs.get("monitoring_interval", 1.0)
962
+
963
+ if not self._monitoring_active:
964
+ self._monitoring_active = True
965
+ monitoring_task = asyncio.create_task(self._monitoring_loop(interval))
966
+ self._background_tasks.add(monitoring_task)
967
+ monitoring_task.add_done_callback(self._background_tasks.discard)
968
+
969
+ return {
970
+ "races_detected": [],
971
+ "race_count": 0,
972
+ "active_accesses": len(self._active_accesses),
973
+ "active_operations": len(self._active_operations),
974
+ "prevention_suggestions": [],
975
+ "resource_conflicts": {},
976
+ "timing_analysis": {},
977
+ "monitoring_status": "monitoring",
978
+ "timestamp": datetime.now(UTC).isoformat(),
979
+ "status": "success",
980
+ }
981
+
982
+ async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
983
+ """Stop continuous race condition monitoring."""
984
+ self._monitoring_active = False
985
+
986
+ # Cancel background tasks
987
+ for task in self._background_tasks:
988
+ if not task.done():
989
+ task.cancel()
990
+
991
+ # Wait for tasks to complete
992
+ if self._background_tasks:
993
+ await asyncio.gather(*self._background_tasks, return_exceptions=True)
994
+
995
+ self._background_tasks.clear()
996
+
997
+ return {
998
+ "races_detected": [],
999
+ "race_count": 0,
1000
+ "active_accesses": len(self._active_accesses),
1001
+ "active_operations": len(self._active_operations),
1002
+ "prevention_suggestions": [],
1003
+ "resource_conflicts": {},
1004
+ "timing_analysis": {},
1005
+ "monitoring_status": "stopped",
1006
+ "timestamp": datetime.now(UTC).isoformat(),
1007
+ "status": "success",
1008
+ }
1009
+
1010
+ async def _monitoring_loop(self, interval: float):
1011
+ """Background monitoring loop for continuous race detection."""
1012
+ while self._monitoring_active:
1013
+ try:
1014
+ await asyncio.sleep(interval)
1015
+
1016
+ # Detect races in recent activity
1017
+ races = await self._detect_races(detection_window=interval * 2)
1018
+
1019
+ if races["race_count"] > 0:
1020
+ self.logger.warning(
1021
+ f"Monitoring detected {races['race_count']} race conditions"
1022
+ )
1023
+
1024
+ except asyncio.CancelledError:
1025
+ break
1026
+ except Exception as e:
1027
+ self.logger.error(f"Monitoring loop error: {e}")
1028
+
1029
+ def _serialize_race(self, race: RaceConditionDetection) -> Dict[str, Any]:
1030
+ """Serialize a race condition detection to dictionary."""
1031
+ return {
1032
+ "detection_id": race.detection_id,
1033
+ "race_type": race.race_type.value,
1034
+ "involved_operations": race.involved_operations,
1035
+ "involved_resources": race.involved_resources,
1036
+ "conflicting_accesses": [
1037
+ {
1038
+ "access_id": a.access_id,
1039
+ "resource_id": a.resource_id,
1040
+ "operation_id": a.operation_id,
1041
+ "thread_id": a.thread_id,
1042
+ "process_id": a.process_id,
1043
+ "access_type": a.access_type.value,
1044
+ "start_time": a.start_time,
1045
+ "end_time": a.end_time,
1046
+ "duration": a.duration,
1047
+ "success": a.success,
1048
+ "error": a.error,
1049
+ }
1050
+ for a in race.conflicting_accesses
1051
+ ],
1052
+ "detection_time": race.detection_time,
1053
+ "confidence_score": race.confidence_score,
1054
+ "severity": race.severity,
1055
+ "potential_impact": race.potential_impact,
1056
+ "recommended_prevention": [p.value for p in race.recommended_prevention],
1057
+ "timing_analysis": race.timing_analysis,
1058
+ "metadata": race.metadata,
1059
+ }
1060
+
1061
+ async def _report_operation(self, **kwargs) -> Dict[str, Any]:
1062
+ """Report an operation with resource access for race detection."""
1063
+ operation_id = kwargs.get("operation_id", str(uuid.uuid4()))
1064
+ resource_id = kwargs.get("resource_id")
1065
+ access_type = kwargs.get("access_type", "read")
1066
+ thread_id = kwargs.get("thread_id", str(threading.get_ident()))
1067
+ process_id = kwargs.get("process_id", str(os.getpid()))
1068
+ metadata = kwargs.get("metadata", {})
1069
+
1070
+ # Register the operation
1071
+ register_result = await self._register_operation(
1072
+ operation_id=operation_id,
1073
+ operation_type="reported_operation",
1074
+ metadata=metadata,
1075
+ )
1076
+
1077
+ # Register resource access if specified
1078
+ if resource_id:
1079
+ access_result = await self._register_access(
1080
+ resource_id=resource_id,
1081
+ operation_id=operation_id,
1082
+ thread_id=thread_id,
1083
+ process_id=process_id,
1084
+ access_type=access_type,
1085
+ metadata=metadata,
1086
+ )
1087
+
1088
+ return {
1089
+ "operation_id": operation_id,
1090
+ "resource_id": resource_id,
1091
+ "access_type": access_type,
1092
+ "thread_id": thread_id,
1093
+ "process_id": process_id,
1094
+ "detection_status": "reported",
1095
+ "races_detected": [
1096
+ self._serialize_race(race) for race in self._detected_races
1097
+ ],
1098
+ "race_count": len(self._detected_races),
1099
+ "active_accesses": len(
1100
+ self._active_accesses
1101
+ ), # Fixed to use correct data structure
1102
+ "active_operations": len(self._active_operations),
1103
+ "prevention_suggestions": [],
1104
+ "resource_conflicts": {},
1105
+ "timing_analysis": {},
1106
+ "monitoring_status": "active",
1107
+ "timestamp": datetime.now(UTC).isoformat(),
1108
+ "status": "success",
1109
+ }
1110
+
1111
+ async def _complete_operation(self, **kwargs) -> Dict[str, Any]:
1112
+ """Complete an operation and perform final race detection analysis."""
1113
+ operation_id = kwargs.get("operation_id")
1114
+ resource_id = kwargs.get("resource_id")
1115
+ success = kwargs.get("success", True)
1116
+
1117
+ # If operation_id is provided, complete that specific operation
1118
+ if operation_id:
1119
+ if operation_id in self._active_operations:
1120
+ operation = self._active_operations.pop(operation_id)
1121
+ # You could add completion logic here
1122
+
1123
+ # Return the current state with race detection results
1124
+ return {
1125
+ "operation_id": operation_id,
1126
+ "resource_id": resource_id,
1127
+ "operation_success": success,
1128
+ "races_detected": [
1129
+ self._serialize_race(race) for race in self._detected_races
1130
+ ],
1131
+ "race_count": len(self._detected_races),
1132
+ "active_accesses": len(self._active_accesses),
1133
+ "active_operations": len(self._active_operations),
1134
+ "prevention_suggestions": [],
1135
+ "resource_conflicts": {},
1136
+ "timing_analysis": {},
1137
+ "monitoring_status": "operation_completed",
1138
+ "timestamp": datetime.now(UTC).isoformat(),
1139
+ "status": "success",
1140
+ }
1141
+
1142
+ def run(self, **kwargs) -> Dict[str, Any]:
1143
+ """Synchronous wrapper for compatibility."""
1144
+ import asyncio
1145
+
1146
+ return asyncio.run(self.async_run(**kwargs))
1147
+
1148
+ async def cleanup(self):
1149
+ """Cleanup resources when node is destroyed."""
1150
+ await self._stop_monitoring()
1151
+ await super().cleanup() if hasattr(super(), "cleanup") else None