kailash 0.6.6__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. kailash/__init__.py +35 -5
  2. kailash/access_control.py +64 -46
  3. kailash/adapters/__init__.py +5 -0
  4. kailash/adapters/mcp_platform_adapter.py +273 -0
  5. kailash/api/workflow_api.py +34 -3
  6. kailash/channels/__init__.py +21 -0
  7. kailash/channels/api_channel.py +409 -0
  8. kailash/channels/base.py +271 -0
  9. kailash/channels/cli_channel.py +661 -0
  10. kailash/channels/event_router.py +496 -0
  11. kailash/channels/mcp_channel.py +648 -0
  12. kailash/channels/session.py +423 -0
  13. kailash/mcp_server/discovery.py +57 -18
  14. kailash/middleware/communication/api_gateway.py +23 -3
  15. kailash/middleware/communication/realtime.py +83 -0
  16. kailash/middleware/core/agent_ui.py +1 -1
  17. kailash/middleware/gateway/storage_backends.py +393 -0
  18. kailash/middleware/mcp/enhanced_server.py +22 -16
  19. kailash/nexus/__init__.py +21 -0
  20. kailash/nexus/cli/__init__.py +5 -0
  21. kailash/nexus/cli/__main__.py +6 -0
  22. kailash/nexus/cli/main.py +176 -0
  23. kailash/nexus/factory.py +413 -0
  24. kailash/nexus/gateway.py +545 -0
  25. kailash/nodes/__init__.py +8 -5
  26. kailash/nodes/ai/iterative_llm_agent.py +988 -17
  27. kailash/nodes/ai/llm_agent.py +29 -9
  28. kailash/nodes/api/__init__.py +2 -2
  29. kailash/nodes/api/monitoring.py +1 -1
  30. kailash/nodes/base.py +29 -5
  31. kailash/nodes/base_async.py +54 -14
  32. kailash/nodes/code/async_python.py +1 -1
  33. kailash/nodes/code/python.py +50 -6
  34. kailash/nodes/data/async_sql.py +90 -0
  35. kailash/nodes/data/bulk_operations.py +939 -0
  36. kailash/nodes/data/query_builder.py +373 -0
  37. kailash/nodes/data/query_cache.py +512 -0
  38. kailash/nodes/monitoring/__init__.py +10 -0
  39. kailash/nodes/monitoring/deadlock_detector.py +964 -0
  40. kailash/nodes/monitoring/performance_anomaly.py +1078 -0
  41. kailash/nodes/monitoring/race_condition_detector.py +1151 -0
  42. kailash/nodes/monitoring/transaction_metrics.py +790 -0
  43. kailash/nodes/monitoring/transaction_monitor.py +931 -0
  44. kailash/nodes/security/behavior_analysis.py +414 -0
  45. kailash/nodes/system/__init__.py +17 -0
  46. kailash/nodes/system/command_parser.py +820 -0
  47. kailash/nodes/transaction/__init__.py +48 -0
  48. kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
  49. kailash/nodes/transaction/saga_coordinator.py +652 -0
  50. kailash/nodes/transaction/saga_state_storage.py +411 -0
  51. kailash/nodes/transaction/saga_step.py +467 -0
  52. kailash/nodes/transaction/transaction_context.py +756 -0
  53. kailash/nodes/transaction/two_phase_commit.py +978 -0
  54. kailash/nodes/transform/processors.py +17 -1
  55. kailash/nodes/validation/__init__.py +21 -0
  56. kailash/nodes/validation/test_executor.py +532 -0
  57. kailash/nodes/validation/validation_nodes.py +447 -0
  58. kailash/resources/factory.py +1 -1
  59. kailash/runtime/access_controlled.py +9 -7
  60. kailash/runtime/async_local.py +84 -21
  61. kailash/runtime/local.py +21 -2
  62. kailash/runtime/parameter_injector.py +187 -31
  63. kailash/runtime/runner.py +6 -4
  64. kailash/runtime/testing.py +1 -1
  65. kailash/security.py +22 -3
  66. kailash/servers/__init__.py +32 -0
  67. kailash/servers/durable_workflow_server.py +430 -0
  68. kailash/servers/enterprise_workflow_server.py +522 -0
  69. kailash/servers/gateway.py +183 -0
  70. kailash/servers/workflow_server.py +293 -0
  71. kailash/utils/data_validation.py +192 -0
  72. kailash/workflow/builder.py +382 -15
  73. kailash/workflow/cyclic_runner.py +102 -10
  74. kailash/workflow/validation.py +144 -8
  75. kailash/workflow/visualization.py +99 -27
  76. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/METADATA +3 -2
  77. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/RECORD +81 -40
  78. kailash/workflow/builder_improvements.py +0 -207
  79. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/WHEEL +0 -0
  80. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/entry_points.txt +0 -0
  81. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/licenses/LICENSE +0 -0
  82. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,931 @@
1
+ """Real-time transaction monitoring node with distributed tracing support.
2
+
3
+ This module provides live transaction monitoring capabilities with real-time
4
+ alerting, distributed tracing, and streaming dashboard support.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import logging
10
+ import time
11
+ import uuid
12
+ from dataclasses import dataclass, field
13
+ from datetime import UTC, datetime
14
+ from enum import Enum
15
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
16
+
17
+ from kailash.nodes.base import NodeParameter, register_node
18
+ from kailash.nodes.base_async import AsyncNode
19
+ from kailash.sdk_exceptions import NodeExecutionError
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class AlertSeverity(Enum):
25
+ """Alert severity levels."""
26
+
27
+ LOW = "low"
28
+ MEDIUM = "medium"
29
+ HIGH = "high"
30
+ CRITICAL = "critical"
31
+
32
+
33
+ class TracingProtocol(Enum):
34
+ """Supported tracing protocols."""
35
+
36
+ OPENTELEMETRY = "opentelemetry"
37
+ JAEGER = "jaeger"
38
+ ZIPKIN = "zipkin"
39
+ CUSTOM = "custom"
40
+
41
+
42
+ @dataclass
43
+ class TransactionSpan:
44
+ """Represents a distributed tracing span."""
45
+
46
+ span_id: str
47
+ trace_id: str
48
+ parent_span_id: Optional[str] = None
49
+ operation_name: str = ""
50
+ start_time: float = field(default_factory=time.time)
51
+ end_time: Optional[float] = None
52
+ duration: Optional[float] = None
53
+ service_name: str = ""
54
+ tags: Dict[str, str] = field(default_factory=dict)
55
+ logs: List[Dict[str, Any]] = field(default_factory=list)
56
+ baggage: Dict[str, str] = field(default_factory=dict)
57
+ status: str = "ok"
58
+ error: Optional[str] = None
59
+
60
+
61
+ @dataclass
62
+ class TransactionTrace:
63
+ """Represents a complete distributed trace."""
64
+
65
+ trace_id: str
66
+ root_span_id: str
67
+ spans: List[TransactionSpan] = field(default_factory=list)
68
+ total_duration: Optional[float] = None
69
+ service_count: int = 0
70
+ span_count: int = 0
71
+ error_count: int = 0
72
+ critical_path: List[str] = field(default_factory=list)
73
+
74
+
75
+ @dataclass
76
+ class TransactionAlert:
77
+ """Represents a transaction monitoring alert."""
78
+
79
+ alert_id: str
80
+ severity: AlertSeverity
81
+ message: str
82
+ transaction_id: Optional[str] = None
83
+ trace_id: Optional[str] = None
84
+ metric_name: str = ""
85
+ metric_value: float = 0.0
86
+ threshold: float = 0.0
87
+ timestamp: float = field(default_factory=time.time)
88
+ tags: Dict[str, str] = field(default_factory=dict)
89
+ resolved: bool = False
90
+
91
+
92
+ @register_node()
93
+ class TransactionMonitorNode(AsyncNode):
94
+ """Node for real-time transaction monitoring and distributed tracing.
95
+
96
+ This node provides comprehensive real-time monitoring including:
97
+ - Live transaction tracking and correlation
98
+ - Distributed tracing with OpenTelemetry support
99
+ - Real-time anomaly detection and alerting
100
+ - WebSocket/SSE streaming for live dashboards
101
+ - Transaction correlation across service boundaries
102
+ - Critical path analysis for performance optimization
103
+
104
+ Design Purpose:
105
+ - Enable real-time performance monitoring
106
+ - Support distributed system troubleshooting
107
+ - Provide actionable alerts for SLA violations
108
+ - Facilitate live dashboard visualization
109
+
110
+ Examples:
111
+ >>> # Start monitoring transactions
112
+ >>> monitor = TransactionMonitorNode()
113
+ >>> result = await monitor.execute(
114
+ ... operation="start_monitoring",
115
+ ... trace_sampling_rate=0.1,
116
+ ... alert_thresholds={
117
+ ... "duration": {"p95": 2.0, "p99": 5.0},
118
+ ... "error_rate": {"threshold": 0.01}
119
+ ... }
120
+ ... )
121
+
122
+ >>> # Create distributed trace
123
+ >>> result = await monitor.execute(
124
+ ... operation="create_trace",
125
+ ... trace_id="trace_12345",
126
+ ... root_operation="order_processing",
127
+ ... service_name="order-service"
128
+ ... )
129
+
130
+ >>> # Add span to trace
131
+ >>> result = await monitor.execute(
132
+ ... operation="add_span",
133
+ ... trace_id="trace_12345",
134
+ ... operation_name="validate_payment",
135
+ ... service_name="payment-service",
136
+ ... parent_span_id="span_abc"
137
+ ... )
138
+ """
139
+
140
+ def __init__(self, **kwargs):
141
+ """Initialize the transaction monitor node."""
142
+ super().__init__(**kwargs)
143
+ self._active_traces: Dict[str, TransactionTrace] = {}
144
+ self._active_spans: Dict[str, TransactionSpan] = {}
145
+ self._monitoring_active = False
146
+ self._alert_handlers: List[Callable] = []
147
+ self._stream_handlers: List[Callable] = []
148
+ self._metrics_buffer: List[Dict[str, Any]] = []
149
+ self._alert_thresholds: Dict[str, Dict[str, float]] = {}
150
+ self._trace_sampling_rate = 1.0
151
+ self._background_tasks: Set[asyncio.Task] = set()
152
+ self.logger.info(f"Initialized TransactionMonitorNode: {self.id}")
153
+
154
+ def get_parameters(self) -> Dict[str, NodeParameter]:
155
+ """Define the parameters this node accepts."""
156
+ return {
157
+ "operation": NodeParameter(
158
+ name="operation",
159
+ type=str,
160
+ required=True,
161
+ description="Operation (start_monitoring, stop_monitoring, create_trace, add_span, finish_span, get_trace, get_alerts)",
162
+ ),
163
+ "transaction_id": NodeParameter(
164
+ name="transaction_id",
165
+ type=str,
166
+ required=False,
167
+ description="Transaction identifier for monitoring operations",
168
+ ),
169
+ "success": NodeParameter(
170
+ name="success",
171
+ type=bool,
172
+ required=False,
173
+ description="Whether the transaction completed successfully",
174
+ ),
175
+ "trace_id": NodeParameter(
176
+ name="trace_id",
177
+ type=str,
178
+ required=False,
179
+ description="Distributed trace identifier",
180
+ ),
181
+ "span_id": NodeParameter(
182
+ name="span_id", type=str, required=False, description="Span identifier"
183
+ ),
184
+ "parent_span_id": NodeParameter(
185
+ name="parent_span_id",
186
+ type=str,
187
+ required=False,
188
+ description="Parent span identifier",
189
+ ),
190
+ "operation_name": NodeParameter(
191
+ name="operation_name",
192
+ type=str,
193
+ required=False,
194
+ description="Name of the operation being traced",
195
+ ),
196
+ "service_name": NodeParameter(
197
+ name="service_name",
198
+ type=str,
199
+ required=False,
200
+ description="Name of the service",
201
+ ),
202
+ "tags": NodeParameter(
203
+ name="tags",
204
+ type=dict,
205
+ required=False,
206
+ default={},
207
+ description="Tags for span/trace",
208
+ ),
209
+ "baggage": NodeParameter(
210
+ name="baggage",
211
+ type=dict,
212
+ required=False,
213
+ default={},
214
+ description="Baggage for distributed context",
215
+ ),
216
+ "error": NodeParameter(
217
+ name="error",
218
+ type=str,
219
+ required=False,
220
+ description="Error message if operation failed",
221
+ ),
222
+ "trace_sampling_rate": NodeParameter(
223
+ name="trace_sampling_rate",
224
+ type=float,
225
+ required=False,
226
+ default=1.0,
227
+ description="Sampling rate for traces (0.0 to 1.0)",
228
+ ),
229
+ "alert_thresholds": NodeParameter(
230
+ name="alert_thresholds",
231
+ type=dict,
232
+ required=False,
233
+ default={},
234
+ description="Alert thresholds for monitoring",
235
+ ),
236
+ "tracing_protocol": NodeParameter(
237
+ name="tracing_protocol",
238
+ type=str,
239
+ required=False,
240
+ default="opentelemetry",
241
+ description="Tracing protocol (opentelemetry, jaeger, zipkin, custom)",
242
+ ),
243
+ "enable_streaming": NodeParameter(
244
+ name="enable_streaming",
245
+ type=bool,
246
+ required=False,
247
+ default=False,
248
+ description="Enable real-time streaming for dashboards",
249
+ ),
250
+ "stream_endpoint": NodeParameter(
251
+ name="stream_endpoint",
252
+ type=str,
253
+ required=False,
254
+ description="WebSocket/SSE endpoint for streaming",
255
+ ),
256
+ "correlation_window": NodeParameter(
257
+ name="correlation_window",
258
+ type=float,
259
+ required=False,
260
+ default=30.0,
261
+ description="Time window for transaction correlation in seconds",
262
+ ),
263
+ }
264
+
265
+ def get_output_schema(self) -> Dict[str, NodeParameter]:
266
+ """Define the output schema for this node."""
267
+ return {
268
+ "monitoring_status": NodeParameter(
269
+ name="monitoring_status",
270
+ type=str,
271
+ description="Current monitoring status",
272
+ ),
273
+ "trace_data": NodeParameter(
274
+ name="trace_data", type=dict, description="Trace information"
275
+ ),
276
+ "span_data": NodeParameter(
277
+ name="span_data", type=dict, description="Span information"
278
+ ),
279
+ "alerts": NodeParameter(
280
+ name="alerts", type=list, description="Active alerts"
281
+ ),
282
+ "metrics": NodeParameter(
283
+ name="metrics", type=dict, description="Real-time metrics"
284
+ ),
285
+ "correlation_id": NodeParameter(
286
+ name="correlation_id",
287
+ type=str,
288
+ description="Correlation ID for tracking",
289
+ ),
290
+ "timestamp": NodeParameter(
291
+ name="timestamp", type=str, description="ISO timestamp of operation"
292
+ ),
293
+ "status": NodeParameter(
294
+ name="status", type=str, description="Operation status"
295
+ ),
296
+ }
297
+
298
+ async def async_run(self, **kwargs) -> Dict[str, Any]:
299
+ """Execute transaction monitoring operation."""
300
+ operation = kwargs.get("operation")
301
+
302
+ try:
303
+ if operation == "start_monitoring":
304
+ return await self._start_monitoring(**kwargs)
305
+ elif operation == "stop_monitoring":
306
+ return await self._stop_monitoring(**kwargs)
307
+ elif operation == "start_transaction":
308
+ return await self._start_transaction(**kwargs)
309
+ elif operation == "complete_transaction":
310
+ return await self._complete_transaction(**kwargs)
311
+ elif operation == "get_monitoring_status":
312
+ return await self._get_monitoring_status(**kwargs)
313
+ elif operation == "create_trace":
314
+ return await self._create_trace(**kwargs)
315
+ elif operation == "add_span":
316
+ return await self._add_span(**kwargs)
317
+ elif operation == "finish_span":
318
+ return await self._finish_span(**kwargs)
319
+ elif operation == "get_trace":
320
+ return await self._get_trace(**kwargs)
321
+ elif operation == "get_alerts":
322
+ return await self._get_alerts(**kwargs)
323
+ elif operation == "correlate_transactions":
324
+ return await self._correlate_transactions(**kwargs)
325
+ else:
326
+ raise ValueError(f"Unknown operation: {operation}")
327
+
328
+ except Exception as e:
329
+ self.logger.error(f"Transaction monitoring operation failed: {str(e)}")
330
+ raise NodeExecutionError(
331
+ f"Failed to execute monitoring operation: {str(e)}"
332
+ )
333
+
334
+ async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
335
+ """Start real-time transaction monitoring."""
336
+ self._trace_sampling_rate = kwargs.get("trace_sampling_rate", 1.0)
337
+ self._alert_thresholds = kwargs.get("alert_thresholds", {})
338
+ enable_streaming = kwargs.get("enable_streaming", False)
339
+ stream_endpoint = kwargs.get("stream_endpoint")
340
+
341
+ # Start background monitoring task
342
+ if not self._monitoring_active:
343
+ self._monitoring_active = True
344
+ monitoring_task = asyncio.create_task(self._monitoring_loop())
345
+ self._background_tasks.add(monitoring_task)
346
+ monitoring_task.add_done_callback(self._background_tasks.discard)
347
+
348
+ # Setup streaming if enabled
349
+ if enable_streaming and stream_endpoint:
350
+ streaming_task = asyncio.create_task(self._setup_streaming(stream_endpoint))
351
+ self._background_tasks.add(streaming_task)
352
+ streaming_task.add_done_callback(self._background_tasks.discard)
353
+
354
+ self.logger.info(
355
+ f"Started transaction monitoring with sampling rate {self._trace_sampling_rate}"
356
+ )
357
+
358
+ return {
359
+ "monitoring_status": "active",
360
+ "trace_data": {},
361
+ "span_data": {},
362
+ "alerts": [],
363
+ "metrics": {"sampling_rate": self._trace_sampling_rate},
364
+ "correlation_id": str(uuid.uuid4()),
365
+ "timestamp": datetime.now(UTC).isoformat(),
366
+ "status": "success",
367
+ }
368
+
369
+ async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
370
+ """Stop real-time transaction monitoring."""
371
+ self._monitoring_active = False
372
+
373
+ # Cancel background tasks
374
+ for task in self._background_tasks:
375
+ if not task.done():
376
+ task.cancel()
377
+
378
+ # Wait for tasks to complete
379
+ if self._background_tasks:
380
+ await asyncio.gather(*self._background_tasks, return_exceptions=True)
381
+
382
+ self._background_tasks.clear()
383
+
384
+ self.logger.info("Stopped transaction monitoring")
385
+
386
+ return {
387
+ "monitoring_status": "stopped",
388
+ "trace_data": {},
389
+ "span_data": {},
390
+ "alerts": [],
391
+ "metrics": {"active_traces": len(self._active_traces)},
392
+ "correlation_id": str(uuid.uuid4()),
393
+ "timestamp": datetime.now(UTC).isoformat(),
394
+ "status": "success",
395
+ }
396
+
397
+ async def _create_trace(self, **kwargs) -> Dict[str, Any]:
398
+ """Create a new distributed trace."""
399
+ trace_id = kwargs.get("trace_id") or str(uuid.uuid4())
400
+ root_operation = kwargs.get("operation_name", "unknown")
401
+ service_name = kwargs.get("service_name", "unknown")
402
+ tags = kwargs.get("tags", {})
403
+
404
+ # Check sampling
405
+ if self._trace_sampling_rate < 1.0:
406
+ import random
407
+
408
+ if random.random() > self._trace_sampling_rate:
409
+ # Skip this trace
410
+ return {
411
+ "monitoring_status": "sampling_skipped",
412
+ "trace_data": {"trace_id": trace_id, "sampled": False},
413
+ "span_data": {},
414
+ "alerts": [],
415
+ "metrics": {},
416
+ "correlation_id": trace_id,
417
+ "timestamp": datetime.now(UTC).isoformat(),
418
+ "status": "success",
419
+ }
420
+
421
+ # Create root span
422
+ root_span_id = str(uuid.uuid4())
423
+ root_span = TransactionSpan(
424
+ span_id=root_span_id,
425
+ trace_id=trace_id,
426
+ operation_name=root_operation,
427
+ service_name=service_name,
428
+ tags=tags,
429
+ )
430
+
431
+ # Create trace
432
+ trace = TransactionTrace(
433
+ trace_id=trace_id,
434
+ root_span_id=root_span_id,
435
+ spans=[root_span],
436
+ span_count=1,
437
+ service_count=1,
438
+ )
439
+
440
+ self._active_traces[trace_id] = trace
441
+ self._active_spans[root_span_id] = root_span
442
+
443
+ self.logger.debug(f"Created trace {trace_id} with root span {root_span_id}")
444
+
445
+ return {
446
+ "monitoring_status": "trace_created",
447
+ "trace_data": {
448
+ "trace_id": trace_id,
449
+ "root_span_id": root_span_id,
450
+ "sampled": True,
451
+ },
452
+ "span_data": {
453
+ "span_id": root_span_id,
454
+ "operation_name": root_operation,
455
+ "service_name": service_name,
456
+ },
457
+ "alerts": [],
458
+ "metrics": {"active_traces": len(self._active_traces)},
459
+ "correlation_id": trace_id,
460
+ "timestamp": datetime.now(UTC).isoformat(),
461
+ "status": "success",
462
+ }
463
+
464
+ async def _add_span(self, **kwargs) -> Dict[str, Any]:
465
+ """Add a new span to an existing trace."""
466
+ trace_id = kwargs.get("trace_id")
467
+ if not trace_id or trace_id not in self._active_traces:
468
+ raise ValueError(f"Trace {trace_id} not found")
469
+
470
+ span_id = kwargs.get("span_id") or str(uuid.uuid4())
471
+ parent_span_id = kwargs.get("parent_span_id")
472
+ operation_name = kwargs.get("operation_name", "unknown")
473
+ service_name = kwargs.get("service_name", "unknown")
474
+ tags = kwargs.get("tags", {})
475
+ baggage = kwargs.get("baggage", {})
476
+
477
+ # Create span
478
+ span = TransactionSpan(
479
+ span_id=span_id,
480
+ trace_id=trace_id,
481
+ parent_span_id=parent_span_id,
482
+ operation_name=operation_name,
483
+ service_name=service_name,
484
+ tags=tags,
485
+ baggage=baggage,
486
+ )
487
+
488
+ # Add to trace
489
+ trace = self._active_traces[trace_id]
490
+ trace.spans.append(span)
491
+ trace.span_count += 1
492
+
493
+ # Update service count
494
+ services = set(s.service_name for s in trace.spans)
495
+ trace.service_count = len(services)
496
+
497
+ self._active_spans[span_id] = span
498
+
499
+ self.logger.debug(f"Added span {span_id} to trace {trace_id}")
500
+
501
+ return {
502
+ "monitoring_status": "span_added",
503
+ "trace_data": {
504
+ "trace_id": trace_id,
505
+ "span_count": trace.span_count,
506
+ "service_count": trace.service_count,
507
+ },
508
+ "span_data": {
509
+ "span_id": span_id,
510
+ "operation_name": operation_name,
511
+ "service_name": service_name,
512
+ "parent_span_id": parent_span_id,
513
+ },
514
+ "alerts": [],
515
+ "metrics": {"active_spans": len(self._active_spans)},
516
+ "correlation_id": trace_id,
517
+ "timestamp": datetime.now(UTC).isoformat(),
518
+ "status": "success",
519
+ }
520
+
521
+ async def _finish_span(self, **kwargs) -> Dict[str, Any]:
522
+ """Finish an active span."""
523
+ span_id = kwargs.get("span_id")
524
+ if not span_id or span_id not in self._active_spans:
525
+ raise ValueError(f"Span {span_id} not found")
526
+
527
+ span = self._active_spans[span_id]
528
+ error = kwargs.get("error")
529
+
530
+ # Complete span
531
+ span.end_time = time.time()
532
+ span.duration = span.end_time - span.start_time
533
+
534
+ if error:
535
+ span.status = "error"
536
+ span.error = error
537
+
538
+ # Update trace error count
539
+ trace = self._active_traces.get(span.trace_id)
540
+ if trace:
541
+ trace.error_count += 1
542
+
543
+ # Check for alerts
544
+ alerts = await self._check_span_alerts(span)
545
+
546
+ # Remove from active spans
547
+ del self._active_spans[span_id]
548
+
549
+ # Check if trace is complete
550
+ trace = self._active_traces.get(span.trace_id)
551
+ if trace and span.span_id == trace.root_span_id:
552
+ # Root span finished, calculate trace duration
553
+ trace.total_duration = span.duration
554
+ trace.critical_path = self._calculate_critical_path(trace)
555
+
556
+ # Move to completed traces (not implemented in this basic version)
557
+ # del self._active_traces[span.trace_id]
558
+
559
+ self.logger.debug(f"Finished span {span_id} with duration {span.duration:.3f}s")
560
+
561
+ return {
562
+ "monitoring_status": "span_finished",
563
+ "trace_data": {
564
+ "trace_id": span.trace_id,
565
+ "total_duration": trace.total_duration if trace else None,
566
+ },
567
+ "span_data": {
568
+ "span_id": span_id,
569
+ "duration": span.duration,
570
+ "status": span.status,
571
+ },
572
+ "alerts": [self._serialize_alert(a) for a in alerts],
573
+ "metrics": {"active_spans": len(self._active_spans)},
574
+ "correlation_id": span.trace_id,
575
+ "timestamp": datetime.now(UTC).isoformat(),
576
+ "status": "success",
577
+ }
578
+
579
+ async def _get_trace(self, **kwargs) -> Dict[str, Any]:
580
+ """Get trace information."""
581
+ trace_id = kwargs.get("trace_id")
582
+ if not trace_id:
583
+ raise ValueError("trace_id is required")
584
+
585
+ trace = self._active_traces.get(trace_id)
586
+ if not trace:
587
+ raise ValueError(f"Trace {trace_id} not found")
588
+
589
+ # Serialize trace data
590
+ trace_data = {
591
+ "trace_id": trace.trace_id,
592
+ "root_span_id": trace.root_span_id,
593
+ "total_duration": trace.total_duration,
594
+ "span_count": trace.span_count,
595
+ "service_count": trace.service_count,
596
+ "error_count": trace.error_count,
597
+ "critical_path": trace.critical_path,
598
+ "spans": [self._serialize_span(s) for s in trace.spans],
599
+ }
600
+
601
+ return {
602
+ "monitoring_status": "trace_retrieved",
603
+ "trace_data": trace_data,
604
+ "span_data": {},
605
+ "alerts": [],
606
+ "metrics": {"span_count": trace.span_count},
607
+ "correlation_id": trace_id,
608
+ "timestamp": datetime.now(UTC).isoformat(),
609
+ "status": "success",
610
+ }
611
+
612
+ async def _get_alerts(self, **kwargs) -> Dict[str, Any]:
613
+ """Get active alerts."""
614
+ # In a real implementation, this would query an alerts database
615
+ # For now, return empty list
616
+ return {
617
+ "monitoring_status": "alerts_retrieved",
618
+ "trace_data": {},
619
+ "span_data": {},
620
+ "alerts": [],
621
+ "metrics": {"active_alerts": 0},
622
+ "correlation_id": str(uuid.uuid4()),
623
+ "timestamp": datetime.now(UTC).isoformat(),
624
+ "status": "success",
625
+ }
626
+
627
+ async def _correlate_transactions(self, **kwargs) -> Dict[str, Any]:
628
+ """Correlate transactions across service boundaries."""
629
+ correlation_window = kwargs.get("correlation_window", 30.0)
630
+ current_time = time.time()
631
+
632
+ # Find traces within correlation window
633
+ recent_traces = []
634
+ for trace in self._active_traces.values():
635
+ if any(
636
+ s.start_time >= current_time - correlation_window for s in trace.spans
637
+ ):
638
+ recent_traces.append(trace)
639
+
640
+ # Group by common tags/baggage
641
+ correlations = {}
642
+ for trace in recent_traces:
643
+ for span in trace.spans:
644
+ for tag_key, tag_value in span.tags.items():
645
+ correlation_key = f"{tag_key}:{tag_value}"
646
+ if correlation_key not in correlations:
647
+ correlations[correlation_key] = []
648
+ correlations[correlation_key].append(
649
+ {
650
+ "trace_id": trace.trace_id,
651
+ "span_id": span.span_id,
652
+ "service_name": span.service_name,
653
+ "operation_name": span.operation_name,
654
+ }
655
+ )
656
+
657
+ # Filter correlations with multiple traces
658
+ significant_correlations = {
659
+ k: v
660
+ for k, v in correlations.items()
661
+ if len(set(item["trace_id"] for item in v)) > 1
662
+ }
663
+
664
+ return {
665
+ "monitoring_status": "correlations_found",
666
+ "trace_data": {},
667
+ "span_data": {},
668
+ "alerts": [],
669
+ "metrics": {
670
+ "correlations_found": len(significant_correlations),
671
+ "traces_analyzed": len(recent_traces),
672
+ },
673
+ "correlation_id": str(uuid.uuid4()),
674
+ "timestamp": datetime.now(UTC).isoformat(),
675
+ "status": "success",
676
+ }
677
+
678
+ async def _monitoring_loop(self):
679
+ """Background monitoring loop for real-time alerts."""
680
+ while self._monitoring_active:
681
+ try:
682
+ await asyncio.sleep(1.0) # Check every second
683
+
684
+ # Check for long-running spans
685
+ current_time = time.time()
686
+ for span in self._active_spans.values():
687
+ duration = current_time - span.start_time
688
+ if duration > 10.0: # Alert on spans > 10 seconds
689
+ alert = TransactionAlert(
690
+ alert_id=str(uuid.uuid4()),
691
+ severity=AlertSeverity.MEDIUM,
692
+ message=f"Long-running span detected: {span.operation_name}",
693
+ trace_id=span.trace_id,
694
+ metric_name="span_duration",
695
+ metric_value=duration,
696
+ threshold=10.0,
697
+ tags=span.tags,
698
+ )
699
+ await self._handle_alert(alert)
700
+
701
+ except asyncio.CancelledError:
702
+ break
703
+ except Exception as e:
704
+ self.logger.error(f"Monitoring loop error: {e}")
705
+
706
+ async def _setup_streaming(self, endpoint: str):
707
+ """Setup streaming for real-time dashboard updates."""
708
+ # In a real implementation, this would setup WebSocket/SSE connections
709
+ self.logger.info(f"Would setup streaming to {endpoint}")
710
+ # Placeholder for streaming setup
711
+ pass
712
+
713
+ async def _check_span_alerts(self, span: TransactionSpan) -> List[TransactionAlert]:
714
+ """Check if span triggers any alerts."""
715
+ alerts = []
716
+
717
+ # Check duration thresholds
718
+ if span.duration and "duration" in self._alert_thresholds:
719
+ thresholds = self._alert_thresholds["duration"]
720
+
721
+ for threshold_name, threshold_value in thresholds.items():
722
+ if span.duration > threshold_value:
723
+ alert = TransactionAlert(
724
+ alert_id=str(uuid.uuid4()),
725
+ severity=(
726
+ AlertSeverity.HIGH
727
+ if threshold_name == "p99"
728
+ else AlertSeverity.MEDIUM
729
+ ),
730
+ message=f"Span duration {span.duration:.3f}s exceeds {threshold_name} threshold {threshold_value}s",
731
+ trace_id=span.trace_id,
732
+ metric_name=f"span_duration_{threshold_name}",
733
+ metric_value=span.duration,
734
+ threshold=threshold_value,
735
+ tags=span.tags,
736
+ )
737
+ alerts.append(alert)
738
+
739
+ # Check for errors
740
+ if span.error:
741
+ alert = TransactionAlert(
742
+ alert_id=str(uuid.uuid4()),
743
+ severity=AlertSeverity.HIGH,
744
+ message=f"Span error: {span.error}",
745
+ trace_id=span.trace_id,
746
+ metric_name="span_error",
747
+ metric_value=1.0,
748
+ threshold=0.0,
749
+ tags=span.tags,
750
+ )
751
+ alerts.append(alert)
752
+
753
+ return alerts
754
+
755
+ async def _handle_alert(self, alert: TransactionAlert):
756
+ """Handle a generated alert."""
757
+ # In a real implementation, this would send to alerting systems
758
+ self.logger.warning(f"Alert: {alert.severity.value} - {alert.message}")
759
+
760
+ # Call registered alert handlers
761
+ for handler in self._alert_handlers:
762
+ try:
763
+ await handler(alert)
764
+ except Exception as e:
765
+ self.logger.error(f"Alert handler error: {e}")
766
+
767
+ def _calculate_critical_path(self, trace: TransactionTrace) -> List[str]:
768
+ """Calculate critical path through the trace."""
769
+ # Simple implementation: find longest duration path
770
+ # In a real implementation, this would use graph algorithms
771
+ spans_by_duration = sorted(
772
+ trace.spans, key=lambda s: s.duration or 0, reverse=True
773
+ )
774
+ return [s.span_id for s in spans_by_duration[:3]] # Top 3 spans
775
+
776
+ def _serialize_span(self, span: TransactionSpan) -> Dict[str, Any]:
777
+ """Serialize a span to dictionary."""
778
+ return {
779
+ "span_id": span.span_id,
780
+ "trace_id": span.trace_id,
781
+ "parent_span_id": span.parent_span_id,
782
+ "operation_name": span.operation_name,
783
+ "service_name": span.service_name,
784
+ "start_time": span.start_time,
785
+ "end_time": span.end_time,
786
+ "duration": span.duration,
787
+ "status": span.status,
788
+ "error": span.error,
789
+ "tags": span.tags,
790
+ "baggage": span.baggage,
791
+ }
792
+
793
+ def _serialize_alert(self, alert: TransactionAlert) -> Dict[str, Any]:
794
+ """Serialize an alert to dictionary."""
795
+ return {
796
+ "alert_id": alert.alert_id,
797
+ "severity": alert.severity.value,
798
+ "message": alert.message,
799
+ "transaction_id": alert.transaction_id,
800
+ "trace_id": alert.trace_id,
801
+ "metric_name": alert.metric_name,
802
+ "metric_value": alert.metric_value,
803
+ "threshold": alert.threshold,
804
+ "timestamp": alert.timestamp,
805
+ "tags": alert.tags,
806
+ "resolved": alert.resolved,
807
+ }
808
+
809
+ def run(self, **kwargs) -> Dict[str, Any]:
810
+ """Synchronous wrapper for compatibility."""
811
+ import asyncio
812
+
813
+ return asyncio.run(self.async_run(**kwargs))
814
+
815
+ async def _start_transaction(self, **kwargs) -> Dict[str, Any]:
816
+ """Start monitoring a specific transaction."""
817
+ transaction_id = kwargs.get("transaction_id", str(uuid.uuid4()))
818
+ transaction_type = kwargs.get("transaction_type", "default")
819
+ metadata = kwargs.get("metadata", {})
820
+
821
+ # Create trace for transaction
822
+ trace_id = str(uuid.uuid4())
823
+ span_id = str(uuid.uuid4())
824
+
825
+ # Store transaction info
826
+ self._active_traces[trace_id] = {
827
+ "transaction_id": transaction_id,
828
+ "transaction_type": transaction_type,
829
+ "start_time": time.time(),
830
+ "metadata": metadata,
831
+ "spans": [span_id],
832
+ "status": "active",
833
+ }
834
+
835
+ self.logger.info(f"Started transaction monitoring for {transaction_id}")
836
+
837
+ return {
838
+ "monitoring_status": "transaction_started",
839
+ "trace_data": {"trace_id": trace_id, "transaction_id": transaction_id},
840
+ "span_data": {"span_id": span_id, "operation": "transaction_start"},
841
+ "alerts": [],
842
+ "metrics": {"active_transactions": len(self._active_traces)},
843
+ "correlation_id": transaction_id,
844
+ "timestamp": datetime.now(UTC).isoformat(),
845
+ "status": "success",
846
+ }
847
+
848
+ async def _complete_transaction(self, **kwargs) -> Dict[str, Any]:
849
+ """Complete a transaction and update monitoring status."""
850
+ self.logger.debug(f"Complete transaction called with kwargs: {kwargs}")
851
+ transaction_id = kwargs.get("transaction_id")
852
+ status = kwargs.get("status", "completed")
853
+
854
+ if not transaction_id:
855
+ raise ValueError(
856
+ f"transaction_id is required for complete_transaction. Received kwargs: {kwargs}"
857
+ )
858
+
859
+ # Mark the transaction as completed in active traces
860
+ if transaction_id in self._active_traces:
861
+ trace_data = self._active_traces[transaction_id]
862
+ trace_data["end_time"] = time.time()
863
+ trace_data["status"] = status
864
+ trace_data["duration"] = trace_data["end_time"] - trace_data.get(
865
+ "start_time", 0
866
+ )
867
+
868
+ # Move to completed traces if we track them
869
+ # For now, just mark as completed in place
870
+
871
+ return {
872
+ "monitoring_active": self._monitoring_active,
873
+ "transaction_id": transaction_id,
874
+ "transaction_status": status,
875
+ "monitoring_status": "transaction_completed",
876
+ "trace_data": {
877
+ "trace_id": f"trace_{transaction_id}",
878
+ "transaction_id": transaction_id,
879
+ },
880
+ "span_data": {
881
+ "span_id": f"span_{transaction_id}",
882
+ "operation": "transaction_complete",
883
+ },
884
+ "alerts": [],
885
+ "metrics": {"active_transactions": len(self._active_traces)},
886
+ "correlation_id": transaction_id,
887
+ "timestamp": datetime.now(UTC).isoformat(),
888
+ "status": "success",
889
+ }
890
+
891
+ async def _get_monitoring_status(self, **kwargs) -> Dict[str, Any]:
892
+ """Get current monitoring status and metrics."""
893
+ active_traces_count = len(self._active_traces)
894
+ active_spans_count = sum(
895
+ len(trace_data.get("spans", []))
896
+ for trace_data in self._active_traces.values()
897
+ )
898
+
899
+ # Calculate performance metrics
900
+ current_time = time.time()
901
+ recent_traces = [
902
+ trace
903
+ for trace in self._active_traces.values()
904
+ if current_time - trace.get("start_time", 0) < 300 # Last 5 minutes
905
+ ]
906
+
907
+ status_info = {
908
+ "monitoring_active": self._monitoring_active,
909
+ "total_active_traces": active_traces_count,
910
+ "total_active_spans": active_spans_count,
911
+ "recent_traces_5min": len(recent_traces),
912
+ "sampling_rate": self._trace_sampling_rate,
913
+ "alert_thresholds": self._alert_thresholds,
914
+ "background_tasks": len(self._background_tasks),
915
+ }
916
+
917
+ return {
918
+ "monitoring_status": "active" if self._monitoring_active else "inactive",
919
+ "trace_data": {"active_traces": active_traces_count},
920
+ "span_data": {"active_spans": active_spans_count},
921
+ "alerts": [],
922
+ "metrics": status_info,
923
+ "correlation_id": str(uuid.uuid4()),
924
+ "timestamp": datetime.now(UTC).isoformat(),
925
+ "status": "success",
926
+ }
927
+
928
+ async def cleanup(self):
929
+ """Cleanup resources when node is destroyed."""
930
+ await self._stop_monitoring()
931
+ await super().cleanup() if hasattr(super(), "cleanup") else None