kailash 0.6.6__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +35 -5
- kailash/adapters/__init__.py +5 -0
- kailash/adapters/mcp_platform_adapter.py +273 -0
- kailash/channels/__init__.py +21 -0
- kailash/channels/api_channel.py +409 -0
- kailash/channels/base.py +271 -0
- kailash/channels/cli_channel.py +661 -0
- kailash/channels/event_router.py +496 -0
- kailash/channels/mcp_channel.py +648 -0
- kailash/channels/session.py +423 -0
- kailash/mcp_server/discovery.py +1 -1
- kailash/middleware/mcp/enhanced_server.py +22 -16
- kailash/nexus/__init__.py +21 -0
- kailash/nexus/factory.py +413 -0
- kailash/nexus/gateway.py +545 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/iterative_llm_agent.py +988 -17
- kailash/nodes/ai/llm_agent.py +29 -9
- kailash/nodes/api/__init__.py +2 -2
- kailash/nodes/api/monitoring.py +1 -1
- kailash/nodes/base_async.py +54 -14
- kailash/nodes/code/async_python.py +1 -1
- kailash/nodes/data/bulk_operations.py +939 -0
- kailash/nodes/data/query_builder.py +373 -0
- kailash/nodes/data/query_cache.py +512 -0
- kailash/nodes/monitoring/__init__.py +10 -0
- kailash/nodes/monitoring/deadlock_detector.py +964 -0
- kailash/nodes/monitoring/performance_anomaly.py +1078 -0
- kailash/nodes/monitoring/race_condition_detector.py +1151 -0
- kailash/nodes/monitoring/transaction_metrics.py +790 -0
- kailash/nodes/monitoring/transaction_monitor.py +931 -0
- kailash/nodes/system/__init__.py +17 -0
- kailash/nodes/system/command_parser.py +820 -0
- kailash/nodes/transaction/__init__.py +48 -0
- kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
- kailash/nodes/transaction/saga_coordinator.py +652 -0
- kailash/nodes/transaction/saga_state_storage.py +411 -0
- kailash/nodes/transaction/saga_step.py +467 -0
- kailash/nodes/transaction/transaction_context.py +756 -0
- kailash/nodes/transaction/two_phase_commit.py +978 -0
- kailash/nodes/transform/processors.py +17 -1
- kailash/nodes/validation/__init__.py +21 -0
- kailash/nodes/validation/test_executor.py +532 -0
- kailash/nodes/validation/validation_nodes.py +447 -0
- kailash/resources/factory.py +1 -1
- kailash/runtime/async_local.py +84 -21
- kailash/runtime/local.py +21 -2
- kailash/runtime/parameter_injector.py +187 -31
- kailash/security.py +16 -1
- kailash/servers/__init__.py +32 -0
- kailash/servers/durable_workflow_server.py +430 -0
- kailash/servers/enterprise_workflow_server.py +466 -0
- kailash/servers/gateway.py +183 -0
- kailash/servers/workflow_server.py +290 -0
- kailash/utils/data_validation.py +192 -0
- kailash/workflow/builder.py +291 -12
- kailash/workflow/validation.py +144 -8
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/METADATA +1 -1
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/RECORD +63 -25
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/WHEEL +0 -0
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,931 @@
|
|
1
|
+
"""Real-time transaction monitoring node with distributed tracing support.
|
2
|
+
|
3
|
+
This module provides live transaction monitoring capabilities with real-time
|
4
|
+
alerting, distributed tracing, and streaming dashboard support.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import time
|
11
|
+
import uuid
|
12
|
+
from dataclasses import dataclass, field
|
13
|
+
from datetime import UTC, datetime
|
14
|
+
from enum import Enum
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
16
|
+
|
17
|
+
from kailash.nodes.base import NodeParameter, register_node
|
18
|
+
from kailash.nodes.base_async import AsyncNode
|
19
|
+
from kailash.sdk_exceptions import NodeExecutionError
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
class AlertSeverity(Enum):
|
25
|
+
"""Alert severity levels."""
|
26
|
+
|
27
|
+
LOW = "low"
|
28
|
+
MEDIUM = "medium"
|
29
|
+
HIGH = "high"
|
30
|
+
CRITICAL = "critical"
|
31
|
+
|
32
|
+
|
33
|
+
class TracingProtocol(Enum):
|
34
|
+
"""Supported tracing protocols."""
|
35
|
+
|
36
|
+
OPENTELEMETRY = "opentelemetry"
|
37
|
+
JAEGER = "jaeger"
|
38
|
+
ZIPKIN = "zipkin"
|
39
|
+
CUSTOM = "custom"
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class TransactionSpan:
|
44
|
+
"""Represents a distributed tracing span."""
|
45
|
+
|
46
|
+
span_id: str
|
47
|
+
trace_id: str
|
48
|
+
parent_span_id: Optional[str] = None
|
49
|
+
operation_name: str = ""
|
50
|
+
start_time: float = field(default_factory=time.time)
|
51
|
+
end_time: Optional[float] = None
|
52
|
+
duration: Optional[float] = None
|
53
|
+
service_name: str = ""
|
54
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
55
|
+
logs: List[Dict[str, Any]] = field(default_factory=list)
|
56
|
+
baggage: Dict[str, str] = field(default_factory=dict)
|
57
|
+
status: str = "ok"
|
58
|
+
error: Optional[str] = None
|
59
|
+
|
60
|
+
|
61
|
+
@dataclass
|
62
|
+
class TransactionTrace:
|
63
|
+
"""Represents a complete distributed trace."""
|
64
|
+
|
65
|
+
trace_id: str
|
66
|
+
root_span_id: str
|
67
|
+
spans: List[TransactionSpan] = field(default_factory=list)
|
68
|
+
total_duration: Optional[float] = None
|
69
|
+
service_count: int = 0
|
70
|
+
span_count: int = 0
|
71
|
+
error_count: int = 0
|
72
|
+
critical_path: List[str] = field(default_factory=list)
|
73
|
+
|
74
|
+
|
75
|
+
@dataclass
|
76
|
+
class TransactionAlert:
|
77
|
+
"""Represents a transaction monitoring alert."""
|
78
|
+
|
79
|
+
alert_id: str
|
80
|
+
severity: AlertSeverity
|
81
|
+
message: str
|
82
|
+
transaction_id: Optional[str] = None
|
83
|
+
trace_id: Optional[str] = None
|
84
|
+
metric_name: str = ""
|
85
|
+
metric_value: float = 0.0
|
86
|
+
threshold: float = 0.0
|
87
|
+
timestamp: float = field(default_factory=time.time)
|
88
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
89
|
+
resolved: bool = False
|
90
|
+
|
91
|
+
|
92
|
+
@register_node()
|
93
|
+
class TransactionMonitorNode(AsyncNode):
|
94
|
+
"""Node for real-time transaction monitoring and distributed tracing.
|
95
|
+
|
96
|
+
This node provides comprehensive real-time monitoring including:
|
97
|
+
- Live transaction tracking and correlation
|
98
|
+
- Distributed tracing with OpenTelemetry support
|
99
|
+
- Real-time anomaly detection and alerting
|
100
|
+
- WebSocket/SSE streaming for live dashboards
|
101
|
+
- Transaction correlation across service boundaries
|
102
|
+
- Critical path analysis for performance optimization
|
103
|
+
|
104
|
+
Design Purpose:
|
105
|
+
- Enable real-time performance monitoring
|
106
|
+
- Support distributed system troubleshooting
|
107
|
+
- Provide actionable alerts for SLA violations
|
108
|
+
- Facilitate live dashboard visualization
|
109
|
+
|
110
|
+
Examples:
|
111
|
+
>>> # Start monitoring transactions
|
112
|
+
>>> monitor = TransactionMonitorNode()
|
113
|
+
>>> result = await monitor.execute(
|
114
|
+
... operation="start_monitoring",
|
115
|
+
... trace_sampling_rate=0.1,
|
116
|
+
... alert_thresholds={
|
117
|
+
... "duration": {"p95": 2.0, "p99": 5.0},
|
118
|
+
... "error_rate": {"threshold": 0.01}
|
119
|
+
... }
|
120
|
+
... )
|
121
|
+
|
122
|
+
>>> # Create distributed trace
|
123
|
+
>>> result = await monitor.execute(
|
124
|
+
... operation="create_trace",
|
125
|
+
... trace_id="trace_12345",
|
126
|
+
... root_operation="order_processing",
|
127
|
+
... service_name="order-service"
|
128
|
+
... )
|
129
|
+
|
130
|
+
>>> # Add span to trace
|
131
|
+
>>> result = await monitor.execute(
|
132
|
+
... operation="add_span",
|
133
|
+
... trace_id="trace_12345",
|
134
|
+
... operation_name="validate_payment",
|
135
|
+
... service_name="payment-service",
|
136
|
+
... parent_span_id="span_abc"
|
137
|
+
... )
|
138
|
+
"""
|
139
|
+
|
140
|
+
def __init__(self, **kwargs):
|
141
|
+
"""Initialize the transaction monitor node."""
|
142
|
+
super().__init__(**kwargs)
|
143
|
+
self._active_traces: Dict[str, TransactionTrace] = {}
|
144
|
+
self._active_spans: Dict[str, TransactionSpan] = {}
|
145
|
+
self._monitoring_active = False
|
146
|
+
self._alert_handlers: List[Callable] = []
|
147
|
+
self._stream_handlers: List[Callable] = []
|
148
|
+
self._metrics_buffer: List[Dict[str, Any]] = []
|
149
|
+
self._alert_thresholds: Dict[str, Dict[str, float]] = {}
|
150
|
+
self._trace_sampling_rate = 1.0
|
151
|
+
self._background_tasks: Set[asyncio.Task] = set()
|
152
|
+
self.logger.info(f"Initialized TransactionMonitorNode: {self.id}")
|
153
|
+
|
154
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
155
|
+
"""Define the parameters this node accepts."""
|
156
|
+
return {
|
157
|
+
"operation": NodeParameter(
|
158
|
+
name="operation",
|
159
|
+
type=str,
|
160
|
+
required=True,
|
161
|
+
description="Operation (start_monitoring, stop_monitoring, create_trace, add_span, finish_span, get_trace, get_alerts)",
|
162
|
+
),
|
163
|
+
"transaction_id": NodeParameter(
|
164
|
+
name="transaction_id",
|
165
|
+
type=str,
|
166
|
+
required=False,
|
167
|
+
description="Transaction identifier for monitoring operations",
|
168
|
+
),
|
169
|
+
"success": NodeParameter(
|
170
|
+
name="success",
|
171
|
+
type=bool,
|
172
|
+
required=False,
|
173
|
+
description="Whether the transaction completed successfully",
|
174
|
+
),
|
175
|
+
"trace_id": NodeParameter(
|
176
|
+
name="trace_id",
|
177
|
+
type=str,
|
178
|
+
required=False,
|
179
|
+
description="Distributed trace identifier",
|
180
|
+
),
|
181
|
+
"span_id": NodeParameter(
|
182
|
+
name="span_id", type=str, required=False, description="Span identifier"
|
183
|
+
),
|
184
|
+
"parent_span_id": NodeParameter(
|
185
|
+
name="parent_span_id",
|
186
|
+
type=str,
|
187
|
+
required=False,
|
188
|
+
description="Parent span identifier",
|
189
|
+
),
|
190
|
+
"operation_name": NodeParameter(
|
191
|
+
name="operation_name",
|
192
|
+
type=str,
|
193
|
+
required=False,
|
194
|
+
description="Name of the operation being traced",
|
195
|
+
),
|
196
|
+
"service_name": NodeParameter(
|
197
|
+
name="service_name",
|
198
|
+
type=str,
|
199
|
+
required=False,
|
200
|
+
description="Name of the service",
|
201
|
+
),
|
202
|
+
"tags": NodeParameter(
|
203
|
+
name="tags",
|
204
|
+
type=dict,
|
205
|
+
required=False,
|
206
|
+
default={},
|
207
|
+
description="Tags for span/trace",
|
208
|
+
),
|
209
|
+
"baggage": NodeParameter(
|
210
|
+
name="baggage",
|
211
|
+
type=dict,
|
212
|
+
required=False,
|
213
|
+
default={},
|
214
|
+
description="Baggage for distributed context",
|
215
|
+
),
|
216
|
+
"error": NodeParameter(
|
217
|
+
name="error",
|
218
|
+
type=str,
|
219
|
+
required=False,
|
220
|
+
description="Error message if operation failed",
|
221
|
+
),
|
222
|
+
"trace_sampling_rate": NodeParameter(
|
223
|
+
name="trace_sampling_rate",
|
224
|
+
type=float,
|
225
|
+
required=False,
|
226
|
+
default=1.0,
|
227
|
+
description="Sampling rate for traces (0.0 to 1.0)",
|
228
|
+
),
|
229
|
+
"alert_thresholds": NodeParameter(
|
230
|
+
name="alert_thresholds",
|
231
|
+
type=dict,
|
232
|
+
required=False,
|
233
|
+
default={},
|
234
|
+
description="Alert thresholds for monitoring",
|
235
|
+
),
|
236
|
+
"tracing_protocol": NodeParameter(
|
237
|
+
name="tracing_protocol",
|
238
|
+
type=str,
|
239
|
+
required=False,
|
240
|
+
default="opentelemetry",
|
241
|
+
description="Tracing protocol (opentelemetry, jaeger, zipkin, custom)",
|
242
|
+
),
|
243
|
+
"enable_streaming": NodeParameter(
|
244
|
+
name="enable_streaming",
|
245
|
+
type=bool,
|
246
|
+
required=False,
|
247
|
+
default=False,
|
248
|
+
description="Enable real-time streaming for dashboards",
|
249
|
+
),
|
250
|
+
"stream_endpoint": NodeParameter(
|
251
|
+
name="stream_endpoint",
|
252
|
+
type=str,
|
253
|
+
required=False,
|
254
|
+
description="WebSocket/SSE endpoint for streaming",
|
255
|
+
),
|
256
|
+
"correlation_window": NodeParameter(
|
257
|
+
name="correlation_window",
|
258
|
+
type=float,
|
259
|
+
required=False,
|
260
|
+
default=30.0,
|
261
|
+
description="Time window for transaction correlation in seconds",
|
262
|
+
),
|
263
|
+
}
|
264
|
+
|
265
|
+
def get_output_schema(self) -> Dict[str, NodeParameter]:
|
266
|
+
"""Define the output schema for this node."""
|
267
|
+
return {
|
268
|
+
"monitoring_status": NodeParameter(
|
269
|
+
name="monitoring_status",
|
270
|
+
type=str,
|
271
|
+
description="Current monitoring status",
|
272
|
+
),
|
273
|
+
"trace_data": NodeParameter(
|
274
|
+
name="trace_data", type=dict, description="Trace information"
|
275
|
+
),
|
276
|
+
"span_data": NodeParameter(
|
277
|
+
name="span_data", type=dict, description="Span information"
|
278
|
+
),
|
279
|
+
"alerts": NodeParameter(
|
280
|
+
name="alerts", type=list, description="Active alerts"
|
281
|
+
),
|
282
|
+
"metrics": NodeParameter(
|
283
|
+
name="metrics", type=dict, description="Real-time metrics"
|
284
|
+
),
|
285
|
+
"correlation_id": NodeParameter(
|
286
|
+
name="correlation_id",
|
287
|
+
type=str,
|
288
|
+
description="Correlation ID for tracking",
|
289
|
+
),
|
290
|
+
"timestamp": NodeParameter(
|
291
|
+
name="timestamp", type=str, description="ISO timestamp of operation"
|
292
|
+
),
|
293
|
+
"status": NodeParameter(
|
294
|
+
name="status", type=str, description="Operation status"
|
295
|
+
),
|
296
|
+
}
|
297
|
+
|
298
|
+
async def async_run(self, **kwargs) -> Dict[str, Any]:
|
299
|
+
"""Execute transaction monitoring operation."""
|
300
|
+
operation = kwargs.get("operation")
|
301
|
+
|
302
|
+
try:
|
303
|
+
if operation == "start_monitoring":
|
304
|
+
return await self._start_monitoring(**kwargs)
|
305
|
+
elif operation == "stop_monitoring":
|
306
|
+
return await self._stop_monitoring(**kwargs)
|
307
|
+
elif operation == "start_transaction":
|
308
|
+
return await self._start_transaction(**kwargs)
|
309
|
+
elif operation == "complete_transaction":
|
310
|
+
return await self._complete_transaction(**kwargs)
|
311
|
+
elif operation == "get_monitoring_status":
|
312
|
+
return await self._get_monitoring_status(**kwargs)
|
313
|
+
elif operation == "create_trace":
|
314
|
+
return await self._create_trace(**kwargs)
|
315
|
+
elif operation == "add_span":
|
316
|
+
return await self._add_span(**kwargs)
|
317
|
+
elif operation == "finish_span":
|
318
|
+
return await self._finish_span(**kwargs)
|
319
|
+
elif operation == "get_trace":
|
320
|
+
return await self._get_trace(**kwargs)
|
321
|
+
elif operation == "get_alerts":
|
322
|
+
return await self._get_alerts(**kwargs)
|
323
|
+
elif operation == "correlate_transactions":
|
324
|
+
return await self._correlate_transactions(**kwargs)
|
325
|
+
else:
|
326
|
+
raise ValueError(f"Unknown operation: {operation}")
|
327
|
+
|
328
|
+
except Exception as e:
|
329
|
+
self.logger.error(f"Transaction monitoring operation failed: {str(e)}")
|
330
|
+
raise NodeExecutionError(
|
331
|
+
f"Failed to execute monitoring operation: {str(e)}"
|
332
|
+
)
|
333
|
+
|
334
|
+
async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
|
335
|
+
"""Start real-time transaction monitoring."""
|
336
|
+
self._trace_sampling_rate = kwargs.get("trace_sampling_rate", 1.0)
|
337
|
+
self._alert_thresholds = kwargs.get("alert_thresholds", {})
|
338
|
+
enable_streaming = kwargs.get("enable_streaming", False)
|
339
|
+
stream_endpoint = kwargs.get("stream_endpoint")
|
340
|
+
|
341
|
+
# Start background monitoring task
|
342
|
+
if not self._monitoring_active:
|
343
|
+
self._monitoring_active = True
|
344
|
+
monitoring_task = asyncio.create_task(self._monitoring_loop())
|
345
|
+
self._background_tasks.add(monitoring_task)
|
346
|
+
monitoring_task.add_done_callback(self._background_tasks.discard)
|
347
|
+
|
348
|
+
# Setup streaming if enabled
|
349
|
+
if enable_streaming and stream_endpoint:
|
350
|
+
streaming_task = asyncio.create_task(self._setup_streaming(stream_endpoint))
|
351
|
+
self._background_tasks.add(streaming_task)
|
352
|
+
streaming_task.add_done_callback(self._background_tasks.discard)
|
353
|
+
|
354
|
+
self.logger.info(
|
355
|
+
f"Started transaction monitoring with sampling rate {self._trace_sampling_rate}"
|
356
|
+
)
|
357
|
+
|
358
|
+
return {
|
359
|
+
"monitoring_status": "active",
|
360
|
+
"trace_data": {},
|
361
|
+
"span_data": {},
|
362
|
+
"alerts": [],
|
363
|
+
"metrics": {"sampling_rate": self._trace_sampling_rate},
|
364
|
+
"correlation_id": str(uuid.uuid4()),
|
365
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
366
|
+
"status": "success",
|
367
|
+
}
|
368
|
+
|
369
|
+
async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
|
370
|
+
"""Stop real-time transaction monitoring."""
|
371
|
+
self._monitoring_active = False
|
372
|
+
|
373
|
+
# Cancel background tasks
|
374
|
+
for task in self._background_tasks:
|
375
|
+
if not task.done():
|
376
|
+
task.cancel()
|
377
|
+
|
378
|
+
# Wait for tasks to complete
|
379
|
+
if self._background_tasks:
|
380
|
+
await asyncio.gather(*self._background_tasks, return_exceptions=True)
|
381
|
+
|
382
|
+
self._background_tasks.clear()
|
383
|
+
|
384
|
+
self.logger.info("Stopped transaction monitoring")
|
385
|
+
|
386
|
+
return {
|
387
|
+
"monitoring_status": "stopped",
|
388
|
+
"trace_data": {},
|
389
|
+
"span_data": {},
|
390
|
+
"alerts": [],
|
391
|
+
"metrics": {"active_traces": len(self._active_traces)},
|
392
|
+
"correlation_id": str(uuid.uuid4()),
|
393
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
394
|
+
"status": "success",
|
395
|
+
}
|
396
|
+
|
397
|
+
async def _create_trace(self, **kwargs) -> Dict[str, Any]:
|
398
|
+
"""Create a new distributed trace."""
|
399
|
+
trace_id = kwargs.get("trace_id") or str(uuid.uuid4())
|
400
|
+
root_operation = kwargs.get("operation_name", "unknown")
|
401
|
+
service_name = kwargs.get("service_name", "unknown")
|
402
|
+
tags = kwargs.get("tags", {})
|
403
|
+
|
404
|
+
# Check sampling
|
405
|
+
if self._trace_sampling_rate < 1.0:
|
406
|
+
import random
|
407
|
+
|
408
|
+
if random.random() > self._trace_sampling_rate:
|
409
|
+
# Skip this trace
|
410
|
+
return {
|
411
|
+
"monitoring_status": "sampling_skipped",
|
412
|
+
"trace_data": {"trace_id": trace_id, "sampled": False},
|
413
|
+
"span_data": {},
|
414
|
+
"alerts": [],
|
415
|
+
"metrics": {},
|
416
|
+
"correlation_id": trace_id,
|
417
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
418
|
+
"status": "success",
|
419
|
+
}
|
420
|
+
|
421
|
+
# Create root span
|
422
|
+
root_span_id = str(uuid.uuid4())
|
423
|
+
root_span = TransactionSpan(
|
424
|
+
span_id=root_span_id,
|
425
|
+
trace_id=trace_id,
|
426
|
+
operation_name=root_operation,
|
427
|
+
service_name=service_name,
|
428
|
+
tags=tags,
|
429
|
+
)
|
430
|
+
|
431
|
+
# Create trace
|
432
|
+
trace = TransactionTrace(
|
433
|
+
trace_id=trace_id,
|
434
|
+
root_span_id=root_span_id,
|
435
|
+
spans=[root_span],
|
436
|
+
span_count=1,
|
437
|
+
service_count=1,
|
438
|
+
)
|
439
|
+
|
440
|
+
self._active_traces[trace_id] = trace
|
441
|
+
self._active_spans[root_span_id] = root_span
|
442
|
+
|
443
|
+
self.logger.debug(f"Created trace {trace_id} with root span {root_span_id}")
|
444
|
+
|
445
|
+
return {
|
446
|
+
"monitoring_status": "trace_created",
|
447
|
+
"trace_data": {
|
448
|
+
"trace_id": trace_id,
|
449
|
+
"root_span_id": root_span_id,
|
450
|
+
"sampled": True,
|
451
|
+
},
|
452
|
+
"span_data": {
|
453
|
+
"span_id": root_span_id,
|
454
|
+
"operation_name": root_operation,
|
455
|
+
"service_name": service_name,
|
456
|
+
},
|
457
|
+
"alerts": [],
|
458
|
+
"metrics": {"active_traces": len(self._active_traces)},
|
459
|
+
"correlation_id": trace_id,
|
460
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
461
|
+
"status": "success",
|
462
|
+
}
|
463
|
+
|
464
|
+
async def _add_span(self, **kwargs) -> Dict[str, Any]:
|
465
|
+
"""Add a new span to an existing trace."""
|
466
|
+
trace_id = kwargs.get("trace_id")
|
467
|
+
if not trace_id or trace_id not in self._active_traces:
|
468
|
+
raise ValueError(f"Trace {trace_id} not found")
|
469
|
+
|
470
|
+
span_id = kwargs.get("span_id") or str(uuid.uuid4())
|
471
|
+
parent_span_id = kwargs.get("parent_span_id")
|
472
|
+
operation_name = kwargs.get("operation_name", "unknown")
|
473
|
+
service_name = kwargs.get("service_name", "unknown")
|
474
|
+
tags = kwargs.get("tags", {})
|
475
|
+
baggage = kwargs.get("baggage", {})
|
476
|
+
|
477
|
+
# Create span
|
478
|
+
span = TransactionSpan(
|
479
|
+
span_id=span_id,
|
480
|
+
trace_id=trace_id,
|
481
|
+
parent_span_id=parent_span_id,
|
482
|
+
operation_name=operation_name,
|
483
|
+
service_name=service_name,
|
484
|
+
tags=tags,
|
485
|
+
baggage=baggage,
|
486
|
+
)
|
487
|
+
|
488
|
+
# Add to trace
|
489
|
+
trace = self._active_traces[trace_id]
|
490
|
+
trace.spans.append(span)
|
491
|
+
trace.span_count += 1
|
492
|
+
|
493
|
+
# Update service count
|
494
|
+
services = set(s.service_name for s in trace.spans)
|
495
|
+
trace.service_count = len(services)
|
496
|
+
|
497
|
+
self._active_spans[span_id] = span
|
498
|
+
|
499
|
+
self.logger.debug(f"Added span {span_id} to trace {trace_id}")
|
500
|
+
|
501
|
+
return {
|
502
|
+
"monitoring_status": "span_added",
|
503
|
+
"trace_data": {
|
504
|
+
"trace_id": trace_id,
|
505
|
+
"span_count": trace.span_count,
|
506
|
+
"service_count": trace.service_count,
|
507
|
+
},
|
508
|
+
"span_data": {
|
509
|
+
"span_id": span_id,
|
510
|
+
"operation_name": operation_name,
|
511
|
+
"service_name": service_name,
|
512
|
+
"parent_span_id": parent_span_id,
|
513
|
+
},
|
514
|
+
"alerts": [],
|
515
|
+
"metrics": {"active_spans": len(self._active_spans)},
|
516
|
+
"correlation_id": trace_id,
|
517
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
518
|
+
"status": "success",
|
519
|
+
}
|
520
|
+
|
521
|
+
async def _finish_span(self, **kwargs) -> Dict[str, Any]:
|
522
|
+
"""Finish an active span."""
|
523
|
+
span_id = kwargs.get("span_id")
|
524
|
+
if not span_id or span_id not in self._active_spans:
|
525
|
+
raise ValueError(f"Span {span_id} not found")
|
526
|
+
|
527
|
+
span = self._active_spans[span_id]
|
528
|
+
error = kwargs.get("error")
|
529
|
+
|
530
|
+
# Complete span
|
531
|
+
span.end_time = time.time()
|
532
|
+
span.duration = span.end_time - span.start_time
|
533
|
+
|
534
|
+
if error:
|
535
|
+
span.status = "error"
|
536
|
+
span.error = error
|
537
|
+
|
538
|
+
# Update trace error count
|
539
|
+
trace = self._active_traces.get(span.trace_id)
|
540
|
+
if trace:
|
541
|
+
trace.error_count += 1
|
542
|
+
|
543
|
+
# Check for alerts
|
544
|
+
alerts = await self._check_span_alerts(span)
|
545
|
+
|
546
|
+
# Remove from active spans
|
547
|
+
del self._active_spans[span_id]
|
548
|
+
|
549
|
+
# Check if trace is complete
|
550
|
+
trace = self._active_traces.get(span.trace_id)
|
551
|
+
if trace and span.span_id == trace.root_span_id:
|
552
|
+
# Root span finished, calculate trace duration
|
553
|
+
trace.total_duration = span.duration
|
554
|
+
trace.critical_path = self._calculate_critical_path(trace)
|
555
|
+
|
556
|
+
# Move to completed traces (not implemented in this basic version)
|
557
|
+
# del self._active_traces[span.trace_id]
|
558
|
+
|
559
|
+
self.logger.debug(f"Finished span {span_id} with duration {span.duration:.3f}s")
|
560
|
+
|
561
|
+
return {
|
562
|
+
"monitoring_status": "span_finished",
|
563
|
+
"trace_data": {
|
564
|
+
"trace_id": span.trace_id,
|
565
|
+
"total_duration": trace.total_duration if trace else None,
|
566
|
+
},
|
567
|
+
"span_data": {
|
568
|
+
"span_id": span_id,
|
569
|
+
"duration": span.duration,
|
570
|
+
"status": span.status,
|
571
|
+
},
|
572
|
+
"alerts": [self._serialize_alert(a) for a in alerts],
|
573
|
+
"metrics": {"active_spans": len(self._active_spans)},
|
574
|
+
"correlation_id": span.trace_id,
|
575
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
576
|
+
"status": "success",
|
577
|
+
}
|
578
|
+
|
579
|
+
async def _get_trace(self, **kwargs) -> Dict[str, Any]:
|
580
|
+
"""Get trace information."""
|
581
|
+
trace_id = kwargs.get("trace_id")
|
582
|
+
if not trace_id:
|
583
|
+
raise ValueError("trace_id is required")
|
584
|
+
|
585
|
+
trace = self._active_traces.get(trace_id)
|
586
|
+
if not trace:
|
587
|
+
raise ValueError(f"Trace {trace_id} not found")
|
588
|
+
|
589
|
+
# Serialize trace data
|
590
|
+
trace_data = {
|
591
|
+
"trace_id": trace.trace_id,
|
592
|
+
"root_span_id": trace.root_span_id,
|
593
|
+
"total_duration": trace.total_duration,
|
594
|
+
"span_count": trace.span_count,
|
595
|
+
"service_count": trace.service_count,
|
596
|
+
"error_count": trace.error_count,
|
597
|
+
"critical_path": trace.critical_path,
|
598
|
+
"spans": [self._serialize_span(s) for s in trace.spans],
|
599
|
+
}
|
600
|
+
|
601
|
+
return {
|
602
|
+
"monitoring_status": "trace_retrieved",
|
603
|
+
"trace_data": trace_data,
|
604
|
+
"span_data": {},
|
605
|
+
"alerts": [],
|
606
|
+
"metrics": {"span_count": trace.span_count},
|
607
|
+
"correlation_id": trace_id,
|
608
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
609
|
+
"status": "success",
|
610
|
+
}
|
611
|
+
|
612
|
+
async def _get_alerts(self, **kwargs) -> Dict[str, Any]:
|
613
|
+
"""Get active alerts."""
|
614
|
+
# In a real implementation, this would query an alerts database
|
615
|
+
# For now, return empty list
|
616
|
+
return {
|
617
|
+
"monitoring_status": "alerts_retrieved",
|
618
|
+
"trace_data": {},
|
619
|
+
"span_data": {},
|
620
|
+
"alerts": [],
|
621
|
+
"metrics": {"active_alerts": 0},
|
622
|
+
"correlation_id": str(uuid.uuid4()),
|
623
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
624
|
+
"status": "success",
|
625
|
+
}
|
626
|
+
|
627
|
+
async def _correlate_transactions(self, **kwargs) -> Dict[str, Any]:
|
628
|
+
"""Correlate transactions across service boundaries."""
|
629
|
+
correlation_window = kwargs.get("correlation_window", 30.0)
|
630
|
+
current_time = time.time()
|
631
|
+
|
632
|
+
# Find traces within correlation window
|
633
|
+
recent_traces = []
|
634
|
+
for trace in self._active_traces.values():
|
635
|
+
if any(
|
636
|
+
s.start_time >= current_time - correlation_window for s in trace.spans
|
637
|
+
):
|
638
|
+
recent_traces.append(trace)
|
639
|
+
|
640
|
+
# Group by common tags/baggage
|
641
|
+
correlations = {}
|
642
|
+
for trace in recent_traces:
|
643
|
+
for span in trace.spans:
|
644
|
+
for tag_key, tag_value in span.tags.items():
|
645
|
+
correlation_key = f"{tag_key}:{tag_value}"
|
646
|
+
if correlation_key not in correlations:
|
647
|
+
correlations[correlation_key] = []
|
648
|
+
correlations[correlation_key].append(
|
649
|
+
{
|
650
|
+
"trace_id": trace.trace_id,
|
651
|
+
"span_id": span.span_id,
|
652
|
+
"service_name": span.service_name,
|
653
|
+
"operation_name": span.operation_name,
|
654
|
+
}
|
655
|
+
)
|
656
|
+
|
657
|
+
# Filter correlations with multiple traces
|
658
|
+
significant_correlations = {
|
659
|
+
k: v
|
660
|
+
for k, v in correlations.items()
|
661
|
+
if len(set(item["trace_id"] for item in v)) > 1
|
662
|
+
}
|
663
|
+
|
664
|
+
return {
|
665
|
+
"monitoring_status": "correlations_found",
|
666
|
+
"trace_data": {},
|
667
|
+
"span_data": {},
|
668
|
+
"alerts": [],
|
669
|
+
"metrics": {
|
670
|
+
"correlations_found": len(significant_correlations),
|
671
|
+
"traces_analyzed": len(recent_traces),
|
672
|
+
},
|
673
|
+
"correlation_id": str(uuid.uuid4()),
|
674
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
675
|
+
"status": "success",
|
676
|
+
}
|
677
|
+
|
678
|
+
async def _monitoring_loop(self):
|
679
|
+
"""Background monitoring loop for real-time alerts."""
|
680
|
+
while self._monitoring_active:
|
681
|
+
try:
|
682
|
+
await asyncio.sleep(1.0) # Check every second
|
683
|
+
|
684
|
+
# Check for long-running spans
|
685
|
+
current_time = time.time()
|
686
|
+
for span in self._active_spans.values():
|
687
|
+
duration = current_time - span.start_time
|
688
|
+
if duration > 10.0: # Alert on spans > 10 seconds
|
689
|
+
alert = TransactionAlert(
|
690
|
+
alert_id=str(uuid.uuid4()),
|
691
|
+
severity=AlertSeverity.MEDIUM,
|
692
|
+
message=f"Long-running span detected: {span.operation_name}",
|
693
|
+
trace_id=span.trace_id,
|
694
|
+
metric_name="span_duration",
|
695
|
+
metric_value=duration,
|
696
|
+
threshold=10.0,
|
697
|
+
tags=span.tags,
|
698
|
+
)
|
699
|
+
await self._handle_alert(alert)
|
700
|
+
|
701
|
+
except asyncio.CancelledError:
|
702
|
+
break
|
703
|
+
except Exception as e:
|
704
|
+
self.logger.error(f"Monitoring loop error: {e}")
|
705
|
+
|
706
|
+
async def _setup_streaming(self, endpoint: str):
|
707
|
+
"""Setup streaming for real-time dashboard updates."""
|
708
|
+
# In a real implementation, this would setup WebSocket/SSE connections
|
709
|
+
self.logger.info(f"Would setup streaming to {endpoint}")
|
710
|
+
# Placeholder for streaming setup
|
711
|
+
pass
|
712
|
+
|
713
|
+
async def _check_span_alerts(self, span: TransactionSpan) -> List[TransactionAlert]:
|
714
|
+
"""Check if span triggers any alerts."""
|
715
|
+
alerts = []
|
716
|
+
|
717
|
+
# Check duration thresholds
|
718
|
+
if span.duration and "duration" in self._alert_thresholds:
|
719
|
+
thresholds = self._alert_thresholds["duration"]
|
720
|
+
|
721
|
+
for threshold_name, threshold_value in thresholds.items():
|
722
|
+
if span.duration > threshold_value:
|
723
|
+
alert = TransactionAlert(
|
724
|
+
alert_id=str(uuid.uuid4()),
|
725
|
+
severity=(
|
726
|
+
AlertSeverity.HIGH
|
727
|
+
if threshold_name == "p99"
|
728
|
+
else AlertSeverity.MEDIUM
|
729
|
+
),
|
730
|
+
message=f"Span duration {span.duration:.3f}s exceeds {threshold_name} threshold {threshold_value}s",
|
731
|
+
trace_id=span.trace_id,
|
732
|
+
metric_name=f"span_duration_{threshold_name}",
|
733
|
+
metric_value=span.duration,
|
734
|
+
threshold=threshold_value,
|
735
|
+
tags=span.tags,
|
736
|
+
)
|
737
|
+
alerts.append(alert)
|
738
|
+
|
739
|
+
# Check for errors
|
740
|
+
if span.error:
|
741
|
+
alert = TransactionAlert(
|
742
|
+
alert_id=str(uuid.uuid4()),
|
743
|
+
severity=AlertSeverity.HIGH,
|
744
|
+
message=f"Span error: {span.error}",
|
745
|
+
trace_id=span.trace_id,
|
746
|
+
metric_name="span_error",
|
747
|
+
metric_value=1.0,
|
748
|
+
threshold=0.0,
|
749
|
+
tags=span.tags,
|
750
|
+
)
|
751
|
+
alerts.append(alert)
|
752
|
+
|
753
|
+
return alerts
|
754
|
+
|
755
|
+
async def _handle_alert(self, alert: TransactionAlert):
|
756
|
+
"""Handle a generated alert."""
|
757
|
+
# In a real implementation, this would send to alerting systems
|
758
|
+
self.logger.warning(f"Alert: {alert.severity.value} - {alert.message}")
|
759
|
+
|
760
|
+
# Call registered alert handlers
|
761
|
+
for handler in self._alert_handlers:
|
762
|
+
try:
|
763
|
+
await handler(alert)
|
764
|
+
except Exception as e:
|
765
|
+
self.logger.error(f"Alert handler error: {e}")
|
766
|
+
|
767
|
+
def _calculate_critical_path(self, trace: TransactionTrace) -> List[str]:
|
768
|
+
"""Calculate critical path through the trace."""
|
769
|
+
# Simple implementation: find longest duration path
|
770
|
+
# In a real implementation, this would use graph algorithms
|
771
|
+
spans_by_duration = sorted(
|
772
|
+
trace.spans, key=lambda s: s.duration or 0, reverse=True
|
773
|
+
)
|
774
|
+
return [s.span_id for s in spans_by_duration[:3]] # Top 3 spans
|
775
|
+
|
776
|
+
def _serialize_span(self, span: TransactionSpan) -> Dict[str, Any]:
|
777
|
+
"""Serialize a span to dictionary."""
|
778
|
+
return {
|
779
|
+
"span_id": span.span_id,
|
780
|
+
"trace_id": span.trace_id,
|
781
|
+
"parent_span_id": span.parent_span_id,
|
782
|
+
"operation_name": span.operation_name,
|
783
|
+
"service_name": span.service_name,
|
784
|
+
"start_time": span.start_time,
|
785
|
+
"end_time": span.end_time,
|
786
|
+
"duration": span.duration,
|
787
|
+
"status": span.status,
|
788
|
+
"error": span.error,
|
789
|
+
"tags": span.tags,
|
790
|
+
"baggage": span.baggage,
|
791
|
+
}
|
792
|
+
|
793
|
+
def _serialize_alert(self, alert: TransactionAlert) -> Dict[str, Any]:
|
794
|
+
"""Serialize an alert to dictionary."""
|
795
|
+
return {
|
796
|
+
"alert_id": alert.alert_id,
|
797
|
+
"severity": alert.severity.value,
|
798
|
+
"message": alert.message,
|
799
|
+
"transaction_id": alert.transaction_id,
|
800
|
+
"trace_id": alert.trace_id,
|
801
|
+
"metric_name": alert.metric_name,
|
802
|
+
"metric_value": alert.metric_value,
|
803
|
+
"threshold": alert.threshold,
|
804
|
+
"timestamp": alert.timestamp,
|
805
|
+
"tags": alert.tags,
|
806
|
+
"resolved": alert.resolved,
|
807
|
+
}
|
808
|
+
|
809
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
810
|
+
"""Synchronous wrapper for compatibility."""
|
811
|
+
import asyncio
|
812
|
+
|
813
|
+
return asyncio.run(self.async_run(**kwargs))
|
814
|
+
|
815
|
+
async def _start_transaction(self, **kwargs) -> Dict[str, Any]:
|
816
|
+
"""Start monitoring a specific transaction."""
|
817
|
+
transaction_id = kwargs.get("transaction_id", str(uuid.uuid4()))
|
818
|
+
transaction_type = kwargs.get("transaction_type", "default")
|
819
|
+
metadata = kwargs.get("metadata", {})
|
820
|
+
|
821
|
+
# Create trace for transaction
|
822
|
+
trace_id = str(uuid.uuid4())
|
823
|
+
span_id = str(uuid.uuid4())
|
824
|
+
|
825
|
+
# Store transaction info
|
826
|
+
self._active_traces[trace_id] = {
|
827
|
+
"transaction_id": transaction_id,
|
828
|
+
"transaction_type": transaction_type,
|
829
|
+
"start_time": time.time(),
|
830
|
+
"metadata": metadata,
|
831
|
+
"spans": [span_id],
|
832
|
+
"status": "active",
|
833
|
+
}
|
834
|
+
|
835
|
+
self.logger.info(f"Started transaction monitoring for {transaction_id}")
|
836
|
+
|
837
|
+
return {
|
838
|
+
"monitoring_status": "transaction_started",
|
839
|
+
"trace_data": {"trace_id": trace_id, "transaction_id": transaction_id},
|
840
|
+
"span_data": {"span_id": span_id, "operation": "transaction_start"},
|
841
|
+
"alerts": [],
|
842
|
+
"metrics": {"active_transactions": len(self._active_traces)},
|
843
|
+
"correlation_id": transaction_id,
|
844
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
845
|
+
"status": "success",
|
846
|
+
}
|
847
|
+
|
848
|
+
async def _complete_transaction(self, **kwargs) -> Dict[str, Any]:
|
849
|
+
"""Complete a transaction and update monitoring status."""
|
850
|
+
self.logger.debug(f"Complete transaction called with kwargs: {kwargs}")
|
851
|
+
transaction_id = kwargs.get("transaction_id")
|
852
|
+
status = kwargs.get("status", "completed")
|
853
|
+
|
854
|
+
if not transaction_id:
|
855
|
+
raise ValueError(
|
856
|
+
f"transaction_id is required for complete_transaction. Received kwargs: {kwargs}"
|
857
|
+
)
|
858
|
+
|
859
|
+
# Mark the transaction as completed in active traces
|
860
|
+
if transaction_id in self._active_traces:
|
861
|
+
trace_data = self._active_traces[transaction_id]
|
862
|
+
trace_data["end_time"] = time.time()
|
863
|
+
trace_data["status"] = status
|
864
|
+
trace_data["duration"] = trace_data["end_time"] - trace_data.get(
|
865
|
+
"start_time", 0
|
866
|
+
)
|
867
|
+
|
868
|
+
# Move to completed traces if we track them
|
869
|
+
# For now, just mark as completed in place
|
870
|
+
|
871
|
+
return {
|
872
|
+
"monitoring_active": self._monitoring_active,
|
873
|
+
"transaction_id": transaction_id,
|
874
|
+
"transaction_status": status,
|
875
|
+
"monitoring_status": "transaction_completed",
|
876
|
+
"trace_data": {
|
877
|
+
"trace_id": f"trace_{transaction_id}",
|
878
|
+
"transaction_id": transaction_id,
|
879
|
+
},
|
880
|
+
"span_data": {
|
881
|
+
"span_id": f"span_{transaction_id}",
|
882
|
+
"operation": "transaction_complete",
|
883
|
+
},
|
884
|
+
"alerts": [],
|
885
|
+
"metrics": {"active_transactions": len(self._active_traces)},
|
886
|
+
"correlation_id": transaction_id,
|
887
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
888
|
+
"status": "success",
|
889
|
+
}
|
890
|
+
|
891
|
+
async def _get_monitoring_status(self, **kwargs) -> Dict[str, Any]:
|
892
|
+
"""Get current monitoring status and metrics."""
|
893
|
+
active_traces_count = len(self._active_traces)
|
894
|
+
active_spans_count = sum(
|
895
|
+
len(trace_data.get("spans", []))
|
896
|
+
for trace_data in self._active_traces.values()
|
897
|
+
)
|
898
|
+
|
899
|
+
# Calculate performance metrics
|
900
|
+
current_time = time.time()
|
901
|
+
recent_traces = [
|
902
|
+
trace
|
903
|
+
for trace in self._active_traces.values()
|
904
|
+
if current_time - trace.get("start_time", 0) < 300 # Last 5 minutes
|
905
|
+
]
|
906
|
+
|
907
|
+
status_info = {
|
908
|
+
"monitoring_active": self._monitoring_active,
|
909
|
+
"total_active_traces": active_traces_count,
|
910
|
+
"total_active_spans": active_spans_count,
|
911
|
+
"recent_traces_5min": len(recent_traces),
|
912
|
+
"sampling_rate": self._trace_sampling_rate,
|
913
|
+
"alert_thresholds": self._alert_thresholds,
|
914
|
+
"background_tasks": len(self._background_tasks),
|
915
|
+
}
|
916
|
+
|
917
|
+
return {
|
918
|
+
"monitoring_status": "active" if self._monitoring_active else "inactive",
|
919
|
+
"trace_data": {"active_traces": active_traces_count},
|
920
|
+
"span_data": {"active_spans": active_spans_count},
|
921
|
+
"alerts": [],
|
922
|
+
"metrics": status_info,
|
923
|
+
"correlation_id": str(uuid.uuid4()),
|
924
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
925
|
+
"status": "success",
|
926
|
+
}
|
927
|
+
|
928
|
+
async def cleanup(self):
|
929
|
+
"""Cleanup resources when node is destroyed."""
|
930
|
+
await self._stop_monitoring()
|
931
|
+
await super().cleanup() if hasattr(super(), "cleanup") else None
|