kite-agent 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kite/__init__.py +46 -0
- kite/ab_testing.py +384 -0
- kite/agent.py +556 -0
- kite/agents/__init__.py +3 -0
- kite/agents/plan_execute.py +191 -0
- kite/agents/react_agent.py +509 -0
- kite/agents/reflective_agent.py +90 -0
- kite/agents/rewoo.py +119 -0
- kite/agents/tot.py +151 -0
- kite/conversation.py +125 -0
- kite/core.py +974 -0
- kite/data_loaders.py +111 -0
- kite/embedding_providers.py +372 -0
- kite/llm_providers.py +1278 -0
- kite/memory/__init__.py +6 -0
- kite/memory/advanced_rag.py +333 -0
- kite/memory/graph_rag.py +719 -0
- kite/memory/session_memory.py +423 -0
- kite/memory/vector_memory.py +579 -0
- kite/monitoring.py +611 -0
- kite/observers.py +107 -0
- kite/optimization/__init__.py +9 -0
- kite/optimization/resource_router.py +80 -0
- kite/persistence.py +42 -0
- kite/pipeline/__init__.py +5 -0
- kite/pipeline/deterministic_pipeline.py +323 -0
- kite/pipeline/reactive_pipeline.py +171 -0
- kite/pipeline_manager.py +15 -0
- kite/routing/__init__.py +6 -0
- kite/routing/aggregator_router.py +325 -0
- kite/routing/llm_router.py +149 -0
- kite/routing/semantic_router.py +228 -0
- kite/safety/__init__.py +6 -0
- kite/safety/circuit_breaker.py +360 -0
- kite/safety/guardrails.py +82 -0
- kite/safety/idempotency_manager.py +304 -0
- kite/safety/kill_switch.py +75 -0
- kite/tool.py +183 -0
- kite/tool_registry.py +87 -0
- kite/tools/__init__.py +21 -0
- kite/tools/code_execution.py +53 -0
- kite/tools/contrib/__init__.py +19 -0
- kite/tools/contrib/calculator.py +26 -0
- kite/tools/contrib/datetime_utils.py +20 -0
- kite/tools/contrib/linkedin.py +428 -0
- kite/tools/contrib/web_search.py +30 -0
- kite/tools/mcp/__init__.py +31 -0
- kite/tools/mcp/database_mcp.py +267 -0
- kite/tools/mcp/gdrive_mcp_server.py +503 -0
- kite/tools/mcp/gmail_mcp_server.py +601 -0
- kite/tools/mcp/postgres_mcp_server.py +490 -0
- kite/tools/mcp/slack_mcp_server.py +538 -0
- kite/tools/mcp/stripe_mcp_server.py +219 -0
- kite/tools/search.py +90 -0
- kite/tools/system_tools.py +54 -0
- kite/tools_manager.py +27 -0
- kite_agent-0.1.0.dist-info/METADATA +621 -0
- kite_agent-0.1.0.dist-info/RECORD +61 -0
- kite_agent-0.1.0.dist-info/WHEEL +5 -0
- kite_agent-0.1.0.dist-info/licenses/LICENSE +21 -0
- kite_agent-0.1.0.dist-info/top_level.txt +1 -0
kite/monitoring.py
ADDED
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Monitoring & Observability System
|
|
3
|
+
Complete production monitoring with metrics, tracing, and alerting.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
import logging
|
|
8
|
+
import threading
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Dict, Any, Optional, Callable, List
|
|
11
|
+
from functools import wraps
|
|
12
|
+
from collections import defaultdict, deque
|
|
13
|
+
import threading
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from prometheus_client import (
|
|
17
|
+
Counter, Histogram, Gauge, Info,
|
|
18
|
+
start_http_server, generate_latest
|
|
19
|
+
)
|
|
20
|
+
PROMETHEUS_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
PROMETHEUS_AVAILABLE = False
|
|
23
|
+
|
|
24
|
+
# Cost per 1k tokens (Input, Output)
|
|
25
|
+
MODEL_COSTS = {
|
|
26
|
+
"gpt-4": (0.03, 0.06),
|
|
27
|
+
"gpt-3.5-turbo": (0.0005, 0.0015),
|
|
28
|
+
"claude-3-opus": (0.015, 0.075),
|
|
29
|
+
"claude-3-sonnet": (0.003, 0.015),
|
|
30
|
+
"claude-3-haiku": (0.00025, 0.00125),
|
|
31
|
+
"deepseek-r1:14b": (0.0, 0.0), # Local
|
|
32
|
+
"ollama": (0.0, 0.0),
|
|
33
|
+
"default": (0.0, 0.0)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
class MetricsCollector:
|
|
37
|
+
"""
|
|
38
|
+
Production-grade metrics collection.
|
|
39
|
+
|
|
40
|
+
Tracks:
|
|
41
|
+
- Request counts
|
|
42
|
+
- Latencies
|
|
43
|
+
- Error rates
|
|
44
|
+
- Resource usage
|
|
45
|
+
- Custom metrics
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, enable_prometheus: bool = True):
|
|
49
|
+
self.enable_prometheus = enable_prometheus and PROMETHEUS_AVAILABLE
|
|
50
|
+
|
|
51
|
+
# In-memory metrics
|
|
52
|
+
self.metrics = defaultdict(lambda: {
|
|
53
|
+
'count': 0,
|
|
54
|
+
'errors': 0,
|
|
55
|
+
'total_latency': 0,
|
|
56
|
+
'min_latency': float('inf'),
|
|
57
|
+
'max_latency': 0,
|
|
58
|
+
'max_latency': 0,
|
|
59
|
+
'outcomes': defaultdict(int),
|
|
60
|
+
'tokens_in': 0,
|
|
61
|
+
'tokens_out': 0,
|
|
62
|
+
'cost': 0.0
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
self.lock = threading.RLock()
|
|
66
|
+
|
|
67
|
+
# History for Dashboard
|
|
68
|
+
self.request_history = deque(maxlen=1000)
|
|
69
|
+
self.error_logs = deque(maxlen=1000)
|
|
70
|
+
self.max_history = 1000
|
|
71
|
+
|
|
72
|
+
# Prometheus metrics
|
|
73
|
+
if self.enable_prometheus:
|
|
74
|
+
self._init_prometheus()
|
|
75
|
+
|
|
76
|
+
def _init_prometheus(self):
|
|
77
|
+
"""Initialize Prometheus metrics."""
|
|
78
|
+
# Request metrics
|
|
79
|
+
self.request_counter = Counter(
|
|
80
|
+
'kite_requests_total',
|
|
81
|
+
'Total requests',
|
|
82
|
+
['component', 'operation']
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self.request_latency = Histogram(
|
|
86
|
+
'kite_request_latency_seconds',
|
|
87
|
+
'Request latency',
|
|
88
|
+
['component', 'operation']
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.error_counter = Counter(
|
|
92
|
+
'kite_errors_total',
|
|
93
|
+
'Total errors',
|
|
94
|
+
['component', 'operation', 'error_type']
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Resource metrics
|
|
98
|
+
self.active_requests = Gauge(
|
|
99
|
+
'kite_active_requests',
|
|
100
|
+
'Active requests',
|
|
101
|
+
['component']
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# LLM metrics
|
|
105
|
+
self.llm_tokens = Counter(
|
|
106
|
+
'kite_llm_tokens_total',
|
|
107
|
+
'Total LLM tokens',
|
|
108
|
+
['provider', 'model', 'type']
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
self.llm_cost = Counter(
|
|
112
|
+
'kite_llm_cost_usd',
|
|
113
|
+
'Total LLM cost in USD',
|
|
114
|
+
['provider', 'model']
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Memory metrics
|
|
118
|
+
self.memory_operations = Counter(
|
|
119
|
+
'kite_memory_operations_total',
|
|
120
|
+
'Memory operations',
|
|
121
|
+
['type', 'operation']
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Circuit breaker metrics
|
|
125
|
+
self.circuit_breaker_state = Gauge(
|
|
126
|
+
'kite_circuit_breaker_state',
|
|
127
|
+
'Circuit breaker state (0=closed, 1=open)',
|
|
128
|
+
['component']
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Info
|
|
132
|
+
self.info = Info(
|
|
133
|
+
'kite',
|
|
134
|
+
'Framework information'
|
|
135
|
+
)
|
|
136
|
+
self.info.info({
|
|
137
|
+
'version': '1.0.0',
|
|
138
|
+
'python_version': '3.11'
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
def record_request(self, component: str, operation: str,
|
|
142
|
+
latency: float, success: bool = True,
|
|
143
|
+
error_type: Optional[str] = None):
|
|
144
|
+
"""Record a component request with timing and status."""
|
|
145
|
+
key = f"{component}.{operation}"
|
|
146
|
+
with self.lock:
|
|
147
|
+
# Update metrics
|
|
148
|
+
data = self.metrics[key]
|
|
149
|
+
data['count'] += 1
|
|
150
|
+
data['total_latency'] += latency
|
|
151
|
+
data['min_latency'] = min(data['min_latency'], latency)
|
|
152
|
+
data['max_latency'] = max(data['max_latency'], latency)
|
|
153
|
+
|
|
154
|
+
if not success:
|
|
155
|
+
data['errors'] += 1
|
|
156
|
+
|
|
157
|
+
# Update history
|
|
158
|
+
self.request_history.append({
|
|
159
|
+
'timestamp': datetime.now(),
|
|
160
|
+
'component': component,
|
|
161
|
+
'operation': operation,
|
|
162
|
+
'latency': latency,
|
|
163
|
+
'success': success,
|
|
164
|
+
'error_type': error_type
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
if not success and error_type:
|
|
168
|
+
self.error_logs.append({
|
|
169
|
+
'timestamp': datetime.now(),
|
|
170
|
+
'component': component,
|
|
171
|
+
'error': error_type
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
# History
|
|
175
|
+
with self.lock:
|
|
176
|
+
self.request_history.append({
|
|
177
|
+
'component': component,
|
|
178
|
+
'operation': operation,
|
|
179
|
+
'latency': latency,
|
|
180
|
+
'success': success,
|
|
181
|
+
'error_type': error_type,
|
|
182
|
+
'timestamp': time.time()
|
|
183
|
+
})
|
|
184
|
+
if len(self.request_history) > self.max_history:
|
|
185
|
+
self.request_history.pop(0)
|
|
186
|
+
|
|
187
|
+
if not success:
|
|
188
|
+
self.error_logs.append({
|
|
189
|
+
'component': component,
|
|
190
|
+
'operation': operation,
|
|
191
|
+
'error': error_type,
|
|
192
|
+
'timestamp': time.time()
|
|
193
|
+
})
|
|
194
|
+
if len(self.error_logs) > self.max_history:
|
|
195
|
+
self.error_logs.pop(0)
|
|
196
|
+
|
|
197
|
+
# Prometheus
|
|
198
|
+
if self.enable_prometheus:
|
|
199
|
+
self.request_counter.labels(
|
|
200
|
+
component=component,
|
|
201
|
+
operation=operation
|
|
202
|
+
).inc()
|
|
203
|
+
|
|
204
|
+
self.request_latency.labels(
|
|
205
|
+
component=component,
|
|
206
|
+
operation=operation
|
|
207
|
+
).observe(latency)
|
|
208
|
+
|
|
209
|
+
if not success:
|
|
210
|
+
self.error_counter.labels(
|
|
211
|
+
component=component,
|
|
212
|
+
operation=operation,
|
|
213
|
+
error_type=error_type or 'unknown'
|
|
214
|
+
).inc()
|
|
215
|
+
|
|
216
|
+
def record_outcome(self, component: str, outcome_type: str):
|
|
217
|
+
"""Record a domain-specific outcome for a component."""
|
|
218
|
+
with self.lock:
|
|
219
|
+
self.metrics[component]['outcomes'][outcome_type] += 1
|
|
220
|
+
|
|
221
|
+
def record_llm_usage(self, provider: str, model: str,
|
|
222
|
+
prompt_tokens: int, completion_tokens: int,
|
|
223
|
+
cost: float = 0):
|
|
224
|
+
"""Record LLM usage."""
|
|
225
|
+
if self.enable_prometheus:
|
|
226
|
+
self.llm_tokens.labels(
|
|
227
|
+
provider=provider,
|
|
228
|
+
model=model,
|
|
229
|
+
type='prompt'
|
|
230
|
+
).inc(prompt_tokens)
|
|
231
|
+
|
|
232
|
+
self.llm_tokens.labels(
|
|
233
|
+
provider=provider,
|
|
234
|
+
model=model,
|
|
235
|
+
type='completion'
|
|
236
|
+
).inc(completion_tokens)
|
|
237
|
+
|
|
238
|
+
if cost > 0:
|
|
239
|
+
self.llm_cost.labels(
|
|
240
|
+
provider=provider,
|
|
241
|
+
model=model
|
|
242
|
+
).inc(cost)
|
|
243
|
+
|
|
244
|
+
# In-memory tracking
|
|
245
|
+
key = f"llm_usage.{model}"
|
|
246
|
+
with self.lock:
|
|
247
|
+
data = self.metrics[key]
|
|
248
|
+
data['count'] += 1
|
|
249
|
+
data['tokens_in'] += prompt_tokens
|
|
250
|
+
data['tokens_out'] += completion_tokens
|
|
251
|
+
|
|
252
|
+
# Calculate cost if not provided
|
|
253
|
+
if cost == 0:
|
|
254
|
+
rates = MODEL_COSTS.get(model, MODEL_COSTS.get(model.split(':')[0], MODEL_COSTS['default']))
|
|
255
|
+
estimated_cost = (prompt_tokens / 1000 * rates[0]) + (completion_tokens / 1000 * rates[1])
|
|
256
|
+
data['cost'] += estimated_cost
|
|
257
|
+
else:
|
|
258
|
+
data['cost'] += cost
|
|
259
|
+
|
|
260
|
+
def record_memory_operation(self, mem_type: str, operation: str):
|
|
261
|
+
"""Record memory operation."""
|
|
262
|
+
if self.enable_prometheus:
|
|
263
|
+
self.memory_operations.labels(
|
|
264
|
+
type=mem_type,
|
|
265
|
+
operation=operation
|
|
266
|
+
).inc()
|
|
267
|
+
|
|
268
|
+
def set_circuit_breaker_state(self, component: str, is_open: bool):
|
|
269
|
+
"""Set circuit breaker state."""
|
|
270
|
+
if self.enable_prometheus:
|
|
271
|
+
self.circuit_breaker_state.labels(
|
|
272
|
+
component=component
|
|
273
|
+
).set(1 if is_open else 0)
|
|
274
|
+
|
|
275
|
+
def get_metrics(self) -> Dict:
|
|
276
|
+
"""Get current metrics snapshot."""
|
|
277
|
+
with self.lock:
|
|
278
|
+
return dict(self.metrics)
|
|
279
|
+
|
|
280
|
+
def start_server(self, port: int = 9090):
|
|
281
|
+
"""Start Prometheus metrics server."""
|
|
282
|
+
if self.enable_prometheus:
|
|
283
|
+
start_http_server(port)
|
|
284
|
+
logging.info(f"Metrics server started on port {port}")
|
|
285
|
+
|
|
286
|
+
def get_history(self) -> list:
|
|
287
|
+
"""Get recent request history."""
|
|
288
|
+
with self.lock:
|
|
289
|
+
return list(self.request_history)
|
|
290
|
+
|
|
291
|
+
def get_error_logs(self) -> list:
|
|
292
|
+
"""Get recent error logs."""
|
|
293
|
+
with self.lock:
|
|
294
|
+
return list(self.error_logs)
|
|
295
|
+
|
|
296
|
+
def get_summary(self) -> Dict:
|
|
297
|
+
"""Get high-level system summary."""
|
|
298
|
+
history = self.get_history()
|
|
299
|
+
total_requests = len(self.request_history)
|
|
300
|
+
total_errors = len(self.error_logs)
|
|
301
|
+
|
|
302
|
+
if not history:
|
|
303
|
+
return {
|
|
304
|
+
"status": "idle",
|
|
305
|
+
"success_rate": 1.0,
|
|
306
|
+
"avg_latency": 0.0,
|
|
307
|
+
"total_requests": total_requests,
|
|
308
|
+
"total_errors": total_errors
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
recent = history[-100:]
|
|
312
|
+
success_rate = sum(1 for r in recent if r['success']) / len(recent) if recent else 0
|
|
313
|
+
avg_latency = sum(r['latency'] for r in recent) / len(recent) if recent else 0
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
"status": "healthy" if success_rate > 0.9 else "degraded",
|
|
317
|
+
"success_rate": success_rate,
|
|
318
|
+
"avg_latency": avg_latency,
|
|
319
|
+
"total_requests": total_requests,
|
|
320
|
+
"total_errors": total_errors
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
def get_detailed_report(self) -> str:
|
|
324
|
+
"""Generate a human-readable detailed report of all metrics."""
|
|
325
|
+
with self.lock:
|
|
326
|
+
if not self.metrics:
|
|
327
|
+
return "No metrics recorded."
|
|
328
|
+
|
|
329
|
+
report = []
|
|
330
|
+
report.append("\n" + "="*60)
|
|
331
|
+
report.append(" 📊 KITE SYSTEM PERFORMANCE REPORT")
|
|
332
|
+
report.append("="*60)
|
|
333
|
+
|
|
334
|
+
# 1. Summary Metrics
|
|
335
|
+
summary = self.get_summary()
|
|
336
|
+
report.append(f"Status: {summary['status'].upper()}")
|
|
337
|
+
report.append(f"Total Calls: {summary['total_requests']}")
|
|
338
|
+
report.append(f"Success Rate: {summary['success_rate']:.1%}")
|
|
339
|
+
# report.append(f"Avg Latency: {summary['avg_latency']:.3f}s")
|
|
340
|
+
report.append("-" * 60)
|
|
341
|
+
|
|
342
|
+
# 2. Per-Component breakdown
|
|
343
|
+
report.append(f"{'Component':<20} | {'Calls':<6} | {'Errors':<6} | {'Avg Latency':<10} | {'Outcomes'}")
|
|
344
|
+
report.append("-" * 60)
|
|
345
|
+
|
|
346
|
+
for key, data in sorted(self.metrics.items()):
|
|
347
|
+
avg_l = data['total_latency'] / data['count'] if data['count'] > 0 else 0
|
|
348
|
+
outcomes_str = ", ".join([f"{k}:{v}" for k, v in data['outcomes'].items()])
|
|
349
|
+
|
|
350
|
+
# Special formatting for LLM usage
|
|
351
|
+
if key.startswith("llm_usage"):
|
|
352
|
+
report.append(f"{key:<30} | {data['count']:<6} | {data['tokens_in']} in / {data['tokens_out']} out | ${data['cost']:.4f}")
|
|
353
|
+
else:
|
|
354
|
+
report.append(f"{key:<20} | {data['count']:<6} | {data['errors']:<6} | {avg_l:<10.3f}s | {outcomes_str}")
|
|
355
|
+
|
|
356
|
+
report.append("="*60 + "\n")
|
|
357
|
+
return "\n".join(report)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class Tracer:
|
|
361
|
+
"""
|
|
362
|
+
Distributed tracing for agent operations.
|
|
363
|
+
Tracks operation flow and timing.
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
def __init__(self):
|
|
367
|
+
self.traces = []
|
|
368
|
+
self.current_trace = None
|
|
369
|
+
self.lock = threading.Lock()
|
|
370
|
+
|
|
371
|
+
def start_trace(self, trace_id: str, operation: str, metadata: Dict = None):
|
|
372
|
+
"""Start a new trace."""
|
|
373
|
+
trace = {
|
|
374
|
+
'trace_id': trace_id,
|
|
375
|
+
'operation': operation,
|
|
376
|
+
'start_time': time.time(),
|
|
377
|
+
'metadata': metadata or {},
|
|
378
|
+
'spans': []
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
with self.lock:
|
|
382
|
+
self.traces.append(trace)
|
|
383
|
+
self.current_trace = trace
|
|
384
|
+
|
|
385
|
+
return trace
|
|
386
|
+
|
|
387
|
+
def add_span(self, name: str, metadata: Dict = None):
|
|
388
|
+
"""Add a span to current trace."""
|
|
389
|
+
if not self.current_trace:
|
|
390
|
+
return
|
|
391
|
+
|
|
392
|
+
span = {
|
|
393
|
+
'name': name,
|
|
394
|
+
'start_time': time.time(),
|
|
395
|
+
'metadata': metadata or {}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
with self.lock:
|
|
399
|
+
self.current_trace['spans'].append(span)
|
|
400
|
+
|
|
401
|
+
return span
|
|
402
|
+
|
|
403
|
+
def end_span(self, span: Dict):
|
|
404
|
+
"""End a span."""
|
|
405
|
+
span['end_time'] = time.time()
|
|
406
|
+
span['duration'] = span['end_time'] - span['start_time']
|
|
407
|
+
|
|
408
|
+
def end_trace(self):
|
|
409
|
+
"""End current trace."""
|
|
410
|
+
if self.current_trace:
|
|
411
|
+
self.current_trace['end_time'] = time.time()
|
|
412
|
+
self.current_trace['duration'] = (
|
|
413
|
+
self.current_trace['end_time'] -
|
|
414
|
+
self.current_trace['start_time']
|
|
415
|
+
)
|
|
416
|
+
self.current_trace = None
|
|
417
|
+
|
|
418
|
+
def get_traces(self, limit: int = 100) -> list:
|
|
419
|
+
"""Get recent traces."""
|
|
420
|
+
with self.lock:
|
|
421
|
+
return self.traces[-limit:]
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def monitor(component: str, operation: str):
|
|
425
|
+
"""
|
|
426
|
+
Decorator to monitor function execution.
|
|
427
|
+
|
|
428
|
+
Usage:
|
|
429
|
+
@monitor('llm', 'chat')
|
|
430
|
+
def chat(messages):
|
|
431
|
+
return llm.chat(messages)
|
|
432
|
+
"""
|
|
433
|
+
def decorator(func: Callable):
|
|
434
|
+
@wraps(func)
|
|
435
|
+
def wrapper(*args, **kwargs):
|
|
436
|
+
start_time = time.time()
|
|
437
|
+
success = True
|
|
438
|
+
error_type = None
|
|
439
|
+
|
|
440
|
+
try:
|
|
441
|
+
result = func(*args, **kwargs)
|
|
442
|
+
return result
|
|
443
|
+
except Exception as e:
|
|
444
|
+
success = False
|
|
445
|
+
error_type = type(e).__name__
|
|
446
|
+
raise
|
|
447
|
+
finally:
|
|
448
|
+
latency = time.time() - start_time
|
|
449
|
+
|
|
450
|
+
# Get metrics collector from args if available
|
|
451
|
+
if args and hasattr(args[0], 'metrics'):
|
|
452
|
+
metrics = args[0].metrics
|
|
453
|
+
metrics.record_request(
|
|
454
|
+
component, operation, latency,
|
|
455
|
+
success, error_type
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
return wrapper
|
|
459
|
+
return decorator
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
class HealthCheck:
|
|
463
|
+
"""
|
|
464
|
+
Health check system for services.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
def __init__(self):
|
|
468
|
+
self.checks = {}
|
|
469
|
+
self.lock = threading.Lock()
|
|
470
|
+
|
|
471
|
+
def register(self, name: str, check_func: Callable):
|
|
472
|
+
"""Register a health check."""
|
|
473
|
+
with self.lock:
|
|
474
|
+
self.checks[name] = check_func
|
|
475
|
+
|
|
476
|
+
def run_checks(self) -> Dict:
|
|
477
|
+
"""Run all health checks."""
|
|
478
|
+
results = {}
|
|
479
|
+
|
|
480
|
+
for name, check_func in self.checks.items():
|
|
481
|
+
try:
|
|
482
|
+
result = check_func()
|
|
483
|
+
results[name] = {
|
|
484
|
+
'status': 'healthy' if result else 'unhealthy',
|
|
485
|
+
'success': result
|
|
486
|
+
}
|
|
487
|
+
except Exception as e:
|
|
488
|
+
results[name] = {
|
|
489
|
+
'status': 'error',
|
|
490
|
+
'error': str(e)
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
# Overall status
|
|
494
|
+
all_healthy = all(
|
|
495
|
+
r['status'] == 'healthy'
|
|
496
|
+
for r in results.values()
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
return {
|
|
500
|
+
'status': 'healthy' if all_healthy else 'unhealthy',
|
|
501
|
+
'checks': results
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
class AlertManager:
|
|
506
|
+
"""
|
|
507
|
+
Alert manager for threshold-based alerts.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
def __init__(self, metrics: MetricsCollector):
|
|
511
|
+
self.metrics = metrics
|
|
512
|
+
self.alerts = []
|
|
513
|
+
self.thresholds = {}
|
|
514
|
+
|
|
515
|
+
def set_threshold(self, metric: str, threshold: float,
|
|
516
|
+
comparison: str = '>'):
|
|
517
|
+
"""Set alert threshold."""
|
|
518
|
+
self.thresholds[metric] = {
|
|
519
|
+
'threshold': threshold,
|
|
520
|
+
'comparison': comparison
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
def check_alerts(self) -> list:
|
|
524
|
+
"""Check for threshold violations."""
|
|
525
|
+
alerts = []
|
|
526
|
+
metrics_data = self.metrics.get_metrics()
|
|
527
|
+
|
|
528
|
+
for metric, config in self.thresholds.items():
|
|
529
|
+
if metric in metrics_data:
|
|
530
|
+
value = metrics_data[metric]['count']
|
|
531
|
+
threshold = config['threshold']
|
|
532
|
+
|
|
533
|
+
if config['comparison'] == '>' and value > threshold:
|
|
534
|
+
alerts.append({
|
|
535
|
+
'metric': metric,
|
|
536
|
+
'value': value,
|
|
537
|
+
'threshold': threshold,
|
|
538
|
+
'message': f"{metric} exceeded threshold: {value} > {threshold}"
|
|
539
|
+
})
|
|
540
|
+
|
|
541
|
+
return alerts
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
# Global instances
|
|
545
|
+
_metrics = None
|
|
546
|
+
_tracer = None
|
|
547
|
+
_health = None
|
|
548
|
+
_alerts = None
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def get_metrics() -> MetricsCollector:
|
|
552
|
+
"""Get global metrics collector."""
|
|
553
|
+
global _metrics
|
|
554
|
+
if _metrics is None:
|
|
555
|
+
_metrics = MetricsCollector()
|
|
556
|
+
return _metrics
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def get_tracer() -> Tracer:
|
|
560
|
+
"""Get global tracer."""
|
|
561
|
+
global _tracer
|
|
562
|
+
if _tracer is None:
|
|
563
|
+
_tracer = Tracer()
|
|
564
|
+
return _tracer
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def get_health_check() -> HealthCheck:
|
|
568
|
+
"""Get global health check."""
|
|
569
|
+
global _health
|
|
570
|
+
if _health is None:
|
|
571
|
+
_health = HealthCheck()
|
|
572
|
+
return _health
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def get_alert_manager() -> AlertManager:
|
|
576
|
+
"""Get global alert manager."""
|
|
577
|
+
global _alerts
|
|
578
|
+
if _alerts is None:
|
|
579
|
+
_alerts = AlertManager(get_metrics())
|
|
580
|
+
return _alerts
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
if __name__ == "__main__":
|
|
584
|
+
# Example usage
|
|
585
|
+
print("Monitoring System Example\n")
|
|
586
|
+
|
|
587
|
+
metrics = get_metrics()
|
|
588
|
+
tracer = get_tracer()
|
|
589
|
+
health = get_health_check()
|
|
590
|
+
|
|
591
|
+
# Record some metrics
|
|
592
|
+
metrics.record_request('llm', 'chat', 0.5, success=True)
|
|
593
|
+
metrics.record_request('llm', 'chat', 0.3, success=True)
|
|
594
|
+
metrics.record_request('llm', 'chat', 1.2, success=False, error_type='Timeout')
|
|
595
|
+
|
|
596
|
+
# Start a trace
|
|
597
|
+
trace = tracer.start_trace('trace-1', 'agent_run')
|
|
598
|
+
span = tracer.add_span('llm_call')
|
|
599
|
+
time.sleep(0.1)
|
|
600
|
+
tracer.end_span(span)
|
|
601
|
+
tracer.end_trace()
|
|
602
|
+
|
|
603
|
+
# Health check
|
|
604
|
+
health.register('llm', lambda: True)
|
|
605
|
+
health.register('memory', lambda: True)
|
|
606
|
+
|
|
607
|
+
print("Metrics:", metrics.get_metrics())
|
|
608
|
+
print("\nTraces:", len(tracer.get_traces()))
|
|
609
|
+
print("\nHealth:", health.run_checks())
|
|
610
|
+
|
|
611
|
+
print("\n[OK] Monitoring system working")
|