kairo-code 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kairo/backend/api/agents.py +337 -16
- kairo/backend/app.py +84 -4
- kairo/backend/config.py +4 -2
- kairo/backend/models/agent.py +216 -2
- kairo/backend/models/api_key.py +4 -1
- kairo/backend/models/task.py +31 -0
- kairo/backend/models/user_provider_key.py +26 -0
- kairo/backend/schemas/agent.py +249 -2
- kairo/backend/schemas/api_key.py +3 -0
- kairo/backend/services/agent/__init__.py +52 -0
- kairo/backend/services/agent/agent_alerts_evaluation_service.py +224 -0
- kairo/backend/services/agent/agent_alerts_service.py +201 -0
- kairo/backend/services/agent/agent_commands_service.py +142 -0
- kairo/backend/services/agent/agent_crud_service.py +150 -0
- kairo/backend/services/agent/agent_events_service.py +103 -0
- kairo/backend/services/agent/agent_heartbeat_service.py +207 -0
- kairo/backend/services/agent/agent_metrics_rollup_service.py +248 -0
- kairo/backend/services/agent/agent_metrics_service.py +259 -0
- kairo/backend/services/agent/agent_service.py +315 -0
- kairo/backend/services/agent/agent_setup_service.py +180 -0
- kairo/backend/services/agent/constants.py +28 -0
- kairo/backend/services/agent_service.py +18 -102
- kairo/backend/services/api_key_service.py +23 -3
- kairo/backend/services/byok_service.py +204 -0
- kairo/backend/services/chat_service.py +398 -63
- kairo/backend/services/deep_search_service.py +159 -0
- kairo/backend/services/email_service.py +418 -19
- kairo/backend/services/few_shot_service.py +223 -0
- kairo/backend/services/post_processor.py +261 -0
- kairo/backend/services/rag_service.py +150 -0
- kairo/backend/services/task_service.py +119 -0
- kairo/backend/tests/__init__.py +1 -0
- kairo/backend/tests/e2e/__init__.py +1 -0
- kairo/backend/tests/e2e/agents/__init__.py +1 -0
- kairo/backend/tests/e2e/agents/conftest.py +389 -0
- kairo/backend/tests/e2e/agents/test_agent_alerts.py +802 -0
- kairo/backend/tests/e2e/agents/test_agent_commands.py +456 -0
- kairo/backend/tests/e2e/agents/test_agent_crud.py +455 -0
- kairo/backend/tests/e2e/agents/test_agent_events.py +415 -0
- kairo/backend/tests/e2e/agents/test_agent_heartbeat.py +520 -0
- kairo/backend/tests/e2e/agents/test_agent_metrics.py +587 -0
- kairo/backend/tests/e2e/agents/test_agent_setup.py +349 -0
- kairo/migrations/versions/010_agent_dashboard.py +246 -0
- {kairo_code-0.1.0.dist-info → kairo_code-0.2.0.dist-info}/METADATA +1 -1
- {kairo_code-0.1.0.dist-info → kairo_code-0.2.0.dist-info}/RECORD +50 -16
- {kairo_code-0.1.0.dist-info → kairo_code-0.2.0.dist-info}/top_level.txt +1 -0
- kairo_migrations/env.py +92 -0
- kairo_migrations/versions/001_add_agent_dashboard_extensions.py +450 -0
- {kairo_code-0.1.0.dist-info → kairo_code-0.2.0.dist-info}/WHEEL +0 -0
- {kairo_code-0.1.0.dist-info → kairo_code-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent metrics rollup and maintenance service.
|
|
3
|
+
|
|
4
|
+
Handles metrics rollups (1m -> 1h -> daily) and cleanup operations.
|
|
5
|
+
These are background job operations that run periodically.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime, timedelta, date, UTC
|
|
10
|
+
|
|
11
|
+
from sqlalchemy import func, select
|
|
12
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
|
+
|
|
14
|
+
from backend.models.agent import AgentMetrics1m, AgentMetrics1h, AgentMetricsDaily
|
|
15
|
+
from backend.services.agent.constants import (
|
|
16
|
+
DEFAULT_METRICS_1M_RETENTION_DAYS,
|
|
17
|
+
DEFAULT_METRICS_1H_RETENTION_DAYS,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AgentMetricsRollupService:
|
|
24
|
+
"""Service for metrics rollup and maintenance operations."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, db: AsyncSession):
|
|
27
|
+
self.db = db
|
|
28
|
+
|
|
29
|
+
async def rollup_1m_to_1h(self) -> int:
|
|
30
|
+
"""Rollup 1-minute metrics into hourly buckets. Run every hour."""
|
|
31
|
+
now = datetime.now(UTC)
|
|
32
|
+
hour_start = (now - timedelta(hours=2)).replace(minute=0, second=0, microsecond=0)
|
|
33
|
+
hour_end = hour_start + timedelta(hours=1)
|
|
34
|
+
|
|
35
|
+
agent_ids = await self._get_agents_with_1m_data(hour_start, hour_end)
|
|
36
|
+
|
|
37
|
+
count = 0
|
|
38
|
+
for agent_id in agent_ids:
|
|
39
|
+
if await self._hourly_bucket_exists(agent_id, hour_start):
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
row = await self._aggregate_1m_data(agent_id, hour_start, hour_end)
|
|
43
|
+
|
|
44
|
+
if row.request_count and row.request_count > 0:
|
|
45
|
+
self._create_hourly_bucket(agent_id, hour_start, row)
|
|
46
|
+
count += 1
|
|
47
|
+
|
|
48
|
+
if count > 0:
|
|
49
|
+
await self.db.commit()
|
|
50
|
+
logger.info("Rolled up %d hourly metric buckets", count)
|
|
51
|
+
|
|
52
|
+
return count
|
|
53
|
+
|
|
54
|
+
async def rollup_1h_to_daily(self) -> int:
|
|
55
|
+
"""Rollup hourly metrics into daily buckets. Run once per day."""
|
|
56
|
+
now = datetime.now(UTC)
|
|
57
|
+
yesterday = (now - timedelta(days=1)).date()
|
|
58
|
+
day_start = datetime.combine(yesterday, datetime.min.time()).replace(tzinfo=UTC)
|
|
59
|
+
day_end = day_start + timedelta(days=1)
|
|
60
|
+
|
|
61
|
+
agent_ids = await self._get_agents_with_hourly_data(day_start, day_end)
|
|
62
|
+
|
|
63
|
+
count = 0
|
|
64
|
+
for agent_id in agent_ids:
|
|
65
|
+
if await self._daily_bucket_exists(agent_id, yesterday):
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
row = await self._aggregate_1h_data(agent_id, day_start, day_end)
|
|
69
|
+
|
|
70
|
+
if row.request_count and row.request_count > 0:
|
|
71
|
+
self._create_daily_bucket(agent_id, yesterday, row)
|
|
72
|
+
count += 1
|
|
73
|
+
|
|
74
|
+
if count > 0:
|
|
75
|
+
await self.db.commit()
|
|
76
|
+
logger.info("Rolled up %d daily metric buckets", count)
|
|
77
|
+
|
|
78
|
+
return count
|
|
79
|
+
|
|
80
|
+
async def cleanup_old_metrics(
|
|
81
|
+
self,
|
|
82
|
+
retention_days_1m: int = DEFAULT_METRICS_1M_RETENTION_DAYS,
|
|
83
|
+
retention_days_1h: int = DEFAULT_METRICS_1H_RETENTION_DAYS
|
|
84
|
+
) -> tuple[int, int]:
|
|
85
|
+
"""Clean up old metrics based on retention policy."""
|
|
86
|
+
now = datetime.now(UTC)
|
|
87
|
+
|
|
88
|
+
deleted_1m = await self._cleanup_metrics_table(
|
|
89
|
+
AgentMetrics1m,
|
|
90
|
+
now - timedelta(days=retention_days_1m)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
deleted_1h = await self._cleanup_metrics_table(
|
|
94
|
+
AgentMetrics1h,
|
|
95
|
+
now - timedelta(days=retention_days_1h)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if deleted_1m > 0 or deleted_1h > 0:
|
|
99
|
+
await self.db.commit()
|
|
100
|
+
logger.info("Cleaned up metrics: %d 1m records, %d 1h records", deleted_1m, deleted_1h)
|
|
101
|
+
|
|
102
|
+
return deleted_1m, deleted_1h
|
|
103
|
+
|
|
104
|
+
# ─── Private Helpers ───────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
async def _get_agents_with_1m_data(
|
|
107
|
+
self,
|
|
108
|
+
start: datetime,
|
|
109
|
+
end: datetime
|
|
110
|
+
) -> list[str]:
|
|
111
|
+
"""Get list of agent IDs with 1m data in the time range."""
|
|
112
|
+
stmt = (
|
|
113
|
+
select(AgentMetrics1m.agent_id)
|
|
114
|
+
.where(AgentMetrics1m.bucket_time >= start)
|
|
115
|
+
.where(AgentMetrics1m.bucket_time < end)
|
|
116
|
+
.distinct()
|
|
117
|
+
)
|
|
118
|
+
result = await self.db.execute(stmt)
|
|
119
|
+
return [row[0] for row in result.all()]
|
|
120
|
+
|
|
121
|
+
async def _get_agents_with_hourly_data(
|
|
122
|
+
self,
|
|
123
|
+
start: datetime,
|
|
124
|
+
end: datetime
|
|
125
|
+
) -> list[str]:
|
|
126
|
+
"""Get list of agent IDs with hourly data."""
|
|
127
|
+
stmt = (
|
|
128
|
+
select(AgentMetrics1h.agent_id)
|
|
129
|
+
.where(AgentMetrics1h.bucket_time >= start)
|
|
130
|
+
.where(AgentMetrics1h.bucket_time < end)
|
|
131
|
+
.distinct()
|
|
132
|
+
)
|
|
133
|
+
result = await self.db.execute(stmt)
|
|
134
|
+
return [row[0] for row in result.all()]
|
|
135
|
+
|
|
136
|
+
async def _hourly_bucket_exists(self, agent_id: str, hour_start: datetime) -> bool:
|
|
137
|
+
"""Check if hourly bucket already exists."""
|
|
138
|
+
stmt = select(AgentMetrics1h).where(
|
|
139
|
+
AgentMetrics1h.agent_id == agent_id,
|
|
140
|
+
AgentMetrics1h.bucket_time == hour_start
|
|
141
|
+
)
|
|
142
|
+
result = await self.db.execute(stmt)
|
|
143
|
+
return result.scalar_one_or_none() is not None
|
|
144
|
+
|
|
145
|
+
async def _daily_bucket_exists(self, agent_id: str, day: date) -> bool:
|
|
146
|
+
"""Check if daily bucket already exists."""
|
|
147
|
+
stmt = select(AgentMetricsDaily).where(
|
|
148
|
+
AgentMetricsDaily.agent_id == agent_id,
|
|
149
|
+
AgentMetricsDaily.date == day
|
|
150
|
+
)
|
|
151
|
+
result = await self.db.execute(stmt)
|
|
152
|
+
return result.scalar_one_or_none() is not None
|
|
153
|
+
|
|
154
|
+
async def _aggregate_1m_data(self, agent_id: str, start: datetime, end: datetime):
|
|
155
|
+
"""Aggregate 1-minute data for a time range."""
|
|
156
|
+
stmt = (
|
|
157
|
+
select(
|
|
158
|
+
func.sum(AgentMetrics1m.request_count).label("request_count"),
|
|
159
|
+
func.sum(AgentMetrics1m.error_count).label("error_count"),
|
|
160
|
+
func.sum(AgentMetrics1m.timeout_count).label("timeout_count"),
|
|
161
|
+
func.sum(AgentMetrics1m.input_tokens).label("input_tokens"),
|
|
162
|
+
func.sum(AgentMetrics1m.output_tokens).label("output_tokens"),
|
|
163
|
+
func.sum(AgentMetrics1m.total_latency_ms).label("total_latency_ms"),
|
|
164
|
+
func.min(AgentMetrics1m.min_latency_ms).label("min_latency_ms"),
|
|
165
|
+
func.max(AgentMetrics1m.max_latency_ms).label("max_latency_ms"),
|
|
166
|
+
func.sum(AgentMetrics1m.tool_calls).label("tool_calls"),
|
|
167
|
+
func.sum(AgentMetrics1m.tool_errors).label("tool_errors"),
|
|
168
|
+
)
|
|
169
|
+
.where(AgentMetrics1m.agent_id == agent_id)
|
|
170
|
+
.where(AgentMetrics1m.bucket_time >= start)
|
|
171
|
+
.where(AgentMetrics1m.bucket_time < end)
|
|
172
|
+
)
|
|
173
|
+
result = await self.db.execute(stmt)
|
|
174
|
+
return result.one()
|
|
175
|
+
|
|
176
|
+
async def _aggregate_1h_data(self, agent_id: str, start: datetime, end: datetime):
|
|
177
|
+
"""Aggregate hourly data for a time range."""
|
|
178
|
+
stmt = (
|
|
179
|
+
select(
|
|
180
|
+
func.sum(AgentMetrics1h.request_count).label("request_count"),
|
|
181
|
+
func.sum(AgentMetrics1h.error_count).label("error_count"),
|
|
182
|
+
func.sum(AgentMetrics1h.timeout_count).label("timeout_count"),
|
|
183
|
+
func.sum(AgentMetrics1h.input_tokens).label("input_tokens"),
|
|
184
|
+
func.sum(AgentMetrics1h.output_tokens).label("output_tokens"),
|
|
185
|
+
func.sum(AgentMetrics1h.total_latency_ms).label("total_latency_ms"),
|
|
186
|
+
func.min(AgentMetrics1h.min_latency_ms).label("min_latency_ms"),
|
|
187
|
+
func.max(AgentMetrics1h.max_latency_ms).label("max_latency_ms"),
|
|
188
|
+
func.sum(AgentMetrics1h.tool_calls).label("tool_calls"),
|
|
189
|
+
func.sum(AgentMetrics1h.tool_errors).label("tool_errors"),
|
|
190
|
+
)
|
|
191
|
+
.where(AgentMetrics1h.agent_id == agent_id)
|
|
192
|
+
.where(AgentMetrics1h.bucket_time >= start)
|
|
193
|
+
.where(AgentMetrics1h.bucket_time < end)
|
|
194
|
+
)
|
|
195
|
+
result = await self.db.execute(stmt)
|
|
196
|
+
return result.one()
|
|
197
|
+
|
|
198
|
+
def _create_hourly_bucket(self, agent_id: str, hour_start: datetime, row) -> None:
|
|
199
|
+
"""Create an hourly metrics bucket."""
|
|
200
|
+
hourly = AgentMetrics1h(
|
|
201
|
+
agent_id=agent_id,
|
|
202
|
+
bucket_time=hour_start,
|
|
203
|
+
request_count=row.request_count or 0,
|
|
204
|
+
error_count=row.error_count or 0,
|
|
205
|
+
timeout_count=row.timeout_count or 0,
|
|
206
|
+
input_tokens=row.input_tokens or 0,
|
|
207
|
+
output_tokens=row.output_tokens or 0,
|
|
208
|
+
total_latency_ms=row.total_latency_ms or 0,
|
|
209
|
+
min_latency_ms=row.min_latency_ms,
|
|
210
|
+
max_latency_ms=row.max_latency_ms,
|
|
211
|
+
tool_calls=row.tool_calls or 0,
|
|
212
|
+
tool_errors=row.tool_errors or 0,
|
|
213
|
+
)
|
|
214
|
+
self.db.add(hourly)
|
|
215
|
+
|
|
216
|
+
def _create_daily_bucket(self, agent_id: str, day: date, row) -> None:
|
|
217
|
+
"""Create a daily metrics bucket."""
|
|
218
|
+
avg_latency = None
|
|
219
|
+
if row.request_count and row.request_count > 0 and row.total_latency_ms:
|
|
220
|
+
avg_latency = int(row.total_latency_ms / row.request_count)
|
|
221
|
+
|
|
222
|
+
daily = AgentMetricsDaily(
|
|
223
|
+
agent_id=agent_id,
|
|
224
|
+
date=day,
|
|
225
|
+
request_count=row.request_count or 0,
|
|
226
|
+
error_count=row.error_count or 0,
|
|
227
|
+
timeout_count=row.timeout_count or 0,
|
|
228
|
+
input_tokens=row.input_tokens or 0,
|
|
229
|
+
output_tokens=row.output_tokens or 0,
|
|
230
|
+
total_latency_ms=row.total_latency_ms or 0,
|
|
231
|
+
min_latency_ms=row.min_latency_ms,
|
|
232
|
+
max_latency_ms=row.max_latency_ms,
|
|
233
|
+
avg_latency_ms=avg_latency,
|
|
234
|
+
tool_calls=row.tool_calls or 0,
|
|
235
|
+
tool_errors=row.tool_errors or 0,
|
|
236
|
+
)
|
|
237
|
+
self.db.add(daily)
|
|
238
|
+
|
|
239
|
+
async def _cleanup_metrics_table(self, model, cutoff: datetime) -> int:
|
|
240
|
+
"""Delete old records from a metrics table."""
|
|
241
|
+
stmt = select(model).where(model.bucket_time < cutoff)
|
|
242
|
+
result = await self.db.execute(stmt)
|
|
243
|
+
records = list(result.scalars().all())
|
|
244
|
+
|
|
245
|
+
for record in records:
|
|
246
|
+
await self.db.delete(record)
|
|
247
|
+
|
|
248
|
+
return len(records)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent metrics service.
|
|
3
|
+
|
|
4
|
+
Handles metrics queries and telemetry processing.
|
|
5
|
+
For rollup and cleanup operations, see agent_metrics_rollup_service.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime, timedelta, UTC
|
|
10
|
+
|
|
11
|
+
from sqlalchemy import select
|
|
12
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
|
+
|
|
14
|
+
from backend.models.agent import Agent, AgentMetrics1m
|
|
15
|
+
from backend.schemas.agent import TelemetryBatchRequest
|
|
16
|
+
from backend.services.agent.constants import TIME_RANGE_MAP
|
|
17
|
+
from backend.services.agent.agent_metrics_rollup_service import AgentMetricsRollupService
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AgentMetricsService:
|
|
23
|
+
"""Service for agent metrics queries and telemetry processing."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, db: AsyncSession):
|
|
26
|
+
self.db = db
|
|
27
|
+
self._rollup = AgentMetricsRollupService(db)
|
|
28
|
+
|
|
29
|
+
async def get_metrics(
|
|
30
|
+
self,
|
|
31
|
+
agent_id: str,
|
|
32
|
+
range_str: str = "24h",
|
|
33
|
+
granularity: str = "auto"
|
|
34
|
+
) -> dict:
|
|
35
|
+
"""
|
|
36
|
+
Get agent metrics for a time range.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
agent_id: The agent ID
|
|
40
|
+
range_str: Time range (1h, 6h, 24h, 7d, 30d)
|
|
41
|
+
granularity: Data granularity (auto, 1m, 1h, 1d)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Dictionary with metrics data and summary
|
|
45
|
+
"""
|
|
46
|
+
hours = TIME_RANGE_MAP.get(range_str, 24)
|
|
47
|
+
delta = timedelta(hours=hours)
|
|
48
|
+
cutoff = datetime.now(UTC) - delta
|
|
49
|
+
|
|
50
|
+
if granularity == "auto":
|
|
51
|
+
granularity = self._determine_granularity(delta)
|
|
52
|
+
|
|
53
|
+
metrics = await self._query_metrics(agent_id, cutoff)
|
|
54
|
+
return self._build_metrics_response(agent_id, range_str, granularity, metrics)
|
|
55
|
+
|
|
56
|
+
def _determine_granularity(self, delta: timedelta) -> str:
|
|
57
|
+
"""Determine the appropriate granularity for a time range."""
|
|
58
|
+
if delta <= timedelta(hours=6):
|
|
59
|
+
return "1m"
|
|
60
|
+
elif delta <= timedelta(days=7):
|
|
61
|
+
return "1h"
|
|
62
|
+
return "1d"
|
|
63
|
+
|
|
64
|
+
async def _query_metrics(
|
|
65
|
+
self,
|
|
66
|
+
agent_id: str,
|
|
67
|
+
cutoff: datetime
|
|
68
|
+
) -> list[AgentMetrics1m]:
|
|
69
|
+
"""Query metrics from the 1m table."""
|
|
70
|
+
stmt = (
|
|
71
|
+
select(AgentMetrics1m)
|
|
72
|
+
.where(AgentMetrics1m.agent_id == agent_id)
|
|
73
|
+
.where(AgentMetrics1m.bucket_time >= cutoff)
|
|
74
|
+
.order_by(AgentMetrics1m.bucket_time)
|
|
75
|
+
)
|
|
76
|
+
result = await self.db.execute(stmt)
|
|
77
|
+
return list(result.scalars().all())
|
|
78
|
+
|
|
79
|
+
def _build_metrics_response(
|
|
80
|
+
self,
|
|
81
|
+
agent_id: str,
|
|
82
|
+
range_str: str,
|
|
83
|
+
granularity: str,
|
|
84
|
+
metrics: list[AgentMetrics1m]
|
|
85
|
+
) -> dict:
|
|
86
|
+
"""Build the metrics response dictionary."""
|
|
87
|
+
data = []
|
|
88
|
+
totals = {"requests": 0, "errors": 0, "tokens": 0, "latency": 0}
|
|
89
|
+
|
|
90
|
+
for m in metrics:
|
|
91
|
+
data.append(self._format_metric_point(m))
|
|
92
|
+
totals["requests"] += m.request_count
|
|
93
|
+
totals["errors"] += m.error_count
|
|
94
|
+
totals["tokens"] += m.input_tokens + m.output_tokens
|
|
95
|
+
totals["latency"] += m.total_latency_ms
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"agent_id": agent_id,
|
|
99
|
+
"range": range_str,
|
|
100
|
+
"granularity": granularity,
|
|
101
|
+
"data": data,
|
|
102
|
+
"summary": self._build_summary(totals),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def _format_metric_point(self, m: AgentMetrics1m) -> dict:
|
|
106
|
+
"""Format a single metric data point."""
|
|
107
|
+
avg_latency = m.total_latency_ms / m.request_count if m.request_count > 0 else None
|
|
108
|
+
return {
|
|
109
|
+
"timestamp": m.bucket_time,
|
|
110
|
+
"request_count": m.request_count,
|
|
111
|
+
"error_count": m.error_count,
|
|
112
|
+
"input_tokens": m.input_tokens,
|
|
113
|
+
"output_tokens": m.output_tokens,
|
|
114
|
+
"avg_latency_ms": avg_latency,
|
|
115
|
+
"tool_calls": m.tool_calls,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
def _build_summary(self, totals: dict) -> dict:
|
|
119
|
+
"""Build the summary section of metrics response."""
|
|
120
|
+
avg_latency = None
|
|
121
|
+
error_rate = 0.0
|
|
122
|
+
if totals["requests"] > 0:
|
|
123
|
+
avg_latency = totals["latency"] / totals["requests"]
|
|
124
|
+
error_rate = totals["errors"] / totals["requests"] * 100
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"total_requests": totals["requests"],
|
|
128
|
+
"total_errors": totals["errors"],
|
|
129
|
+
"total_tokens": totals["tokens"],
|
|
130
|
+
"avg_latency_ms": avg_latency,
|
|
131
|
+
"error_rate": error_rate,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
async def get_metrics_for_user(
|
|
135
|
+
self,
|
|
136
|
+
user_id: str,
|
|
137
|
+
agent_id: str,
|
|
138
|
+
range_str: str = "24h",
|
|
139
|
+
granularity: str = "auto"
|
|
140
|
+
) -> dict | None:
|
|
141
|
+
"""Get metrics for an agent owned by a user. Returns None if not found."""
|
|
142
|
+
if not await self._verify_agent_ownership(user_id, agent_id):
|
|
143
|
+
return None
|
|
144
|
+
return await self.get_metrics(agent_id, range_str, granularity)
|
|
145
|
+
|
|
146
|
+
async def _verify_agent_ownership(self, user_id: str, agent_id: str) -> bool:
|
|
147
|
+
"""Verify that a user owns an agent."""
|
|
148
|
+
stmt = (
|
|
149
|
+
select(Agent)
|
|
150
|
+
.where(Agent.id == agent_id, Agent.user_id == user_id)
|
|
151
|
+
.where(Agent.deleted_at.is_(None))
|
|
152
|
+
)
|
|
153
|
+
result = await self.db.execute(stmt)
|
|
154
|
+
return result.scalar_one_or_none() is not None
|
|
155
|
+
|
|
156
|
+
# ─── Telemetry Processing ──────────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
async def process_telemetry_batch(
|
|
159
|
+
self,
|
|
160
|
+
agent_id: str,
|
|
161
|
+
req: TelemetryBatchRequest
|
|
162
|
+
) -> tuple[int, int, list[str]]:
|
|
163
|
+
"""Process a batch of telemetry events."""
|
|
164
|
+
accepted = 0
|
|
165
|
+
rejected = 0
|
|
166
|
+
errors = []
|
|
167
|
+
|
|
168
|
+
for event in req.events:
|
|
169
|
+
try:
|
|
170
|
+
await self._process_telemetry_event(agent_id, event)
|
|
171
|
+
accepted += 1
|
|
172
|
+
except Exception as e:
|
|
173
|
+
rejected += 1
|
|
174
|
+
errors.append(f"{event.request_id}: {str(e)}")
|
|
175
|
+
|
|
176
|
+
await self.db.commit()
|
|
177
|
+
return accepted, rejected, errors
|
|
178
|
+
|
|
179
|
+
async def _process_telemetry_event(self, agent_id: str, event) -> None:
|
|
180
|
+
"""Process a single telemetry event."""
|
|
181
|
+
bucket_time = event.timestamp_start.replace(second=0, microsecond=0)
|
|
182
|
+
is_error = event.status in ("error", "timeout")
|
|
183
|
+
|
|
184
|
+
existing = await self._get_metrics_bucket(agent_id, bucket_time)
|
|
185
|
+
|
|
186
|
+
if existing:
|
|
187
|
+
self._update_metrics_bucket(existing, event, is_error)
|
|
188
|
+
else:
|
|
189
|
+
self._create_metrics_bucket(agent_id, bucket_time, event, is_error)
|
|
190
|
+
|
|
191
|
+
async def _get_metrics_bucket(
|
|
192
|
+
self,
|
|
193
|
+
agent_id: str,
|
|
194
|
+
bucket_time: datetime
|
|
195
|
+
) -> AgentMetrics1m | None:
|
|
196
|
+
"""Get existing metrics bucket."""
|
|
197
|
+
stmt = select(AgentMetrics1m).where(
|
|
198
|
+
AgentMetrics1m.agent_id == agent_id,
|
|
199
|
+
AgentMetrics1m.bucket_time == bucket_time
|
|
200
|
+
)
|
|
201
|
+
result = await self.db.execute(stmt)
|
|
202
|
+
return result.scalar_one_or_none()
|
|
203
|
+
|
|
204
|
+
def _update_metrics_bucket(
|
|
205
|
+
self,
|
|
206
|
+
existing: AgentMetrics1m,
|
|
207
|
+
event,
|
|
208
|
+
is_error: bool
|
|
209
|
+
) -> None:
|
|
210
|
+
"""Update an existing metrics bucket."""
|
|
211
|
+
existing.request_count += 1
|
|
212
|
+
existing.error_count += 1 if is_error else 0
|
|
213
|
+
existing.timeout_count += 1 if event.status == "timeout" else 0
|
|
214
|
+
existing.input_tokens += event.input_tokens
|
|
215
|
+
existing.output_tokens += event.output_tokens
|
|
216
|
+
existing.total_latency_ms += event.latency_ms
|
|
217
|
+
if event.tool_calls:
|
|
218
|
+
existing.tool_calls += len(event.tool_calls)
|
|
219
|
+
|
|
220
|
+
def _create_metrics_bucket(
|
|
221
|
+
self,
|
|
222
|
+
agent_id: str,
|
|
223
|
+
bucket_time: datetime,
|
|
224
|
+
event,
|
|
225
|
+
is_error: bool
|
|
226
|
+
) -> None:
|
|
227
|
+
"""Create a new metrics bucket."""
|
|
228
|
+
new_metrics = AgentMetrics1m(
|
|
229
|
+
agent_id=agent_id,
|
|
230
|
+
bucket_time=bucket_time,
|
|
231
|
+
request_count=1,
|
|
232
|
+
error_count=1 if is_error else 0,
|
|
233
|
+
timeout_count=1 if event.status == "timeout" else 0,
|
|
234
|
+
input_tokens=event.input_tokens,
|
|
235
|
+
output_tokens=event.output_tokens,
|
|
236
|
+
total_latency_ms=event.latency_ms,
|
|
237
|
+
min_latency_ms=event.latency_ms,
|
|
238
|
+
max_latency_ms=event.latency_ms,
|
|
239
|
+
tool_calls=len(event.tool_calls) if event.tool_calls else 0,
|
|
240
|
+
)
|
|
241
|
+
self.db.add(new_metrics)
|
|
242
|
+
|
|
243
|
+
# ─── Delegated Rollup Operations ───────────────────────────────────────────
|
|
244
|
+
|
|
245
|
+
async def rollup_1m_to_1h(self) -> int:
|
|
246
|
+
"""Delegate to rollup service."""
|
|
247
|
+
return await self._rollup.rollup_1m_to_1h()
|
|
248
|
+
|
|
249
|
+
async def rollup_1h_to_daily(self) -> int:
|
|
250
|
+
"""Delegate to rollup service."""
|
|
251
|
+
return await self._rollup.rollup_1h_to_daily()
|
|
252
|
+
|
|
253
|
+
async def cleanup_old_metrics(
|
|
254
|
+
self,
|
|
255
|
+
retention_days_1m: int = 30,
|
|
256
|
+
retention_days_1h: int = 365
|
|
257
|
+
) -> tuple[int, int]:
|
|
258
|
+
"""Delegate to rollup service."""
|
|
259
|
+
return await self._rollup.cleanup_old_metrics(retention_days_1m, retention_days_1h)
|