agent-relay-server 0.33.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ import { Database } from "bun:sqlite";
2
+ import { randomUUID } from "node:crypto";
3
+ import { isRecord, stringValue, isMechanicalMessageKind } from "agent-relay-sdk";
4
+ import { ORCHESTRATOR_PROTOCOL_VERSION, VERSION } from "../config.ts";
5
+ import { parseJson } from "../utils";
6
+ import { isLiveIsolatedWorkspace } from "../workspace-phase";
7
+ import {
8
+ CONTRACT_REQUIREMENTS,
9
+ contractCompatibility,
10
+ parseRuntimeCapabilities,
11
+ parseRuntimeContracts,
12
+ parseRuntimePackage,
13
+ type RuntimeContracts,
14
+ } from "../contracts";
15
+ import { STALE_TTL_MS, DAY_MS, CLAIM_LEASE_MS, POOL_CLAIM_LEASE_MS, WORKSPACE_MERGE_LEASE_MS } from "../config";
16
+ import { matchAgents } from "../agent-ref";
17
+ import { listAgents } from "./agents.ts";
18
+ import { listChannels } from "./channels.ts";
19
+ import { getDb } from "./connection.ts";
20
+ import type {
21
+ AgentCard,
22
+ ActivityEvent,
23
+ ActivityEventInput,
24
+ AgentKind,
25
+ AgentSessionGuard,
26
+ Artifact,
27
+ ArtifactBlob,
28
+ ArtifactKind,
29
+ ArtifactLink,
30
+ ArtifactSensitivity,
31
+ ArtifactVisibility,
32
+ AttachmentRef,
33
+ ChannelBinding,
34
+ ChannelBindingMode,
35
+ ChannelRouteTarget,
36
+ ChatHistoryImport,
37
+ ChatHistoryImportEntry,
38
+ ChannelSummary,
39
+ ChannelTargetHealth,
40
+ CreatePairInput,
41
+ HealthCheck,
42
+ HealthReport,
43
+ ManagedAgent,
44
+ ManagedSessionExitDiagnostics,
45
+ Message,
46
+ MessageDeliveryAttempt,
47
+ MessageDeliveryStatus,
48
+ Orchestrator,
49
+ OrchestratorHealth,
50
+ OrchestratorRuntimeInput,
51
+ OrchestratorStatus,
52
+ OrchestratorUpgradeState,
53
+ PairActionInput,
54
+ PairMessageInput,
55
+ PairSession,
56
+ PairStatus,
57
+ RegisterAgentInput,
58
+ ReplyObligation,
59
+ RegisterOrchestratorInput,
60
+ SendMessageInput,
61
+ PollQuery,
62
+ SpawnApprovalMode,
63
+ SpawnProvider,
64
+ Task,
65
+ TaskEvent,
66
+ TaskSeverity,
67
+ TaskStatus,
68
+ IntegrationEventInput,
69
+ IntegrationSummary,
70
+ IntegrationTaskStats,
71
+ InboxDraft,
72
+ InboxState,
73
+ InboxThreadState,
74
+ ContextSnapshot,
75
+ ContextState,
76
+ ProviderCapabilities,
77
+ TaskStatusInput,
78
+ WorkspaceMetadata,
79
+ WorkspaceRecord,
80
+ WorkspaceStatus,
81
+ } from "../types";
82
+
83
+ export function getStats(): {
84
+ version: string;
85
+ agents: number;
86
+ online: number;
87
+ messages: number;
88
+ messagesLast24h: number;
89
+ tasks: number;
90
+ openTasks: number;
91
+ } {
92
+ const agents = (
93
+ getDb().query("SELECT COUNT(*) as c FROM agents").get() as any
94
+ ).c;
95
+ const online = (
96
+ getDb()
97
+ .query(
98
+ "SELECT COUNT(*) as c FROM agents WHERE status != 'offline' AND last_seen > ?"
99
+ )
100
+ .get(Date.now() - STALE_TTL_MS) as any
101
+ ).c;
102
+ const messages = (
103
+ getDb().query("SELECT COUNT(*) as c FROM messages").get() as any
104
+ ).c;
105
+ const messagesLast24h = (
106
+ getDb()
107
+ .query("SELECT COUNT(*) as c FROM messages WHERE created_at > ?")
108
+ .get(Date.now() - DAY_MS) as any
109
+ ).c;
110
+ const tasks = (
111
+ getDb().query("SELECT COUNT(*) as c FROM tasks").get() as any
112
+ ).c;
113
+ const openTasks = (
114
+ getDb().query("SELECT COUNT(*) as c FROM tasks WHERE status NOT IN ('done', 'failed', 'canceled')").get() as any
115
+ ).c;
116
+
117
+ return { version: VERSION, agents, online, messages, messagesLast24h, tasks, openTasks };
118
+ }
119
+
120
+ // Analytics aggregation periods. Bucket counts mirror the dashboard chart
121
+ // granularity so the client renders pre-aggregated buckets directly. This is the
122
+ // single source of truth — the dashboard no longer windows raw messages (which
123
+ // capped it at the last ~100 rows ≈ 3h on a busy relay; the data was always
124
+ // retained server-side per RETENTION_DAYS). See issue #212 for the Layer-2 rollup
125
+ // that will let stats outlive the message-retention window.
126
+ export const ANALYTICS_PERIODS = {
127
+ "1h": { ms: 3_600_000, buckets: 12 },
128
+ "6h": { ms: 21_600_000, buckets: 12 },
129
+ "24h": { ms: 86_400_000, buckets: 24 },
130
+ "7d": { ms: 604_800_000, buckets: 14 },
131
+ "14d": { ms: 1_209_600_000, buckets: 14 },
132
+ "30d": { ms: 2_592_000_000, buckets: 15 },
133
+ } as const;
134
+
135
+ export type AnalyticsPeriod = keyof typeof ANALYTICS_PERIODS;
136
+
137
+ // Message → category, the ONE place this mapping lives (server-side SQL). Order is
138
+ // significant: a claimable/system/pair/channel message is classified as such even
139
+ // when it is also a reply; only an otherwise-plain reply counts as "Replies".
140
+ export const ANALYTICS_CATEGORIES = ["Messages", "Replies", "Work items", "System", "Pair", "Channel"] as const;
141
+ export type AnalyticsCategory = (typeof ANALYTICS_CATEGORIES)[number];
142
+ export const ANALYTICS_CATEGORY_SQL = `
143
+ CASE
144
+ WHEN claimable = 1 THEN 'Work items'
145
+ WHEN kind = 'system' OR from_agent = 'system' THEN 'System'
146
+ WHEN kind = 'pair' THEN 'Pair'
147
+ WHEN kind = 'channel.event' THEN 'Channel'
148
+ WHEN reply_to IS NOT NULL THEN 'Replies'
149
+ ELSE 'Messages'
150
+ END`;
151
+
152
+ export interface AnalyticsData {
153
+ period: AnalyticsPeriod;
154
+ start: number;
155
+ now: number;
156
+ bucketMs: number;
157
+ bucketCount: number;
158
+ // Per-bucket category counts, oldest → newest, length === bucketCount.
159
+ buckets: Array<{ start: number; counts: Record<AnalyticsCategory, number> }>;
160
+ // Category totals over the whole period (powers the breakdown donut).
161
+ categories: Record<AnalyticsCategory, number>;
162
+ // Busiest-hours grid: heatmap[dayOfWeek 0=Sun][hour 0-23], in server-local time.
163
+ heatmap: number[][];
164
+ totalMessages: number;
165
+ totalReactions: number;
166
+ }
167
+
168
+ export function emptyCategoryCounts(): Record<AnalyticsCategory, number> {
169
+ return { Messages: 0, Replies: 0, "Work items": 0, System: 0, Pair: 0, Channel: 0 };
170
+ }
171
+
172
+ export function getAnalytics(period: AnalyticsPeriod, now: number = Date.now()): AnalyticsData {
173
+ const { ms, buckets: bucketCount } = ANALYTICS_PERIODS[period] ?? ANALYTICS_PERIODS["24h"];
174
+ const start = now - ms;
175
+ const bucketMs = ms / bucketCount;
176
+
177
+ const buckets = Array.from({ length: bucketCount }, (_, i) => ({
178
+ start: start + i * bucketMs,
179
+ counts: emptyCategoryCounts(),
180
+ }));
181
+ const categories = emptyCategoryCounts();
182
+
183
+ // One pass: bucket index × category counts. CAST floors toward zero; created_at
184
+ // in (start, now] keeps the index in [0, bucketCount).
185
+ const volumeRows = getDb()
186
+ .query(
187
+ `SELECT CAST((created_at - ?) / ? AS INTEGER) AS bucket, ${ANALYTICS_CATEGORY_SQL} AS category, COUNT(*) AS c
188
+ FROM messages WHERE created_at > ? AND created_at <= ? GROUP BY bucket, category`,
189
+ )
190
+ .all(start, bucketMs, start, now) as Array<{ bucket: number; category: AnalyticsCategory; c: number }>;
191
+ for (const row of volumeRows) {
192
+ const idx = Math.min(Math.max(row.bucket, 0), bucketCount - 1);
193
+ const bucket = buckets[idx];
194
+ if (bucket && row.category in bucket.counts) {
195
+ bucket.counts[row.category] += row.c;
196
+ categories[row.category] += row.c;
197
+ }
198
+ }
199
+ const totalMessages = Object.values(categories).reduce((sum, v) => sum + v, 0);
200
+
201
+ // Busiest-hours heatmap (day-of-week × hour), server-local time.
202
+ const heatmap: number[][] = Array.from({ length: 7 }, () => new Array(24).fill(0));
203
+ const heatRows = getDb()
204
+ .query(
205
+ `SELECT CAST(strftime('%w', created_at / 1000, 'unixepoch', 'localtime') AS INTEGER) AS dow,
206
+ CAST(strftime('%H', created_at / 1000, 'unixepoch', 'localtime') AS INTEGER) AS hour, COUNT(*) AS c
207
+ FROM messages WHERE created_at > ? AND created_at <= ? GROUP BY dow, hour`,
208
+ )
209
+ .all(start, now) as Array<{ dow: number; hour: number; c: number }>;
210
+ for (const row of heatRows) {
211
+ const day = heatmap[row.dow];
212
+ if (day && row.hour >= 0 && row.hour < 24) day[row.hour] = row.c;
213
+ }
214
+
215
+ const totalReactions = (
216
+ getDb()
217
+ .query(
218
+ `SELECT COUNT(*) AS c FROM message_reactions r
219
+ JOIN messages m ON m.id = r.message_id WHERE m.created_at > ? AND m.created_at <= ?`,
220
+ )
221
+ .get(start, now) as { c: number }
222
+ ).c;
223
+
224
+ return { period, start, now, bucketMs, bucketCount, buckets, categories, heatmap, totalMessages, totalReactions };
225
+ }
226
+
227
+ export function getHealth(now: number = Date.now()): HealthReport {
228
+ const checks: HealthCheck[] = [];
229
+
230
+ try {
231
+ getDb().query("SELECT 1").get();
232
+ checks.push({ name: "database", status: "ok" });
233
+ } catch (e) {
234
+ checks.push({ name: "database", status: "error", detail: e instanceof Error ? e.message : "database check failed" });
235
+ }
236
+
237
+ const staleLiveAgents = (getDb()
238
+ .query("SELECT COUNT(*) as c FROM agents WHERE status != 'offline' AND last_seen <= ? AND id NOT IN ('user', 'system')")
239
+ .get(now - STALE_TTL_MS) as any).c as number;
240
+ checks.push({
241
+ name: "stale-live-agents",
242
+ status: staleLiveAgents > 0 ? "warn" : "ok",
243
+ count: staleLiveAgents,
244
+ detail: staleLiveAgents > 0 ? `${staleLiveAgents} non-offline agent(s) missed heartbeat TTL` : undefined,
245
+ });
246
+
247
+ const expiredMessageClaims = (getDb()
248
+ .query("SELECT COUNT(*) as c FROM messages WHERE claimed_by IS NOT NULL AND (claim_expires_at IS NULL OR claim_expires_at <= ?) AND NOT EXISTS (SELECT 1 FROM tasks t WHERE t.message_id = messages.id AND t.status IN ('done', 'failed', 'canceled'))")
249
+ .get(now) as any).c as number;
250
+ checks.push({
251
+ name: "expired-message-claims",
252
+ status: expiredMessageClaims > 0 ? "warn" : "ok",
253
+ count: expiredMessageClaims,
254
+ detail: expiredMessageClaims > 0 ? `${expiredMessageClaims} message claim(s) are expired or missing a lease` : undefined,
255
+ });
256
+
257
+ const expiredTaskClaims = (getDb()
258
+ .query("SELECT COUNT(*) as c FROM tasks WHERE claimed_by IS NOT NULL AND (claim_expires_at IS NULL OR claim_expires_at <= ?) AND status IN ('claimed', 'in_progress', 'blocked')")
259
+ .get(now) as any).c as number;
260
+ checks.push({
261
+ name: "expired-task-claims",
262
+ status: expiredTaskClaims > 0 ? "warn" : "ok",
263
+ count: expiredTaskClaims,
264
+ detail: expiredTaskClaims > 0 ? `${expiredTaskClaims} active task claim(s) are expired or missing a lease` : undefined,
265
+ });
266
+
267
+ const offlineClaimedTasks = (getDb()
268
+ .query(`
269
+ SELECT COUNT(*) as c
270
+ FROM tasks t
271
+ JOIN agents a ON a.id = t.claimed_by
272
+ WHERE t.status IN ('claimed', 'in_progress', 'blocked') AND a.status = 'offline'
273
+ `)
274
+ .get() as any).c as number;
275
+ checks.push({
276
+ name: "offline-claimed-tasks",
277
+ status: offlineClaimedTasks > 0 ? "warn" : "ok",
278
+ count: offlineClaimedTasks,
279
+ detail: offlineClaimedTasks > 0 ? `${offlineClaimedTasks} active task(s) are claimed by offline agents` : undefined,
280
+ });
281
+
282
+ const unhealthyChannelTargets = listChannels().filter((channel) => channel.targetHealth && channel.targetHealth.status !== "ok");
283
+ const channelTargetErrors = unhealthyChannelTargets.filter((channel) => channel.targetHealth?.status === "error");
284
+ const channelTargetDetail = unhealthyChannelTargets
285
+ .slice(0, 3)
286
+ .map((channel) => `${channel.name}: ${channel.targetHealth?.detail}`)
287
+ .join("; ");
288
+ checks.push({
289
+ name: "channel-delivery-targets",
290
+ status: channelTargetErrors.length > 0 ? "error" : unhealthyChannelTargets.length > 0 ? "warn" : "ok",
291
+ count: unhealthyChannelTargets.length,
292
+ detail: unhealthyChannelTargets.length > 0
293
+ ? `${unhealthyChannelTargets.length} channel delivery target issue(s): ${channelTargetDetail}${unhealthyChannelTargets.length > 3 ? "; ..." : ""}`
294
+ : undefined,
295
+ });
296
+
297
+ const rootTokenRuntimeAgents = listAgents().filter((agent) => {
298
+ if (agent.status === "offline") return false;
299
+ if (agent.id === "user" || agent.id === "system") return false;
300
+ const meta = agent.meta ?? {};
301
+ if (meta.runnerManaged !== true) return false;
302
+ const auth = meta.auth && typeof meta.auth === "object" && !Array.isArray(meta.auth)
303
+ ? meta.auth as Record<string, unknown>
304
+ : {};
305
+ return auth.rootTokenFallback === true || typeof auth.profileId !== "string" || auth.profileId.length === 0;
306
+ });
307
+ checks.push({
308
+ name: "root-token-runtime-drift",
309
+ status: rootTokenRuntimeAgents.length > 0 ? "warn" : "ok",
310
+ count: rootTokenRuntimeAgents.length,
311
+ detail: rootTokenRuntimeAgents.length > 0
312
+ ? `${rootTokenRuntimeAgents.length} runtime agent(s) are using or missing scoped token metadata`
313
+ : undefined,
314
+ subjects: rootTokenRuntimeAgents.map((agent) => {
315
+ const auth = agent.meta?.auth && typeof agent.meta.auth === "object" && !Array.isArray(agent.meta.auth)
316
+ ? agent.meta.auth as Record<string, unknown>
317
+ : {};
318
+ const reason = auth.rootTokenFallback === true
319
+ ? "using root-token fallback"
320
+ : "missing scoped token profile metadata";
321
+ return {
322
+ id: agent.id,
323
+ label: agent.label || agent.name || agent.id,
324
+ status: agent.status,
325
+ detail: reason,
326
+ };
327
+ }),
328
+ });
329
+
330
+ const status = checks.some((check) => check.status === "error")
331
+ ? "error"
332
+ : checks.some((check) => check.status === "warn")
333
+ ? "degraded"
334
+ : "ok";
335
+ return { status, version: VERSION, generatedAt: now, checks };
336
+ }
337
+