agent-regression-lab 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ import { DatabaseSync } from "node:sqlite";
2
+ import { writeFileSync } from "node:fs";
3
+ import { resolve } from "node:path";
4
+ import { ensureParentDir } from "./lib/fs.js";
5
+ const DB_PATH = resolve("artifacts", "agentlab.db");
6
+ const SCHEMA_VERSION = "2";
7
+ export class Storage {
8
+ db;
9
+ constructor() {
10
+ ensureParentDir(DB_PATH);
11
+ this.db = new DatabaseSync(DB_PATH);
12
+ this.db.exec(`
13
+ CREATE TABLE IF NOT EXISTS metadata (
14
+ key TEXT PRIMARY KEY,
15
+ value TEXT NOT NULL
16
+ );
17
+
18
+ CREATE TABLE IF NOT EXISTS scenarios (
19
+ id TEXT PRIMARY KEY,
20
+ name TEXT NOT NULL,
21
+ suite TEXT NOT NULL,
22
+ description TEXT,
23
+ tags_json TEXT,
24
+ difficulty TEXT,
25
+ file_path TEXT NOT NULL,
26
+ file_hash TEXT NOT NULL,
27
+ created_at TEXT NOT NULL,
28
+ updated_at TEXT NOT NULL
29
+ );
30
+
31
+ CREATE TABLE IF NOT EXISTS agent_versions (
32
+ id TEXT PRIMARY KEY,
33
+ label TEXT NOT NULL,
34
+ model_id TEXT,
35
+ provider TEXT,
36
+ command TEXT,
37
+ args_json TEXT,
38
+ config_json TEXT NOT NULL,
39
+ created_at TEXT NOT NULL
40
+ );
41
+
42
+ CREATE TABLE IF NOT EXISTS runs (
43
+ id TEXT PRIMARY KEY,
44
+ scenario_id TEXT NOT NULL,
45
+ scenario_file_hash TEXT NOT NULL,
46
+ agent_version_id TEXT NOT NULL,
47
+ status TEXT NOT NULL,
48
+ termination_reason TEXT NOT NULL,
49
+ final_output TEXT NOT NULL,
50
+ total_steps INTEGER NOT NULL,
51
+ total_tool_calls INTEGER NOT NULL,
52
+ duration_ms INTEGER NOT NULL,
53
+ total_tokens INTEGER,
54
+ total_cost_usd REAL,
55
+ score INTEGER NOT NULL,
56
+ started_at TEXT NOT NULL,
57
+ finished_at TEXT NOT NULL
58
+ );
59
+
60
+ CREATE TABLE IF NOT EXISTS run_steps (
61
+ id TEXT PRIMARY KEY,
62
+ run_id TEXT NOT NULL,
63
+ step_index INTEGER NOT NULL,
64
+ timestamp TEXT NOT NULL,
65
+ source TEXT NOT NULL,
66
+ type TEXT NOT NULL,
67
+ payload_json TEXT NOT NULL
68
+ );
69
+
70
+ CREATE TABLE IF NOT EXISTS tool_calls (
71
+ id TEXT PRIMARY KEY,
72
+ run_id TEXT NOT NULL,
73
+ step_index INTEGER NOT NULL,
74
+ tool_name TEXT NOT NULL,
75
+ input_json TEXT NOT NULL,
76
+ output_json TEXT,
77
+ status TEXT NOT NULL,
78
+ duration_ms INTEGER,
79
+ error_message TEXT
80
+ );
81
+
82
+ CREATE TABLE IF NOT EXISTS evaluator_results (
83
+ id TEXT PRIMARY KEY,
84
+ run_id TEXT NOT NULL,
85
+ evaluator_id TEXT NOT NULL,
86
+ evaluator_type TEXT NOT NULL,
87
+ mode TEXT NOT NULL,
88
+ status TEXT NOT NULL,
89
+ raw_score REAL,
90
+ normalized_score REAL,
91
+ weight REAL,
92
+ message TEXT NOT NULL,
93
+ details_json TEXT
94
+ );
95
+ `);
96
+ this.ensureSchemaVersion();
97
+ this.ensureAgentVersionColumns();
98
+ }
99
+ upsertScenario(summary, definition, filePath, fileHash) {
100
+ const now = new Date().toISOString();
101
+ this.db
102
+ .prepare(`INSERT INTO scenarios (id, name, suite, description, tags_json, difficulty, file_path, file_hash, created_at, updated_at)
103
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
104
+ ON CONFLICT(id) DO UPDATE SET
105
+ name = excluded.name,
106
+ suite = excluded.suite,
107
+ description = excluded.description,
108
+ tags_json = excluded.tags_json,
109
+ difficulty = excluded.difficulty,
110
+ file_path = excluded.file_path,
111
+ file_hash = excluded.file_hash,
112
+ updated_at = excluded.updated_at`)
113
+ .run(summary.id, summary.name, summary.suite, summary.description ?? null, JSON.stringify(definition.tags ?? []), summary.difficulty ?? null, filePath, fileHash, now, now);
114
+ }
115
+ upsertAgentVersion(agentVersion) {
116
+ const now = new Date().toISOString();
117
+ this.db
118
+ .prepare(`INSERT INTO agent_versions (id, label, model_id, provider, command, args_json, config_json, created_at)
119
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
120
+ ON CONFLICT(id) DO UPDATE SET
121
+ label = excluded.label,
122
+ model_id = excluded.model_id,
123
+ provider = excluded.provider,
124
+ command = excluded.command,
125
+ args_json = excluded.args_json,
126
+ config_json = excluded.config_json`)
127
+ .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
128
+ }
129
+ saveRun(bundle) {
130
+ const run = bundle.run;
131
+ this.db
132
+ .prepare(`INSERT INTO runs (
133
+ id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
134
+ total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
135
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
136
+ .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
137
+ const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
138
+ VALUES (?, ?, ?, ?, ?, ?, ?)`);
139
+ const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
140
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`);
141
+ const insertEval = this.db.prepare(`INSERT INTO evaluator_results (id, run_id, evaluator_id, evaluator_type, mode, status, raw_score, normalized_score, weight, message, details_json)
142
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
143
+ this.db.exec("BEGIN");
144
+ try {
145
+ for (const event of bundle.traceEvents) {
146
+ insertStep.run(event.eventId, bundle.run.id, event.stepIndex, event.timestamp, event.source, event.type, JSON.stringify(event.payload));
147
+ }
148
+ for (const toolCall of bundle.toolCalls) {
149
+ insertTool.run(toolCall.id, bundle.run.id, toolCall.stepIndex, toolCall.toolName, JSON.stringify(toolCall.input), toolCall.output === undefined ? null : JSON.stringify(toolCall.output), toolCall.status, toolCall.durationMs ?? null, toolCall.errorMessage ?? null);
150
+ }
151
+ for (const result of bundle.evaluatorResults) {
152
+ insertEval.run(`${bundle.run.id}:${result.evaluatorId}`, bundle.run.id, result.evaluatorId, result.evaluatorType, result.mode, result.status, result.rawScore ?? null, result.normalizedScore ?? null, result.weight ?? null, result.message, result.details ? JSON.stringify(result.details) : null);
153
+ }
154
+ this.db.exec("COMMIT");
155
+ }
156
+ catch (error) {
157
+ this.db.exec("ROLLBACK");
158
+ throw error;
159
+ }
160
+ this.writeTraceArtifact(bundle.run.id, bundle.traceEvents);
161
+ }
162
+ listRuns(filters = {}) {
163
+ const clauses = [];
164
+ const values = [];
165
+ if (filters.suite) {
166
+ clauses.push("s.suite = ?");
167
+ values.push(filters.suite);
168
+ }
169
+ if (filters.status) {
170
+ clauses.push("r.status = ?");
171
+ values.push(filters.status);
172
+ }
173
+ if (filters.provider) {
174
+ clauses.push("av.provider = ?");
175
+ values.push(filters.provider);
176
+ }
177
+ const whereClause = clauses.length > 0 ? `WHERE ${clauses.join(" AND ")}` : "";
178
+ return this.db
179
+ .prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
180
+ av.label as agentLabel, av.provider, av.model_id as modelId,
181
+ r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
182
+ r.started_at as startedAt
183
+ FROM runs r
184
+ JOIN scenarios s ON s.id = r.scenario_id
185
+ JOIN agent_versions av ON av.id = r.agent_version_id
186
+ ${whereClause}
187
+ ORDER BY r.started_at DESC`)
188
+ .all(...values);
189
+ }
190
+ getRun(runId) {
191
+ const run = this.getRunRecord(runId);
192
+ if (!run) {
193
+ return null;
194
+ }
195
+ const traceEvents = this.db
196
+ .prepare(`SELECT id as eventId, run_id as runId, step_index as stepIndex, timestamp, source, type, payload_json
197
+ FROM run_steps WHERE run_id = ? ORDER BY step_index ASC`)
198
+ .all(runId)
199
+ .map((row) => ({
200
+ eventId: row.eventId,
201
+ runId: row.runId,
202
+ scenarioId: run.scenarioId,
203
+ stepIndex: row.stepIndex,
204
+ timestamp: row.timestamp,
205
+ source: row.source,
206
+ type: row.type,
207
+ payload: JSON.parse(row.payload_json),
208
+ }));
209
+ const toolCalls = this.db
210
+ .prepare(`SELECT id, step_index as stepIndex, tool_name as toolName, input_json, output_json, status, duration_ms as durationMs, error_message as errorMessage
211
+ FROM tool_calls WHERE run_id = ? ORDER BY step_index ASC`)
212
+ .all(runId)
213
+ .map((row) => ({
214
+ id: row.id,
215
+ stepIndex: row.stepIndex,
216
+ toolName: row.toolName,
217
+ input: JSON.parse(row.input_json),
218
+ output: row.output_json ? JSON.parse(row.output_json) : undefined,
219
+ status: row.status,
220
+ durationMs: row.durationMs ?? undefined,
221
+ errorMessage: row.errorMessage ?? undefined,
222
+ }));
223
+ const evaluatorResults = this.db
224
+ .prepare(`SELECT evaluator_id as evaluatorId, evaluator_type as evaluatorType, mode, status, raw_score as rawScore,
225
+ normalized_score as normalizedScore, weight, message, details_json
226
+ FROM evaluator_results WHERE run_id = ? ORDER BY evaluator_id ASC`)
227
+ .all(runId)
228
+ .map((row) => ({
229
+ evaluatorId: row.evaluatorId,
230
+ evaluatorType: row.evaluatorType,
231
+ mode: row.mode,
232
+ status: row.status,
233
+ rawScore: row.rawScore ?? undefined,
234
+ normalizedScore: row.normalizedScore ?? undefined,
235
+ weight: row.weight ?? undefined,
236
+ message: row.message,
237
+ details: row.details_json ? JSON.parse(row.details_json) : undefined,
238
+ }));
239
+ const agentVersion = this.db
240
+ .prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
241
+ FROM agent_versions WHERE id = ?`)
242
+ .get(run.agentVersionId);
243
+ return {
244
+ run,
245
+ traceEvents,
246
+ toolCalls,
247
+ evaluatorResults,
248
+ agentVersion: agentVersion
249
+ ? {
250
+ id: agentVersion.id,
251
+ label: agentVersion.label,
252
+ modelId: agentVersion.modelId ?? undefined,
253
+ provider: agentVersion.provider ?? undefined,
254
+ command: agentVersion.command ?? undefined,
255
+ args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
256
+ config: JSON.parse(agentVersion.config_json),
257
+ }
258
+ : undefined,
259
+ };
260
+ }
261
+ compareRuns(baselineRunId, candidateRunId) {
262
+ const baseline = this.getRun(baselineRunId);
263
+ const candidate = this.getRun(candidateRunId);
264
+ if (!baseline) {
265
+ throw new Error(`Run '${baselineRunId}' not found.`);
266
+ }
267
+ if (!candidate) {
268
+ throw new Error(`Run '${candidateRunId}' not found.`);
269
+ }
270
+ if (baseline.run.scenarioId !== candidate.run.scenarioId) {
271
+ throw new Error("Runs can only be compared when they share the same scenario id.");
272
+ }
273
+ if (baseline.run.scenarioFileHash !== candidate.run.scenarioFileHash) {
274
+ throw new Error("Runs can only be compared when they share the same scenario file hash.");
275
+ }
276
+ const notes = [];
277
+ if (baseline.run.status !== candidate.run.status) {
278
+ notes.push(`Verdict changed: ${baseline.run.status} -> ${candidate.run.status}`);
279
+ }
280
+ if (baseline.run.score !== candidate.run.score) {
281
+ notes.push(`Score changed: ${baseline.run.score} -> ${candidate.run.score}`);
282
+ }
283
+ if (baseline.run.totalSteps !== candidate.run.totalSteps) {
284
+ notes.push(`Steps changed: ${baseline.run.totalSteps} -> ${candidate.run.totalSteps}`);
285
+ }
286
+ if (baseline.run.durationMs !== candidate.run.durationMs) {
287
+ notes.push(`Runtime changed: ${baseline.run.durationMs}ms -> ${candidate.run.durationMs}ms`);
288
+ }
289
+ if (baseline.run.terminationReason !== candidate.run.terminationReason) {
290
+ notes.push(`Termination changed: ${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`);
291
+ }
292
+ const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
293
+ const toolDiffs = buildToolDiffs(baseline, candidate);
294
+ return {
295
+ baseline,
296
+ candidate,
297
+ notes,
298
+ deltas: {
299
+ score: candidate.run.score - baseline.run.score,
300
+ runtimeMs: candidate.run.durationMs - baseline.run.durationMs,
301
+ steps: candidate.run.totalSteps - baseline.run.totalSteps,
302
+ },
303
+ evaluatorDiffs,
304
+ toolDiffs,
305
+ };
306
+ }
307
+ getRunRecord(runId) {
308
+ return (this.db
309
+ .prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
310
+ status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
311
+ total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
312
+ total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
313
+ FROM runs WHERE id = ?`)
314
+ .get(runId) ?? null);
315
+ }
316
+ getRunRecordOrThrow(runId) {
317
+ const run = this.getRunRecord(runId);
318
+ if (!run) {
319
+ throw new Error(`Run '${runId}' not found.`);
320
+ }
321
+ return run;
322
+ }
323
+ writeTraceArtifact(runId, events) {
324
+ const path = resolve("artifacts", runId, "trace.json");
325
+ ensureParentDir(path);
326
+ writeFileSync(path, JSON.stringify(events, null, 2));
327
+ }
328
+ ensureSchemaVersion() {
329
+ const existing = this.db
330
+ .prepare(`SELECT value FROM metadata WHERE key = 'schema_version'`)
331
+ .get();
332
+ if (!existing) {
333
+ this.db.prepare(`INSERT INTO metadata (key, value) VALUES ('schema_version', ?)`).run(SCHEMA_VERSION);
334
+ return;
335
+ }
336
+ if (existing.value !== SCHEMA_VERSION) {
337
+ throw new Error(`Unsupported database schema version '${existing.value}'. Expected '${SCHEMA_VERSION}'. Remove artifacts/agentlab.db or add a migration.`);
338
+ }
339
+ }
340
+ ensureAgentVersionColumns() {
341
+ const columns = this.db.prepare(`PRAGMA table_info(agent_versions)`).all();
342
+ const names = new Set(columns.map((column) => column.name));
343
+ if (!names.has("command")) {
344
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN command TEXT`);
345
+ }
346
+ if (!names.has("args_json")) {
347
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
348
+ }
349
+ }
350
+ }
351
+ function buildEvaluatorDiffs(baseline, candidate) {
352
+ const ids = new Set([
353
+ ...baseline.evaluatorResults.map((result) => result.evaluatorId),
354
+ ...candidate.evaluatorResults.map((result) => result.evaluatorId),
355
+ ]);
356
+ return [...ids]
357
+ .sort()
358
+ .map((evaluatorId) => {
359
+ const baselineResult = baseline.evaluatorResults.find((result) => result.evaluatorId === evaluatorId);
360
+ const candidateResult = candidate.evaluatorResults.find((result) => result.evaluatorId === evaluatorId);
361
+ if (baselineResult?.status === candidateResult?.status) {
362
+ return null;
363
+ }
364
+ return {
365
+ evaluatorId,
366
+ baselineStatus: baselineResult?.status,
367
+ candidateStatus: candidateResult?.status,
368
+ note: `Evaluator '${evaluatorId}' changed: ${baselineResult?.status ?? "missing"} -> ${candidateResult?.status ?? "missing"}`,
369
+ };
370
+ })
371
+ .filter((diff) => diff !== null);
372
+ }
373
+ function buildToolDiffs(baseline, candidate) {
374
+ const toolNames = new Set([
375
+ ...baseline.toolCalls.map((call) => call.toolName),
376
+ ...candidate.toolCalls.map((call) => call.toolName),
377
+ ]);
378
+ return [...toolNames]
379
+ .sort()
380
+ .map((toolName) => {
381
+ const baselineCount = baseline.toolCalls.filter((call) => call.toolName === toolName).length;
382
+ const candidateCount = candidate.toolCalls.filter((call) => call.toolName === toolName).length;
383
+ if (baselineCount === candidateCount) {
384
+ return null;
385
+ }
386
+ return {
387
+ toolName,
388
+ baselineCount,
389
+ candidateCount,
390
+ note: `Tool '${toolName}' usage changed: ${baselineCount} -> ${candidateCount}`,
391
+ };
392
+ })
393
+ .filter((diff) => diff !== null);
394
+ }
package/dist/tools.js ADDED
@@ -0,0 +1,128 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { pathToFileURL } from "node:url";
3
+ import { resolve } from "node:path";
4
+ import { loadAgentLabConfig } from "./config.js";
5
+ function loadFixture(path) {
6
+ const raw = readFileSync(resolve(path), "utf8");
7
+ return JSON.parse(raw);
8
+ }
9
+ const BUILTIN_TOOLS = [
10
+ {
11
+ spec: {
12
+ name: "crm.search_customer",
13
+ description: "Find a customer by email.",
14
+ inputSchema: {
15
+ type: "object",
16
+ additionalProperties: false,
17
+ properties: {
18
+ email: { type: "string", description: "Customer email address." },
19
+ },
20
+ required: ["email"],
21
+ },
22
+ },
23
+ handler: async (input) => {
24
+ assertObject(input);
25
+ const email = String(input.email ?? "");
26
+ const customers = loadFixture("fixtures/support/customers.json");
27
+ const customer = customers.find((candidate) => candidate.email === email);
28
+ if (!customer) {
29
+ throw new Error(`Customer with email '${email}' not found.`);
30
+ }
31
+ return customer;
32
+ },
33
+ },
34
+ {
35
+ spec: {
36
+ name: "orders.list",
37
+ description: "List orders for a given customer id.",
38
+ inputSchema: {
39
+ type: "object",
40
+ additionalProperties: false,
41
+ properties: {
42
+ customer_id: { type: "string", description: "Customer id returned from CRM." },
43
+ },
44
+ required: ["customer_id"],
45
+ },
46
+ },
47
+ handler: async (input) => {
48
+ assertObject(input);
49
+ const customerId = String(input.customer_id ?? "");
50
+ const orders = loadFixture("fixtures/support/orders.json");
51
+ return orders.filter((order) => order.customer_id === customerId);
52
+ },
53
+ },
54
+ {
55
+ spec: {
56
+ name: "orders.refund",
57
+ description: "Refund a single order by id.",
58
+ inputSchema: {
59
+ type: "object",
60
+ additionalProperties: false,
61
+ properties: {
62
+ order_id: { type: "string", description: "Order id to refund." },
63
+ },
64
+ required: ["order_id"],
65
+ },
66
+ },
67
+ handler: async (input) => {
68
+ assertObject(input);
69
+ const orderId = String(input.order_id ?? "");
70
+ const orders = loadFixture("fixtures/support/orders.json");
71
+ const order = orders.find((candidate) => candidate.id === orderId);
72
+ if (!order) {
73
+ throw new Error(`Order '${orderId}' not found.`);
74
+ }
75
+ return {
76
+ refunded: true,
77
+ order_id: order.id,
78
+ amount: order.amount,
79
+ currency: order.currency,
80
+ };
81
+ },
82
+ },
83
+ ];
84
+ export async function loadToolRegistry() {
85
+ const tools = await loadTools();
86
+ return Object.fromEntries(tools.map((tool) => [tool.spec.name, tool.handler]));
87
+ }
88
+ export async function loadToolSpecs() {
89
+ const tools = await loadTools();
90
+ return tools.map((tool) => tool.spec);
91
+ }
92
+ export function getBuiltinToolSpecs() {
93
+ return BUILTIN_TOOLS.map((tool) => tool.spec);
94
+ }
95
+ async function loadTools() {
96
+ const config = loadAgentLabConfig();
97
+ const configuredTools = await Promise.all((config.tools ?? []).map((tool) => loadConfiguredTool(tool)));
98
+ const merged = [...BUILTIN_TOOLS, ...configuredTools];
99
+ const seen = new Set();
100
+ for (const tool of merged) {
101
+ if (seen.has(tool.spec.name)) {
102
+ throw new Error(`Duplicate tool registration for '${tool.spec.name}'.`);
103
+ }
104
+ seen.add(tool.spec.name);
105
+ }
106
+ return merged;
107
+ }
108
+ async function loadConfiguredTool(tool) {
109
+ const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
110
+ const module = await import(moduleUrl);
111
+ const candidate = module[tool.exportName];
112
+ if (typeof candidate !== "function") {
113
+ throw new Error(`Tool '${tool.name}' export '${tool.exportName}' is not a function.`);
114
+ }
115
+ return {
116
+ spec: {
117
+ name: tool.name,
118
+ description: tool.description,
119
+ inputSchema: tool.inputSchema,
120
+ },
121
+ handler: candidate,
122
+ };
123
+ }
124
+ function assertObject(value) {
125
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
126
+ throw new Error("Tool input must be an object.");
127
+ }
128
+ }
package/dist/trace.js ADDED
@@ -0,0 +1,30 @@
1
+ import { createEventId } from "./lib/id.js";
2
+ export class TraceRecorder {
3
+ runId;
4
+ scenarioId;
5
+ events = [];
6
+ stepIndex = 0;
7
+ constructor(runId, scenarioId) {
8
+ this.runId = runId;
9
+ this.scenarioId = scenarioId;
10
+ }
11
+ record(source, type, payload) {
12
+ this.stepIndex += 1;
13
+ this.events.push({
14
+ eventId: createEventId(),
15
+ runId: this.runId,
16
+ scenarioId: this.scenarioId,
17
+ stepIndex: this.stepIndex,
18
+ timestamp: new Date().toISOString(),
19
+ source,
20
+ type,
21
+ payload,
22
+ });
23
+ }
24
+ getEvents() {
25
+ return [...this.events];
26
+ }
27
+ getStepCount() {
28
+ return this.stepIndex;
29
+ }
30
+ }
package/dist/types.js ADDED
@@ -0,0 +1 @@
1
+ export {};
package/dist/ui/App.js ADDED
@@ -0,0 +1,85 @@
1
+ import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
2
+ import { useEffect, useState } from "react";
3
+ export function App() {
4
+ const route = getRoute();
5
+ return (_jsxs("div", { className: "shell", children: [_jsx("header", { className: "topbar", children: _jsx("a", { className: "brand", href: "/", children: "Agent Regression Lab Alpha" }) }), _jsxs("main", { className: "page", children: [route.type === "list" ? _jsx(RunListPage, {}) : null, route.type === "detail" ? _jsx(RunDetailPage, { runId: route.runId }) : null, route.type === "compare" ? _jsx(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null] })] }));
6
+ }
7
+ function RunListPage() {
8
+ const [runs, setRuns] = useState([]);
9
+ const [suite, setSuite] = useState("");
10
+ const [status, setStatus] = useState("");
11
+ const [provider, setProvider] = useState("");
12
+ useEffect(() => {
13
+ const url = new URL("/api/runs", window.location.origin);
14
+ if (suite)
15
+ url.searchParams.set("suite", suite);
16
+ if (status)
17
+ url.searchParams.set("status", status);
18
+ if (provider)
19
+ url.searchParams.set("provider", provider);
20
+ void fetch(url)
21
+ .then((response) => response.json())
22
+ .then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
23
+ }, [suite, status, provider]);
24
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null] })] }, run.id))) })] })) : null] }));
25
+ }
26
+ function RunDetailPage(props) {
27
+ const [detail, setDetail] = useState(null);
28
+ useEffect(() => {
29
+ void fetch(`/api/runs/${props.runId}`)
30
+ .then((response) => response.json())
31
+ .then((data) => setDetail(data));
32
+ }, [props.runId]);
33
+ if (!detail) {
34
+ return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
35
+ }
36
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
37
+ }
38
+ function ComparePage(props) {
39
+ const [data, setData] = useState(null);
40
+ useEffect(() => {
41
+ if (!props.baseline || !props.candidate) {
42
+ setData(null);
43
+ return;
44
+ }
45
+ const url = new URL("/api/compare", window.location.origin);
46
+ url.searchParams.set("baseline", props.baseline);
47
+ url.searchParams.set("candidate", props.candidate);
48
+ void fetch(url)
49
+ .then((response) => response.json())
50
+ .then((payload) => setData(payload));
51
+ }, [props.baseline, props.candidate]);
52
+ if (!props.baseline || !props.candidate) {
53
+ return _jsx(EmptyState, { title: "No comparison selected", description: "Open the compare page with baseline and candidate run ids." });
54
+ }
55
+ if (!data) {
56
+ return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
57
+ }
58
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsx("li", { children: diff.note }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack", children: data.toolDiffs.map((diff) => (_jsx("li", { children: diff.note }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
59
+ }
60
+ function RunSide(props) {
61
+ return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { children: _jsxs("strong", { children: [event.stepIndex, ". ", event.type] }) }, event.eventId))) })] }));
62
+ }
63
+ function Stat(props) {
64
+ return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
65
+ }
66
+ function EmptyState(props) {
67
+ return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
68
+ }
69
+ function signed(value) {
70
+ return value > 0 ? `+${value}` : `${value}`;
71
+ }
72
+ function getRoute() {
73
+ const url = new URL(window.location.href);
74
+ if (url.pathname.startsWith("/runs/")) {
75
+ return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
76
+ }
77
+ if (url.pathname === "/compare") {
78
+ return {
79
+ type: "compare",
80
+ baseline: url.searchParams.get("baseline") ?? undefined,
81
+ candidate: url.searchParams.get("candidate") ?? undefined,
82
+ };
83
+ }
84
+ return { type: "list" };
85
+ }
@@ -0,0 +1,10 @@
1
+ import { jsx as _jsx } from "react/jsx-runtime";
2
+ import React from "react";
3
+ import { createRoot } from "react-dom/client";
4
+ import { App } from "./App.js";
5
+ import "./styles.css";
6
+ const container = document.getElementById("root");
7
+ if (!container) {
8
+ throw new Error("Missing root element.");
9
+ }
10
+ createRoot(container).render(_jsx(React.StrictMode, { children: _jsx(App, {}) }));