agent-regression-lab 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -123
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +79 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +223 -4
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/evaluators.js +56 -1
- package/dist/index.js +428 -111
- package/dist/lib/id.js +6 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +211 -11
- package/dist/scoring.js +2 -2
- package/dist/storage.js +305 -31
- package/dist/tools.js +284 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +67 -5
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +165 -3
- package/docs/agents.md +287 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +94 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +419 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +296 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +4 -3
package/dist/storage.js
CHANGED
|
@@ -10,6 +10,10 @@ export class Storage {
|
|
|
10
10
|
ensureParentDir(DB_PATH);
|
|
11
11
|
this.db = new DatabaseSync(DB_PATH);
|
|
12
12
|
this.db.exec(`
|
|
13
|
+
PRAGMA journal_mode = WAL;
|
|
14
|
+
PRAGMA busy_timeout = 5000;
|
|
15
|
+
`);
|
|
16
|
+
this.db.exec(`
|
|
13
17
|
CREATE TABLE IF NOT EXISTS metadata (
|
|
14
18
|
key TEXT PRIMARY KEY,
|
|
15
19
|
value TEXT NOT NULL
|
|
@@ -35,6 +39,15 @@ export class Storage {
|
|
|
35
39
|
provider TEXT,
|
|
36
40
|
command TEXT,
|
|
37
41
|
args_json TEXT,
|
|
42
|
+
variant_set_name TEXT,
|
|
43
|
+
variant_label TEXT,
|
|
44
|
+
prompt_version TEXT,
|
|
45
|
+
model_version TEXT,
|
|
46
|
+
tool_schema_version TEXT,
|
|
47
|
+
config_label TEXT,
|
|
48
|
+
config_hash TEXT,
|
|
49
|
+
runtime_profile_name TEXT,
|
|
50
|
+
suite_definition_name TEXT,
|
|
38
51
|
config_json TEXT NOT NULL,
|
|
39
52
|
created_at TEXT NOT NULL
|
|
40
53
|
);
|
|
@@ -44,6 +57,16 @@ export class Storage {
|
|
|
44
57
|
scenario_id TEXT NOT NULL,
|
|
45
58
|
scenario_file_hash TEXT NOT NULL,
|
|
46
59
|
agent_version_id TEXT NOT NULL,
|
|
60
|
+
suite_batch_id TEXT,
|
|
61
|
+
variant_set_name TEXT,
|
|
62
|
+
variant_label TEXT,
|
|
63
|
+
prompt_version TEXT,
|
|
64
|
+
model_version TEXT,
|
|
65
|
+
tool_schema_version TEXT,
|
|
66
|
+
config_label TEXT,
|
|
67
|
+
config_hash TEXT,
|
|
68
|
+
runtime_profile_name TEXT,
|
|
69
|
+
suite_definition_name TEXT,
|
|
47
70
|
status TEXT NOT NULL,
|
|
48
71
|
termination_reason TEXT NOT NULL,
|
|
49
72
|
final_output TEXT NOT NULL,
|
|
@@ -95,6 +118,10 @@ export class Storage {
|
|
|
95
118
|
`);
|
|
96
119
|
this.ensureSchemaVersion();
|
|
97
120
|
this.ensureAgentVersionColumns();
|
|
121
|
+
this.ensureRunColumns();
|
|
122
|
+
}
|
|
123
|
+
close() {
|
|
124
|
+
this.db.close();
|
|
98
125
|
}
|
|
99
126
|
upsertScenario(summary, definition, filePath, fileHash) {
|
|
100
127
|
const now = new Date().toISOString();
|
|
@@ -115,25 +142,41 @@ export class Storage {
|
|
|
115
142
|
upsertAgentVersion(agentVersion) {
|
|
116
143
|
const now = new Date().toISOString();
|
|
117
144
|
this.db
|
|
118
|
-
.prepare(`INSERT INTO agent_versions (
|
|
119
|
-
|
|
145
|
+
.prepare(`INSERT INTO agent_versions (
|
|
146
|
+
id, label, model_id, provider, command, args_json,
|
|
147
|
+
variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
|
|
148
|
+
config_label, config_hash, runtime_profile_name, suite_definition_name,
|
|
149
|
+
config_json, created_at
|
|
150
|
+
)
|
|
151
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
120
152
|
ON CONFLICT(id) DO UPDATE SET
|
|
121
153
|
label = excluded.label,
|
|
122
154
|
model_id = excluded.model_id,
|
|
123
155
|
provider = excluded.provider,
|
|
124
156
|
command = excluded.command,
|
|
125
157
|
args_json = excluded.args_json,
|
|
158
|
+
variant_set_name = excluded.variant_set_name,
|
|
159
|
+
variant_label = excluded.variant_label,
|
|
160
|
+
prompt_version = excluded.prompt_version,
|
|
161
|
+
model_version = excluded.model_version,
|
|
162
|
+
tool_schema_version = excluded.tool_schema_version,
|
|
163
|
+
config_label = excluded.config_label,
|
|
164
|
+
config_hash = excluded.config_hash,
|
|
165
|
+
runtime_profile_name = excluded.runtime_profile_name,
|
|
166
|
+
suite_definition_name = excluded.suite_definition_name,
|
|
126
167
|
config_json = excluded.config_json`)
|
|
127
|
-
.run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
|
|
168
|
+
.run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
|
|
128
169
|
}
|
|
129
170
|
saveRun(bundle) {
|
|
130
171
|
const run = bundle.run;
|
|
131
172
|
this.db
|
|
132
173
|
.prepare(`INSERT INTO runs (
|
|
133
174
|
id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
|
|
175
|
+
suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
|
|
176
|
+
config_label, config_hash, runtime_profile_name, suite_definition_name,
|
|
134
177
|
total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
|
|
135
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
|
136
|
-
.run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
|
|
178
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
|
179
|
+
.run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
|
|
137
180
|
const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
|
|
138
181
|
VALUES (?, ?, ?, ?, ?, ?, ?)`);
|
|
139
182
|
const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
|
|
@@ -177,6 +220,8 @@ export class Storage {
|
|
|
177
220
|
const whereClause = clauses.length > 0 ? `WHERE ${clauses.join(" AND ")}` : "";
|
|
178
221
|
return this.db
|
|
179
222
|
.prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
|
|
223
|
+
r.suite_batch_id as suiteBatchId,
|
|
224
|
+
r.variant_set_name as variantSetName, r.variant_label as variantLabel,
|
|
180
225
|
av.label as agentLabel, av.provider, av.model_id as modelId,
|
|
181
226
|
r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
|
|
182
227
|
r.started_at as startedAt
|
|
@@ -238,6 +283,11 @@ export class Storage {
|
|
|
238
283
|
}));
|
|
239
284
|
const agentVersion = this.db
|
|
240
285
|
.prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
|
|
286
|
+
, variant_set_name as variantSetName, variant_label as variantLabel,
|
|
287
|
+
prompt_version as promptVersion, model_version as modelVersion,
|
|
288
|
+
tool_schema_version as toolSchemaVersion, config_label as configLabel,
|
|
289
|
+
config_hash as configHash, runtime_profile_name as runtimeProfileName,
|
|
290
|
+
suite_definition_name as suiteDefinitionName
|
|
241
291
|
FROM agent_versions WHERE id = ?`)
|
|
242
292
|
.get(run.agentVersionId);
|
|
243
293
|
return {
|
|
@@ -253,6 +303,15 @@ export class Storage {
|
|
|
253
303
|
provider: agentVersion.provider ?? undefined,
|
|
254
304
|
command: agentVersion.command ?? undefined,
|
|
255
305
|
args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
|
|
306
|
+
variantSetName: agentVersion.variantSetName ?? undefined,
|
|
307
|
+
variantLabel: agentVersion.variantLabel ?? undefined,
|
|
308
|
+
promptVersion: agentVersion.promptVersion ?? undefined,
|
|
309
|
+
modelVersion: agentVersion.modelVersion ?? undefined,
|
|
310
|
+
toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
|
|
311
|
+
configLabel: agentVersion.configLabel ?? undefined,
|
|
312
|
+
configHash: agentVersion.configHash ?? undefined,
|
|
313
|
+
runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
|
|
314
|
+
suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
|
|
256
315
|
config: JSON.parse(agentVersion.config_json),
|
|
257
316
|
}
|
|
258
317
|
: undefined,
|
|
@@ -267,46 +326,85 @@ export class Storage {
|
|
|
267
326
|
if (!candidate) {
|
|
268
327
|
throw new Error(`Run '${candidateRunId}' not found.`);
|
|
269
328
|
}
|
|
270
|
-
|
|
271
|
-
|
|
329
|
+
return compareRunBundles(baseline, candidate);
|
|
330
|
+
}
|
|
331
|
+
compareSuites(baselineBatchId, candidateBatchId) {
|
|
332
|
+
const baselineRuns = this.getRunsBySuiteBatchId(baselineBatchId);
|
|
333
|
+
const candidateRuns = this.getRunsBySuiteBatchId(candidateBatchId);
|
|
334
|
+
if (baselineRuns.length === 0) {
|
|
335
|
+
throw new Error(`No runs found for suite batch '${baselineBatchId}'.`);
|
|
272
336
|
}
|
|
273
|
-
if (
|
|
274
|
-
throw new Error(
|
|
337
|
+
if (candidateRuns.length === 0) {
|
|
338
|
+
throw new Error(`No runs found for suite batch '${candidateBatchId}'.`);
|
|
275
339
|
}
|
|
276
|
-
const
|
|
277
|
-
|
|
278
|
-
|
|
340
|
+
const baselineSuites = new Set(baselineRuns.map((bundle) => deriveSuiteName(bundle.run.scenarioId)));
|
|
341
|
+
const candidateSuites = new Set(candidateRuns.map((bundle) => deriveSuiteName(bundle.run.scenarioId)));
|
|
342
|
+
if (baselineSuites.size !== 1) {
|
|
343
|
+
throw new Error(`Suite batch '${baselineBatchId}' contains runs from multiple suites.`);
|
|
279
344
|
}
|
|
280
|
-
if (
|
|
281
|
-
|
|
345
|
+
if (candidateSuites.size !== 1) {
|
|
346
|
+
throw new Error(`Suite batch '${candidateBatchId}' contains runs from multiple suites.`);
|
|
282
347
|
}
|
|
283
|
-
|
|
284
|
-
|
|
348
|
+
const suite = [...baselineSuites][0] ?? "unknown";
|
|
349
|
+
const candidateSuite = [...candidateSuites][0] ?? "unknown";
|
|
350
|
+
if (suite !== candidateSuite) {
|
|
351
|
+
throw new Error(`Suite batches can only be compared when they share the same suite. Got '${suite}' and '${candidateSuite}'.`);
|
|
285
352
|
}
|
|
286
|
-
|
|
287
|
-
|
|
353
|
+
const baselineMap = new Map(baselineRuns.map((bundle) => [bundle.run.scenarioId, bundle]));
|
|
354
|
+
const candidateMap = new Map(candidateRuns.map((bundle) => [bundle.run.scenarioId, bundle]));
|
|
355
|
+
const sharedScenarioIds = [...baselineMap.keys()].filter((scenarioId) => candidateMap.has(scenarioId)).sort();
|
|
356
|
+
const comparisons = sharedScenarioIds.map((scenarioId) => ({
|
|
357
|
+
scenarioId,
|
|
358
|
+
comparison: compareRunBundles(baselineMap.get(scenarioId), candidateMap.get(scenarioId)),
|
|
359
|
+
}));
|
|
360
|
+
const regressions = comparisons.filter((entry) => entry.comparison.classification === "regressed");
|
|
361
|
+
const improvements = comparisons.filter((entry) => entry.comparison.classification === "improved");
|
|
362
|
+
const unchanged = comparisons.filter((entry) => !["regressed", "improved"].includes(entry.comparison.classification));
|
|
363
|
+
const baselineStats = summarizeRuns(baselineRuns);
|
|
364
|
+
const candidateStats = summarizeRuns(candidateRuns);
|
|
365
|
+
const missingFromCandidate = [...baselineMap.keys()].filter((scenarioId) => !candidateMap.has(scenarioId)).sort();
|
|
366
|
+
const missingFromBaseline = [...candidateMap.keys()].filter((scenarioId) => !baselineMap.has(scenarioId)).sort();
|
|
367
|
+
const notes = [];
|
|
368
|
+
if (regressions.length > 0) {
|
|
369
|
+
notes.push(`${regressions.length} scenario regressions detected.`);
|
|
288
370
|
}
|
|
289
|
-
if (
|
|
290
|
-
notes.push(
|
|
371
|
+
if (improvements.length > 0) {
|
|
372
|
+
notes.push(`${improvements.length} scenario improvements detected.`);
|
|
373
|
+
}
|
|
374
|
+
if (missingFromCandidate.length > 0) {
|
|
375
|
+
notes.push(`${missingFromCandidate.length} scenarios missing from candidate batch.`);
|
|
376
|
+
}
|
|
377
|
+
if (missingFromBaseline.length > 0) {
|
|
378
|
+
notes.push(`${missingFromBaseline.length} scenarios missing from baseline batch.`);
|
|
291
379
|
}
|
|
292
|
-
const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
|
|
293
|
-
const toolDiffs = buildToolDiffs(baseline, candidate);
|
|
294
380
|
return {
|
|
295
|
-
|
|
296
|
-
|
|
381
|
+
suite,
|
|
382
|
+
baselineBatchId,
|
|
383
|
+
candidateBatchId,
|
|
384
|
+
classification: regressions.length > 0 ? "regressed" : improvements.length > 0 ? "improved" : notes.length > 0 ? "mixed" : "unchanged",
|
|
297
385
|
notes,
|
|
298
386
|
deltas: {
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
387
|
+
pass: candidateStats.pass - baselineStats.pass,
|
|
388
|
+
fail: candidateStats.fail - baselineStats.fail,
|
|
389
|
+
error: candidateStats.error - baselineStats.error,
|
|
390
|
+
averageScore: candidateStats.averageScore - baselineStats.averageScore,
|
|
391
|
+
averageRuntimeMs: candidateStats.averageRuntimeMs - baselineStats.averageRuntimeMs,
|
|
392
|
+
averageSteps: candidateStats.averageSteps - baselineStats.averageSteps,
|
|
302
393
|
},
|
|
303
|
-
|
|
304
|
-
|
|
394
|
+
regressions,
|
|
395
|
+
improvements,
|
|
396
|
+
unchanged,
|
|
397
|
+
missingFromCandidate,
|
|
398
|
+
missingFromBaseline,
|
|
305
399
|
};
|
|
306
400
|
}
|
|
307
401
|
getRunRecord(runId) {
|
|
308
402
|
return (this.db
|
|
309
403
|
.prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
|
|
404
|
+
suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
|
|
405
|
+
prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
|
|
406
|
+
config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
|
|
407
|
+
suite_definition_name as suiteDefinitionName,
|
|
310
408
|
status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
|
|
311
409
|
total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
|
|
312
410
|
total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
|
|
@@ -346,6 +444,75 @@ export class Storage {
|
|
|
346
444
|
if (!names.has("args_json")) {
|
|
347
445
|
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
|
|
348
446
|
}
|
|
447
|
+
if (!names.has("variant_set_name")) {
|
|
448
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
|
|
449
|
+
}
|
|
450
|
+
if (!names.has("variant_label")) {
|
|
451
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
|
|
452
|
+
}
|
|
453
|
+
if (!names.has("prompt_version")) {
|
|
454
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
|
|
455
|
+
}
|
|
456
|
+
if (!names.has("model_version")) {
|
|
457
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
|
|
458
|
+
}
|
|
459
|
+
if (!names.has("tool_schema_version")) {
|
|
460
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
|
|
461
|
+
}
|
|
462
|
+
if (!names.has("config_label")) {
|
|
463
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
|
|
464
|
+
}
|
|
465
|
+
if (!names.has("config_hash")) {
|
|
466
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
|
|
467
|
+
}
|
|
468
|
+
if (!names.has("runtime_profile_name")) {
|
|
469
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
|
|
470
|
+
}
|
|
471
|
+
if (!names.has("suite_definition_name")) {
|
|
472
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
ensureRunColumns() {
|
|
476
|
+
const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
|
|
477
|
+
const names = new Set(columns.map((column) => column.name));
|
|
478
|
+
if (!names.has("suite_batch_id")) {
|
|
479
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
|
|
480
|
+
}
|
|
481
|
+
if (!names.has("variant_set_name")) {
|
|
482
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
|
|
483
|
+
}
|
|
484
|
+
if (!names.has("variant_label")) {
|
|
485
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
|
|
486
|
+
}
|
|
487
|
+
if (!names.has("prompt_version")) {
|
|
488
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
|
|
489
|
+
}
|
|
490
|
+
if (!names.has("model_version")) {
|
|
491
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
|
|
492
|
+
}
|
|
493
|
+
if (!names.has("tool_schema_version")) {
|
|
494
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
|
|
495
|
+
}
|
|
496
|
+
if (!names.has("config_label")) {
|
|
497
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
|
|
498
|
+
}
|
|
499
|
+
if (!names.has("config_hash")) {
|
|
500
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
|
|
501
|
+
}
|
|
502
|
+
if (!names.has("runtime_profile_name")) {
|
|
503
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
|
|
504
|
+
}
|
|
505
|
+
if (!names.has("suite_definition_name")) {
|
|
506
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
getRunsBySuiteBatchId(suiteBatchId) {
|
|
510
|
+
const runIds = this.db
|
|
511
|
+
.prepare(`SELECT id FROM runs WHERE suite_batch_id = ? ORDER BY scenario_id ASC`)
|
|
512
|
+
.all(suiteBatchId);
|
|
513
|
+
return runIds
|
|
514
|
+
.map((row) => this.getRun(row.id))
|
|
515
|
+
.filter((bundle) => bundle !== null);
|
|
349
516
|
}
|
|
350
517
|
}
|
|
351
518
|
function buildEvaluatorDiffs(baseline, candidate) {
|
|
@@ -361,14 +528,18 @@ function buildEvaluatorDiffs(baseline, candidate) {
|
|
|
361
528
|
if (baselineResult?.status === candidateResult?.status) {
|
|
362
529
|
return null;
|
|
363
530
|
}
|
|
531
|
+
const hardGate = baselineResult?.mode === "hard_gate" || candidateResult?.mode === "hard_gate";
|
|
364
532
|
return {
|
|
365
533
|
evaluatorId,
|
|
534
|
+
hardGate,
|
|
535
|
+
weight: candidateResult?.weight ?? baselineResult?.weight,
|
|
366
536
|
baselineStatus: baselineResult?.status,
|
|
367
537
|
candidateStatus: candidateResult?.status,
|
|
368
538
|
note: `Evaluator '${evaluatorId}' changed: ${baselineResult?.status ?? "missing"} -> ${candidateResult?.status ?? "missing"}`,
|
|
369
539
|
};
|
|
370
540
|
})
|
|
371
|
-
.filter((diff) => diff !== null)
|
|
541
|
+
.filter((diff) => diff !== null)
|
|
542
|
+
.sort((left, right) => Number(right.hardGate) - Number(left.hardGate) || left.evaluatorId.localeCompare(right.evaluatorId));
|
|
372
543
|
}
|
|
373
544
|
function buildToolDiffs(baseline, candidate) {
|
|
374
545
|
const toolNames = new Set([
|
|
@@ -383,12 +554,115 @@ function buildToolDiffs(baseline, candidate) {
|
|
|
383
554
|
if (baselineCount === candidateCount) {
|
|
384
555
|
return null;
|
|
385
556
|
}
|
|
386
|
-
|
|
557
|
+
const diff = {
|
|
387
558
|
toolName,
|
|
388
559
|
baselineCount,
|
|
389
560
|
candidateCount,
|
|
561
|
+
risk: baselineCount === 0 && candidateCount > 0 ? "new_tool" : "none",
|
|
390
562
|
note: `Tool '${toolName}' usage changed: ${baselineCount} -> ${candidateCount}`,
|
|
391
563
|
};
|
|
564
|
+
return diff;
|
|
392
565
|
})
|
|
393
566
|
.filter((diff) => diff !== null);
|
|
394
567
|
}
|
|
568
|
+
function compareRunBundles(baseline, candidate) {
|
|
569
|
+
if (baseline.run.scenarioId !== candidate.run.scenarioId) {
|
|
570
|
+
throw new Error("Runs can only be compared when they share the same scenario id.");
|
|
571
|
+
}
|
|
572
|
+
if (baseline.run.scenarioFileHash !== candidate.run.scenarioFileHash) {
|
|
573
|
+
throw new Error("Runs can only be compared when they share the same scenario file hash.");
|
|
574
|
+
}
|
|
575
|
+
const notes = [];
|
|
576
|
+
const verdictDelta = `${baseline.run.status} -> ${candidate.run.status}`;
|
|
577
|
+
if (baseline.run.status !== candidate.run.status) {
|
|
578
|
+
notes.push(`Verdict changed: ${verdictDelta}`);
|
|
579
|
+
}
|
|
580
|
+
if (baseline.run.score !== candidate.run.score) {
|
|
581
|
+
notes.push(`Score changed: ${baseline.run.score} -> ${candidate.run.score}`);
|
|
582
|
+
}
|
|
583
|
+
if (baseline.run.totalSteps !== candidate.run.totalSteps) {
|
|
584
|
+
notes.push(`Steps changed: ${baseline.run.totalSteps} -> ${candidate.run.totalSteps}`);
|
|
585
|
+
}
|
|
586
|
+
if (baseline.run.durationMs !== candidate.run.durationMs) {
|
|
587
|
+
notes.push(`Runtime changed: ${baseline.run.durationMs}ms -> ${candidate.run.durationMs}ms`);
|
|
588
|
+
}
|
|
589
|
+
if (baseline.run.terminationReason !== candidate.run.terminationReason) {
|
|
590
|
+
notes.push(`Termination changed: ${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`);
|
|
591
|
+
}
|
|
592
|
+
const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
|
|
593
|
+
const toolDiffs = buildToolDiffs(baseline, candidate);
|
|
594
|
+
const hardGateRegression = evaluatorDiffs.some((diff) => diff.hardGate && diff.baselineStatus === "pass" && diff.candidateStatus === "fail");
|
|
595
|
+
const scoreDelta = candidate.run.score - baseline.run.score;
|
|
596
|
+
const runtimeDeltaMs = candidate.run.durationMs - baseline.run.durationMs;
|
|
597
|
+
const stepDelta = candidate.run.totalSteps - baseline.run.totalSteps;
|
|
598
|
+
const runtimePct = baseline.run.durationMs === 0 ? 0 : Math.round((runtimeDeltaMs / baseline.run.durationMs) * 100);
|
|
599
|
+
const outputChanged = baseline.run.finalOutput !== candidate.run.finalOutput;
|
|
600
|
+
if (outputChanged) {
|
|
601
|
+
notes.push("Final output changed.");
|
|
602
|
+
}
|
|
603
|
+
return {
|
|
604
|
+
baseline,
|
|
605
|
+
candidate,
|
|
606
|
+
classification: classifyComparison({
|
|
607
|
+
baselineStatus: baseline.run.status,
|
|
608
|
+
candidateStatus: candidate.run.status,
|
|
609
|
+
scoreDelta,
|
|
610
|
+
runtimePct,
|
|
611
|
+
stepDelta,
|
|
612
|
+
hardGateRegression,
|
|
613
|
+
}),
|
|
614
|
+
verdictDelta,
|
|
615
|
+
terminationDelta: baseline.run.terminationReason === candidate.run.terminationReason
|
|
616
|
+
? undefined
|
|
617
|
+
: `${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`,
|
|
618
|
+
outputChanged,
|
|
619
|
+
notes,
|
|
620
|
+
deltas: {
|
|
621
|
+
score: scoreDelta,
|
|
622
|
+
runtimeMs: runtimeDeltaMs,
|
|
623
|
+
steps: stepDelta,
|
|
624
|
+
runtimePct,
|
|
625
|
+
},
|
|
626
|
+
evaluatorDiffs,
|
|
627
|
+
toolDiffs,
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
function classifyComparison(input) {
|
|
631
|
+
if (input.baselineStatus === "pass" &&
|
|
632
|
+
(input.candidateStatus !== "pass" || input.hardGateRegression || input.scoreDelta < -5 || input.runtimePct > 25 || input.stepDelta > 2)) {
|
|
633
|
+
return "regressed";
|
|
634
|
+
}
|
|
635
|
+
if (input.baselineStatus !== "pass" && input.candidateStatus === "pass") {
|
|
636
|
+
return "improved";
|
|
637
|
+
}
|
|
638
|
+
if (input.baselineStatus === input.candidateStatus &&
|
|
639
|
+
input.baselineStatus === "pass" &&
|
|
640
|
+
input.scoreDelta >= 0 &&
|
|
641
|
+
input.runtimePct <= 25 &&
|
|
642
|
+
input.stepDelta <= 2 &&
|
|
643
|
+
!input.hardGateRegression) {
|
|
644
|
+
return "unchanged_pass";
|
|
645
|
+
}
|
|
646
|
+
if (input.baselineStatus === input.candidateStatus && input.baselineStatus === "fail") {
|
|
647
|
+
return "unchanged_fail";
|
|
648
|
+
}
|
|
649
|
+
if (input.baselineStatus !== "pass" && input.candidateStatus !== "pass" && input.scoreDelta > 0) {
|
|
650
|
+
return "improved";
|
|
651
|
+
}
|
|
652
|
+
if (input.scoreDelta < -5 || input.runtimePct > 25 || input.stepDelta > 2 || input.hardGateRegression) {
|
|
653
|
+
return "regressed";
|
|
654
|
+
}
|
|
655
|
+
return "changed_non_terminal";
|
|
656
|
+
}
|
|
657
|
+
function summarizeRuns(runs) {
|
|
658
|
+
const pass = runs.filter((bundle) => bundle.run.status === "pass").length;
|
|
659
|
+
const fail = runs.filter((bundle) => bundle.run.status === "fail").length;
|
|
660
|
+
const error = runs.filter((bundle) => bundle.run.status === "error").length;
|
|
661
|
+
const averageScore = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
|
|
662
|
+
const averageRuntimeMs = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.durationMs, 0) / runs.length);
|
|
663
|
+
const averageSteps = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.totalSteps, 0) / runs.length);
|
|
664
|
+
return { pass, fail, error, averageScore, averageRuntimeMs, averageSteps };
|
|
665
|
+
}
|
|
666
|
+
function deriveSuiteName(scenarioId) {
|
|
667
|
+
return scenarioId.split(".")[0] ?? "unknown";
|
|
668
|
+
}
|