agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/storage.js CHANGED
@@ -10,6 +10,10 @@ export class Storage {
10
10
  ensureParentDir(DB_PATH);
11
11
  this.db = new DatabaseSync(DB_PATH);
12
12
  this.db.exec(`
13
+ PRAGMA journal_mode = WAL;
14
+ PRAGMA busy_timeout = 5000;
15
+ `);
16
+ this.db.exec(`
13
17
  CREATE TABLE IF NOT EXISTS metadata (
14
18
  key TEXT PRIMARY KEY,
15
19
  value TEXT NOT NULL
@@ -35,6 +39,15 @@ export class Storage {
35
39
  provider TEXT,
36
40
  command TEXT,
37
41
  args_json TEXT,
42
+ variant_set_name TEXT,
43
+ variant_label TEXT,
44
+ prompt_version TEXT,
45
+ model_version TEXT,
46
+ tool_schema_version TEXT,
47
+ config_label TEXT,
48
+ config_hash TEXT,
49
+ runtime_profile_name TEXT,
50
+ suite_definition_name TEXT,
38
51
  config_json TEXT NOT NULL,
39
52
  created_at TEXT NOT NULL
40
53
  );
@@ -44,6 +57,16 @@ export class Storage {
44
57
  scenario_id TEXT NOT NULL,
45
58
  scenario_file_hash TEXT NOT NULL,
46
59
  agent_version_id TEXT NOT NULL,
60
+ suite_batch_id TEXT,
61
+ variant_set_name TEXT,
62
+ variant_label TEXT,
63
+ prompt_version TEXT,
64
+ model_version TEXT,
65
+ tool_schema_version TEXT,
66
+ config_label TEXT,
67
+ config_hash TEXT,
68
+ runtime_profile_name TEXT,
69
+ suite_definition_name TEXT,
47
70
  status TEXT NOT NULL,
48
71
  termination_reason TEXT NOT NULL,
49
72
  final_output TEXT NOT NULL,
@@ -95,6 +118,10 @@ export class Storage {
95
118
  `);
96
119
  this.ensureSchemaVersion();
97
120
  this.ensureAgentVersionColumns();
121
+ this.ensureRunColumns();
122
+ }
123
+ close() {
124
+ this.db.close();
98
125
  }
99
126
  upsertScenario(summary, definition, filePath, fileHash) {
100
127
  const now = new Date().toISOString();
@@ -115,25 +142,41 @@ export class Storage {
115
142
  upsertAgentVersion(agentVersion) {
116
143
  const now = new Date().toISOString();
117
144
  this.db
118
- .prepare(`INSERT INTO agent_versions (id, label, model_id, provider, command, args_json, config_json, created_at)
119
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
145
+ .prepare(`INSERT INTO agent_versions (
146
+ id, label, model_id, provider, command, args_json,
147
+ variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
148
+ config_label, config_hash, runtime_profile_name, suite_definition_name,
149
+ config_json, created_at
150
+ )
151
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
120
152
  ON CONFLICT(id) DO UPDATE SET
121
153
  label = excluded.label,
122
154
  model_id = excluded.model_id,
123
155
  provider = excluded.provider,
124
156
  command = excluded.command,
125
157
  args_json = excluded.args_json,
158
+ variant_set_name = excluded.variant_set_name,
159
+ variant_label = excluded.variant_label,
160
+ prompt_version = excluded.prompt_version,
161
+ model_version = excluded.model_version,
162
+ tool_schema_version = excluded.tool_schema_version,
163
+ config_label = excluded.config_label,
164
+ config_hash = excluded.config_hash,
165
+ runtime_profile_name = excluded.runtime_profile_name,
166
+ suite_definition_name = excluded.suite_definition_name,
126
167
  config_json = excluded.config_json`)
127
- .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
168
+ .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
128
169
  }
129
170
  saveRun(bundle) {
130
171
  const run = bundle.run;
131
172
  this.db
132
173
  .prepare(`INSERT INTO runs (
133
174
  id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
175
+ suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
176
+ config_label, config_hash, runtime_profile_name, suite_definition_name,
134
177
  total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
135
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
136
- .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
178
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
179
+ .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
137
180
  const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
138
181
  VALUES (?, ?, ?, ?, ?, ?, ?)`);
139
182
  const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
@@ -177,6 +220,8 @@ export class Storage {
177
220
  const whereClause = clauses.length > 0 ? `WHERE ${clauses.join(" AND ")}` : "";
178
221
  return this.db
179
222
  .prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
223
+ r.suite_batch_id as suiteBatchId,
224
+ r.variant_set_name as variantSetName, r.variant_label as variantLabel,
180
225
  av.label as agentLabel, av.provider, av.model_id as modelId,
181
226
  r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
182
227
  r.started_at as startedAt
@@ -238,6 +283,11 @@ export class Storage {
238
283
  }));
239
284
  const agentVersion = this.db
240
285
  .prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
286
+ , variant_set_name as variantSetName, variant_label as variantLabel,
287
+ prompt_version as promptVersion, model_version as modelVersion,
288
+ tool_schema_version as toolSchemaVersion, config_label as configLabel,
289
+ config_hash as configHash, runtime_profile_name as runtimeProfileName,
290
+ suite_definition_name as suiteDefinitionName
241
291
  FROM agent_versions WHERE id = ?`)
242
292
  .get(run.agentVersionId);
243
293
  return {
@@ -253,6 +303,15 @@ export class Storage {
253
303
  provider: agentVersion.provider ?? undefined,
254
304
  command: agentVersion.command ?? undefined,
255
305
  args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
306
+ variantSetName: agentVersion.variantSetName ?? undefined,
307
+ variantLabel: agentVersion.variantLabel ?? undefined,
308
+ promptVersion: agentVersion.promptVersion ?? undefined,
309
+ modelVersion: agentVersion.modelVersion ?? undefined,
310
+ toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
311
+ configLabel: agentVersion.configLabel ?? undefined,
312
+ configHash: agentVersion.configHash ?? undefined,
313
+ runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
314
+ suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
256
315
  config: JSON.parse(agentVersion.config_json),
257
316
  }
258
317
  : undefined,
@@ -267,46 +326,85 @@ export class Storage {
267
326
  if (!candidate) {
268
327
  throw new Error(`Run '${candidateRunId}' not found.`);
269
328
  }
270
- if (baseline.run.scenarioId !== candidate.run.scenarioId) {
271
- throw new Error("Runs can only be compared when they share the same scenario id.");
329
+ return compareRunBundles(baseline, candidate);
330
+ }
331
+ compareSuites(baselineBatchId, candidateBatchId) {
332
+ const baselineRuns = this.getRunsBySuiteBatchId(baselineBatchId);
333
+ const candidateRuns = this.getRunsBySuiteBatchId(candidateBatchId);
334
+ if (baselineRuns.length === 0) {
335
+ throw new Error(`No runs found for suite batch '${baselineBatchId}'.`);
272
336
  }
273
- if (baseline.run.scenarioFileHash !== candidate.run.scenarioFileHash) {
274
- throw new Error("Runs can only be compared when they share the same scenario file hash.");
337
+ if (candidateRuns.length === 0) {
338
+ throw new Error(`No runs found for suite batch '${candidateBatchId}'.`);
275
339
  }
276
- const notes = [];
277
- if (baseline.run.status !== candidate.run.status) {
278
- notes.push(`Verdict changed: ${baseline.run.status} -> ${candidate.run.status}`);
340
+ const baselineSuites = new Set(baselineRuns.map((bundle) => deriveSuiteName(bundle.run.scenarioId)));
341
+ const candidateSuites = new Set(candidateRuns.map((bundle) => deriveSuiteName(bundle.run.scenarioId)));
342
+ if (baselineSuites.size !== 1) {
343
+ throw new Error(`Suite batch '${baselineBatchId}' contains runs from multiple suites.`);
279
344
  }
280
- if (baseline.run.score !== candidate.run.score) {
281
- notes.push(`Score changed: ${baseline.run.score} -> ${candidate.run.score}`);
345
+ if (candidateSuites.size !== 1) {
346
+ throw new Error(`Suite batch '${candidateBatchId}' contains runs from multiple suites.`);
282
347
  }
283
- if (baseline.run.totalSteps !== candidate.run.totalSteps) {
284
- notes.push(`Steps changed: ${baseline.run.totalSteps} -> ${candidate.run.totalSteps}`);
348
+ const suite = [...baselineSuites][0] ?? "unknown";
349
+ const candidateSuite = [...candidateSuites][0] ?? "unknown";
350
+ if (suite !== candidateSuite) {
351
+ throw new Error(`Suite batches can only be compared when they share the same suite. Got '${suite}' and '${candidateSuite}'.`);
285
352
  }
286
- if (baseline.run.durationMs !== candidate.run.durationMs) {
287
- notes.push(`Runtime changed: ${baseline.run.durationMs}ms -> ${candidate.run.durationMs}ms`);
353
+ const baselineMap = new Map(baselineRuns.map((bundle) => [bundle.run.scenarioId, bundle]));
354
+ const candidateMap = new Map(candidateRuns.map((bundle) => [bundle.run.scenarioId, bundle]));
355
+ const sharedScenarioIds = [...baselineMap.keys()].filter((scenarioId) => candidateMap.has(scenarioId)).sort();
356
+ const comparisons = sharedScenarioIds.map((scenarioId) => ({
357
+ scenarioId,
358
+ comparison: compareRunBundles(baselineMap.get(scenarioId), candidateMap.get(scenarioId)),
359
+ }));
360
+ const regressions = comparisons.filter((entry) => entry.comparison.classification === "regressed");
361
+ const improvements = comparisons.filter((entry) => entry.comparison.classification === "improved");
362
+ const unchanged = comparisons.filter((entry) => !["regressed", "improved"].includes(entry.comparison.classification));
363
+ const baselineStats = summarizeRuns(baselineRuns);
364
+ const candidateStats = summarizeRuns(candidateRuns);
365
+ const missingFromCandidate = [...baselineMap.keys()].filter((scenarioId) => !candidateMap.has(scenarioId)).sort();
366
+ const missingFromBaseline = [...candidateMap.keys()].filter((scenarioId) => !baselineMap.has(scenarioId)).sort();
367
+ const notes = [];
368
+ if (regressions.length > 0) {
369
+ notes.push(`${regressions.length} scenario regressions detected.`);
288
370
  }
289
- if (baseline.run.terminationReason !== candidate.run.terminationReason) {
290
- notes.push(`Termination changed: ${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`);
371
+ if (improvements.length > 0) {
372
+ notes.push(`${improvements.length} scenario improvements detected.`);
373
+ }
374
+ if (missingFromCandidate.length > 0) {
375
+ notes.push(`${missingFromCandidate.length} scenarios missing from candidate batch.`);
376
+ }
377
+ if (missingFromBaseline.length > 0) {
378
+ notes.push(`${missingFromBaseline.length} scenarios missing from baseline batch.`);
291
379
  }
292
- const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
293
- const toolDiffs = buildToolDiffs(baseline, candidate);
294
380
  return {
295
- baseline,
296
- candidate,
381
+ suite,
382
+ baselineBatchId,
383
+ candidateBatchId,
384
+ classification: regressions.length > 0 ? "regressed" : improvements.length > 0 ? "improved" : notes.length > 0 ? "mixed" : "unchanged",
297
385
  notes,
298
386
  deltas: {
299
- score: candidate.run.score - baseline.run.score,
300
- runtimeMs: candidate.run.durationMs - baseline.run.durationMs,
301
- steps: candidate.run.totalSteps - baseline.run.totalSteps,
387
+ pass: candidateStats.pass - baselineStats.pass,
388
+ fail: candidateStats.fail - baselineStats.fail,
389
+ error: candidateStats.error - baselineStats.error,
390
+ averageScore: candidateStats.averageScore - baselineStats.averageScore,
391
+ averageRuntimeMs: candidateStats.averageRuntimeMs - baselineStats.averageRuntimeMs,
392
+ averageSteps: candidateStats.averageSteps - baselineStats.averageSteps,
302
393
  },
303
- evaluatorDiffs,
304
- toolDiffs,
394
+ regressions,
395
+ improvements,
396
+ unchanged,
397
+ missingFromCandidate,
398
+ missingFromBaseline,
305
399
  };
306
400
  }
307
401
  getRunRecord(runId) {
308
402
  return (this.db
309
403
  .prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
404
+ suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
405
+ prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
406
+ config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
407
+ suite_definition_name as suiteDefinitionName,
310
408
  status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
311
409
  total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
312
410
  total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
@@ -346,6 +444,75 @@ export class Storage {
346
444
  if (!names.has("args_json")) {
347
445
  this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
348
446
  }
447
+ if (!names.has("variant_set_name")) {
448
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
449
+ }
450
+ if (!names.has("variant_label")) {
451
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
452
+ }
453
+ if (!names.has("prompt_version")) {
454
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
455
+ }
456
+ if (!names.has("model_version")) {
457
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
458
+ }
459
+ if (!names.has("tool_schema_version")) {
460
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
461
+ }
462
+ if (!names.has("config_label")) {
463
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
464
+ }
465
+ if (!names.has("config_hash")) {
466
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
467
+ }
468
+ if (!names.has("runtime_profile_name")) {
469
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
470
+ }
471
+ if (!names.has("suite_definition_name")) {
472
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
473
+ }
474
+ }
475
+ ensureRunColumns() {
476
+ const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
477
+ const names = new Set(columns.map((column) => column.name));
478
+ if (!names.has("suite_batch_id")) {
479
+ this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
480
+ }
481
+ if (!names.has("variant_set_name")) {
482
+ this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
483
+ }
484
+ if (!names.has("variant_label")) {
485
+ this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
486
+ }
487
+ if (!names.has("prompt_version")) {
488
+ this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
489
+ }
490
+ if (!names.has("model_version")) {
491
+ this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
492
+ }
493
+ if (!names.has("tool_schema_version")) {
494
+ this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
495
+ }
496
+ if (!names.has("config_label")) {
497
+ this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
498
+ }
499
+ if (!names.has("config_hash")) {
500
+ this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
501
+ }
502
+ if (!names.has("runtime_profile_name")) {
503
+ this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
504
+ }
505
+ if (!names.has("suite_definition_name")) {
506
+ this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
507
+ }
508
+ }
509
+ getRunsBySuiteBatchId(suiteBatchId) {
510
+ const runIds = this.db
511
+ .prepare(`SELECT id FROM runs WHERE suite_batch_id = ? ORDER BY scenario_id ASC`)
512
+ .all(suiteBatchId);
513
+ return runIds
514
+ .map((row) => this.getRun(row.id))
515
+ .filter((bundle) => bundle !== null);
349
516
  }
350
517
  }
351
518
  function buildEvaluatorDiffs(baseline, candidate) {
@@ -361,14 +528,18 @@ function buildEvaluatorDiffs(baseline, candidate) {
361
528
  if (baselineResult?.status === candidateResult?.status) {
362
529
  return null;
363
530
  }
531
+ const hardGate = baselineResult?.mode === "hard_gate" || candidateResult?.mode === "hard_gate";
364
532
  return {
365
533
  evaluatorId,
534
+ hardGate,
535
+ weight: candidateResult?.weight ?? baselineResult?.weight,
366
536
  baselineStatus: baselineResult?.status,
367
537
  candidateStatus: candidateResult?.status,
368
538
  note: `Evaluator '${evaluatorId}' changed: ${baselineResult?.status ?? "missing"} -> ${candidateResult?.status ?? "missing"}`,
369
539
  };
370
540
  })
371
- .filter((diff) => diff !== null);
541
+ .filter((diff) => diff !== null)
542
+ .sort((left, right) => Number(right.hardGate) - Number(left.hardGate) || left.evaluatorId.localeCompare(right.evaluatorId));
372
543
  }
373
544
  function buildToolDiffs(baseline, candidate) {
374
545
  const toolNames = new Set([
@@ -383,12 +554,115 @@ function buildToolDiffs(baseline, candidate) {
383
554
  if (baselineCount === candidateCount) {
384
555
  return null;
385
556
  }
386
- return {
557
+ const diff = {
387
558
  toolName,
388
559
  baselineCount,
389
560
  candidateCount,
561
+ risk: baselineCount === 0 && candidateCount > 0 ? "new_tool" : "none",
390
562
  note: `Tool '${toolName}' usage changed: ${baselineCount} -> ${candidateCount}`,
391
563
  };
564
+ return diff;
392
565
  })
393
566
  .filter((diff) => diff !== null);
394
567
  }
568
+ function compareRunBundles(baseline, candidate) {
569
+ if (baseline.run.scenarioId !== candidate.run.scenarioId) {
570
+ throw new Error("Runs can only be compared when they share the same scenario id.");
571
+ }
572
+ if (baseline.run.scenarioFileHash !== candidate.run.scenarioFileHash) {
573
+ throw new Error("Runs can only be compared when they share the same scenario file hash.");
574
+ }
575
+ const notes = [];
576
+ const verdictDelta = `${baseline.run.status} -> ${candidate.run.status}`;
577
+ if (baseline.run.status !== candidate.run.status) {
578
+ notes.push(`Verdict changed: ${verdictDelta}`);
579
+ }
580
+ if (baseline.run.score !== candidate.run.score) {
581
+ notes.push(`Score changed: ${baseline.run.score} -> ${candidate.run.score}`);
582
+ }
583
+ if (baseline.run.totalSteps !== candidate.run.totalSteps) {
584
+ notes.push(`Steps changed: ${baseline.run.totalSteps} -> ${candidate.run.totalSteps}`);
585
+ }
586
+ if (baseline.run.durationMs !== candidate.run.durationMs) {
587
+ notes.push(`Runtime changed: ${baseline.run.durationMs}ms -> ${candidate.run.durationMs}ms`);
588
+ }
589
+ if (baseline.run.terminationReason !== candidate.run.terminationReason) {
590
+ notes.push(`Termination changed: ${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`);
591
+ }
592
+ const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
593
+ const toolDiffs = buildToolDiffs(baseline, candidate);
594
+ const hardGateRegression = evaluatorDiffs.some((diff) => diff.hardGate && diff.baselineStatus === "pass" && diff.candidateStatus === "fail");
595
+ const scoreDelta = candidate.run.score - baseline.run.score;
596
+ const runtimeDeltaMs = candidate.run.durationMs - baseline.run.durationMs;
597
+ const stepDelta = candidate.run.totalSteps - baseline.run.totalSteps;
598
+ const runtimePct = baseline.run.durationMs === 0 ? 0 : Math.round((runtimeDeltaMs / baseline.run.durationMs) * 100);
599
+ const outputChanged = baseline.run.finalOutput !== candidate.run.finalOutput;
600
+ if (outputChanged) {
601
+ notes.push("Final output changed.");
602
+ }
603
+ return {
604
+ baseline,
605
+ candidate,
606
+ classification: classifyComparison({
607
+ baselineStatus: baseline.run.status,
608
+ candidateStatus: candidate.run.status,
609
+ scoreDelta,
610
+ runtimePct,
611
+ stepDelta,
612
+ hardGateRegression,
613
+ }),
614
+ verdictDelta,
615
+ terminationDelta: baseline.run.terminationReason === candidate.run.terminationReason
616
+ ? undefined
617
+ : `${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`,
618
+ outputChanged,
619
+ notes,
620
+ deltas: {
621
+ score: scoreDelta,
622
+ runtimeMs: runtimeDeltaMs,
623
+ steps: stepDelta,
624
+ runtimePct,
625
+ },
626
+ evaluatorDiffs,
627
+ toolDiffs,
628
+ };
629
+ }
630
+ function classifyComparison(input) {
631
+ if (input.baselineStatus === "pass" &&
632
+ (input.candidateStatus !== "pass" || input.hardGateRegression || input.scoreDelta < -5 || input.runtimePct > 25 || input.stepDelta > 2)) {
633
+ return "regressed";
634
+ }
635
+ if (input.baselineStatus !== "pass" && input.candidateStatus === "pass") {
636
+ return "improved";
637
+ }
638
+ if (input.baselineStatus === input.candidateStatus &&
639
+ input.baselineStatus === "pass" &&
640
+ input.scoreDelta >= 0 &&
641
+ input.runtimePct <= 25 &&
642
+ input.stepDelta <= 2 &&
643
+ !input.hardGateRegression) {
644
+ return "unchanged_pass";
645
+ }
646
+ if (input.baselineStatus === input.candidateStatus && input.baselineStatus === "fail") {
647
+ return "unchanged_fail";
648
+ }
649
+ if (input.baselineStatus !== "pass" && input.candidateStatus !== "pass" && input.scoreDelta > 0) {
650
+ return "improved";
651
+ }
652
+ if (input.scoreDelta < -5 || input.runtimePct > 25 || input.stepDelta > 2 || input.hardGateRegression) {
653
+ return "regressed";
654
+ }
655
+ return "changed_non_terminal";
656
+ }
657
+ function summarizeRuns(runs) {
658
+ const pass = runs.filter((bundle) => bundle.run.status === "pass").length;
659
+ const fail = runs.filter((bundle) => bundle.run.status === "fail").length;
660
+ const error = runs.filter((bundle) => bundle.run.status === "error").length;
661
+ const averageScore = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
662
+ const averageRuntimeMs = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.durationMs, 0) / runs.length);
663
+ const averageSteps = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.totalSteps, 0) / runs.length);
664
+ return { pass, fail, error, averageScore, averageRuntimeMs, averageSteps };
665
+ }
666
+ function deriveSuiteName(scenarioId) {
667
+ return scenarioId.split(".")[0] ?? "unknown";
668
+ }