npm - agent-regression-lab - Versions diffs - 0.1.1 → 0.3.0 - Mend

agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +186 -123
package/dist/agent/factory.js +20 -6
package/dist/agent/httpAdapter.js +79 -0
package/dist/agent/mockAdapter.js +210 -13
package/dist/config.js +223 -4
package/dist/conversationEvaluators.js +167 -0
package/dist/conversationRunner.js +199 -0
package/dist/evaluators.js +56 -1
package/dist/index.js +428 -111
package/dist/lib/id.js +6 -0
package/dist/runOutput.js +46 -0
package/dist/runner.js +31 -9
package/dist/scenarios.js +211 -11
package/dist/scoring.js +2 -2
package/dist/storage.js +305 -31
package/dist/tools.js +284 -0
package/dist/trace.js +4 -2
package/dist/ui/App.js +67 -5
package/dist/ui/server.js +18 -0
package/dist/ui-assets/client.js +165 -3
package/docs/agents.md +287 -0
package/docs/golden-suites.md +74 -0
package/docs/integrations-and-live-services.md +58 -0
package/docs/memory-and-stateful-agents.md +51 -0
package/docs/release-checklist.md +94 -0
package/docs/runtime-profiles.md +67 -0
package/docs/scenarios.md +419 -0
package/docs/tools.md +102 -0
package/docs/troubleshooting.md +296 -0
package/docs/variant-sets.md +63 -0
package/package.json +4 -3

package/dist/storage.js CHANGED Viewed

@@ -10,6 +10,10 @@ export class Storage {
         ensureParentDir(DB_PATH);
         this.db = new DatabaseSync(DB_PATH);
         this.db.exec(`
+      PRAGMA journal_mode = WAL;
+      PRAGMA busy_timeout = 5000;
+    `);
+        this.db.exec(`
       CREATE TABLE IF NOT EXISTS metadata (
         key TEXT PRIMARY KEY,
         value TEXT NOT NULL
@@ -35,6 +39,15 @@ export class Storage {
         provider TEXT,
         command TEXT,
         args_json TEXT,
+        variant_set_name TEXT,
+        variant_label TEXT,
+        prompt_version TEXT,
+        model_version TEXT,
+        tool_schema_version TEXT,
+        config_label TEXT,
+        config_hash TEXT,
+        runtime_profile_name TEXT,
+        suite_definition_name TEXT,
         config_json TEXT NOT NULL,
         created_at TEXT NOT NULL
       );
@@ -44,6 +57,16 @@ export class Storage {
         scenario_id TEXT NOT NULL,
         scenario_file_hash TEXT NOT NULL,
         agent_version_id TEXT NOT NULL,
+        suite_batch_id TEXT,
+        variant_set_name TEXT,
+        variant_label TEXT,
+        prompt_version TEXT,
+        model_version TEXT,
+        tool_schema_version TEXT,
+        config_label TEXT,
+        config_hash TEXT,
+        runtime_profile_name TEXT,
+        suite_definition_name TEXT,
         status TEXT NOT NULL,
         termination_reason TEXT NOT NULL,
         final_output TEXT NOT NULL,
@@ -95,6 +118,10 @@ export class Storage {
     `);
         this.ensureSchemaVersion();
         this.ensureAgentVersionColumns();
+        this.ensureRunColumns();
+    }
+    close() {
+        this.db.close();
     }
     upsertScenario(summary, definition, filePath, fileHash) {
         const now = new Date().toISOString();
@@ -115,25 +142,41 @@ export class Storage {
     upsertAgentVersion(agentVersion) {
         const now = new Date().toISOString();
         this.db
-            .prepare(`INSERT INTO agent_versions (id, label, model_id, provider, command, args_json, config_json, created_at)
-         VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            .prepare(`INSERT INTO agent_versions (
+           id, label, model_id, provider, command, args_json,
+           variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
+           config_label, config_hash, runtime_profile_name, suite_definition_name,
+           config_json, created_at
+         )
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
          ON CONFLICT(id) DO UPDATE SET
            label = excluded.label,
            model_id = excluded.model_id,
            provider = excluded.provider,
            command = excluded.command,
            args_json = excluded.args_json,
+           variant_set_name = excluded.variant_set_name,
+           variant_label = excluded.variant_label,
+           prompt_version = excluded.prompt_version,
+           model_version = excluded.model_version,
+           tool_schema_version = excluded.tool_schema_version,
+           config_label = excluded.config_label,
+           config_hash = excluded.config_hash,
+           runtime_profile_name = excluded.runtime_profile_name,
+           suite_definition_name = excluded.suite_definition_name,
            config_json = excluded.config_json`)
-            .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
+            .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
     }
     saveRun(bundle) {
         const run = bundle.run;
         this.db
             .prepare(`INSERT INTO runs (
           id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
+          suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
+          config_label, config_hash, runtime_profile_name, suite_definition_name,
           total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
-        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
-            .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
+            .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
         const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
        VALUES (?, ?, ?, ?, ?, ?, ?)`);
         const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
@@ -177,6 +220,8 @@ export class Storage {
         const whereClause = clauses.length > 0 ? `WHERE ${clauses.join(" AND ")}` : "";
         return this.db
             .prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
+                r.suite_batch_id as suiteBatchId,
+                r.variant_set_name as variantSetName, r.variant_label as variantLabel,
                 av.label as agentLabel, av.provider, av.model_id as modelId,
                 r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
                 r.started_at as startedAt
@@ -238,6 +283,11 @@ export class Storage {
         }));
         const agentVersion = this.db
             .prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
+                , variant_set_name as variantSetName, variant_label as variantLabel,
+                prompt_version as promptVersion, model_version as modelVersion,
+                tool_schema_version as toolSchemaVersion, config_label as configLabel,
+                config_hash as configHash, runtime_profile_name as runtimeProfileName,
+                suite_definition_name as suiteDefinitionName
          FROM agent_versions WHERE id = ?`)
             .get(run.agentVersionId);
         return {
@@ -253,6 +303,15 @@ export class Storage {
                     provider: agentVersion.provider ?? undefined,
                     command: agentVersion.command ?? undefined,
                     args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
+                    variantSetName: agentVersion.variantSetName ?? undefined,
+                    variantLabel: agentVersion.variantLabel ?? undefined,
+                    promptVersion: agentVersion.promptVersion ?? undefined,
+                    modelVersion: agentVersion.modelVersion ?? undefined,
+                    toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
+                    configLabel: agentVersion.configLabel ?? undefined,
+                    configHash: agentVersion.configHash ?? undefined,
+                    runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
+                    suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
                     config: JSON.parse(agentVersion.config_json),
                 }
                 : undefined,
@@ -267,46 +326,85 @@ export class Storage {
         if (!candidate) {
             throw new Error(`Run '${candidateRunId}' not found.`);
         }
-        if (baseline.run.scenarioId !== candidate.run.scenarioId) {
-            throw new Error("Runs can only be compared when they share the same scenario id.");
+        return compareRunBundles(baseline, candidate);
+    }
+    compareSuites(baselineBatchId, candidateBatchId) {
+        const baselineRuns = this.getRunsBySuiteBatchId(baselineBatchId);
+        const candidateRuns = this.getRunsBySuiteBatchId(candidateBatchId);
+        if (baselineRuns.length === 0) {
+            throw new Error(`No runs found for suite batch '${baselineBatchId}'.`);
         }
-        if (baseline.run.scenarioFileHash !== candidate.run.scenarioFileHash) {
-            throw new Error("Runs can only be compared when they share the same scenario file hash.");
+        if (candidateRuns.length === 0) {
+            throw new Error(`No runs found for suite batch '${candidateBatchId}'.`);
         }
-        const notes = [];
-        if (baseline.run.status !== candidate.run.status) {
-            notes.push(`Verdict changed: ${baseline.run.status} -> ${candidate.run.status}`);
+        const baselineSuites = new Set(baselineRuns.map((bundle) => deriveSuiteName(bundle.run.scenarioId)));
+        const candidateSuites = new Set(candidateRuns.map((bundle) => deriveSuiteName(bundle.run.scenarioId)));
+        if (baselineSuites.size !== 1) {
+            throw new Error(`Suite batch '${baselineBatchId}' contains runs from multiple suites.`);
         }
-        if (baseline.run.score !== candidate.run.score) {
-            notes.push(`Score changed: ${baseline.run.score} -> ${candidate.run.score}`);
+        if (candidateSuites.size !== 1) {
+            throw new Error(`Suite batch '${candidateBatchId}' contains runs from multiple suites.`);
         }
-        if (baseline.run.totalSteps !== candidate.run.totalSteps) {
-            notes.push(`Steps changed: ${baseline.run.totalSteps} -> ${candidate.run.totalSteps}`);
+        const suite = [...baselineSuites][0] ?? "unknown";
+        const candidateSuite = [...candidateSuites][0] ?? "unknown";
+        if (suite !== candidateSuite) {
+            throw new Error(`Suite batches can only be compared when they share the same suite. Got '${suite}' and '${candidateSuite}'.`);
         }
-        if (baseline.run.durationMs !== candidate.run.durationMs) {
-            notes.push(`Runtime changed: ${baseline.run.durationMs}ms -> ${candidate.run.durationMs}ms`);
+        const baselineMap = new Map(baselineRuns.map((bundle) => [bundle.run.scenarioId, bundle]));
+        const candidateMap = new Map(candidateRuns.map((bundle) => [bundle.run.scenarioId, bundle]));
+        const sharedScenarioIds = [...baselineMap.keys()].filter((scenarioId) => candidateMap.has(scenarioId)).sort();
+        const comparisons = sharedScenarioIds.map((scenarioId) => ({
+            scenarioId,
+            comparison: compareRunBundles(baselineMap.get(scenarioId), candidateMap.get(scenarioId)),
+        }));
+        const regressions = comparisons.filter((entry) => entry.comparison.classification === "regressed");
+        const improvements = comparisons.filter((entry) => entry.comparison.classification === "improved");
+        const unchanged = comparisons.filter((entry) => !["regressed", "improved"].includes(entry.comparison.classification));
+        const baselineStats = summarizeRuns(baselineRuns);
+        const candidateStats = summarizeRuns(candidateRuns);
+        const missingFromCandidate = [...baselineMap.keys()].filter((scenarioId) => !candidateMap.has(scenarioId)).sort();
+        const missingFromBaseline = [...candidateMap.keys()].filter((scenarioId) => !baselineMap.has(scenarioId)).sort();
+        const notes = [];
+        if (regressions.length > 0) {
+            notes.push(`${regressions.length} scenario regressions detected.`);
         }
-        if (baseline.run.terminationReason !== candidate.run.terminationReason) {
-            notes.push(`Termination changed: ${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`);
+        if (improvements.length > 0) {
+            notes.push(`${improvements.length} scenario improvements detected.`);
+        }
+        if (missingFromCandidate.length > 0) {
+            notes.push(`${missingFromCandidate.length} scenarios missing from candidate batch.`);
+        }
+        if (missingFromBaseline.length > 0) {
+            notes.push(`${missingFromBaseline.length} scenarios missing from baseline batch.`);
         }
-        const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
-        const toolDiffs = buildToolDiffs(baseline, candidate);
         return {
-            baseline,
-            candidate,
+            suite,
+            baselineBatchId,
+            candidateBatchId,
+            classification: regressions.length > 0 ? "regressed" : improvements.length > 0 ? "improved" : notes.length > 0 ? "mixed" : "unchanged",
             notes,
             deltas: {
-                score: candidate.run.score - baseline.run.score,
-                runtimeMs: candidate.run.durationMs - baseline.run.durationMs,
-                steps: candidate.run.totalSteps - baseline.run.totalSteps,
+                pass: candidateStats.pass - baselineStats.pass,
+                fail: candidateStats.fail - baselineStats.fail,
+                error: candidateStats.error - baselineStats.error,
+                averageScore: candidateStats.averageScore - baselineStats.averageScore,
+                averageRuntimeMs: candidateStats.averageRuntimeMs - baselineStats.averageRuntimeMs,
+                averageSteps: candidateStats.averageSteps - baselineStats.averageSteps,
             },
-            evaluatorDiffs,
-            toolDiffs,
+            regressions,
+            improvements,
+            unchanged,
+            missingFromCandidate,
+            missingFromBaseline,
         };
     }
     getRunRecord(runId) {
         return (this.db
             .prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
+                  suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
+                  prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
+                  config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
+                  suite_definition_name as suiteDefinitionName,
                   status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
                   total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
                   total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
@@ -346,6 +444,75 @@ export class Storage {
         if (!names.has("args_json")) {
             this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
         }
+        if (!names.has("variant_set_name")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
+        }
+        if (!names.has("variant_label")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
+        }
+        if (!names.has("prompt_version")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
+        }
+        if (!names.has("model_version")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
+        }
+        if (!names.has("tool_schema_version")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
+        }
+        if (!names.has("config_label")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
+        }
+        if (!names.has("config_hash")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
+        }
+        if (!names.has("runtime_profile_name")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
+        }
+        if (!names.has("suite_definition_name")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
+        }
+    }
+    ensureRunColumns() {
+        const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
+        const names = new Set(columns.map((column) => column.name));
+        if (!names.has("suite_batch_id")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
+        }
+        if (!names.has("variant_set_name")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
+        }
+        if (!names.has("variant_label")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
+        }
+        if (!names.has("prompt_version")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
+        }
+        if (!names.has("model_version")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
+        }
+        if (!names.has("tool_schema_version")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
+        }
+        if (!names.has("config_label")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
+        }
+        if (!names.has("config_hash")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
+        }
+        if (!names.has("runtime_profile_name")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
+        }
+        if (!names.has("suite_definition_name")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
+        }
+    }
+    getRunsBySuiteBatchId(suiteBatchId) {
+        const runIds = this.db
+            .prepare(`SELECT id FROM runs WHERE suite_batch_id = ? ORDER BY scenario_id ASC`)
+            .all(suiteBatchId);
+        return runIds
+            .map((row) => this.getRun(row.id))
+            .filter((bundle) => bundle !== null);
     }
 }
 function buildEvaluatorDiffs(baseline, candidate) {
@@ -361,14 +528,18 @@ function buildEvaluatorDiffs(baseline, candidate) {
         if (baselineResult?.status === candidateResult?.status) {
             return null;
         }
+        const hardGate = baselineResult?.mode === "hard_gate" || candidateResult?.mode === "hard_gate";
         return {
             evaluatorId,
+            hardGate,
+            weight: candidateResult?.weight ?? baselineResult?.weight,
             baselineStatus: baselineResult?.status,
             candidateStatus: candidateResult?.status,
             note: `Evaluator '${evaluatorId}' changed: ${baselineResult?.status ?? "missing"} -> ${candidateResult?.status ?? "missing"}`,
         };
     })
-        .filter((diff) => diff !== null);
+        .filter((diff) => diff !== null)
+        .sort((left, right) => Number(right.hardGate) - Number(left.hardGate) || left.evaluatorId.localeCompare(right.evaluatorId));
 }
 function buildToolDiffs(baseline, candidate) {
     const toolNames = new Set([
@@ -383,12 +554,115 @@ function buildToolDiffs(baseline, candidate) {
         if (baselineCount === candidateCount) {
             return null;
         }
-        return {
+        const diff = {
             toolName,
             baselineCount,
             candidateCount,
+            risk: baselineCount === 0 && candidateCount > 0 ? "new_tool" : "none",
             note: `Tool '${toolName}' usage changed: ${baselineCount} -> ${candidateCount}`,
         };
+        return diff;
     })
         .filter((diff) => diff !== null);
 }
+function compareRunBundles(baseline, candidate) {
+    if (baseline.run.scenarioId !== candidate.run.scenarioId) {
+        throw new Error("Runs can only be compared when they share the same scenario id.");
+    }
+    if (baseline.run.scenarioFileHash !== candidate.run.scenarioFileHash) {
+        throw new Error("Runs can only be compared when they share the same scenario file hash.");
+    }
+    const notes = [];
+    const verdictDelta = `${baseline.run.status} -> ${candidate.run.status}`;
+    if (baseline.run.status !== candidate.run.status) {
+        notes.push(`Verdict changed: ${verdictDelta}`);
+    }
+    if (baseline.run.score !== candidate.run.score) {
+        notes.push(`Score changed: ${baseline.run.score} -> ${candidate.run.score}`);
+    }
+    if (baseline.run.totalSteps !== candidate.run.totalSteps) {
+        notes.push(`Steps changed: ${baseline.run.totalSteps} -> ${candidate.run.totalSteps}`);
+    }
+    if (baseline.run.durationMs !== candidate.run.durationMs) {
+        notes.push(`Runtime changed: ${baseline.run.durationMs}ms -> ${candidate.run.durationMs}ms`);
+    }
+    if (baseline.run.terminationReason !== candidate.run.terminationReason) {
+        notes.push(`Termination changed: ${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`);
+    }
+    const evaluatorDiffs = buildEvaluatorDiffs(baseline, candidate);
+    const toolDiffs = buildToolDiffs(baseline, candidate);
+    const hardGateRegression = evaluatorDiffs.some((diff) => diff.hardGate && diff.baselineStatus === "pass" && diff.candidateStatus === "fail");
+    const scoreDelta = candidate.run.score - baseline.run.score;
+    const runtimeDeltaMs = candidate.run.durationMs - baseline.run.durationMs;
+    const stepDelta = candidate.run.totalSteps - baseline.run.totalSteps;
+    const runtimePct = baseline.run.durationMs === 0 ? 0 : Math.round((runtimeDeltaMs / baseline.run.durationMs) * 100);
+    const outputChanged = baseline.run.finalOutput !== candidate.run.finalOutput;
+    if (outputChanged) {
+        notes.push("Final output changed.");
+    }
+    return {
+        baseline,
+        candidate,
+        classification: classifyComparison({
+            baselineStatus: baseline.run.status,
+            candidateStatus: candidate.run.status,
+            scoreDelta,
+            runtimePct,
+            stepDelta,
+            hardGateRegression,
+        }),
+        verdictDelta,
+        terminationDelta: baseline.run.terminationReason === candidate.run.terminationReason
+            ? undefined
+            : `${baseline.run.terminationReason} -> ${candidate.run.terminationReason}`,
+        outputChanged,
+        notes,
+        deltas: {
+            score: scoreDelta,
+            runtimeMs: runtimeDeltaMs,
+            steps: stepDelta,
+            runtimePct,
+        },
+        evaluatorDiffs,
+        toolDiffs,
+    };
+}
+function classifyComparison(input) {
+    if (input.baselineStatus === "pass" &&
+        (input.candidateStatus !== "pass" || input.hardGateRegression || input.scoreDelta < -5 || input.runtimePct > 25 || input.stepDelta > 2)) {
+        return "regressed";
+    }
+    if (input.baselineStatus !== "pass" && input.candidateStatus === "pass") {
+        return "improved";
+    }
+    if (input.baselineStatus === input.candidateStatus &&
+        input.baselineStatus === "pass" &&
+        input.scoreDelta >= 0 &&
+        input.runtimePct <= 25 &&
+        input.stepDelta <= 2 &&
+        !input.hardGateRegression) {
+        return "unchanged_pass";
+    }
+    if (input.baselineStatus === input.candidateStatus && input.baselineStatus === "fail") {
+        return "unchanged_fail";
+    }
+    if (input.baselineStatus !== "pass" && input.candidateStatus !== "pass" && input.scoreDelta > 0) {
+        return "improved";
+    }
+    if (input.scoreDelta < -5 || input.runtimePct > 25 || input.stepDelta > 2 || input.hardGateRegression) {
+        return "regressed";
+    }
+    return "changed_non_terminal";
+}
+function summarizeRuns(runs) {
+    const pass = runs.filter((bundle) => bundle.run.status === "pass").length;
+    const fail = runs.filter((bundle) => bundle.run.status === "fail").length;
+    const error = runs.filter((bundle) => bundle.run.status === "error").length;
+    const averageScore = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
+    const averageRuntimeMs = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.durationMs, 0) / runs.length);
+    const averageSteps = runs.length === 0 ? 0 : Math.round(runs.reduce((sum, bundle) => sum + bundle.run.totalSteps, 0) / runs.length);
+    return { pass, fail, error, averageScore, averageRuntimeMs, averageSteps };
+}
+function deriveSuiteName(scenarioId) {
+    return scenarioId.split(".")[0] ?? "unknown";
+}