npm - @evalgate/sdk - Versions diffs - 2.2.2 → 2.2.4 - Mend

@evalgate/sdk 2.2.2 → 2.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/CHANGELOG.md +32 -0
package/README.md +40 -1
package/dist/assertions.d.ts +194 -10
package/dist/assertions.js +525 -73
package/dist/batch.js +4 -4
package/dist/cache.d.ts +5 -1
package/dist/cache.js +5 -1
package/dist/cli/baseline.d.ts +14 -0
package/dist/cli/baseline.js +43 -3
package/dist/cli/check.d.ts +5 -2
package/dist/cli/check.js +20 -12
package/dist/cli/compare.d.ts +80 -0
package/dist/cli/compare.js +266 -0
package/dist/cli/index.js +244 -101
package/dist/cli/regression-gate.js +23 -0
package/dist/cli/run.js +22 -0
package/dist/cli/start.d.ts +26 -0
package/dist/cli/start.js +130 -0
package/dist/cli/templates.d.ts +24 -0
package/dist/cli/templates.js +314 -0
package/dist/cli/traces.d.ts +109 -0
package/dist/cli/traces.js +152 -0
package/dist/cli/upgrade.js +5 -0
package/dist/cli/validate.d.ts +37 -0
package/dist/cli/validate.js +252 -0
package/dist/cli/watch.d.ts +19 -0
package/dist/cli/watch.js +175 -0
package/dist/client.js +6 -13
package/dist/constants.d.ts +2 -0
package/dist/constants.js +5 -0
package/dist/errors.js +7 -0
package/dist/export.js +2 -2
package/dist/index.d.ts +10 -9
package/dist/index.js +24 -7
package/dist/integrations/anthropic.js +6 -6
package/dist/integrations/openai.js +84 -61
package/dist/logger.d.ts +3 -1
package/dist/logger.js +2 -1
package/dist/otel.d.ts +130 -0
package/dist/otel.js +309 -0
package/dist/pagination.d.ts +13 -2
package/dist/pagination.js +28 -2
package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
package/dist/runtime/eval.d.ts +14 -4
package/dist/runtime/eval.js +127 -2
package/dist/runtime/executor.d.ts +3 -2
package/dist/runtime/executor.js +3 -2
package/dist/runtime/registry.d.ts +8 -3
package/dist/runtime/registry.js +15 -4
package/dist/runtime/run-report.d.ts +1 -1
package/dist/runtime/run-report.js +7 -4
package/dist/runtime/types.d.ts +38 -0
package/dist/snapshot.d.ts +12 -0
package/dist/snapshot.js +24 -1
package/dist/testing.d.ts +8 -0
package/dist/testing.js +45 -10
package/dist/version.d.ts +2 -2
package/dist/version.js +2 -2
package/dist/workflows.d.ts +2 -0
package/dist/workflows.js +184 -102
package/package.json +8 -1

package/dist/batch.js CHANGED Viewed

@@ -163,15 +163,15 @@ function canBatch(method, endpoint) {
  */
 async function batchProcess(items, processor, concurrency = 5) {
     const results = [];
-    const executing = [];
+    const executing = new Set();
     for (const item of items) {
         const promise = processor(item).then((result) => {
             results.push(result);
         });
-        executing.push(promise);
-        if (executing.length >= concurrency) {
+        const tracked = promise.finally(() => executing.delete(tracked));
+        executing.add(tracked);
+        if (executing.size >= concurrency) {
             await Promise.race(executing);
-            executing.splice(executing.indexOf(promise), 1);
         }
     }
     await Promise.all(executing);

package/dist/cache.d.ts CHANGED Viewed

@@ -2,6 +2,10 @@
  * Simple in-memory cache with TTL for SDK requests
  * Reduces redundant API calls and improves performance
  */
+/**
+ * @internal — HTTP request cache used by AIEvalClient. Not part of the public API.
+ * Use {@link CacheTTL} to configure cache durations via client options.
+ */
 export declare class RequestCache {
     private cache;
     private maxSize;
@@ -21,7 +25,7 @@ export declare class RequestCache {
     /**
      * Store response in cache
      */
-    set<T>(method: string, url: string, data: T, ttl: number, params?: unknown): void;
+    set<T>(method: string, url: string, data: T, ttl?: number, params?: unknown): void;
     /**
      * Invalidate specific cache entry
      */

package/dist/cache.js CHANGED Viewed

@@ -7,6 +7,10 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.CacheTTL = exports.RequestCache = void 0;
 exports.shouldCache = shouldCache;
 exports.getTTL = getTTL;
+/**
+ * @internal — HTTP request cache used by AIEvalClient. Not part of the public API.
+ * Use {@link CacheTTL} to configure cache durations via client options.
+ */
 class RequestCache {
     constructor(maxSize = 1000) {
         this.cache = new Map();
@@ -43,7 +47,7 @@ class RequestCache {
     /**
      * Store response in cache
      */
-    set(method, url, data, ttl, params) {
+    set(method, url, data, ttl = exports.CacheTTL.MEDIUM, params) {
         // Enforce cache size limit (LRU-style)
         if (this.cache.size >= this.maxSize) {
             const firstKey = this.cache.keys().next().value;

package/dist/cli/baseline.d.ts CHANGED Viewed

@@ -5,6 +5,20 @@
  *   evalgate baseline init    — Create a starter evals/baseline.json
  *   evalgate baseline update  — Run tests + update baseline with real scores
  */
+/**
+ * Compute a SHA-256 checksum of the baseline data (excluding the _checksum field).
+ * This detects accidental corruption or manual tampering between runs.
+ */
+export declare function computeBaselineChecksum(data: Record<string, unknown>): string;
+/**
+ * Verify the checksum stored in a baseline file matches its content.
+ * Returns { valid: true } if checksum matches or is absent (legacy files).
+ * Returns { valid: false, reason } if checksum is present but doesn't match.
+ */
+export declare function verifyBaselineChecksum(data: Record<string, unknown>): {
+    valid: boolean;
+    reason?: string;
+};
 export declare function runBaselineInit(cwd: string): number;
 export declare function runBaselineUpdate(cwd: string): number;
 export declare function runBaseline(argv: string[]): number;

package/dist/cli/baseline.js CHANGED Viewed

@@ -40,12 +40,45 @@ var __importStar = (this && this.__importStar) || (function () {
     };
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
+exports.computeBaselineChecksum = computeBaselineChecksum;
+exports.verifyBaselineChecksum = verifyBaselineChecksum;
 exports.runBaselineInit = runBaselineInit;
 exports.runBaselineUpdate = runBaselineUpdate;
 exports.runBaseline = runBaseline;
 const node_child_process_1 = require("node:child_process");
+const crypto = __importStar(require("node:crypto"));
 const fs = __importStar(require("node:fs"));
 const path = __importStar(require("node:path"));
+/**
+ * Compute a SHA-256 checksum of the baseline data (excluding the _checksum field).
+ * This detects accidental corruption or manual tampering between runs.
+ */
+function computeBaselineChecksum(data) {
+    const copy = { ...data };
+    delete copy._checksum;
+    const content = JSON.stringify(copy, Object.keys(copy).sort());
+    return crypto.createHash("sha256").update(content).digest("hex");
+}
+/**
+ * Verify the checksum stored in a baseline file matches its content.
+ * Returns { valid: true } if checksum matches or is absent (legacy files).
+ * Returns { valid: false, reason } if checksum is present but doesn't match.
+ */
+function verifyBaselineChecksum(data) {
+    const stored = data._checksum;
+    if (typeof stored !== "string") {
+        // Legacy baseline without checksum — allow but warn
+        return { valid: true, reason: "no_checksum" };
+    }
+    const computed = computeBaselineChecksum(data);
+    if (computed !== stored) {
+        return {
+            valid: false,
+            reason: `Checksum mismatch: expected ${stored.slice(0, 12)}…, got ${computed.slice(0, 12)}…. Baseline may be corrupted or tampered with.`,
+        };
+    }
+    return { valid: true };
+}
 const BASELINE_REL = "evals/baseline.json";
 /** Detect the package manager used in the project */
 function detectPackageManager(cwd) {
@@ -116,8 +149,13 @@ function runBaselineInit(cwd) {
         },
         productMetrics: {},
     };
-    fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
-    console.log(`✅ Created ${BASELINE_REL} with sample values\n`);
+    // Stamp checksum
+    const withChecksum = {
+        ...baseline,
+        _checksum: computeBaselineChecksum(baseline),
+    };
+    fs.writeFileSync(baselinePath, `${JSON.stringify(withChecksum, null, 2)}\n`);
+    console.log(`✅ Created ${BASELINE_REL} with sample values (checksum stamped)\n`);
     console.log("Next steps:");
     console.log(`  1. Commit ${BASELINE_REL} to your repo`);
     console.log("  2. Run 'evalgate baseline update' to populate with real scores");
@@ -164,8 +202,10 @@ function runBaselineUpdate(cwd) {
         baseline.updatedBy = process.env.USER || process.env.USERNAME || "unknown";
         baseline.confidenceTests = baseline.confidenceTests ?? {};
         baseline.confidenceTests.unitPassed = testResult.status === 0;
+        // Re-stamp checksum
+        baseline._checksum = computeBaselineChecksum(baseline);
         fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
-        console.log("\n✅ Baseline updated successfully");
+        console.log("\n✅ Baseline updated successfully (checksum stamped)");
     }
     catch {
         console.error("❌ Failed to update baseline file");

package/dist/cli/check.d.ts CHANGED Viewed

@@ -16,12 +16,13 @@
  *   --policy <name>      Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
  *   --baseline <mode>   Baseline comparison mode: "published" (default), "previous", or "production"
  *   --evaluationId <id>  Required. The evaluation to gate on.
- *   --baseUrl <url>      API base URL (default: EVALGATE_BASE_URL or http://localhost:3000)
+ *   --baseUrl <url>      API base URL (default: EVALGATE_BASE_URL or https://api.evalgate.com)
  *   --apiKey <key>       API key (default: EVALGATE_API_KEY env var)
  *   --share <mode>       Share link: "always" | "fail" | "never" (default: never)
  *                        fail = create public share link only when gate fails (CI-friendly)
  *   --pr-comment-out <file>  Write PR comment markdown to file (for GitHub Action to post)
  *   --profile <name>         Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
+ *   --dry-run               Run all checks and print results, but always exit 0
  *
  * Exit codes:
  *   0  — Gate passed
@@ -35,7 +36,7 @@
  *   8  — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
  *
  * Environment:
- *   EVALGATE_BASE_URL  — API base URL (default: http://localhost:3000)
+ *   EVALGATE_BASE_URL  — API base URL (default: https://api.evalgate.com)
  *   EVALGATE_API_KEY   — API key for authentication
  */
 export { EXIT } from "./constants";
@@ -60,6 +61,8 @@ export interface CheckArgs {
     maxCostUsd?: number;
     maxLatencyMs?: number;
     maxCostDeltaUsd?: number;
+    /** When true, run all checks and print results but always exit 0. */
+    dryRun?: boolean;
 }
 export type ParseArgsResult = {
     ok: true;

package/dist/cli/check.js CHANGED Viewed

@@ -17,12 +17,13 @@
  *   --policy <name>      Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
  *   --baseline <mode>   Baseline comparison mode: "published" (default), "previous", or "production"
  *   --evaluationId <id>  Required. The evaluation to gate on.
- *   --baseUrl <url>      API base URL (default: EVALGATE_BASE_URL or http://localhost:3000)
+ *   --baseUrl <url>      API base URL (default: EVALGATE_BASE_URL or https://api.evalgate.com)
  *   --apiKey <key>       API key (default: EVALGATE_API_KEY env var)
  *   --share <mode>       Share link: "always" | "fail" | "never" (default: never)
  *                        fail = create public share link only when gate fails (CI-friendly)
  *   --pr-comment-out <file>  Write PR comment markdown to file (for GitHub Action to post)
  *   --profile <name>         Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
+ *   --dry-run               Run all checks and print results, but always exit 0
  *
  * Exit codes:
  *   0  — Gate passed
@@ -36,7 +37,7 @@
  *   8  — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
  *
  * Environment:
- *   EVALGATE_BASE_URL  — API base URL (default: http://localhost:3000)
+ *   EVALGATE_BASE_URL  — API base URL (default: https://api.evalgate.com)
  *   EVALGATE_API_KEY   — API key for authentication
  */
 var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
@@ -78,18 +79,19 @@ exports.parseArgs = parseArgs;
 exports.runCheck = runCheck;
 const fs = __importStar(require("node:fs"));
 const path = __importStar(require("node:path"));
+const constants_1 = require("../constants");
 const api_1 = require("./api");
 const ci_context_1 = require("./ci-context");
 const config_1 = require("./config");
-const constants_1 = require("./constants");
+const constants_2 = require("./constants");
 const github_1 = require("./formatters/github");
 const human_1 = require("./formatters/human");
 const json_1 = require("./formatters/json");
 const pr_comment_1 = require("./formatters/pr-comment");
 const gate_1 = require("./gate");
 const build_check_report_1 = require("./report/build-check-report");
-var constants_2 = require("./constants");
-Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_2.EXIT; } });
+var constants_3 = require("./constants");
+Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_3.EXIT; } });
 function parseArgs(argv) {
     const args = {};
     for (let i = 0; i < argv.length; i++) {
@@ -106,7 +108,7 @@ function parseArgs(argv) {
             }
         }
     }
-    let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL || "http://localhost:3000";
+    let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL || constants_1.DEFAULT_BASE_URL;
     const apiKey = args.apiKey ||
         process.env.EVALGATE_API_KEY ||
         process.env.EVALAI_API_KEY ||
@@ -122,6 +124,7 @@ function parseArgs(argv) {
     const format = formatRaw === "json" ? "json" : formatRaw === "github" ? "github" : "human";
     const explain = args.explain === "true" || args.explain === "1";
     const onFail = args.onFail === "import" ? "import" : undefined;
+    const dryRun = args["dry-run"] === "true" || args.dryRun === "true";
     const shareRaw = args.share || "never";
     const share = shareRaw === "always" ? "always" : shareRaw === "fail" ? "fail" : "never";
     const prCommentOut = args["pr-comment-out"] || args.prCommentOut || undefined;
@@ -176,28 +179,28 @@ function parseArgs(argv) {
     if (!apiKey) {
         return {
             ok: false,
-            exitCode: constants_1.EXIT.BAD_ARGS,
+            exitCode: constants_2.EXIT.BAD_ARGS,
             message: "Error: --apiKey or EVALGATE_API_KEY is required",
         };
     }
     if (!evaluationId) {
         return {
             ok: false,
-            exitCode: constants_1.EXIT.BAD_ARGS,
+            exitCode: constants_2.EXIT.BAD_ARGS,
             message: "Run npx evalgate init and paste your evaluationId, or pass --evaluationId.",
         };
     }
     if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
         return {
             ok: false,
-            exitCode: constants_1.EXIT.BAD_ARGS,
+            exitCode: constants_2.EXIT.BAD_ARGS,
             message: "Error: --minScore must be 0-100",
         };
     }
     if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
         return {
             ok: false,
-            exitCode: constants_1.EXIT.BAD_ARGS,
+            exitCode: constants_2.EXIT.BAD_ARGS,
             message: "Error: --minN must be a positive number",
         };
     }
@@ -228,6 +231,7 @@ function parseArgs(argv) {
             maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
                 ? maxCostDeltaUsd
                 : undefined,
+            dryRun: dryRun || undefined,
         },
     };
 }
@@ -240,7 +244,7 @@ async function runCheck(args) {
         else {
             console.error(`EvalGate gate ERROR: API returned ${qualityResult.status} — ${qualityResult.body}`);
         }
-        return constants_1.EXIT.API_ERROR;
+        return constants_2.EXIT.API_ERROR;
     }
     const { data: quality, requestId } = qualityResult;
     const evaluationRunId = quality?.evaluationRunId;
@@ -336,6 +340,10 @@ async function runCheck(args) {
             }
         }
     }
+    if (args.dryRun) {
+        console.error(`\n[dry-run] Gate would have exited with code ${gateResult.exitCode}`);
+        return constants_2.EXIT.PASS;
+    }
     return gateResult.exitCode;
 }
 // Main entry point
@@ -350,6 +358,6 @@ if (isDirectRun) {
         .then((code) => process.exit(code))
         .catch((err) => {
         console.error(`EvalGate gate ERROR: ${err instanceof Error ? err.message : String(err)}`);
-        process.exit(constants_1.EXIT.API_ERROR);
+        process.exit(constants_2.EXIT.API_ERROR);
     });
 }

package/dist/cli/compare.d.ts ADDED Viewed

@@ -0,0 +1,80 @@
+/**
+ * evalgate compare — Side-by-side result file comparison
+ *
+ * Compares two or more saved run result JSON files. Does NOT re-run anything.
+ * You run each model/config separately (evalgate run --write-results),
+ * then compare the saved artifacts. Shows wins/losses/ties per spec.
+ *
+ * Usage:
+ *   evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json
+ *   evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"
+ *   evalgate compare --runs run-a.json run-b.json run-c.json
+ */
+export interface CompareOptions {
+    /** Paths to run result files to compare */
+    runs: string[];
+    /** Human-readable labels for each run (e.g., model names) */
+    labels?: string[];
+    /** Output format */
+    format?: "human" | "json";
+    /** Sort by: name, score-delta, status */
+    sortBy?: "name" | "score" | "duration";
+}
+/**
+ * Per-spec comparison row
+ */
+export interface CompareRow {
+    specId: string;
+    name: string;
+    filePath: string;
+    results: Array<{
+        label: string;
+        status: "passed" | "failed" | "skipped" | "missing";
+        score?: number;
+        duration: number;
+        error?: string;
+    }>;
+    /** Which run "won" (highest score), or null if tied */
+    winner: string | null;
+}
+/**
+ * Overall comparison result
+ */
+export interface CompareResult {
+    schemaVersion: 1;
+    labels: string[];
+    runIds: string[];
+    specs: CompareRow[];
+    summary: {
+        /** Wins per label */
+        wins: Record<string, number>;
+        /** Ties count */
+        ties: number;
+        /** Per-label aggregates */
+        aggregates: Array<{
+            label: string;
+            runId: string;
+            passed: number;
+            failed: number;
+            avgScore: number;
+            avgDuration: number;
+            totalDuration: number;
+        }>;
+    };
+}
+/**
+ * Run the comparison
+ */
+export declare function runCompare(options: CompareOptions, projectRoot?: string): Promise<CompareResult>;
+/**
+ * Print human-readable comparison
+ */
+export declare function printHumanCompare(result: CompareResult): void;
+/**
+ * Print JSON comparison
+ */
+export declare function printJsonCompare(result: CompareResult): void;
+/**
+ * CLI entry point for compare
+ */
+export declare function runCompareCLI(options: CompareOptions): Promise<void>;

package/dist/cli/compare.js ADDED Viewed

@@ -0,0 +1,266 @@
+"use strict";
+/**
+ * evalgate compare — Side-by-side result file comparison
+ *
+ * Compares two or more saved run result JSON files. Does NOT re-run anything.
+ * You run each model/config separately (evalgate run --write-results),
+ * then compare the saved artifacts. Shows wins/losses/ties per spec.
+ *
+ * Usage:
+ *   evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json
+ *   evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"
+ *   evalgate compare --runs run-a.json run-b.json run-c.json
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.runCompare = runCompare;
+exports.printHumanCompare = printHumanCompare;
+exports.printJsonCompare = printJsonCompare;
+exports.runCompareCLI = runCompareCLI;
+const fs = __importStar(require("node:fs/promises"));
+const path = __importStar(require("node:path"));
+/**
+ * Load a run result from file
+ */
+async function loadRunResult(filePath, projectRoot) {
+    const resolved = path.isAbsolute(filePath)
+        ? filePath
+        : path.join(projectRoot, filePath);
+    const content = await fs.readFile(resolved, "utf-8");
+    return JSON.parse(content);
+}
+/**
+ * Run the comparison
+ */
+async function runCompare(options, projectRoot = process.cwd()) {
+    if (options.runs.length < 2) {
+        throw new Error("At least 2 run files are required for comparison.");
+    }
+    // Load all runs
+    const runs = [];
+    for (const runPath of options.runs) {
+        runs.push(await loadRunResult(runPath, projectRoot));
+    }
+    // Generate labels
+    const labels = options.labels?.length === runs.length
+        ? options.labels
+        : runs.map((r, i) => options.labels?.[i] ?? r.runId ?? `Run ${i + 1}`);
+    // Collect all unique spec IDs across all runs
+    const allSpecIds = new Map();
+    for (const run of runs) {
+        for (const spec of run.results) {
+            if (!allSpecIds.has(spec.specId)) {
+                allSpecIds.set(spec.specId, {
+                    name: spec.name,
+                    filePath: spec.filePath,
+                });
+            }
+        }
+    }
+    // Build comparison rows
+    const specs = [];
+    const wins = {};
+    let ties = 0;
+    for (const label of labels)
+        wins[label] = 0;
+    for (const [specId, meta] of allSpecIds) {
+        const results = runs.map((run, i) => {
+            const spec = run.results.find((r) => r.specId === specId);
+            if (!spec) {
+                return {
+                    label: labels[i],
+                    status: "missing",
+                    score: undefined,
+                    duration: 0,
+                };
+            }
+            return {
+                label: labels[i],
+                status: spec.result.status,
+                score: spec.result.score,
+                duration: spec.result.duration,
+                error: spec.result.error,
+            };
+        });
+        // Determine winner by score (higher is better), then by status
+        const scoredResults = results.filter((r) => r.score !== undefined && r.status !== "missing");
+        let winner = null;
+        if (scoredResults.length >= 2) {
+            const maxScore = Math.max(...scoredResults.map((r) => r.score ?? 0));
+            const topScorers = scoredResults.filter((r) => r.score === maxScore);
+            if (topScorers.length === 1) {
+                winner = topScorers[0].label;
+                wins[winner]++;
+            }
+            else {
+                ties++;
+            }
+        }
+        else {
+            // Compare by status: passed > failed > skipped > missing
+            const statusRank = { passed: 3, failed: 1, skipped: 0, missing: -1 };
+            const ranked = results
+                .filter((r) => r.status !== "missing")
+                .sort((a, b) => (statusRank[b.status] ?? 0) - (statusRank[a.status] ?? 0));
+            if (ranked.length >= 2 &&
+                statusRank[ranked[0].status] > statusRank[ranked[1].status]) {
+                winner = ranked[0].label;
+                wins[winner]++;
+            }
+            else if (ranked.length >= 2) {
+                ties++;
+            }
+        }
+        specs.push({
+            specId,
+            name: meta.name,
+            filePath: meta.filePath,
+            results,
+            winner,
+        });
+    }
+    // Sort
+    if (options.sortBy === "score") {
+        specs.sort((a, b) => {
+            const aMax = Math.max(...a.results.map((r) => r.score ?? 0));
+            const bMax = Math.max(...b.results.map((r) => r.score ?? 0));
+            return bMax - aMax;
+        });
+    }
+    else if (options.sortBy === "duration") {
+        specs.sort((a, b) => {
+            const aMax = Math.max(...a.results.map((r) => r.duration));
+            const bMax = Math.max(...b.results.map((r) => r.duration));
+            return bMax - aMax;
+        });
+    }
+    else {
+        specs.sort((a, b) => a.name.localeCompare(b.name));
+    }
+    // Build aggregates
+    const aggregates = runs.map((run, i) => {
+        const passed = run.results.filter((r) => r.result.status === "passed").length;
+        const failed = run.results.filter((r) => r.result.status === "failed").length;
+        const scores = run.results
+            .filter((r) => r.result.score !== undefined)
+            .map((r) => r.result.score);
+        const durations = run.results.map((r) => r.result.duration);
+        return {
+            label: labels[i],
+            runId: run.runId,
+            passed,
+            failed,
+            avgScore: scores.length > 0
+                ? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000
+                : 0,
+            avgDuration: durations.length > 0
+                ? Math.round(durations.reduce((a, b) => a + b, 0) / durations.length)
+                : 0,
+            totalDuration: durations.reduce((a, b) => a + b, 0),
+        };
+    });
+    return {
+        schemaVersion: 1,
+        labels,
+        runIds: runs.map((r) => r.runId),
+        specs,
+        summary: { wins, ties, aggregates },
+    };
+}
+/**
+ * Print human-readable comparison
+ */
+function printHumanCompare(result) {
+    console.log("\n🔄 Run Comparison\n");
+    // Header
+    const labelHeader = result.labels.map((l) => l.padEnd(16)).join("  ");
+    console.log(`  ${"Spec".padEnd(30)}  ${labelHeader}  Winner`);
+    console.log(`  ${"─".repeat(30)}  ${result.labels.map(() => "─".repeat(16)).join("  ")}  ${"─".repeat(12)}`);
+    // Rows
+    for (const spec of result.specs) {
+        const name = spec.name.length > 28 ? `${spec.name.substring(0, 25)}...` : spec.name;
+        const cells = spec.results.map((r) => {
+            const icon = r.status === "passed"
+                ? "✅"
+                : r.status === "failed"
+                    ? "❌"
+                    : r.status === "skipped"
+                        ? "⏭️"
+                        : "➖";
+            const score = r.score !== undefined ? `${(r.score * 100).toFixed(0)}%` : "";
+            const dur = r.duration > 0 ? `${r.duration}ms` : "";
+            return `${icon} ${score} ${dur}`.padEnd(16);
+        });
+        const winner = spec.winner ?? "tie";
+        console.log(`  ${name.padEnd(30)}  ${cells.join("  ")}  ${winner}`);
+    }
+    // Summary
+    console.log("\n📊 Summary:");
+    for (const agg of result.summary.aggregates) {
+        console.log(`  ${agg.label}: ${agg.passed} passed, ${agg.failed} failed, avg score: ${(agg.avgScore * 100).toFixed(1)}%, avg latency: ${agg.avgDuration}ms`);
+    }
+    console.log("\n🏆 Wins:");
+    for (const [label, count] of Object.entries(result.summary.wins)) {
+        console.log(`  ${label}: ${count} wins`);
+    }
+    if (result.summary.ties > 0) {
+        console.log(`  Ties: ${result.summary.ties}`);
+    }
+}
+/**
+ * Print JSON comparison
+ */
+function printJsonCompare(result) {
+    console.log(JSON.stringify(result, null, 2));
+}
+/**
+ * CLI entry point for compare
+ */
+async function runCompareCLI(options) {
+    try {
+        const result = await runCompare(options);
+        if (options.format === "json") {
+            printJsonCompare(result);
+        }
+        else {
+            printHumanCompare(result);
+        }
+        process.exit(0);
+    }
+    catch (error) {
+        console.error("❌ Compare failed:", error instanceof Error ? error.message : String(error));
+        process.exit(1);
+    }
+}