@evalgate/sdk 2.2.3 → 2.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +5 -0
  2. package/README.md +38 -1
  3. package/dist/assertions.d.ts +185 -5
  4. package/dist/assertions.js +496 -61
  5. package/dist/batch.js +4 -4
  6. package/dist/cache.d.ts +4 -0
  7. package/dist/cache.js +4 -0
  8. package/dist/cli/baseline.d.ts +14 -0
  9. package/dist/cli/baseline.js +43 -3
  10. package/dist/cli/check.d.ts +5 -2
  11. package/dist/cli/check.js +20 -12
  12. package/dist/cli/compare.d.ts +80 -0
  13. package/dist/cli/compare.js +266 -0
  14. package/dist/cli/index.js +244 -101
  15. package/dist/cli/regression-gate.js +23 -0
  16. package/dist/cli/run.js +22 -0
  17. package/dist/cli/start.d.ts +26 -0
  18. package/dist/cli/start.js +130 -0
  19. package/dist/cli/templates.d.ts +24 -0
  20. package/dist/cli/templates.js +314 -0
  21. package/dist/cli/traces.d.ts +109 -0
  22. package/dist/cli/traces.js +152 -0
  23. package/dist/cli/validate.d.ts +37 -0
  24. package/dist/cli/validate.js +252 -0
  25. package/dist/cli/watch.d.ts +19 -0
  26. package/dist/cli/watch.js +175 -0
  27. package/dist/client.js +6 -13
  28. package/dist/constants.d.ts +2 -0
  29. package/dist/constants.js +5 -0
  30. package/dist/index.d.ts +7 -6
  31. package/dist/index.js +22 -6
  32. package/dist/integrations/openai.js +83 -60
  33. package/dist/logger.d.ts +3 -1
  34. package/dist/logger.js +2 -1
  35. package/dist/otel.d.ts +130 -0
  36. package/dist/otel.js +309 -0
  37. package/dist/runtime/eval.d.ts +14 -4
  38. package/dist/runtime/eval.js +127 -2
  39. package/dist/runtime/registry.d.ts +4 -2
  40. package/dist/runtime/registry.js +11 -3
  41. package/dist/runtime/run-report.d.ts +1 -1
  42. package/dist/runtime/run-report.js +7 -4
  43. package/dist/runtime/types.d.ts +38 -0
  44. package/dist/testing.d.ts +8 -0
  45. package/dist/testing.js +45 -10
  46. package/dist/version.d.ts +1 -1
  47. package/dist/version.js +1 -1
  48. package/dist/workflows.d.ts +2 -0
  49. package/dist/workflows.js +184 -102
  50. package/package.json +8 -1
package/dist/batch.js CHANGED
@@ -163,15 +163,15 @@ function canBatch(method, endpoint) {
163
163
  */
164
164
  async function batchProcess(items, processor, concurrency = 5) {
165
165
  const results = [];
166
- const executing = [];
166
+ const executing = new Set();
167
167
  for (const item of items) {
168
168
  const promise = processor(item).then((result) => {
169
169
  results.push(result);
170
170
  });
171
- executing.push(promise);
172
- if (executing.length >= concurrency) {
171
+ const tracked = promise.finally(() => executing.delete(tracked));
172
+ executing.add(tracked);
173
+ if (executing.size >= concurrency) {
173
174
  await Promise.race(executing);
174
- executing.splice(executing.indexOf(promise), 1);
175
175
  }
176
176
  }
177
177
  await Promise.all(executing);
package/dist/cache.d.ts CHANGED
@@ -2,6 +2,10 @@
2
2
  * Simple in-memory cache with TTL for SDK requests
3
3
  * Reduces redundant API calls and improves performance
4
4
  */
5
+ /**
6
+ * @internal — HTTP request cache used by AIEvalClient. Not part of the public API.
7
+ * Use {@link CacheTTL} to configure cache durations via client options.
8
+ */
5
9
  export declare class RequestCache {
6
10
  private cache;
7
11
  private maxSize;
package/dist/cache.js CHANGED
@@ -7,6 +7,10 @@ Object.defineProperty(exports, "__esModule", { value: true });
7
7
  exports.CacheTTL = exports.RequestCache = void 0;
8
8
  exports.shouldCache = shouldCache;
9
9
  exports.getTTL = getTTL;
10
+ /**
11
+ * @internal — HTTP request cache used by AIEvalClient. Not part of the public API.
12
+ * Use {@link CacheTTL} to configure cache durations via client options.
13
+ */
10
14
  class RequestCache {
11
15
  constructor(maxSize = 1000) {
12
16
  this.cache = new Map();
@@ -5,6 +5,20 @@
5
5
  * evalgate baseline init — Create a starter evals/baseline.json
6
6
  * evalgate baseline update — Run tests + update baseline with real scores
7
7
  */
8
+ /**
9
+ * Compute a SHA-256 checksum of the baseline data (excluding the _checksum field).
10
+ * This detects accidental corruption or manual tampering between runs.
11
+ */
12
+ export declare function computeBaselineChecksum(data: Record<string, unknown>): string;
13
+ /**
14
+ * Verify the checksum stored in a baseline file matches its content.
15
+ * Returns { valid: true } if checksum matches or is absent (legacy files).
16
+ * Returns { valid: false, reason } if checksum is present but doesn't match.
17
+ */
18
+ export declare function verifyBaselineChecksum(data: Record<string, unknown>): {
19
+ valid: boolean;
20
+ reason?: string;
21
+ };
8
22
  export declare function runBaselineInit(cwd: string): number;
9
23
  export declare function runBaselineUpdate(cwd: string): number;
10
24
  export declare function runBaseline(argv: string[]): number;
@@ -40,12 +40,45 @@ var __importStar = (this && this.__importStar) || (function () {
40
40
  };
41
41
  })();
42
42
  Object.defineProperty(exports, "__esModule", { value: true });
43
+ exports.computeBaselineChecksum = computeBaselineChecksum;
44
+ exports.verifyBaselineChecksum = verifyBaselineChecksum;
43
45
  exports.runBaselineInit = runBaselineInit;
44
46
  exports.runBaselineUpdate = runBaselineUpdate;
45
47
  exports.runBaseline = runBaseline;
46
48
  const node_child_process_1 = require("node:child_process");
49
+ const crypto = __importStar(require("node:crypto"));
47
50
  const fs = __importStar(require("node:fs"));
48
51
  const path = __importStar(require("node:path"));
52
+ /**
53
+ * Compute a SHA-256 checksum of the baseline data (excluding the _checksum field).
54
+ * This detects accidental corruption or manual tampering between runs.
55
+ */
56
+ function computeBaselineChecksum(data) {
57
+ const copy = { ...data };
58
+ delete copy._checksum;
59
+ const content = JSON.stringify(copy, Object.keys(copy).sort());
60
+ return crypto.createHash("sha256").update(content).digest("hex");
61
+ }
62
+ /**
63
+ * Verify the checksum stored in a baseline file matches its content.
64
+ * Returns { valid: true } if checksum matches or is absent (legacy files).
65
+ * Returns { valid: false, reason } if checksum is present but doesn't match.
66
+ */
67
+ function verifyBaselineChecksum(data) {
68
+ const stored = data._checksum;
69
+ if (typeof stored !== "string") {
70
+ // Legacy baseline without checksum — allow but warn
71
+ return { valid: true, reason: "no_checksum" };
72
+ }
73
+ const computed = computeBaselineChecksum(data);
74
+ if (computed !== stored) {
75
+ return {
76
+ valid: false,
77
+ reason: `Checksum mismatch: expected ${stored.slice(0, 12)}…, got ${computed.slice(0, 12)}…. Baseline may be corrupted or tampered with.`,
78
+ };
79
+ }
80
+ return { valid: true };
81
+ }
49
82
  const BASELINE_REL = "evals/baseline.json";
50
83
  /** Detect the package manager used in the project */
51
84
  function detectPackageManager(cwd) {
@@ -116,8 +149,13 @@ function runBaselineInit(cwd) {
116
149
  },
117
150
  productMetrics: {},
118
151
  };
119
- fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
120
- console.log(`✅ Created ${BASELINE_REL} with sample values\n`);
152
+ // Stamp checksum
153
+ const withChecksum = {
154
+ ...baseline,
155
+ _checksum: computeBaselineChecksum(baseline),
156
+ };
157
+ fs.writeFileSync(baselinePath, `${JSON.stringify(withChecksum, null, 2)}\n`);
158
+ console.log(`✅ Created ${BASELINE_REL} with sample values (checksum stamped)\n`);
121
159
  console.log("Next steps:");
122
160
  console.log(` 1. Commit ${BASELINE_REL} to your repo`);
123
161
  console.log(" 2. Run 'evalgate baseline update' to populate with real scores");
@@ -164,8 +202,10 @@ function runBaselineUpdate(cwd) {
164
202
  baseline.updatedBy = process.env.USER || process.env.USERNAME || "unknown";
165
203
  baseline.confidenceTests = baseline.confidenceTests ?? {};
166
204
  baseline.confidenceTests.unitPassed = testResult.status === 0;
205
+ // Re-stamp checksum
206
+ baseline._checksum = computeBaselineChecksum(baseline);
167
207
  fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
168
- console.log("\n✅ Baseline updated successfully");
208
+ console.log("\n✅ Baseline updated successfully (checksum stamped)");
169
209
  }
170
210
  catch {
171
211
  console.error("❌ Failed to update baseline file");
@@ -16,12 +16,13 @@
16
16
  * --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
17
17
  * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
18
18
  * --evaluationId <id> Required. The evaluation to gate on.
19
- * --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or http://localhost:3000)
19
+ * --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or https://api.evalgate.com)
20
20
  * --apiKey <key> API key (default: EVALGATE_API_KEY env var)
21
21
  * --share <mode> Share link: "always" | "fail" | "never" (default: never)
22
22
  * fail = create public share link only when gate fails (CI-friendly)
23
23
  * --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
24
24
  * --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
25
+ * --dry-run Run all checks and print results, but always exit 0
25
26
  *
26
27
  * Exit codes:
27
28
  * 0 — Gate passed
@@ -35,7 +36,7 @@
35
36
  * 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
36
37
  *
37
38
  * Environment:
38
- * EVALGATE_BASE_URL — API base URL (default: http://localhost:3000)
39
+ * EVALGATE_BASE_URL — API base URL (default: https://api.evalgate.com)
39
40
  * EVALGATE_API_KEY — API key for authentication
40
41
  */
41
42
  export { EXIT } from "./constants";
@@ -60,6 +61,8 @@ export interface CheckArgs {
60
61
  maxCostUsd?: number;
61
62
  maxLatencyMs?: number;
62
63
  maxCostDeltaUsd?: number;
64
+ /** When true, run all checks and print results but always exit 0. */
65
+ dryRun?: boolean;
63
66
  }
64
67
  export type ParseArgsResult = {
65
68
  ok: true;
package/dist/cli/check.js CHANGED
@@ -17,12 +17,13 @@
17
17
  * --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
18
18
  * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
19
19
  * --evaluationId <id> Required. The evaluation to gate on.
20
- * --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or http://localhost:3000)
20
+ * --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or https://api.evalgate.com)
21
21
  * --apiKey <key> API key (default: EVALGATE_API_KEY env var)
22
22
  * --share <mode> Share link: "always" | "fail" | "never" (default: never)
23
23
  * fail = create public share link only when gate fails (CI-friendly)
24
24
  * --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
25
25
  * --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
26
+ * --dry-run Run all checks and print results, but always exit 0
26
27
  *
27
28
  * Exit codes:
28
29
  * 0 — Gate passed
@@ -36,7 +37,7 @@
36
37
  * 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
37
38
  *
38
39
  * Environment:
39
- * EVALGATE_BASE_URL — API base URL (default: http://localhost:3000)
40
+ * EVALGATE_BASE_URL — API base URL (default: https://api.evalgate.com)
40
41
  * EVALGATE_API_KEY — API key for authentication
41
42
  */
42
43
  var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
@@ -78,18 +79,19 @@ exports.parseArgs = parseArgs;
78
79
  exports.runCheck = runCheck;
79
80
  const fs = __importStar(require("node:fs"));
80
81
  const path = __importStar(require("node:path"));
82
+ const constants_1 = require("../constants");
81
83
  const api_1 = require("./api");
82
84
  const ci_context_1 = require("./ci-context");
83
85
  const config_1 = require("./config");
84
- const constants_1 = require("./constants");
86
+ const constants_2 = require("./constants");
85
87
  const github_1 = require("./formatters/github");
86
88
  const human_1 = require("./formatters/human");
87
89
  const json_1 = require("./formatters/json");
88
90
  const pr_comment_1 = require("./formatters/pr-comment");
89
91
  const gate_1 = require("./gate");
90
92
  const build_check_report_1 = require("./report/build-check-report");
91
- var constants_2 = require("./constants");
92
- Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_2.EXIT; } });
93
+ var constants_3 = require("./constants");
94
+ Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_3.EXIT; } });
93
95
  function parseArgs(argv) {
94
96
  const args = {};
95
97
  for (let i = 0; i < argv.length; i++) {
@@ -106,7 +108,7 @@ function parseArgs(argv) {
106
108
  }
107
109
  }
108
110
  }
109
- let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL || "http://localhost:3000";
111
+ let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL || constants_1.DEFAULT_BASE_URL;
110
112
  const apiKey = args.apiKey ||
111
113
  process.env.EVALGATE_API_KEY ||
112
114
  process.env.EVALAI_API_KEY ||
@@ -122,6 +124,7 @@ function parseArgs(argv) {
122
124
  const format = formatRaw === "json" ? "json" : formatRaw === "github" ? "github" : "human";
123
125
  const explain = args.explain === "true" || args.explain === "1";
124
126
  const onFail = args.onFail === "import" ? "import" : undefined;
127
+ const dryRun = args["dry-run"] === "true" || args.dryRun === "true";
125
128
  const shareRaw = args.share || "never";
126
129
  const share = shareRaw === "always" ? "always" : shareRaw === "fail" ? "fail" : "never";
127
130
  const prCommentOut = args["pr-comment-out"] || args.prCommentOut || undefined;
@@ -176,28 +179,28 @@ function parseArgs(argv) {
176
179
  if (!apiKey) {
177
180
  return {
178
181
  ok: false,
179
- exitCode: constants_1.EXIT.BAD_ARGS,
182
+ exitCode: constants_2.EXIT.BAD_ARGS,
180
183
  message: "Error: --apiKey or EVALGATE_API_KEY is required",
181
184
  };
182
185
  }
183
186
  if (!evaluationId) {
184
187
  return {
185
188
  ok: false,
186
- exitCode: constants_1.EXIT.BAD_ARGS,
189
+ exitCode: constants_2.EXIT.BAD_ARGS,
187
190
  message: "Run npx evalgate init and paste your evaluationId, or pass --evaluationId.",
188
191
  };
189
192
  }
190
193
  if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
191
194
  return {
192
195
  ok: false,
193
- exitCode: constants_1.EXIT.BAD_ARGS,
196
+ exitCode: constants_2.EXIT.BAD_ARGS,
194
197
  message: "Error: --minScore must be 0-100",
195
198
  };
196
199
  }
197
200
  if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
198
201
  return {
199
202
  ok: false,
200
- exitCode: constants_1.EXIT.BAD_ARGS,
203
+ exitCode: constants_2.EXIT.BAD_ARGS,
201
204
  message: "Error: --minN must be a positive number",
202
205
  };
203
206
  }
@@ -228,6 +231,7 @@ function parseArgs(argv) {
228
231
  maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
229
232
  ? maxCostDeltaUsd
230
233
  : undefined,
234
+ dryRun: dryRun || undefined,
231
235
  },
232
236
  };
233
237
  }
@@ -240,7 +244,7 @@ async function runCheck(args) {
240
244
  else {
241
245
  console.error(`EvalGate gate ERROR: API returned ${qualityResult.status} — ${qualityResult.body}`);
242
246
  }
243
- return constants_1.EXIT.API_ERROR;
247
+ return constants_2.EXIT.API_ERROR;
244
248
  }
245
249
  const { data: quality, requestId } = qualityResult;
246
250
  const evaluationRunId = quality?.evaluationRunId;
@@ -336,6 +340,10 @@ async function runCheck(args) {
336
340
  }
337
341
  }
338
342
  }
343
+ if (args.dryRun) {
344
+ console.error(`\n[dry-run] Gate would have exited with code ${gateResult.exitCode}`);
345
+ return constants_2.EXIT.PASS;
346
+ }
339
347
  return gateResult.exitCode;
340
348
  }
341
349
  // Main entry point
@@ -350,6 +358,6 @@ if (isDirectRun) {
350
358
  .then((code) => process.exit(code))
351
359
  .catch((err) => {
352
360
  console.error(`EvalGate gate ERROR: ${err instanceof Error ? err.message : String(err)}`);
353
- process.exit(constants_1.EXIT.API_ERROR);
361
+ process.exit(constants_2.EXIT.API_ERROR);
354
362
  });
355
363
  }
@@ -0,0 +1,80 @@
1
+ /**
2
+ * evalgate compare — Side-by-side result file comparison
3
+ *
4
+ * Compares two or more saved run result JSON files. Does NOT re-run anything.
5
+ * You run each model/config separately (evalgate run --write-results),
6
+ * then compare the saved artifacts. Shows wins/losses/ties per spec.
7
+ *
8
+ * Usage:
9
+ * evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json
10
+ * evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"
11
+ * evalgate compare --runs run-a.json run-b.json run-c.json
12
+ */
13
+ export interface CompareOptions {
14
+ /** Paths to run result files to compare */
15
+ runs: string[];
16
+ /** Human-readable labels for each run (e.g., model names) */
17
+ labels?: string[];
18
+ /** Output format */
19
+ format?: "human" | "json";
20
+ /** Sort by: name, score-delta, status */
21
+ sortBy?: "name" | "score" | "duration";
22
+ }
23
+ /**
24
+ * Per-spec comparison row
25
+ */
26
+ export interface CompareRow {
27
+ specId: string;
28
+ name: string;
29
+ filePath: string;
30
+ results: Array<{
31
+ label: string;
32
+ status: "passed" | "failed" | "skipped" | "missing";
33
+ score?: number;
34
+ duration: number;
35
+ error?: string;
36
+ }>;
37
+ /** Which run "won" (highest score), or null if tied */
38
+ winner: string | null;
39
+ }
40
+ /**
41
+ * Overall comparison result
42
+ */
43
+ export interface CompareResult {
44
+ schemaVersion: 1;
45
+ labels: string[];
46
+ runIds: string[];
47
+ specs: CompareRow[];
48
+ summary: {
49
+ /** Wins per label */
50
+ wins: Record<string, number>;
51
+ /** Ties count */
52
+ ties: number;
53
+ /** Per-label aggregates */
54
+ aggregates: Array<{
55
+ label: string;
56
+ runId: string;
57
+ passed: number;
58
+ failed: number;
59
+ avgScore: number;
60
+ avgDuration: number;
61
+ totalDuration: number;
62
+ }>;
63
+ };
64
+ }
65
+ /**
66
+ * Run the comparison
67
+ */
68
+ export declare function runCompare(options: CompareOptions, projectRoot?: string): Promise<CompareResult>;
69
+ /**
70
+ * Print human-readable comparison
71
+ */
72
+ export declare function printHumanCompare(result: CompareResult): void;
73
+ /**
74
+ * Print JSON comparison
75
+ */
76
+ export declare function printJsonCompare(result: CompareResult): void;
77
+ /**
78
+ * CLI entry point for compare
79
+ */
80
+ export declare function runCompareCLI(options: CompareOptions): Promise<void>;
@@ -0,0 +1,266 @@
1
+ "use strict";
2
+ /**
3
+ * evalgate compare — Side-by-side result file comparison
4
+ *
5
+ * Compares two or more saved run result JSON files. Does NOT re-run anything.
6
+ * You run each model/config separately (evalgate run --write-results),
7
+ * then compare the saved artifacts. Shows wins/losses/ties per spec.
8
+ *
9
+ * Usage:
10
+ * evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json
11
+ * evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"
12
+ * evalgate compare --runs run-a.json run-b.json run-c.json
13
+ */
14
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ var desc = Object.getOwnPropertyDescriptor(m, k);
17
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
18
+ desc = { enumerable: true, get: function() { return m[k]; } };
19
+ }
20
+ Object.defineProperty(o, k2, desc);
21
+ }) : (function(o, m, k, k2) {
22
+ if (k2 === undefined) k2 = k;
23
+ o[k2] = m[k];
24
+ }));
25
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
26
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
27
+ }) : function(o, v) {
28
+ o["default"] = v;
29
+ });
30
+ var __importStar = (this && this.__importStar) || (function () {
31
+ var ownKeys = function(o) {
32
+ ownKeys = Object.getOwnPropertyNames || function (o) {
33
+ var ar = [];
34
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
35
+ return ar;
36
+ };
37
+ return ownKeys(o);
38
+ };
39
+ return function (mod) {
40
+ if (mod && mod.__esModule) return mod;
41
+ var result = {};
42
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
43
+ __setModuleDefault(result, mod);
44
+ return result;
45
+ };
46
+ })();
47
+ Object.defineProperty(exports, "__esModule", { value: true });
48
+ exports.runCompare = runCompare;
49
+ exports.printHumanCompare = printHumanCompare;
50
+ exports.printJsonCompare = printJsonCompare;
51
+ exports.runCompareCLI = runCompareCLI;
52
+ const fs = __importStar(require("node:fs/promises"));
53
+ const path = __importStar(require("node:path"));
54
+ /**
55
+ * Load a run result from file
56
+ */
57
+ async function loadRunResult(filePath, projectRoot) {
58
+ const resolved = path.isAbsolute(filePath)
59
+ ? filePath
60
+ : path.join(projectRoot, filePath);
61
+ const content = await fs.readFile(resolved, "utf-8");
62
+ return JSON.parse(content);
63
+ }
64
+ /**
65
+ * Run the comparison
66
+ */
67
+ async function runCompare(options, projectRoot = process.cwd()) {
68
+ if (options.runs.length < 2) {
69
+ throw new Error("At least 2 run files are required for comparison.");
70
+ }
71
+ // Load all runs
72
+ const runs = [];
73
+ for (const runPath of options.runs) {
74
+ runs.push(await loadRunResult(runPath, projectRoot));
75
+ }
76
+ // Generate labels
77
+ const labels = options.labels?.length === runs.length
78
+ ? options.labels
79
+ : runs.map((r, i) => options.labels?.[i] ?? r.runId ?? `Run ${i + 1}`);
80
+ // Collect all unique spec IDs across all runs
81
+ const allSpecIds = new Map();
82
+ for (const run of runs) {
83
+ for (const spec of run.results) {
84
+ if (!allSpecIds.has(spec.specId)) {
85
+ allSpecIds.set(spec.specId, {
86
+ name: spec.name,
87
+ filePath: spec.filePath,
88
+ });
89
+ }
90
+ }
91
+ }
92
+ // Build comparison rows
93
+ const specs = [];
94
+ const wins = {};
95
+ let ties = 0;
96
+ for (const label of labels)
97
+ wins[label] = 0;
98
+ for (const [specId, meta] of allSpecIds) {
99
+ const results = runs.map((run, i) => {
100
+ const spec = run.results.find((r) => r.specId === specId);
101
+ if (!spec) {
102
+ return {
103
+ label: labels[i],
104
+ status: "missing",
105
+ score: undefined,
106
+ duration: 0,
107
+ };
108
+ }
109
+ return {
110
+ label: labels[i],
111
+ status: spec.result.status,
112
+ score: spec.result.score,
113
+ duration: spec.result.duration,
114
+ error: spec.result.error,
115
+ };
116
+ });
117
+ // Determine winner by score (higher is better), then by status
118
+ const scoredResults = results.filter((r) => r.score !== undefined && r.status !== "missing");
119
+ let winner = null;
120
+ if (scoredResults.length >= 2) {
121
+ const maxScore = Math.max(...scoredResults.map((r) => r.score ?? 0));
122
+ const topScorers = scoredResults.filter((r) => r.score === maxScore);
123
+ if (topScorers.length === 1) {
124
+ winner = topScorers[0].label;
125
+ wins[winner]++;
126
+ }
127
+ else {
128
+ ties++;
129
+ }
130
+ }
131
+ else {
132
+ // Compare by status: passed > failed > skipped > missing
133
+ const statusRank = { passed: 3, failed: 1, skipped: 0, missing: -1 };
134
+ const ranked = results
135
+ .filter((r) => r.status !== "missing")
136
+ .sort((a, b) => (statusRank[b.status] ?? 0) - (statusRank[a.status] ?? 0));
137
+ if (ranked.length >= 2 &&
138
+ statusRank[ranked[0].status] > statusRank[ranked[1].status]) {
139
+ winner = ranked[0].label;
140
+ wins[winner]++;
141
+ }
142
+ else if (ranked.length >= 2) {
143
+ ties++;
144
+ }
145
+ }
146
+ specs.push({
147
+ specId,
148
+ name: meta.name,
149
+ filePath: meta.filePath,
150
+ results,
151
+ winner,
152
+ });
153
+ }
154
+ // Sort
155
+ if (options.sortBy === "score") {
156
+ specs.sort((a, b) => {
157
+ const aMax = Math.max(...a.results.map((r) => r.score ?? 0));
158
+ const bMax = Math.max(...b.results.map((r) => r.score ?? 0));
159
+ return bMax - aMax;
160
+ });
161
+ }
162
+ else if (options.sortBy === "duration") {
163
+ specs.sort((a, b) => {
164
+ const aMax = Math.max(...a.results.map((r) => r.duration));
165
+ const bMax = Math.max(...b.results.map((r) => r.duration));
166
+ return bMax - aMax;
167
+ });
168
+ }
169
+ else {
170
+ specs.sort((a, b) => a.name.localeCompare(b.name));
171
+ }
172
+ // Build aggregates
173
+ const aggregates = runs.map((run, i) => {
174
+ const passed = run.results.filter((r) => r.result.status === "passed").length;
175
+ const failed = run.results.filter((r) => r.result.status === "failed").length;
176
+ const scores = run.results
177
+ .filter((r) => r.result.score !== undefined)
178
+ .map((r) => r.result.score);
179
+ const durations = run.results.map((r) => r.result.duration);
180
+ return {
181
+ label: labels[i],
182
+ runId: run.runId,
183
+ passed,
184
+ failed,
185
+ avgScore: scores.length > 0
186
+ ? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000
187
+ : 0,
188
+ avgDuration: durations.length > 0
189
+ ? Math.round(durations.reduce((a, b) => a + b, 0) / durations.length)
190
+ : 0,
191
+ totalDuration: durations.reduce((a, b) => a + b, 0),
192
+ };
193
+ });
194
+ return {
195
+ schemaVersion: 1,
196
+ labels,
197
+ runIds: runs.map((r) => r.runId),
198
+ specs,
199
+ summary: { wins, ties, aggregates },
200
+ };
201
+ }
202
+ /**
203
+ * Print human-readable comparison
204
+ */
205
+ function printHumanCompare(result) {
206
+ console.log("\n🔄 Run Comparison\n");
207
+ // Header
208
+ const labelHeader = result.labels.map((l) => l.padEnd(16)).join(" ");
209
+ console.log(` ${"Spec".padEnd(30)} ${labelHeader} Winner`);
210
+ console.log(` ${"─".repeat(30)} ${result.labels.map(() => "─".repeat(16)).join(" ")} ${"─".repeat(12)}`);
211
+ // Rows
212
+ for (const spec of result.specs) {
213
+ const name = spec.name.length > 28 ? `${spec.name.substring(0, 25)}...` : spec.name;
214
+ const cells = spec.results.map((r) => {
215
+ const icon = r.status === "passed"
216
+ ? "✅"
217
+ : r.status === "failed"
218
+ ? "❌"
219
+ : r.status === "skipped"
220
+ ? "⏭️"
221
+ : "➖";
222
+ const score = r.score !== undefined ? `${(r.score * 100).toFixed(0)}%` : "";
223
+ const dur = r.duration > 0 ? `${r.duration}ms` : "";
224
+ return `${icon} ${score} ${dur}`.padEnd(16);
225
+ });
226
+ const winner = spec.winner ?? "tie";
227
+ console.log(` ${name.padEnd(30)} ${cells.join(" ")} ${winner}`);
228
+ }
229
+ // Summary
230
+ console.log("\n📊 Summary:");
231
+ for (const agg of result.summary.aggregates) {
232
+ console.log(` ${agg.label}: ${agg.passed} passed, ${agg.failed} failed, avg score: ${(agg.avgScore * 100).toFixed(1)}%, avg latency: ${agg.avgDuration}ms`);
233
+ }
234
+ console.log("\n🏆 Wins:");
235
+ for (const [label, count] of Object.entries(result.summary.wins)) {
236
+ console.log(` ${label}: ${count} wins`);
237
+ }
238
+ if (result.summary.ties > 0) {
239
+ console.log(` Ties: ${result.summary.ties}`);
240
+ }
241
+ }
242
+ /**
243
+ * Print JSON comparison
244
+ */
245
+ function printJsonCompare(result) {
246
+ console.log(JSON.stringify(result, null, 2));
247
+ }
248
+ /**
249
+ * CLI entry point for compare
250
+ */
251
+ async function runCompareCLI(options) {
252
+ try {
253
+ const result = await runCompare(options);
254
+ if (options.format === "json") {
255
+ printJsonCompare(result);
256
+ }
257
+ else {
258
+ printHumanCompare(result);
259
+ }
260
+ process.exit(0);
261
+ }
262
+ catch (error) {
263
+ console.error("❌ Compare failed:", error instanceof Error ? error.message : String(error));
264
+ process.exit(1);
265
+ }
266
+ }