@evalgate/sdk 2.2.2 ā 2.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +40 -1
- package/dist/assertions.d.ts +194 -10
- package/dist/assertions.js +525 -73
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +5 -1
- package/dist/cache.js +5 -1
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/upgrade.js +5 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/errors.js +7 -0
- package/dist/export.js +2 -2
- package/dist/index.d.ts +10 -9
- package/dist/index.js +24 -7
- package/dist/integrations/anthropic.js +6 -6
- package/dist/integrations/openai.js +84 -61
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/pagination.d.ts +13 -2
- package/dist/pagination.js +28 -2
- package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/executor.d.ts +3 -2
- package/dist/runtime/executor.js +3 -2
- package/dist/runtime/registry.d.ts +8 -3
- package/dist/runtime/registry.js +15 -4
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/snapshot.d.ts +12 -0
- package/dist/snapshot.js +24 -1
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +8 -1
package/dist/batch.js
CHANGED
|
@@ -163,15 +163,15 @@ function canBatch(method, endpoint) {
|
|
|
163
163
|
*/
|
|
164
164
|
async function batchProcess(items, processor, concurrency = 5) {
|
|
165
165
|
const results = [];
|
|
166
|
-
const executing =
|
|
166
|
+
const executing = new Set();
|
|
167
167
|
for (const item of items) {
|
|
168
168
|
const promise = processor(item).then((result) => {
|
|
169
169
|
results.push(result);
|
|
170
170
|
});
|
|
171
|
-
executing.
|
|
172
|
-
|
|
171
|
+
const tracked = promise.finally(() => executing.delete(tracked));
|
|
172
|
+
executing.add(tracked);
|
|
173
|
+
if (executing.size >= concurrency) {
|
|
173
174
|
await Promise.race(executing);
|
|
174
|
-
executing.splice(executing.indexOf(promise), 1);
|
|
175
175
|
}
|
|
176
176
|
}
|
|
177
177
|
await Promise.all(executing);
|
package/dist/cache.d.ts
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
* Simple in-memory cache with TTL for SDK requests
|
|
3
3
|
* Reduces redundant API calls and improves performance
|
|
4
4
|
*/
|
|
5
|
+
/**
|
|
6
|
+
* @internal ā HTTP request cache used by AIEvalClient. Not part of the public API.
|
|
7
|
+
* Use {@link CacheTTL} to configure cache durations via client options.
|
|
8
|
+
*/
|
|
5
9
|
export declare class RequestCache {
|
|
6
10
|
private cache;
|
|
7
11
|
private maxSize;
|
|
@@ -21,7 +25,7 @@ export declare class RequestCache {
|
|
|
21
25
|
/**
|
|
22
26
|
* Store response in cache
|
|
23
27
|
*/
|
|
24
|
-
set<T>(method: string, url: string, data: T, ttl
|
|
28
|
+
set<T>(method: string, url: string, data: T, ttl?: number, params?: unknown): void;
|
|
25
29
|
/**
|
|
26
30
|
* Invalidate specific cache entry
|
|
27
31
|
*/
|
package/dist/cache.js
CHANGED
|
@@ -7,6 +7,10 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
7
7
|
exports.CacheTTL = exports.RequestCache = void 0;
|
|
8
8
|
exports.shouldCache = shouldCache;
|
|
9
9
|
exports.getTTL = getTTL;
|
|
10
|
+
/**
|
|
11
|
+
* @internal ā HTTP request cache used by AIEvalClient. Not part of the public API.
|
|
12
|
+
* Use {@link CacheTTL} to configure cache durations via client options.
|
|
13
|
+
*/
|
|
10
14
|
class RequestCache {
|
|
11
15
|
constructor(maxSize = 1000) {
|
|
12
16
|
this.cache = new Map();
|
|
@@ -43,7 +47,7 @@ class RequestCache {
|
|
|
43
47
|
/**
|
|
44
48
|
* Store response in cache
|
|
45
49
|
*/
|
|
46
|
-
set(method, url, data, ttl, params) {
|
|
50
|
+
set(method, url, data, ttl = exports.CacheTTL.MEDIUM, params) {
|
|
47
51
|
// Enforce cache size limit (LRU-style)
|
|
48
52
|
if (this.cache.size >= this.maxSize) {
|
|
49
53
|
const firstKey = this.cache.keys().next().value;
|
package/dist/cli/baseline.d.ts
CHANGED
|
@@ -5,6 +5,20 @@
|
|
|
5
5
|
* evalgate baseline init ā Create a starter evals/baseline.json
|
|
6
6
|
* evalgate baseline update ā Run tests + update baseline with real scores
|
|
7
7
|
*/
|
|
8
|
+
/**
|
|
9
|
+
* Compute a SHA-256 checksum of the baseline data (excluding the _checksum field).
|
|
10
|
+
* This detects accidental corruption or manual tampering between runs.
|
|
11
|
+
*/
|
|
12
|
+
export declare function computeBaselineChecksum(data: Record<string, unknown>): string;
|
|
13
|
+
/**
|
|
14
|
+
* Verify the checksum stored in a baseline file matches its content.
|
|
15
|
+
* Returns { valid: true } if checksum matches or is absent (legacy files).
|
|
16
|
+
* Returns { valid: false, reason } if checksum is present but doesn't match.
|
|
17
|
+
*/
|
|
18
|
+
export declare function verifyBaselineChecksum(data: Record<string, unknown>): {
|
|
19
|
+
valid: boolean;
|
|
20
|
+
reason?: string;
|
|
21
|
+
};
|
|
8
22
|
export declare function runBaselineInit(cwd: string): number;
|
|
9
23
|
export declare function runBaselineUpdate(cwd: string): number;
|
|
10
24
|
export declare function runBaseline(argv: string[]): number;
|
package/dist/cli/baseline.js
CHANGED
|
@@ -40,12 +40,45 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
40
40
|
};
|
|
41
41
|
})();
|
|
42
42
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
43
|
+
exports.computeBaselineChecksum = computeBaselineChecksum;
|
|
44
|
+
exports.verifyBaselineChecksum = verifyBaselineChecksum;
|
|
43
45
|
exports.runBaselineInit = runBaselineInit;
|
|
44
46
|
exports.runBaselineUpdate = runBaselineUpdate;
|
|
45
47
|
exports.runBaseline = runBaseline;
|
|
46
48
|
const node_child_process_1 = require("node:child_process");
|
|
49
|
+
const crypto = __importStar(require("node:crypto"));
|
|
47
50
|
const fs = __importStar(require("node:fs"));
|
|
48
51
|
const path = __importStar(require("node:path"));
|
|
52
|
+
/**
|
|
53
|
+
* Compute a SHA-256 checksum of the baseline data (excluding the _checksum field).
|
|
54
|
+
* This detects accidental corruption or manual tampering between runs.
|
|
55
|
+
*/
|
|
56
|
+
function computeBaselineChecksum(data) {
|
|
57
|
+
const copy = { ...data };
|
|
58
|
+
delete copy._checksum;
|
|
59
|
+
const content = JSON.stringify(copy, Object.keys(copy).sort());
|
|
60
|
+
return crypto.createHash("sha256").update(content).digest("hex");
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Verify the checksum stored in a baseline file matches its content.
|
|
64
|
+
* Returns { valid: true } if checksum matches or is absent (legacy files).
|
|
65
|
+
* Returns { valid: false, reason } if checksum is present but doesn't match.
|
|
66
|
+
*/
|
|
67
|
+
function verifyBaselineChecksum(data) {
|
|
68
|
+
const stored = data._checksum;
|
|
69
|
+
if (typeof stored !== "string") {
|
|
70
|
+
// Legacy baseline without checksum ā allow but warn
|
|
71
|
+
return { valid: true, reason: "no_checksum" };
|
|
72
|
+
}
|
|
73
|
+
const computed = computeBaselineChecksum(data);
|
|
74
|
+
if (computed !== stored) {
|
|
75
|
+
return {
|
|
76
|
+
valid: false,
|
|
77
|
+
reason: `Checksum mismatch: expected ${stored.slice(0, 12)}ā¦, got ${computed.slice(0, 12)}ā¦. Baseline may be corrupted or tampered with.`,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
return { valid: true };
|
|
81
|
+
}
|
|
49
82
|
const BASELINE_REL = "evals/baseline.json";
|
|
50
83
|
/** Detect the package manager used in the project */
|
|
51
84
|
function detectPackageManager(cwd) {
|
|
@@ -116,8 +149,13 @@ function runBaselineInit(cwd) {
|
|
|
116
149
|
},
|
|
117
150
|
productMetrics: {},
|
|
118
151
|
};
|
|
119
|
-
|
|
120
|
-
|
|
152
|
+
// Stamp checksum
|
|
153
|
+
const withChecksum = {
|
|
154
|
+
...baseline,
|
|
155
|
+
_checksum: computeBaselineChecksum(baseline),
|
|
156
|
+
};
|
|
157
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify(withChecksum, null, 2)}\n`);
|
|
158
|
+
console.log(`ā
Created ${BASELINE_REL} with sample values (checksum stamped)\n`);
|
|
121
159
|
console.log("Next steps:");
|
|
122
160
|
console.log(` 1. Commit ${BASELINE_REL} to your repo`);
|
|
123
161
|
console.log(" 2. Run 'evalgate baseline update' to populate with real scores");
|
|
@@ -164,8 +202,10 @@ function runBaselineUpdate(cwd) {
|
|
|
164
202
|
baseline.updatedBy = process.env.USER || process.env.USERNAME || "unknown";
|
|
165
203
|
baseline.confidenceTests = baseline.confidenceTests ?? {};
|
|
166
204
|
baseline.confidenceTests.unitPassed = testResult.status === 0;
|
|
205
|
+
// Re-stamp checksum
|
|
206
|
+
baseline._checksum = computeBaselineChecksum(baseline);
|
|
167
207
|
fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
|
|
168
|
-
console.log("\nā
Baseline updated successfully");
|
|
208
|
+
console.log("\nā
Baseline updated successfully (checksum stamped)");
|
|
169
209
|
}
|
|
170
210
|
catch {
|
|
171
211
|
console.error("ā Failed to update baseline file");
|
package/dist/cli/check.d.ts
CHANGED
|
@@ -16,12 +16,13 @@
|
|
|
16
16
|
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
17
17
|
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
18
18
|
* --evaluationId <id> Required. The evaluation to gate on.
|
|
19
|
-
* --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or
|
|
19
|
+
* --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or https://api.evalgate.com)
|
|
20
20
|
* --apiKey <key> API key (default: EVALGATE_API_KEY env var)
|
|
21
21
|
* --share <mode> Share link: "always" | "fail" | "never" (default: never)
|
|
22
22
|
* fail = create public share link only when gate fails (CI-friendly)
|
|
23
23
|
* --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
|
|
24
24
|
* --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
|
|
25
|
+
* --dry-run Run all checks and print results, but always exit 0
|
|
25
26
|
*
|
|
26
27
|
* Exit codes:
|
|
27
28
|
* 0 ā Gate passed
|
|
@@ -35,7 +36,7 @@
|
|
|
35
36
|
* 8 ā Gate warned: near-regression (warnDrop ⤠drop < maxDrop)
|
|
36
37
|
*
|
|
37
38
|
* Environment:
|
|
38
|
-
* EVALGATE_BASE_URL ā API base URL (default:
|
|
39
|
+
* EVALGATE_BASE_URL ā API base URL (default: https://api.evalgate.com)
|
|
39
40
|
* EVALGATE_API_KEY ā API key for authentication
|
|
40
41
|
*/
|
|
41
42
|
export { EXIT } from "./constants";
|
|
@@ -60,6 +61,8 @@ export interface CheckArgs {
|
|
|
60
61
|
maxCostUsd?: number;
|
|
61
62
|
maxLatencyMs?: number;
|
|
62
63
|
maxCostDeltaUsd?: number;
|
|
64
|
+
/** When true, run all checks and print results but always exit 0. */
|
|
65
|
+
dryRun?: boolean;
|
|
63
66
|
}
|
|
64
67
|
export type ParseArgsResult = {
|
|
65
68
|
ok: true;
|
package/dist/cli/check.js
CHANGED
|
@@ -17,12 +17,13 @@
|
|
|
17
17
|
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
18
18
|
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
19
19
|
* --evaluationId <id> Required. The evaluation to gate on.
|
|
20
|
-
* --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or
|
|
20
|
+
* --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or https://api.evalgate.com)
|
|
21
21
|
* --apiKey <key> API key (default: EVALGATE_API_KEY env var)
|
|
22
22
|
* --share <mode> Share link: "always" | "fail" | "never" (default: never)
|
|
23
23
|
* fail = create public share link only when gate fails (CI-friendly)
|
|
24
24
|
* --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
|
|
25
25
|
* --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
|
|
26
|
+
* --dry-run Run all checks and print results, but always exit 0
|
|
26
27
|
*
|
|
27
28
|
* Exit codes:
|
|
28
29
|
* 0 ā Gate passed
|
|
@@ -36,7 +37,7 @@
|
|
|
36
37
|
* 8 ā Gate warned: near-regression (warnDrop ⤠drop < maxDrop)
|
|
37
38
|
*
|
|
38
39
|
* Environment:
|
|
39
|
-
* EVALGATE_BASE_URL ā API base URL (default:
|
|
40
|
+
* EVALGATE_BASE_URL ā API base URL (default: https://api.evalgate.com)
|
|
40
41
|
* EVALGATE_API_KEY ā API key for authentication
|
|
41
42
|
*/
|
|
42
43
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
@@ -78,18 +79,19 @@ exports.parseArgs = parseArgs;
|
|
|
78
79
|
exports.runCheck = runCheck;
|
|
79
80
|
const fs = __importStar(require("node:fs"));
|
|
80
81
|
const path = __importStar(require("node:path"));
|
|
82
|
+
const constants_1 = require("../constants");
|
|
81
83
|
const api_1 = require("./api");
|
|
82
84
|
const ci_context_1 = require("./ci-context");
|
|
83
85
|
const config_1 = require("./config");
|
|
84
|
-
const
|
|
86
|
+
const constants_2 = require("./constants");
|
|
85
87
|
const github_1 = require("./formatters/github");
|
|
86
88
|
const human_1 = require("./formatters/human");
|
|
87
89
|
const json_1 = require("./formatters/json");
|
|
88
90
|
const pr_comment_1 = require("./formatters/pr-comment");
|
|
89
91
|
const gate_1 = require("./gate");
|
|
90
92
|
const build_check_report_1 = require("./report/build-check-report");
|
|
91
|
-
var
|
|
92
|
-
Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return
|
|
93
|
+
var constants_3 = require("./constants");
|
|
94
|
+
Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_3.EXIT; } });
|
|
93
95
|
function parseArgs(argv) {
|
|
94
96
|
const args = {};
|
|
95
97
|
for (let i = 0; i < argv.length; i++) {
|
|
@@ -106,7 +108,7 @@ function parseArgs(argv) {
|
|
|
106
108
|
}
|
|
107
109
|
}
|
|
108
110
|
}
|
|
109
|
-
let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL ||
|
|
111
|
+
let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL || constants_1.DEFAULT_BASE_URL;
|
|
110
112
|
const apiKey = args.apiKey ||
|
|
111
113
|
process.env.EVALGATE_API_KEY ||
|
|
112
114
|
process.env.EVALAI_API_KEY ||
|
|
@@ -122,6 +124,7 @@ function parseArgs(argv) {
|
|
|
122
124
|
const format = formatRaw === "json" ? "json" : formatRaw === "github" ? "github" : "human";
|
|
123
125
|
const explain = args.explain === "true" || args.explain === "1";
|
|
124
126
|
const onFail = args.onFail === "import" ? "import" : undefined;
|
|
127
|
+
const dryRun = args["dry-run"] === "true" || args.dryRun === "true";
|
|
125
128
|
const shareRaw = args.share || "never";
|
|
126
129
|
const share = shareRaw === "always" ? "always" : shareRaw === "fail" ? "fail" : "never";
|
|
127
130
|
const prCommentOut = args["pr-comment-out"] || args.prCommentOut || undefined;
|
|
@@ -176,28 +179,28 @@ function parseArgs(argv) {
|
|
|
176
179
|
if (!apiKey) {
|
|
177
180
|
return {
|
|
178
181
|
ok: false,
|
|
179
|
-
exitCode:
|
|
182
|
+
exitCode: constants_2.EXIT.BAD_ARGS,
|
|
180
183
|
message: "Error: --apiKey or EVALGATE_API_KEY is required",
|
|
181
184
|
};
|
|
182
185
|
}
|
|
183
186
|
if (!evaluationId) {
|
|
184
187
|
return {
|
|
185
188
|
ok: false,
|
|
186
|
-
exitCode:
|
|
189
|
+
exitCode: constants_2.EXIT.BAD_ARGS,
|
|
187
190
|
message: "Run npx evalgate init and paste your evaluationId, or pass --evaluationId.",
|
|
188
191
|
};
|
|
189
192
|
}
|
|
190
193
|
if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
191
194
|
return {
|
|
192
195
|
ok: false,
|
|
193
|
-
exitCode:
|
|
196
|
+
exitCode: constants_2.EXIT.BAD_ARGS,
|
|
194
197
|
message: "Error: --minScore must be 0-100",
|
|
195
198
|
};
|
|
196
199
|
}
|
|
197
200
|
if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
|
|
198
201
|
return {
|
|
199
202
|
ok: false,
|
|
200
|
-
exitCode:
|
|
203
|
+
exitCode: constants_2.EXIT.BAD_ARGS,
|
|
201
204
|
message: "Error: --minN must be a positive number",
|
|
202
205
|
};
|
|
203
206
|
}
|
|
@@ -228,6 +231,7 @@ function parseArgs(argv) {
|
|
|
228
231
|
maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
|
|
229
232
|
? maxCostDeltaUsd
|
|
230
233
|
: undefined,
|
|
234
|
+
dryRun: dryRun || undefined,
|
|
231
235
|
},
|
|
232
236
|
};
|
|
233
237
|
}
|
|
@@ -240,7 +244,7 @@ async function runCheck(args) {
|
|
|
240
244
|
else {
|
|
241
245
|
console.error(`EvalGate gate ERROR: API returned ${qualityResult.status} ā ${qualityResult.body}`);
|
|
242
246
|
}
|
|
243
|
-
return
|
|
247
|
+
return constants_2.EXIT.API_ERROR;
|
|
244
248
|
}
|
|
245
249
|
const { data: quality, requestId } = qualityResult;
|
|
246
250
|
const evaluationRunId = quality?.evaluationRunId;
|
|
@@ -336,6 +340,10 @@ async function runCheck(args) {
|
|
|
336
340
|
}
|
|
337
341
|
}
|
|
338
342
|
}
|
|
343
|
+
if (args.dryRun) {
|
|
344
|
+
console.error(`\n[dry-run] Gate would have exited with code ${gateResult.exitCode}`);
|
|
345
|
+
return constants_2.EXIT.PASS;
|
|
346
|
+
}
|
|
339
347
|
return gateResult.exitCode;
|
|
340
348
|
}
|
|
341
349
|
// Main entry point
|
|
@@ -350,6 +358,6 @@ if (isDirectRun) {
|
|
|
350
358
|
.then((code) => process.exit(code))
|
|
351
359
|
.catch((err) => {
|
|
352
360
|
console.error(`EvalGate gate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
353
|
-
process.exit(
|
|
361
|
+
process.exit(constants_2.EXIT.API_ERROR);
|
|
354
362
|
});
|
|
355
363
|
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalgate compare ā Side-by-side result file comparison
|
|
3
|
+
*
|
|
4
|
+
* Compares two or more saved run result JSON files. Does NOT re-run anything.
|
|
5
|
+
* You run each model/config separately (evalgate run --write-results),
|
|
6
|
+
* then compare the saved artifacts. Shows wins/losses/ties per spec.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json
|
|
10
|
+
* evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"
|
|
11
|
+
* evalgate compare --runs run-a.json run-b.json run-c.json
|
|
12
|
+
*/
|
|
13
|
+
export interface CompareOptions {
|
|
14
|
+
/** Paths to run result files to compare */
|
|
15
|
+
runs: string[];
|
|
16
|
+
/** Human-readable labels for each run (e.g., model names) */
|
|
17
|
+
labels?: string[];
|
|
18
|
+
/** Output format */
|
|
19
|
+
format?: "human" | "json";
|
|
20
|
+
/** Sort by: name, score-delta, status */
|
|
21
|
+
sortBy?: "name" | "score" | "duration";
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Per-spec comparison row
|
|
25
|
+
*/
|
|
26
|
+
export interface CompareRow {
|
|
27
|
+
specId: string;
|
|
28
|
+
name: string;
|
|
29
|
+
filePath: string;
|
|
30
|
+
results: Array<{
|
|
31
|
+
label: string;
|
|
32
|
+
status: "passed" | "failed" | "skipped" | "missing";
|
|
33
|
+
score?: number;
|
|
34
|
+
duration: number;
|
|
35
|
+
error?: string;
|
|
36
|
+
}>;
|
|
37
|
+
/** Which run "won" (highest score), or null if tied */
|
|
38
|
+
winner: string | null;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Overall comparison result
|
|
42
|
+
*/
|
|
43
|
+
export interface CompareResult {
|
|
44
|
+
schemaVersion: 1;
|
|
45
|
+
labels: string[];
|
|
46
|
+
runIds: string[];
|
|
47
|
+
specs: CompareRow[];
|
|
48
|
+
summary: {
|
|
49
|
+
/** Wins per label */
|
|
50
|
+
wins: Record<string, number>;
|
|
51
|
+
/** Ties count */
|
|
52
|
+
ties: number;
|
|
53
|
+
/** Per-label aggregates */
|
|
54
|
+
aggregates: Array<{
|
|
55
|
+
label: string;
|
|
56
|
+
runId: string;
|
|
57
|
+
passed: number;
|
|
58
|
+
failed: number;
|
|
59
|
+
avgScore: number;
|
|
60
|
+
avgDuration: number;
|
|
61
|
+
totalDuration: number;
|
|
62
|
+
}>;
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Run the comparison
|
|
67
|
+
*/
|
|
68
|
+
export declare function runCompare(options: CompareOptions, projectRoot?: string): Promise<CompareResult>;
|
|
69
|
+
/**
|
|
70
|
+
* Print human-readable comparison
|
|
71
|
+
*/
|
|
72
|
+
export declare function printHumanCompare(result: CompareResult): void;
|
|
73
|
+
/**
|
|
74
|
+
* Print JSON comparison
|
|
75
|
+
*/
|
|
76
|
+
export declare function printJsonCompare(result: CompareResult): void;
|
|
77
|
+
/**
|
|
78
|
+
* CLI entry point for compare
|
|
79
|
+
*/
|
|
80
|
+
export declare function runCompareCLI(options: CompareOptions): Promise<void>;
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalgate compare ā Side-by-side result file comparison
|
|
4
|
+
*
|
|
5
|
+
* Compares two or more saved run result JSON files. Does NOT re-run anything.
|
|
6
|
+
* You run each model/config separately (evalgate run --write-results),
|
|
7
|
+
* then compare the saved artifacts. Shows wins/losses/ties per spec.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json
|
|
11
|
+
* evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"
|
|
12
|
+
* evalgate compare --runs run-a.json run-b.json run-c.json
|
|
13
|
+
*/
|
|
14
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
17
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
18
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
19
|
+
}
|
|
20
|
+
Object.defineProperty(o, k2, desc);
|
|
21
|
+
}) : (function(o, m, k, k2) {
|
|
22
|
+
if (k2 === undefined) k2 = k;
|
|
23
|
+
o[k2] = m[k];
|
|
24
|
+
}));
|
|
25
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
26
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
27
|
+
}) : function(o, v) {
|
|
28
|
+
o["default"] = v;
|
|
29
|
+
});
|
|
30
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
31
|
+
var ownKeys = function(o) {
|
|
32
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
33
|
+
var ar = [];
|
|
34
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
35
|
+
return ar;
|
|
36
|
+
};
|
|
37
|
+
return ownKeys(o);
|
|
38
|
+
};
|
|
39
|
+
return function (mod) {
|
|
40
|
+
if (mod && mod.__esModule) return mod;
|
|
41
|
+
var result = {};
|
|
42
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
43
|
+
__setModuleDefault(result, mod);
|
|
44
|
+
return result;
|
|
45
|
+
};
|
|
46
|
+
})();
|
|
47
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
48
|
+
exports.runCompare = runCompare;
|
|
49
|
+
exports.printHumanCompare = printHumanCompare;
|
|
50
|
+
exports.printJsonCompare = printJsonCompare;
|
|
51
|
+
exports.runCompareCLI = runCompareCLI;
|
|
52
|
+
const fs = __importStar(require("node:fs/promises"));
|
|
53
|
+
const path = __importStar(require("node:path"));
|
|
54
|
+
/**
|
|
55
|
+
* Load a run result from file
|
|
56
|
+
*/
|
|
57
|
+
async function loadRunResult(filePath, projectRoot) {
|
|
58
|
+
const resolved = path.isAbsolute(filePath)
|
|
59
|
+
? filePath
|
|
60
|
+
: path.join(projectRoot, filePath);
|
|
61
|
+
const content = await fs.readFile(resolved, "utf-8");
|
|
62
|
+
return JSON.parse(content);
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Run the comparison
|
|
66
|
+
*/
|
|
67
|
+
async function runCompare(options, projectRoot = process.cwd()) {
|
|
68
|
+
if (options.runs.length < 2) {
|
|
69
|
+
throw new Error("At least 2 run files are required for comparison.");
|
|
70
|
+
}
|
|
71
|
+
// Load all runs
|
|
72
|
+
const runs = [];
|
|
73
|
+
for (const runPath of options.runs) {
|
|
74
|
+
runs.push(await loadRunResult(runPath, projectRoot));
|
|
75
|
+
}
|
|
76
|
+
// Generate labels
|
|
77
|
+
const labels = options.labels?.length === runs.length
|
|
78
|
+
? options.labels
|
|
79
|
+
: runs.map((r, i) => options.labels?.[i] ?? r.runId ?? `Run ${i + 1}`);
|
|
80
|
+
// Collect all unique spec IDs across all runs
|
|
81
|
+
const allSpecIds = new Map();
|
|
82
|
+
for (const run of runs) {
|
|
83
|
+
for (const spec of run.results) {
|
|
84
|
+
if (!allSpecIds.has(spec.specId)) {
|
|
85
|
+
allSpecIds.set(spec.specId, {
|
|
86
|
+
name: spec.name,
|
|
87
|
+
filePath: spec.filePath,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
// Build comparison rows
|
|
93
|
+
const specs = [];
|
|
94
|
+
const wins = {};
|
|
95
|
+
let ties = 0;
|
|
96
|
+
for (const label of labels)
|
|
97
|
+
wins[label] = 0;
|
|
98
|
+
for (const [specId, meta] of allSpecIds) {
|
|
99
|
+
const results = runs.map((run, i) => {
|
|
100
|
+
const spec = run.results.find((r) => r.specId === specId);
|
|
101
|
+
if (!spec) {
|
|
102
|
+
return {
|
|
103
|
+
label: labels[i],
|
|
104
|
+
status: "missing",
|
|
105
|
+
score: undefined,
|
|
106
|
+
duration: 0,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
return {
|
|
110
|
+
label: labels[i],
|
|
111
|
+
status: spec.result.status,
|
|
112
|
+
score: spec.result.score,
|
|
113
|
+
duration: spec.result.duration,
|
|
114
|
+
error: spec.result.error,
|
|
115
|
+
};
|
|
116
|
+
});
|
|
117
|
+
// Determine winner by score (higher is better), then by status
|
|
118
|
+
const scoredResults = results.filter((r) => r.score !== undefined && r.status !== "missing");
|
|
119
|
+
let winner = null;
|
|
120
|
+
if (scoredResults.length >= 2) {
|
|
121
|
+
const maxScore = Math.max(...scoredResults.map((r) => r.score ?? 0));
|
|
122
|
+
const topScorers = scoredResults.filter((r) => r.score === maxScore);
|
|
123
|
+
if (topScorers.length === 1) {
|
|
124
|
+
winner = topScorers[0].label;
|
|
125
|
+
wins[winner]++;
|
|
126
|
+
}
|
|
127
|
+
else {
|
|
128
|
+
ties++;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
// Compare by status: passed > failed > skipped > missing
|
|
133
|
+
const statusRank = { passed: 3, failed: 1, skipped: 0, missing: -1 };
|
|
134
|
+
const ranked = results
|
|
135
|
+
.filter((r) => r.status !== "missing")
|
|
136
|
+
.sort((a, b) => (statusRank[b.status] ?? 0) - (statusRank[a.status] ?? 0));
|
|
137
|
+
if (ranked.length >= 2 &&
|
|
138
|
+
statusRank[ranked[0].status] > statusRank[ranked[1].status]) {
|
|
139
|
+
winner = ranked[0].label;
|
|
140
|
+
wins[winner]++;
|
|
141
|
+
}
|
|
142
|
+
else if (ranked.length >= 2) {
|
|
143
|
+
ties++;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
specs.push({
|
|
147
|
+
specId,
|
|
148
|
+
name: meta.name,
|
|
149
|
+
filePath: meta.filePath,
|
|
150
|
+
results,
|
|
151
|
+
winner,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
// Sort
|
|
155
|
+
if (options.sortBy === "score") {
|
|
156
|
+
specs.sort((a, b) => {
|
|
157
|
+
const aMax = Math.max(...a.results.map((r) => r.score ?? 0));
|
|
158
|
+
const bMax = Math.max(...b.results.map((r) => r.score ?? 0));
|
|
159
|
+
return bMax - aMax;
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
else if (options.sortBy === "duration") {
|
|
163
|
+
specs.sort((a, b) => {
|
|
164
|
+
const aMax = Math.max(...a.results.map((r) => r.duration));
|
|
165
|
+
const bMax = Math.max(...b.results.map((r) => r.duration));
|
|
166
|
+
return bMax - aMax;
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
else {
|
|
170
|
+
specs.sort((a, b) => a.name.localeCompare(b.name));
|
|
171
|
+
}
|
|
172
|
+
// Build aggregates
|
|
173
|
+
const aggregates = runs.map((run, i) => {
|
|
174
|
+
const passed = run.results.filter((r) => r.result.status === "passed").length;
|
|
175
|
+
const failed = run.results.filter((r) => r.result.status === "failed").length;
|
|
176
|
+
const scores = run.results
|
|
177
|
+
.filter((r) => r.result.score !== undefined)
|
|
178
|
+
.map((r) => r.result.score);
|
|
179
|
+
const durations = run.results.map((r) => r.result.duration);
|
|
180
|
+
return {
|
|
181
|
+
label: labels[i],
|
|
182
|
+
runId: run.runId,
|
|
183
|
+
passed,
|
|
184
|
+
failed,
|
|
185
|
+
avgScore: scores.length > 0
|
|
186
|
+
? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000
|
|
187
|
+
: 0,
|
|
188
|
+
avgDuration: durations.length > 0
|
|
189
|
+
? Math.round(durations.reduce((a, b) => a + b, 0) / durations.length)
|
|
190
|
+
: 0,
|
|
191
|
+
totalDuration: durations.reduce((a, b) => a + b, 0),
|
|
192
|
+
};
|
|
193
|
+
});
|
|
194
|
+
return {
|
|
195
|
+
schemaVersion: 1,
|
|
196
|
+
labels,
|
|
197
|
+
runIds: runs.map((r) => r.runId),
|
|
198
|
+
specs,
|
|
199
|
+
summary: { wins, ties, aggregates },
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Print human-readable comparison
|
|
204
|
+
*/
|
|
205
|
+
function printHumanCompare(result) {
|
|
206
|
+
console.log("\nš Run Comparison\n");
|
|
207
|
+
// Header
|
|
208
|
+
const labelHeader = result.labels.map((l) => l.padEnd(16)).join(" ");
|
|
209
|
+
console.log(` ${"Spec".padEnd(30)} ${labelHeader} Winner`);
|
|
210
|
+
console.log(` ${"ā".repeat(30)} ${result.labels.map(() => "ā".repeat(16)).join(" ")} ${"ā".repeat(12)}`);
|
|
211
|
+
// Rows
|
|
212
|
+
for (const spec of result.specs) {
|
|
213
|
+
const name = spec.name.length > 28 ? `${spec.name.substring(0, 25)}...` : spec.name;
|
|
214
|
+
const cells = spec.results.map((r) => {
|
|
215
|
+
const icon = r.status === "passed"
|
|
216
|
+
? "ā
"
|
|
217
|
+
: r.status === "failed"
|
|
218
|
+
? "ā"
|
|
219
|
+
: r.status === "skipped"
|
|
220
|
+
? "āļø"
|
|
221
|
+
: "ā";
|
|
222
|
+
const score = r.score !== undefined ? `${(r.score * 100).toFixed(0)}%` : "";
|
|
223
|
+
const dur = r.duration > 0 ? `${r.duration}ms` : "";
|
|
224
|
+
return `${icon} ${score} ${dur}`.padEnd(16);
|
|
225
|
+
});
|
|
226
|
+
const winner = spec.winner ?? "tie";
|
|
227
|
+
console.log(` ${name.padEnd(30)} ${cells.join(" ")} ${winner}`);
|
|
228
|
+
}
|
|
229
|
+
// Summary
|
|
230
|
+
console.log("\nš Summary:");
|
|
231
|
+
for (const agg of result.summary.aggregates) {
|
|
232
|
+
console.log(` ${agg.label}: ${agg.passed} passed, ${agg.failed} failed, avg score: ${(agg.avgScore * 100).toFixed(1)}%, avg latency: ${agg.avgDuration}ms`);
|
|
233
|
+
}
|
|
234
|
+
console.log("\nš Wins:");
|
|
235
|
+
for (const [label, count] of Object.entries(result.summary.wins)) {
|
|
236
|
+
console.log(` ${label}: ${count} wins`);
|
|
237
|
+
}
|
|
238
|
+
if (result.summary.ties > 0) {
|
|
239
|
+
console.log(` Ties: ${result.summary.ties}`);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Print JSON comparison
|
|
244
|
+
*/
|
|
245
|
+
function printJsonCompare(result) {
|
|
246
|
+
console.log(JSON.stringify(result, null, 2));
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* CLI entry point for compare
|
|
250
|
+
*/
|
|
251
|
+
async function runCompareCLI(options) {
|
|
252
|
+
try {
|
|
253
|
+
const result = await runCompare(options);
|
|
254
|
+
if (options.format === "json") {
|
|
255
|
+
printJsonCompare(result);
|
|
256
|
+
}
|
|
257
|
+
else {
|
|
258
|
+
printHumanCompare(result);
|
|
259
|
+
}
|
|
260
|
+
process.exit(0);
|
|
261
|
+
}
|
|
262
|
+
catch (error) {
|
|
263
|
+
console.error("ā Compare failed:", error instanceof Error ? error.message : String(error));
|
|
264
|
+
process.exit(1);
|
|
265
|
+
}
|
|
266
|
+
}
|