@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
package/dist/cli/diff.js
ADDED
|
@@ -0,0 +1,685 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* TICKET 5 — Behavioral Diff CLI (EVAL-401)
|
|
4
|
+
*
|
|
5
|
+
* Goal: "Git diff for AI behavior" from two RunReports
|
|
6
|
+
*
|
|
7
|
+
* Command:
|
|
8
|
+
* evalgate diff --base main (default uses git to find baseline run)
|
|
9
|
+
* evalgate diff --a <runReportPath> --b <runReportPath>
|
|
10
|
+
* evalgate diff main..feature (nice-to-have alias)
|
|
11
|
+
*/
|
|
12
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
13
|
+
if (k2 === undefined) k2 = k;
|
|
14
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
15
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
16
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
17
|
+
}
|
|
18
|
+
Object.defineProperty(o, k2, desc);
|
|
19
|
+
}) : (function(o, m, k, k2) {
|
|
20
|
+
if (k2 === undefined) k2 = k;
|
|
21
|
+
o[k2] = m[k];
|
|
22
|
+
}));
|
|
23
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
24
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
25
|
+
}) : function(o, v) {
|
|
26
|
+
o["default"] = v;
|
|
27
|
+
});
|
|
28
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
29
|
+
var ownKeys = function(o) {
|
|
30
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
31
|
+
var ar = [];
|
|
32
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
33
|
+
return ar;
|
|
34
|
+
};
|
|
35
|
+
return ownKeys(o);
|
|
36
|
+
};
|
|
37
|
+
return function (mod) {
|
|
38
|
+
if (mod && mod.__esModule) return mod;
|
|
39
|
+
var result = {};
|
|
40
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
41
|
+
__setModuleDefault(result, mod);
|
|
42
|
+
return result;
|
|
43
|
+
};
|
|
44
|
+
})();
|
|
45
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
46
|
+
exports.diffCore = exports.SUPPORTED_SCHEMA_VERSIONS = exports.DIFF_SCHEMA_VERSION = void 0;
|
|
47
|
+
exports.round = round;
|
|
48
|
+
exports.roundPct = roundPct;
|
|
49
|
+
exports.validateSchemaVersion = validateSchemaVersion;
|
|
50
|
+
exports.runDiff = runDiff;
|
|
51
|
+
exports.compareReports = compareReports;
|
|
52
|
+
exports.calculateDiffSummary = calculateDiffSummary;
|
|
53
|
+
exports.printHumanResults = printHumanResults;
|
|
54
|
+
exports.printJsonResults = printJsonResults;
|
|
55
|
+
exports.writeGitHubStepSummary = writeGitHubStepSummary;
|
|
56
|
+
exports.runDiffCLI = runDiffCLI;
|
|
57
|
+
exports.classifyDiff = classifyDiff;
|
|
58
|
+
exports.calculateDeltas = calculateDeltas;
|
|
59
|
+
const node_child_process_1 = require("node:child_process");
|
|
60
|
+
const fs = __importStar(require("node:fs/promises"));
|
|
61
|
+
const path = __importStar(require("node:path"));
|
|
62
|
+
const env_1 = require("./env");
|
|
63
|
+
const workspace_1 = require("./workspace");
|
|
64
|
+
/**
|
|
65
|
+
* Diff schema version
|
|
66
|
+
*/
|
|
67
|
+
exports.DIFF_SCHEMA_VERSION = 1;
|
|
68
|
+
/**
|
|
69
|
+
* Supported RunReport schema versions
|
|
70
|
+
*/
|
|
71
|
+
exports.SUPPORTED_SCHEMA_VERSIONS = [1];
|
|
72
|
+
/**
|
|
73
|
+
* Rounding helpers for floating point normalization
|
|
74
|
+
*/
|
|
75
|
+
function round(value, precision = 4) {
|
|
76
|
+
return Math.round(value * 10 ** precision) / 10 ** precision;
|
|
77
|
+
}
|
|
78
|
+
function roundPct(value, precision = 1) {
|
|
79
|
+
return round(value * 100, precision);
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Validate RunReport schema version
|
|
83
|
+
*/
|
|
84
|
+
function validateSchemaVersion(report) {
|
|
85
|
+
if (!report.schemaVersion) {
|
|
86
|
+
throw new Error(`RunReport missing schemaVersion. This report was generated by an older version of EvalGate.\n` +
|
|
87
|
+
`Please regenerate the run report or upgrade to a compatible version.`);
|
|
88
|
+
}
|
|
89
|
+
if (!exports.SUPPORTED_SCHEMA_VERSIONS.includes(report.schemaVersion)) {
|
|
90
|
+
throw new Error(`Unsupported RunReport schema version: ${report.schemaVersion}\n` +
|
|
91
|
+
`Supported versions: ${exports.SUPPORTED_SCHEMA_VERSIONS.join(", ")}\n` +
|
|
92
|
+
`This report was generated by a newer version of EvalGate. Please upgrade your EvalGate CLI.`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Check if running in CI environment
|
|
97
|
+
*/
|
|
98
|
+
function isCIEnvironment() {
|
|
99
|
+
return (0, env_1.isCI)();
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Generate CI-friendly error message for missing base
|
|
103
|
+
*/
|
|
104
|
+
function generateCIBaseErrorMessage(baseRef) {
|
|
105
|
+
const lines = [
|
|
106
|
+
"🚫 Base run report not found in CI environment",
|
|
107
|
+
"",
|
|
108
|
+
"To fix this, download the base run artifact from your base branch workflow:",
|
|
109
|
+
"",
|
|
110
|
+
`1. Download run artifact from ${baseRef} branch workflow`,
|
|
111
|
+
"2. Save it as .evalgate/base-run.json",
|
|
112
|
+
"3. Re-run: evalgate diff --base .evalgate/base-run.json --head .evalgate/last-run.json",
|
|
113
|
+
"",
|
|
114
|
+
"Expected artifact patterns:",
|
|
115
|
+
" - .evalgate/runs/run-*.json",
|
|
116
|
+
" - .evalgate/last-run.json",
|
|
117
|
+
"",
|
|
118
|
+
"Or set a baseline pointer:",
|
|
119
|
+
" evalgate diff --base <runId> --head last --set-baseline",
|
|
120
|
+
];
|
|
121
|
+
return lines.join("\n");
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Run diff comparison
|
|
125
|
+
*/
|
|
126
|
+
async function runDiff(options) {
|
|
127
|
+
// Resolve base and head reports
|
|
128
|
+
const baseReport = await resolveBaseReport(options.base);
|
|
129
|
+
const headReport = await resolveHeadReport(options.head);
|
|
130
|
+
if (!baseReport) {
|
|
131
|
+
if (isCIEnvironment() && options.base && (0, env_1.isGitRef)(options.base)) {
|
|
132
|
+
// In CI with git ref, provide helpful guidance
|
|
133
|
+
throw new Error(generateCIBaseErrorMessage(options.base));
|
|
134
|
+
}
|
|
135
|
+
throw new Error("Base run report not found. Use --base to specify a report or branch.");
|
|
136
|
+
}
|
|
137
|
+
if (!headReport) {
|
|
138
|
+
throw new Error("Head run report not found. Use --head to specify a report path.");
|
|
139
|
+
}
|
|
140
|
+
// Validate schema versions
|
|
141
|
+
validateSchemaVersion(baseReport);
|
|
142
|
+
validateSchemaVersion(headReport);
|
|
143
|
+
// Perform diff comparison
|
|
144
|
+
const diffResult = compareReports(baseReport, headReport);
|
|
145
|
+
return diffResult;
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Resolve base report from options
|
|
149
|
+
*/
|
|
150
|
+
async function resolveBaseReport(base) {
|
|
151
|
+
if (!base) {
|
|
152
|
+
// Default: try to find last run for main branch
|
|
153
|
+
return await findLastRunForBranch("main");
|
|
154
|
+
}
|
|
155
|
+
if (base === "last") {
|
|
156
|
+
// Resolve to previous run in index
|
|
157
|
+
return await findPreviousRun();
|
|
158
|
+
}
|
|
159
|
+
if (base === "baseline") {
|
|
160
|
+
// Resolve to baseline pointer or runId in index
|
|
161
|
+
return await findBaselineRun();
|
|
162
|
+
}
|
|
163
|
+
if (base.includes("..")) {
|
|
164
|
+
// Branch range like "main..feature" - extract base branch
|
|
165
|
+
const baseBranch = base.split("..")[0];
|
|
166
|
+
return await findLastRunForBranch(baseBranch);
|
|
167
|
+
}
|
|
168
|
+
// Check if it's a branch name
|
|
169
|
+
if (await isBranchName(base)) {
|
|
170
|
+
return await findLastRunForBranch(base);
|
|
171
|
+
}
|
|
172
|
+
// Check if it's a runId
|
|
173
|
+
if (base.startsWith("run-")) {
|
|
174
|
+
return await loadRunReport(`.evalgate/runs/${base}.json`);
|
|
175
|
+
}
|
|
176
|
+
// Treat as file path
|
|
177
|
+
return await loadRunReport(base);
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Find baseline run from pointer file or index
|
|
181
|
+
*/
|
|
182
|
+
async function findBaselineRun() {
|
|
183
|
+
const workspace = (0, workspace_1.resolveEvalWorkspace)();
|
|
184
|
+
// First, check for baseline pointer file
|
|
185
|
+
try {
|
|
186
|
+
const content = await fs.readFile(workspace.baselinePath, "utf-8");
|
|
187
|
+
return JSON.parse(content);
|
|
188
|
+
}
|
|
189
|
+
catch (_error) {
|
|
190
|
+
// Baseline file doesn't exist, try index
|
|
191
|
+
}
|
|
192
|
+
// Check index for baseline runId
|
|
193
|
+
try {
|
|
194
|
+
const content = await fs.readFile(workspace.indexPath, "utf-8");
|
|
195
|
+
const index = JSON.parse(content);
|
|
196
|
+
// Look for a run marked as baseline (could be a future enhancement)
|
|
197
|
+
// For now, just return the oldest run in index
|
|
198
|
+
if (index.length > 0) {
|
|
199
|
+
const oldestRunId = index[index.length - 1].runId;
|
|
200
|
+
return await loadRunReport(`.evalgate/runs/${oldestRunId}.json`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
catch (_error) {
|
|
204
|
+
// Index doesn't exist
|
|
205
|
+
}
|
|
206
|
+
throw new Error("No baseline run found. Set a baseline with 'evalgate diff --base <runId> --head last --set-baseline' or create .evalgate/baseline-run.json.");
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Resolve head report from options
|
|
210
|
+
*/
|
|
211
|
+
async function resolveHeadReport(head) {
|
|
212
|
+
if (head) {
|
|
213
|
+
if (head === "last") {
|
|
214
|
+
return await loadRunReport(".evalgate/last-run.json");
|
|
215
|
+
}
|
|
216
|
+
// Check if it's a runId
|
|
217
|
+
if (head.startsWith("run-")) {
|
|
218
|
+
return await loadRunReport(`.evalgate/runs/${head}.json`);
|
|
219
|
+
}
|
|
220
|
+
// Treat as file path
|
|
221
|
+
return await loadRunReport(head);
|
|
222
|
+
}
|
|
223
|
+
// Default: use last run
|
|
224
|
+
return await loadRunReport(".evalgate/last-run.json");
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Find previous run from index
|
|
228
|
+
*/
|
|
229
|
+
async function findPreviousRun() {
|
|
230
|
+
const indexPath = path.join(process.cwd(), ".evalgate", "runs", "index.json");
|
|
231
|
+
try {
|
|
232
|
+
const content = await fs.readFile(indexPath, "utf-8");
|
|
233
|
+
const index = JSON.parse(content);
|
|
234
|
+
if (index.length < 2) {
|
|
235
|
+
throw new Error("Need at least 2 runs to use 'last' shortcut. Run 'evalgate run --write-results' at least twice.");
|
|
236
|
+
}
|
|
237
|
+
// Return the second most recent run (index[0] is the most recent)
|
|
238
|
+
const previousRunId = index[1].runId;
|
|
239
|
+
return await loadRunReport(`.evalgate/runs/${previousRunId}.json`);
|
|
240
|
+
}
|
|
241
|
+
catch (error) {
|
|
242
|
+
if (error instanceof Error &&
|
|
243
|
+
error.message.includes("Need at least 2 runs")) {
|
|
244
|
+
throw error;
|
|
245
|
+
}
|
|
246
|
+
throw new Error("No run history found. Run 'evalgate run --write-results' first.");
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Check if string is a branch name
|
|
251
|
+
*/
|
|
252
|
+
async function isBranchName(name) {
|
|
253
|
+
return new Promise((resolve) => {
|
|
254
|
+
const git = (0, node_child_process_1.spawn)("git", ["rev-parse", "--verify", name], {
|
|
255
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
256
|
+
});
|
|
257
|
+
git.on("close", (code) => {
|
|
258
|
+
resolve(code === 0);
|
|
259
|
+
});
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Find last run for a branch
|
|
264
|
+
*/
|
|
265
|
+
async function findLastRunForBranch(_branch) {
|
|
266
|
+
// For now, just look for .evalgate/last-run.json
|
|
267
|
+
// In a real implementation, this would:
|
|
268
|
+
// 1. Check CI artifacts for the branch
|
|
269
|
+
// 2. Look for branch-specific run files
|
|
270
|
+
// 3. Fall back to local last-run.json
|
|
271
|
+
const lastRunPath = path.join(process.cwd(), ".evalgate", "last-run.json");
|
|
272
|
+
try {
|
|
273
|
+
const content = await fs.readFile(lastRunPath, "utf-8");
|
|
274
|
+
return JSON.parse(content);
|
|
275
|
+
}
|
|
276
|
+
catch (_error) {
|
|
277
|
+
return null;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Load run report from file
|
|
282
|
+
*/
|
|
283
|
+
async function loadRunReport(filePath) {
|
|
284
|
+
try {
|
|
285
|
+
const content = await fs.readFile(path.resolve(filePath), "utf-8");
|
|
286
|
+
return JSON.parse(content);
|
|
287
|
+
}
|
|
288
|
+
catch (_error) {
|
|
289
|
+
return null;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Compare two run reports
|
|
294
|
+
*/
|
|
295
|
+
function compareReports(base, head) {
|
|
296
|
+
const changedSpecs = [];
|
|
297
|
+
// Create maps for easy lookup
|
|
298
|
+
const baseSpecs = new Map(base.results.map((r) => [r.specId, r]));
|
|
299
|
+
const headSpecs = new Map(head.results.map((r) => [r.specId, r]));
|
|
300
|
+
const allSpecIds = new Set([...baseSpecs.keys(), ...headSpecs.keys()]);
|
|
301
|
+
// Analyze each spec
|
|
302
|
+
for (const specId of allSpecIds) {
|
|
303
|
+
const baseSpec = baseSpecs.get(specId);
|
|
304
|
+
const headSpec = headSpecs.get(specId);
|
|
305
|
+
const specDiff = analyzeSpecDiff(specId, baseSpec, headSpec);
|
|
306
|
+
if (specDiff) {
|
|
307
|
+
changedSpecs.push(specDiff);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
// Sort by severity then ID
|
|
311
|
+
changedSpecs.sort((a, b) => {
|
|
312
|
+
const severityOrder = getSeverityOrder(a.classification) - getSeverityOrder(b.classification);
|
|
313
|
+
if (severityOrder !== 0)
|
|
314
|
+
return severityOrder;
|
|
315
|
+
return a.specId.localeCompare(b.specId);
|
|
316
|
+
});
|
|
317
|
+
// Calculate summary
|
|
318
|
+
const summary = calculateDiffSummary(base, head, changedSpecs);
|
|
319
|
+
return {
|
|
320
|
+
schemaVersion: exports.DIFF_SCHEMA_VERSION,
|
|
321
|
+
base,
|
|
322
|
+
head,
|
|
323
|
+
summary,
|
|
324
|
+
changedSpecs,
|
|
325
|
+
metadata: {
|
|
326
|
+
generatedAt: Date.now(),
|
|
327
|
+
baseSource: "local",
|
|
328
|
+
headSource: "local",
|
|
329
|
+
},
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
/**
|
|
333
|
+
* Analyze diff for a single spec
|
|
334
|
+
*/
|
|
335
|
+
function analyzeSpecDiff(specId, base, head) {
|
|
336
|
+
const name = head?.name || base?.name || specId;
|
|
337
|
+
const filePath = head?.filePath || base?.filePath || "";
|
|
338
|
+
// Determine classification
|
|
339
|
+
const classification = classifyDiff(base, head);
|
|
340
|
+
// If no significant change, return null
|
|
341
|
+
if (classification === "execution_error" && base && head) {
|
|
342
|
+
// Check if it's actually identical
|
|
343
|
+
if (base.result.status === head.result.status &&
|
|
344
|
+
(base.result.score || 0) === (head.result.score || 0) &&
|
|
345
|
+
base.result.duration === head.result.duration &&
|
|
346
|
+
base.result.error === head.result.error) {
|
|
347
|
+
return null;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
// Calculate deltas
|
|
351
|
+
const deltas = calculateDeltas(base, head);
|
|
352
|
+
return {
|
|
353
|
+
specId,
|
|
354
|
+
name,
|
|
355
|
+
filePath,
|
|
356
|
+
classification,
|
|
357
|
+
base: base
|
|
358
|
+
? {
|
|
359
|
+
status: base.result.status,
|
|
360
|
+
score: base.result.score,
|
|
361
|
+
duration: base.result.duration,
|
|
362
|
+
error: base.result.error,
|
|
363
|
+
}
|
|
364
|
+
: undefined,
|
|
365
|
+
head: head
|
|
366
|
+
? {
|
|
367
|
+
status: head.result.status,
|
|
368
|
+
score: head.result.score,
|
|
369
|
+
duration: head.result.duration,
|
|
370
|
+
error: head.result.error,
|
|
371
|
+
}
|
|
372
|
+
: undefined,
|
|
373
|
+
deltas,
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Classify the type of change
|
|
378
|
+
*/
|
|
379
|
+
function classifyDiff(base, head) {
|
|
380
|
+
if (!base && head) {
|
|
381
|
+
return "added";
|
|
382
|
+
}
|
|
383
|
+
if (base && !head) {
|
|
384
|
+
return "removed";
|
|
385
|
+
}
|
|
386
|
+
if (!base || !head) {
|
|
387
|
+
return "execution_error";
|
|
388
|
+
}
|
|
389
|
+
// Both exist - analyze changes
|
|
390
|
+
if (base.result.status === "passed" && head.result.status === "failed") {
|
|
391
|
+
return "new_failure";
|
|
392
|
+
}
|
|
393
|
+
if (base.result.status === "failed" && head.result.status === "passed") {
|
|
394
|
+
return "fixed_failure";
|
|
395
|
+
}
|
|
396
|
+
if (base.result.status === "skipped" && head.result.status !== "skipped") {
|
|
397
|
+
return "skipped_change";
|
|
398
|
+
}
|
|
399
|
+
if (head.result.status === "skipped" && base.result.status !== "skipped") {
|
|
400
|
+
return "skipped_change";
|
|
401
|
+
}
|
|
402
|
+
// Score changes
|
|
403
|
+
if (base.result.score && head.result.score) {
|
|
404
|
+
const delta = head.result.score - base.result.score;
|
|
405
|
+
if (delta < -0.05)
|
|
406
|
+
return "score_drop";
|
|
407
|
+
if (delta > 0.05)
|
|
408
|
+
return "score_improve";
|
|
409
|
+
}
|
|
410
|
+
// Default to no significant change
|
|
411
|
+
return "execution_error";
|
|
412
|
+
}
|
|
413
|
+
/**
|
|
414
|
+
* Calculate deltas between base and head
|
|
415
|
+
*/
|
|
416
|
+
function calculateDeltas(base, head) {
|
|
417
|
+
const deltas = {};
|
|
418
|
+
if (base && head) {
|
|
419
|
+
if (base.result.score && head.result.score) {
|
|
420
|
+
deltas.scoreDelta = round(head.result.score - base.result.score, 4);
|
|
421
|
+
}
|
|
422
|
+
deltas.durationDelta = head.result.duration - base.result.duration;
|
|
423
|
+
if (base.result.status !== head.result.status) {
|
|
424
|
+
deltas.statusChange = `${base.result.status} → ${head.result.status}`;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
return deltas;
|
|
428
|
+
}
|
|
429
|
+
/**
|
|
430
|
+
* Get severity order for sorting
|
|
431
|
+
*/
|
|
432
|
+
function getSeverityOrder(classification) {
|
|
433
|
+
const severityMap = {
|
|
434
|
+
new_failure: 1,
|
|
435
|
+
score_drop: 2,
|
|
436
|
+
execution_error: 3,
|
|
437
|
+
skipped_change: 4,
|
|
438
|
+
removed: 5,
|
|
439
|
+
added: 6,
|
|
440
|
+
fixed_failure: 7,
|
|
441
|
+
score_improve: 8,
|
|
442
|
+
};
|
|
443
|
+
return severityMap[classification] || 9;
|
|
444
|
+
}
|
|
445
|
+
/**
|
|
446
|
+
* Calculate diff summary statistics
|
|
447
|
+
*/
|
|
448
|
+
function calculateDiffSummary(base, head, changedSpecs) {
|
|
449
|
+
const baseTotal = base.results.length;
|
|
450
|
+
const headTotal = head.results.length;
|
|
451
|
+
const basePassRate = base.summary.passRate;
|
|
452
|
+
const headPassRate = head.summary.passRate;
|
|
453
|
+
const passRateDelta = round(headPassRate - basePassRate, 4);
|
|
454
|
+
// Calculate average scores
|
|
455
|
+
const baseScores = base.results
|
|
456
|
+
.filter((r) => r.result.score !== undefined)
|
|
457
|
+
.map((r) => r.result.score);
|
|
458
|
+
const headScores = head.results
|
|
459
|
+
.filter((r) => r.result.score !== undefined)
|
|
460
|
+
.map((r) => r.result.score);
|
|
461
|
+
const baseAvgScore = baseScores.length > 0
|
|
462
|
+
? baseScores.reduce((a, b) => a + b, 0) / baseScores.length
|
|
463
|
+
: 0;
|
|
464
|
+
const headAvgScore = headScores.length > 0
|
|
465
|
+
? headScores.reduce((a, b) => a + b, 0) / headScores.length
|
|
466
|
+
: 0;
|
|
467
|
+
const scoreDelta = round(headAvgScore - baseAvgScore, 4);
|
|
468
|
+
// Count classifications
|
|
469
|
+
const regressions = changedSpecs.filter((s) => ["new_failure", "score_drop", "execution_error"].includes(s.classification)).length;
|
|
470
|
+
const improvements = changedSpecs.filter((s) => ["fixed_failure", "score_improve"].includes(s.classification)).length;
|
|
471
|
+
const added = changedSpecs.filter((s) => s.classification === "added").length;
|
|
472
|
+
const removed = changedSpecs.filter((s) => s.classification === "removed").length;
|
|
473
|
+
return {
|
|
474
|
+
baseTotal,
|
|
475
|
+
headTotal,
|
|
476
|
+
passRateDelta,
|
|
477
|
+
scoreDelta,
|
|
478
|
+
regressions,
|
|
479
|
+
improvements,
|
|
480
|
+
added,
|
|
481
|
+
removed,
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
/**
|
|
485
|
+
* Print human-readable diff results
|
|
486
|
+
*/
|
|
487
|
+
function printHumanResults(result) {
|
|
488
|
+
console.log("\n🔄 Behavioral Diff Results");
|
|
489
|
+
console.log(`📊 Base: ${result.metadata.baseSource} (${result.summary.baseTotal} specs)`);
|
|
490
|
+
console.log(`📈 Head: ${result.metadata.headSource} (${result.summary.headTotal} specs)`);
|
|
491
|
+
console.log("\n📈 Summary:");
|
|
492
|
+
console.log(` 📊 Pass Rate Delta: ${roundPct(result.summary.passRateDelta, 1).toFixed(1)}%`);
|
|
493
|
+
console.log(` 🎯 Score Delta: ${roundPct(result.summary.scoreDelta, 1).toFixed(1)}%`);
|
|
494
|
+
console.log(` 📉 Regressions: ${result.summary.regressions}`);
|
|
495
|
+
console.log(` 📈 Improvements: ${result.summary.improvements}`);
|
|
496
|
+
console.log(` ➕ Added: ${result.summary.added}`);
|
|
497
|
+
console.log(` ➖ Removed: ${result.summary.removed}`);
|
|
498
|
+
if (result.changedSpecs.length > 0) {
|
|
499
|
+
console.log("\n🔍 Changed Specifications:");
|
|
500
|
+
for (const spec of result.changedSpecs) {
|
|
501
|
+
const icon = getClassificationIcon(spec.classification);
|
|
502
|
+
const scoreInfo = spec.deltas.scoreDelta
|
|
503
|
+
? ` (${spec.deltas.scoreDelta > 0 ? "+" : ""}${roundPct(spec.deltas.scoreDelta, 1).toFixed(1)}%)`
|
|
504
|
+
: "";
|
|
505
|
+
const durationInfo = spec.deltas.durationDelta
|
|
506
|
+
? ` (${spec.deltas.durationDelta > 0 ? "+" : ""}${spec.deltas.durationDelta}ms)`
|
|
507
|
+
: "";
|
|
508
|
+
console.log(` ${icon} ${spec.name}${scoreInfo}${durationInfo}`);
|
|
509
|
+
if (spec.head?.error) {
|
|
510
|
+
console.log(` ❌ ${spec.head.error}`);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
else {
|
|
515
|
+
console.log("\n✅ No changes detected");
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* Get icon for classification
|
|
520
|
+
*/
|
|
521
|
+
function getClassificationIcon(classification) {
|
|
522
|
+
const iconMap = {
|
|
523
|
+
new_failure: "🆘",
|
|
524
|
+
fixed_failure: "✅",
|
|
525
|
+
score_drop: "📉",
|
|
526
|
+
score_improve: "📈",
|
|
527
|
+
execution_error: "❌",
|
|
528
|
+
skipped_change: "⏭️",
|
|
529
|
+
added: "➕",
|
|
530
|
+
removed: "➖",
|
|
531
|
+
};
|
|
532
|
+
return iconMap[classification] || "❓";
|
|
533
|
+
}
|
|
534
|
+
/**
|
|
535
|
+
* Print JSON results
|
|
536
|
+
*/
|
|
537
|
+
function printJsonResults(result) {
|
|
538
|
+
console.log(JSON.stringify(result, null, 2));
|
|
539
|
+
}
|
|
540
|
+
/**
|
|
541
|
+
* Write GitHub Step Summary
|
|
542
|
+
*/
|
|
543
|
+
async function writeGitHubStepSummary(result) {
|
|
544
|
+
if (!process.env.GITHUB_STEP_SUMMARY) {
|
|
545
|
+
return; // Not in GitHub Actions
|
|
546
|
+
}
|
|
547
|
+
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
548
|
+
try {
|
|
549
|
+
const summary = generateGitHubSummary(result);
|
|
550
|
+
await fs.appendFile(summaryPath, `${summary}\n`, "utf-8");
|
|
551
|
+
}
|
|
552
|
+
catch (error) {
|
|
553
|
+
console.warn("Warning: Could not write GitHub Step Summary:", error);
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
/**
|
|
557
|
+
* Generate GitHub Step Summary content
|
|
558
|
+
*/
|
|
559
|
+
function generateGitHubSummary(result) {
|
|
560
|
+
const lines = [];
|
|
561
|
+
// Header
|
|
562
|
+
lines.push("## 🤖 EvalGate Diff Results\n");
|
|
563
|
+
// Summary metrics
|
|
564
|
+
lines.push("### 📊 Summary Metrics");
|
|
565
|
+
lines.push(`- **Pass Rate Delta**: ${roundPct(result.summary.passRateDelta, 1).toFixed(1)}%`);
|
|
566
|
+
lines.push(`- **Score Delta**: ${roundPct(result.summary.scoreDelta, 1).toFixed(1)}%`);
|
|
567
|
+
lines.push(`- **🚨 Regressions**: ${result.summary.regressions}`);
|
|
568
|
+
lines.push(`- **📈 Improvements**: ${result.summary.improvements}`);
|
|
569
|
+
lines.push(`- **➕ Added**: ${result.summary.added}`);
|
|
570
|
+
lines.push(`- **➖ Removed**: ${result.summary.removed}`);
|
|
571
|
+
lines.push("");
|
|
572
|
+
// Status indicator
|
|
573
|
+
if (result.summary.regressions > 0) {
|
|
574
|
+
lines.push("### 🚨 Regressions Detected\n");
|
|
575
|
+
lines.push("**⚠️ This PR contains regressions that should be reviewed.**\n");
|
|
576
|
+
}
|
|
577
|
+
else {
|
|
578
|
+
lines.push("### ✅ No Regressions Detected\n");
|
|
579
|
+
lines.push("**🎉 All tests passed! No regressions found.**\n");
|
|
580
|
+
}
|
|
581
|
+
// Top regressions
|
|
582
|
+
const regressions = result.changedSpecs.filter((s) => ["new_failure", "execution_error", "score_drop"].includes(s.classification));
|
|
583
|
+
if (regressions.length > 0) {
|
|
584
|
+
lines.push("### 🔍 Top Regressions\n");
|
|
585
|
+
const topRegressions = regressions.slice(0, 5);
|
|
586
|
+
for (const spec of topRegressions) {
|
|
587
|
+
const icon = getClassificationIcon(spec.classification);
|
|
588
|
+
const scoreInfo = spec.deltas.scoreDelta
|
|
589
|
+
? ` (${spec.deltas.scoreDelta > 0 ? "+" : ""}${roundPct(spec.deltas.scoreDelta, 1).toFixed(1)}%)`
|
|
590
|
+
: "";
|
|
591
|
+
lines.push(`${icon} **${spec.name}**${scoreInfo}`);
|
|
592
|
+
lines.push(` Classification: \`${spec.classification}\``);
|
|
593
|
+
if (spec.head?.error) {
|
|
594
|
+
lines.push(` Error: \`${spec.head.error}\``);
|
|
595
|
+
}
|
|
596
|
+
lines.push("");
|
|
597
|
+
}
|
|
598
|
+
if (regressions.length > 5) {
|
|
599
|
+
lines.push(`... and ${regressions.length - 5} more regressions\n`);
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
// Artifacts
|
|
603
|
+
lines.push("### 📁 Artifacts\n");
|
|
604
|
+
lines.push(`- **Run Index**: \`.evalgate/runs/index.json\``);
|
|
605
|
+
lines.push(`- **Latest Run**: \`.evalgate/runs/${result.head.runId}.json\``);
|
|
606
|
+
lines.push(`- **Last Run**: \`.evalgate/last-run.json\``);
|
|
607
|
+
lines.push("");
|
|
608
|
+
// Footer
|
|
609
|
+
lines.push("<details>");
|
|
610
|
+
lines.push("<summary>🔧 Technical Details</summary>");
|
|
611
|
+
lines.push("");
|
|
612
|
+
lines.push("```json");
|
|
613
|
+
lines.push(JSON.stringify({
|
|
614
|
+
baseRunId: result.base.runId,
|
|
615
|
+
headRunId: result.head.runId,
|
|
616
|
+
baseTotal: result.summary.baseTotal,
|
|
617
|
+
headTotal: result.summary.headTotal,
|
|
618
|
+
passRateDelta: result.summary.passRateDelta,
|
|
619
|
+
scoreDelta: result.summary.scoreDelta,
|
|
620
|
+
regressions: result.summary.regressions,
|
|
621
|
+
improvements: result.summary.improvements,
|
|
622
|
+
added: result.summary.added,
|
|
623
|
+
removed: result.summary.removed,
|
|
624
|
+
}, null, 2));
|
|
625
|
+
lines.push("```");
|
|
626
|
+
lines.push("");
|
|
627
|
+
lines.push("</details>");
|
|
628
|
+
return lines.join("\n");
|
|
629
|
+
}
|
|
630
|
+
/**
|
|
631
|
+
* CLI entry point
|
|
632
|
+
*/
|
|
633
|
+
async function runDiffCLI(options) {
|
|
634
|
+
try {
|
|
635
|
+
const result = await runDiff(options);
|
|
636
|
+
if (options.format === "json") {
|
|
637
|
+
printJsonResults(result);
|
|
638
|
+
}
|
|
639
|
+
else {
|
|
640
|
+
printHumanResults(result);
|
|
641
|
+
}
|
|
642
|
+
// Write GitHub Step Summary if in CI
|
|
643
|
+
await writeGitHubStepSummary(result);
|
|
644
|
+
// Exit with appropriate code
|
|
645
|
+
if (result.summary.regressions > 0) {
|
|
646
|
+
process.exit(1); // Regressions detected
|
|
647
|
+
}
|
|
648
|
+
else {
|
|
649
|
+
process.exit(0); // Clean
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
catch (error) {
|
|
653
|
+
console.error(`EvalGate ERROR: ${error instanceof Error ? error.message : String(error)}`);
|
|
654
|
+
// In CI with git ref error, exit with config code
|
|
655
|
+
if (isCIEnvironment() &&
|
|
656
|
+
options.base &&
|
|
657
|
+
(0, env_1.isGitRef)(options.base) &&
|
|
658
|
+
error instanceof Error &&
|
|
659
|
+
error.message.includes("Base run report not found in CI environment")) {
|
|
660
|
+
process.exit(2); // Config/infra issue
|
|
661
|
+
}
|
|
662
|
+
else {
|
|
663
|
+
process.exit(2); // General error
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
// Public diff core API surface
|
|
668
|
+
exports.diffCore = {
|
|
669
|
+
/**
|
|
670
|
+
* Compare two run reports and return diff result
|
|
671
|
+
*/
|
|
672
|
+
diffRunReports: compareReports,
|
|
673
|
+
/**
|
|
674
|
+
* Classify the type of change between two specs
|
|
675
|
+
*/
|
|
676
|
+
classifyChange: classifyDiff,
|
|
677
|
+
/**
|
|
678
|
+
* Calculate summary statistics for a diff
|
|
679
|
+
*/
|
|
680
|
+
summarizeDiff: calculateDiffSummary,
|
|
681
|
+
/**
|
|
682
|
+
* Calculate deltas between two spec results
|
|
683
|
+
*/
|
|
684
|
+
calculateDeltas,
|
|
685
|
+
};
|