@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalgate gate — Run the regression gate
|
|
4
|
+
*
|
|
5
|
+
* Two modes:
|
|
6
|
+
* 1. Project mode: delegates to eval:regression-gate npm script (full gate)
|
|
7
|
+
* 2. Built-in mode: runs `npm test`, compares against evals/baseline.json
|
|
8
|
+
*
|
|
9
|
+
* Built-in mode activates when no eval:regression-gate script is defined,
|
|
10
|
+
* making `npx evalgate gate` work for any project after `npx evalgate init`.
|
|
11
|
+
*/
|
|
12
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
13
|
+
if (k2 === undefined) k2 = k;
|
|
14
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
15
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
16
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
17
|
+
}
|
|
18
|
+
Object.defineProperty(o, k2, desc);
|
|
19
|
+
}) : (function(o, m, k, k2) {
|
|
20
|
+
if (k2 === undefined) k2 = k;
|
|
21
|
+
o[k2] = m[k];
|
|
22
|
+
}));
|
|
23
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
24
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
25
|
+
}) : function(o, v) {
|
|
26
|
+
o["default"] = v;
|
|
27
|
+
});
|
|
28
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
29
|
+
var ownKeys = function(o) {
|
|
30
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
31
|
+
var ar = [];
|
|
32
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
33
|
+
return ar;
|
|
34
|
+
};
|
|
35
|
+
return ownKeys(o);
|
|
36
|
+
};
|
|
37
|
+
return function (mod) {
|
|
38
|
+
if (mod && mod.__esModule) return mod;
|
|
39
|
+
var result = {};
|
|
40
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
41
|
+
__setModuleDefault(result, mod);
|
|
42
|
+
return result;
|
|
43
|
+
};
|
|
44
|
+
})();
|
|
45
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
46
|
+
exports.parseGateArgs = parseGateArgs;
|
|
47
|
+
exports.runGate = runGate;
|
|
48
|
+
const node_child_process_1 = require("node:child_process");
|
|
49
|
+
const fs = __importStar(require("node:fs"));
|
|
50
|
+
const path = __importStar(require("node:path"));
|
|
51
|
+
const REPORT_REL = "evals/regression-report.json";
|
|
52
|
+
const BASELINE_REL = "evals/baseline.json";
|
|
53
|
+
/** Detect the package manager used in the project */
|
|
54
|
+
function detectPackageManager(cwd) {
|
|
55
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
56
|
+
return "pnpm";
|
|
57
|
+
if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
58
|
+
return "yarn";
|
|
59
|
+
return "npm";
|
|
60
|
+
}
|
|
61
|
+
function parseGateArgs(argv) {
|
|
62
|
+
const args = { format: "human" };
|
|
63
|
+
for (let i = 0; i < argv.length; i++) {
|
|
64
|
+
if (argv[i] === "--format" && argv[i + 1]) {
|
|
65
|
+
const fmt = argv[i + 1];
|
|
66
|
+
if (fmt === "json" || fmt === "github" || fmt === "human") {
|
|
67
|
+
args.format = fmt;
|
|
68
|
+
}
|
|
69
|
+
i++;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return args;
|
|
73
|
+
}
|
|
74
|
+
function detectRunner(cwd) {
|
|
75
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
76
|
+
try {
|
|
77
|
+
const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
78
|
+
const testCmd = pkg.scripts?.test ?? "";
|
|
79
|
+
if (testCmd.includes("vitest"))
|
|
80
|
+
return "vitest";
|
|
81
|
+
if (testCmd.includes("jest"))
|
|
82
|
+
return "jest";
|
|
83
|
+
if (testCmd.includes("mocha"))
|
|
84
|
+
return "mocha";
|
|
85
|
+
if (testCmd.includes("node --test"))
|
|
86
|
+
return "node:test";
|
|
87
|
+
if (testCmd.includes("ava"))
|
|
88
|
+
return "ava";
|
|
89
|
+
if (testCmd.includes("tap"))
|
|
90
|
+
return "tap";
|
|
91
|
+
}
|
|
92
|
+
catch {
|
|
93
|
+
// ignore
|
|
94
|
+
}
|
|
95
|
+
return "unknown";
|
|
96
|
+
}
|
|
97
|
+
function runBuiltinGate(cwd) {
|
|
98
|
+
const t0 = Date.now();
|
|
99
|
+
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
100
|
+
const now = new Date().toISOString();
|
|
101
|
+
const pm = detectPackageManager(cwd);
|
|
102
|
+
const command = `${pm} test`;
|
|
103
|
+
const runner = detectRunner(cwd);
|
|
104
|
+
// Load baseline
|
|
105
|
+
if (!fs.existsSync(baselinePath)) {
|
|
106
|
+
return {
|
|
107
|
+
schemaVersion: 1,
|
|
108
|
+
timestamp: now,
|
|
109
|
+
exitCode: 2,
|
|
110
|
+
category: "infra_error",
|
|
111
|
+
passed: false,
|
|
112
|
+
failures: ["Baseline file not found. Run: npx evalgate init"],
|
|
113
|
+
deltas: [],
|
|
114
|
+
baseline: null,
|
|
115
|
+
durationMs: Date.now() - t0,
|
|
116
|
+
command,
|
|
117
|
+
runner,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
let baselineData;
|
|
121
|
+
try {
|
|
122
|
+
baselineData = JSON.parse(fs.readFileSync(baselinePath, "utf-8"));
|
|
123
|
+
}
|
|
124
|
+
catch {
|
|
125
|
+
return {
|
|
126
|
+
schemaVersion: 1,
|
|
127
|
+
timestamp: now,
|
|
128
|
+
exitCode: 2,
|
|
129
|
+
category: "infra_error",
|
|
130
|
+
passed: false,
|
|
131
|
+
failures: ["Failed to parse evals/baseline.json"],
|
|
132
|
+
deltas: [],
|
|
133
|
+
baseline: null,
|
|
134
|
+
durationMs: Date.now() - t0,
|
|
135
|
+
command,
|
|
136
|
+
runner,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
const baselineMeta = baselineData.updatedAt
|
|
140
|
+
? {
|
|
141
|
+
updatedAt: baselineData.updatedAt,
|
|
142
|
+
updatedBy: baselineData.updatedBy ?? "unknown",
|
|
143
|
+
}
|
|
144
|
+
: null;
|
|
145
|
+
// Run tests
|
|
146
|
+
const isWin = process.platform === "win32";
|
|
147
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["test"], {
|
|
148
|
+
cwd,
|
|
149
|
+
stdio: "pipe",
|
|
150
|
+
shell: isWin,
|
|
151
|
+
timeout: 300000,
|
|
152
|
+
});
|
|
153
|
+
const testsPassed = result.status === 0;
|
|
154
|
+
const output = (result.stdout?.toString() ?? "") + (result.stderr?.toString() ?? "");
|
|
155
|
+
// Try to extract test count
|
|
156
|
+
let testCount = 0;
|
|
157
|
+
const countMatch = output.match(/(\d+)\s+(?:tests?|specs?)\s+(?:passed|completed)/i) ??
|
|
158
|
+
output.match(/Tests:\s+(\d+)\s+passed/i) ??
|
|
159
|
+
output.match(/(\d+)\s+passing/i) ??
|
|
160
|
+
output.match(/Test Files\s+\d+\s+passed.*\n\s+Tests\s+(\d+)\s+passed/i);
|
|
161
|
+
if (countMatch)
|
|
162
|
+
testCount = parseInt(countMatch[1], 10);
|
|
163
|
+
// Compare against baseline
|
|
164
|
+
const baselinePassed = baselineData.confidenceTests?.passed ?? true;
|
|
165
|
+
const baselineTotal = baselineData.confidenceTests?.total ?? 0;
|
|
166
|
+
const failures = [];
|
|
167
|
+
const deltas = [];
|
|
168
|
+
// Delta: tests passing
|
|
169
|
+
deltas.push({
|
|
170
|
+
metric: "tests_passing",
|
|
171
|
+
baseline: baselinePassed,
|
|
172
|
+
current: testsPassed,
|
|
173
|
+
delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
|
|
174
|
+
status: testsPassed ? "pass" : "fail",
|
|
175
|
+
});
|
|
176
|
+
if (!testsPassed && baselinePassed) {
|
|
177
|
+
failures.push("Tests were passing in baseline but are now failing");
|
|
178
|
+
}
|
|
179
|
+
// Delta: test count (only if we captured counts)
|
|
180
|
+
if (testCount > 0 || baselineTotal > 0) {
|
|
181
|
+
const countDelta = testCount - baselineTotal;
|
|
182
|
+
deltas.push({
|
|
183
|
+
metric: "test_count",
|
|
184
|
+
baseline: baselineTotal,
|
|
185
|
+
current: testCount,
|
|
186
|
+
delta: countDelta >= 0 ? `+${countDelta}` : `${countDelta}`,
|
|
187
|
+
status: testCount >= baselineTotal ? "pass" : "fail",
|
|
188
|
+
});
|
|
189
|
+
if (testCount < baselineTotal) {
|
|
190
|
+
failures.push(`Test count dropped from ${baselineTotal} to ${testCount} (${countDelta})`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
const hasRegression = failures.length > 0;
|
|
194
|
+
return {
|
|
195
|
+
schemaVersion: 1,
|
|
196
|
+
timestamp: now,
|
|
197
|
+
exitCode: hasRegression ? 1 : 0,
|
|
198
|
+
category: hasRegression ? "regression" : "pass",
|
|
199
|
+
passed: !hasRegression,
|
|
200
|
+
failures,
|
|
201
|
+
deltas,
|
|
202
|
+
baseline: baselineMeta,
|
|
203
|
+
durationMs: Date.now() - t0,
|
|
204
|
+
command,
|
|
205
|
+
runner,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
// ── Format helpers ──
|
|
209
|
+
function formatHuman(report) {
|
|
210
|
+
const icon = report.passed ? "✅" : "❌";
|
|
211
|
+
console.log(`\n${icon} EvalGate Gate: ${report.category.toUpperCase()}\n`);
|
|
212
|
+
if (report.deltas.length > 0) {
|
|
213
|
+
const pad = (s, n) => s.padEnd(n);
|
|
214
|
+
console.log(` ${pad("Metric", 16)} ${pad("Baseline", 10)} ${pad("Current", 10)} ${pad("Delta", 8)} Status`);
|
|
215
|
+
console.log(` ${"-".repeat(16)} ${"-".repeat(10)} ${"-".repeat(10)} ${"-".repeat(8)} ------`);
|
|
216
|
+
for (const d of report.deltas) {
|
|
217
|
+
const si = d.status === "pass" ? "✔" : "✖";
|
|
218
|
+
console.log(` ${pad(d.metric, 16)} ${pad(String(d.baseline), 10)} ${pad(String(d.current), 10)} ${pad(d.delta, 8)} ${si}`);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
if (report.failures.length > 0) {
|
|
222
|
+
console.log("\n Failures:");
|
|
223
|
+
for (const f of report.failures) {
|
|
224
|
+
console.log(` • ${f}`);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
console.log("");
|
|
228
|
+
}
|
|
229
|
+
function formatGithub(report) {
|
|
230
|
+
const icon = report.passed ? "✅" : "❌";
|
|
231
|
+
const lines = [
|
|
232
|
+
`## ${icon} EvalGate Gate: ${report.category}`,
|
|
233
|
+
"",
|
|
234
|
+
"| Metric | Baseline | Current | Delta | Status |",
|
|
235
|
+
"|--------|----------|---------|-------|--------|",
|
|
236
|
+
];
|
|
237
|
+
for (const d of report.deltas) {
|
|
238
|
+
const si = d.status === "pass" ? "✅" : "❌";
|
|
239
|
+
lines.push(`| ${d.metric} | ${d.baseline} | ${d.current} | ${d.delta} | ${si} |`);
|
|
240
|
+
}
|
|
241
|
+
if (report.failures.length > 0) {
|
|
242
|
+
lines.push("", "### Failures", "");
|
|
243
|
+
for (const f of report.failures) {
|
|
244
|
+
lines.push(`- ${f}`);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
lines.push("", `Schema version: ${report.schemaVersion}`);
|
|
248
|
+
const md = lines.join("\n");
|
|
249
|
+
// Write to $GITHUB_STEP_SUMMARY if available
|
|
250
|
+
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
251
|
+
if (summaryPath) {
|
|
252
|
+
try {
|
|
253
|
+
fs.appendFileSync(summaryPath, `${md}\n`);
|
|
254
|
+
}
|
|
255
|
+
catch {
|
|
256
|
+
// ignore if not writable
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
console.log(md);
|
|
260
|
+
}
|
|
261
|
+
function formatReport(report, args) {
|
|
262
|
+
if (args.format === "json") {
|
|
263
|
+
process.stdout.write(JSON.stringify(report, null, 2));
|
|
264
|
+
}
|
|
265
|
+
else if (args.format === "github") {
|
|
266
|
+
formatGithub(report);
|
|
267
|
+
}
|
|
268
|
+
else {
|
|
269
|
+
formatHuman(report);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
// ── Main ──
|
|
273
|
+
function runGate(argv) {
|
|
274
|
+
const cwd = process.cwd();
|
|
275
|
+
const args = parseGateArgs(argv);
|
|
276
|
+
// Check for package.json
|
|
277
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
278
|
+
if (!fs.existsSync(pkgPath)) {
|
|
279
|
+
console.error("❌ No package.json found. Run this from your project root.");
|
|
280
|
+
return 1;
|
|
281
|
+
}
|
|
282
|
+
let pkg;
|
|
283
|
+
try {
|
|
284
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
console.error("❌ Failed to parse package.json");
|
|
288
|
+
return 1;
|
|
289
|
+
}
|
|
290
|
+
// ── Project mode: delegate to eval:regression-gate if it exists ──
|
|
291
|
+
if (pkg.scripts?.["eval:regression-gate"]) {
|
|
292
|
+
const pm = detectPackageManager(cwd);
|
|
293
|
+
const isWin = process.platform === "win32";
|
|
294
|
+
const stdio = args.format === "json" ? "pipe" : "inherit";
|
|
295
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", "eval:regression-gate"], {
|
|
296
|
+
cwd,
|
|
297
|
+
stdio: stdio,
|
|
298
|
+
shell: isWin,
|
|
299
|
+
});
|
|
300
|
+
const exitCode = result.status ?? 1;
|
|
301
|
+
// Post-process report for json/github formats
|
|
302
|
+
if (args.format === "json") {
|
|
303
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
304
|
+
if (fs.existsSync(reportPath)) {
|
|
305
|
+
process.stdout.write(fs.readFileSync(reportPath, "utf-8"));
|
|
306
|
+
}
|
|
307
|
+
else {
|
|
308
|
+
console.error(JSON.stringify({
|
|
309
|
+
error: "regression-report.json not found",
|
|
310
|
+
exitCode,
|
|
311
|
+
}));
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
else if (args.format === "github") {
|
|
315
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
316
|
+
if (fs.existsSync(reportPath)) {
|
|
317
|
+
try {
|
|
318
|
+
const report = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
|
|
319
|
+
formatGithub(report);
|
|
320
|
+
}
|
|
321
|
+
catch {
|
|
322
|
+
// human output already printed
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
return exitCode;
|
|
327
|
+
}
|
|
328
|
+
// ── Built-in mode: run tests + compare against baseline ──
|
|
329
|
+
if (args.format === "human") {
|
|
330
|
+
console.log("\n Running EvalGate regression gate (built-in mode)...\n");
|
|
331
|
+
}
|
|
332
|
+
const report = runBuiltinGate(cwd);
|
|
333
|
+
// Write report artifact
|
|
334
|
+
const evalsDir = path.join(cwd, "evals");
|
|
335
|
+
if (!fs.existsSync(evalsDir)) {
|
|
336
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
337
|
+
}
|
|
338
|
+
fs.writeFileSync(path.join(cwd, REPORT_REL), `${JSON.stringify(report, null, 2)}\n`);
|
|
339
|
+
formatReport(report, args);
|
|
340
|
+
return report.exitCode;
|
|
341
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Truncate a string for deterministic output.
|
|
4
|
+
* Replaces newlines with space, caps length.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.truncateSnippet = truncateSnippet;
|
|
8
|
+
function truncateSnippet(s, maxLen = 140) {
|
|
9
|
+
if (s == null)
|
|
10
|
+
return "";
|
|
11
|
+
const normalized = s.replace(/\s+/g, " ").trim();
|
|
12
|
+
if (normalized.length <= maxLen)
|
|
13
|
+
return normalized;
|
|
14
|
+
return `${normalized.slice(0, maxLen)}…`;
|
|
15
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic ordering for failed cases.
|
|
3
|
+
* Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
|
|
4
|
+
*/
|
|
5
|
+
export interface SortableCase {
|
|
6
|
+
status?: string;
|
|
7
|
+
testCaseId?: number;
|
|
8
|
+
[key: string]: unknown;
|
|
9
|
+
}
|
|
10
|
+
export declare function sortFailedCases<T extends SortableCase>(cases: T[]): T[];
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Deterministic ordering for failed cases.
|
|
4
|
+
* Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.sortFailedCases = sortFailedCases;
|
|
8
|
+
const STATUS_SEVERITY = {
|
|
9
|
+
failed: 0,
|
|
10
|
+
error: 1,
|
|
11
|
+
skipped: 2,
|
|
12
|
+
passed: 3,
|
|
13
|
+
};
|
|
14
|
+
function sortFailedCases(cases) {
|
|
15
|
+
return [...cases].sort((a, b) => {
|
|
16
|
+
const sevA = STATUS_SEVERITY[a.status?.toLowerCase() ?? ""] ?? 4;
|
|
17
|
+
const sevB = STATUS_SEVERITY[b.status?.toLowerCase() ?? ""] ?? 4;
|
|
18
|
+
if (sevA !== sevB)
|
|
19
|
+
return sevA - sevB;
|
|
20
|
+
const idA = a.testCaseId ?? 0;
|
|
21
|
+
const idB = b.testCaseId ?? 0;
|
|
22
|
+
return idA - idB;
|
|
23
|
+
});
|
|
24
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build CheckReport from API data and gate result.
|
|
3
|
+
* Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
|
|
4
|
+
*/
|
|
5
|
+
import type { QualityLatestData, RunDetailsData } from "../api";
|
|
6
|
+
import type { CheckArgs } from "../check";
|
|
7
|
+
import { type CheckReport } from "../formatters/types";
|
|
8
|
+
import type { GateResult } from "../gate";
|
|
9
|
+
export type BuildReportInput = {
|
|
10
|
+
args: CheckArgs;
|
|
11
|
+
quality: QualityLatestData;
|
|
12
|
+
runDetails?: RunDetailsData | null;
|
|
13
|
+
gateResult: GateResult;
|
|
14
|
+
requestId?: string;
|
|
15
|
+
shareUrl?: string;
|
|
16
|
+
baselineRunId?: number | null;
|
|
17
|
+
ciRunUrl?: string | null;
|
|
18
|
+
};
|
|
19
|
+
export declare function buildCheckReport(input: BuildReportInput): CheckReport;
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Build CheckReport from API data and gate result.
|
|
4
|
+
* Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.buildCheckReport = buildCheckReport;
|
|
8
|
+
const types_1 = require("../formatters/types");
|
|
9
|
+
const snippet_1 = require("../render/snippet");
|
|
10
|
+
const sort_1 = require("../render/sort");
|
|
11
|
+
const TOP_N = 3;
|
|
12
|
+
/** ContribPts from weights: passRate*50, safety*25, (0.6*judge+0.4*schema)*15, (0.6*latency+0.4*cost)*10 */
|
|
13
|
+
function computeContribPts(b) {
|
|
14
|
+
const pr = b.passRate ?? 0;
|
|
15
|
+
const s = b.safety ?? 0;
|
|
16
|
+
const j = b.judge ?? 0;
|
|
17
|
+
const sc = b.schema ?? 0;
|
|
18
|
+
const l = b.latency ?? 0;
|
|
19
|
+
const c = b.cost ?? 0;
|
|
20
|
+
return {
|
|
21
|
+
passRatePts: Math.round(pr * 50 * 10) / 10,
|
|
22
|
+
safetyPts: Math.round(s * 25 * 10) / 10,
|
|
23
|
+
compliancePts: Math.round((0.6 * j + 0.4 * sc) * 15 * 10) / 10,
|
|
24
|
+
performancePts: Math.round((0.6 * l + 0.4 * c) * 10 * 10) / 10,
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
const SNIPPET_MAX = 50;
|
|
28
|
+
function buildCheckReport(input) {
|
|
29
|
+
const { args, quality, runDetails, gateResult, requestId } = input;
|
|
30
|
+
const score = quality?.score ?? 0;
|
|
31
|
+
const total = quality?.total ?? null;
|
|
32
|
+
const baselineScore = quality?.baselineScore ?? null;
|
|
33
|
+
const regressionDelta = quality?.regressionDelta ?? null;
|
|
34
|
+
const evaluationRunId = quality?.evaluationRunId;
|
|
35
|
+
const breakdown = quality?.breakdown ?? {};
|
|
36
|
+
const flags = (quality?.flags ?? []);
|
|
37
|
+
const baseUrl = args.baseUrl.replace(/\/$/, "");
|
|
38
|
+
const dashboardUrl = evaluationRunId != null
|
|
39
|
+
? `${baseUrl}/evaluations/${args.evaluationId}/runs/${evaluationRunId}`
|
|
40
|
+
: undefined;
|
|
41
|
+
// Build failed cases from run details
|
|
42
|
+
let failedCases = [];
|
|
43
|
+
if (runDetails?.results && evaluationRunId != null) {
|
|
44
|
+
const raw = runDetails.results
|
|
45
|
+
.filter((r) => r.status === "failed")
|
|
46
|
+
.map((r) => ({
|
|
47
|
+
testCaseId: r.testCaseId,
|
|
48
|
+
status: "failed",
|
|
49
|
+
name: r.test_cases?.name,
|
|
50
|
+
input: r.test_cases?.input,
|
|
51
|
+
expectedOutput: r.test_cases?.expectedOutput,
|
|
52
|
+
output: r.output,
|
|
53
|
+
}));
|
|
54
|
+
failedCases = (0, sort_1.sortFailedCases)(raw).map((fc) => ({
|
|
55
|
+
...fc,
|
|
56
|
+
inputSnippet: (0, snippet_1.truncateSnippet)(fc.input, SNIPPET_MAX),
|
|
57
|
+
expectedSnippet: (0, snippet_1.truncateSnippet)(fc.expectedOutput, SNIPPET_MAX),
|
|
58
|
+
outputSnippet: (0, snippet_1.truncateSnippet)(fc.output, SNIPPET_MAX),
|
|
59
|
+
}));
|
|
60
|
+
}
|
|
61
|
+
const failedCasesShown = Math.min(failedCases.length, TOP_N);
|
|
62
|
+
const failedCasesMore = failedCases.length - failedCasesShown;
|
|
63
|
+
const breakdown01 = Object.keys(breakdown).length > 0
|
|
64
|
+
? breakdown
|
|
65
|
+
: undefined;
|
|
66
|
+
const contribPts = args.explain && breakdown01 ? computeContribPts(breakdown01) : undefined;
|
|
67
|
+
const gateSkipped = gateResult.gateSkipped === true;
|
|
68
|
+
const gateApplied = !gateSkipped;
|
|
69
|
+
const gateMode = gateSkipped ? "neutral" : "enforced";
|
|
70
|
+
const actionableMessage = gateSkipped
|
|
71
|
+
? "Gate not applied: baseline missing. Publish a baseline from the dashboard, or run with --baseline previous once you have runs."
|
|
72
|
+
: (gateResult.reasonMessage ?? undefined);
|
|
73
|
+
const verdict = gateResult.reasonCode === "WARN_REGRESSION"
|
|
74
|
+
? "warn"
|
|
75
|
+
: gateResult.passed
|
|
76
|
+
? "pass"
|
|
77
|
+
: "fail";
|
|
78
|
+
const report = {
|
|
79
|
+
schemaVersion: types_1.CHECK_REPORT_SCHEMA_VERSION,
|
|
80
|
+
evaluationId: args.evaluationId,
|
|
81
|
+
runId: evaluationRunId,
|
|
82
|
+
verdict,
|
|
83
|
+
gateApplied,
|
|
84
|
+
gateMode,
|
|
85
|
+
actionableMessage,
|
|
86
|
+
shareUrl: input.shareUrl,
|
|
87
|
+
policy: args.policy,
|
|
88
|
+
baselineRunId: input.baselineRunId ?? quality?.baselineRunId ?? undefined,
|
|
89
|
+
ciRunUrl: input.ciRunUrl ?? undefined,
|
|
90
|
+
reasonCode: gateResult.reasonCode,
|
|
91
|
+
reasonMessage: gateResult.reasonMessage ?? undefined,
|
|
92
|
+
score,
|
|
93
|
+
baselineScore: baselineScore ?? undefined,
|
|
94
|
+
delta: regressionDelta ?? undefined,
|
|
95
|
+
n: total ?? undefined,
|
|
96
|
+
evidenceLevel: quality?.evidenceLevel ?? undefined,
|
|
97
|
+
baselineMissing: quality?.baselineMissing === true,
|
|
98
|
+
baselineStatus: quality?.baselineMissing === true
|
|
99
|
+
? "missing"
|
|
100
|
+
: quality?.baselineScore != null
|
|
101
|
+
? "found"
|
|
102
|
+
: undefined,
|
|
103
|
+
flags: flags.length > 0 ? [...flags].sort() : undefined,
|
|
104
|
+
breakdown01,
|
|
105
|
+
contribPts,
|
|
106
|
+
thresholds: {
|
|
107
|
+
minScore: args.minScore,
|
|
108
|
+
maxDrop: args.maxDrop,
|
|
109
|
+
warnDrop: args.warnDrop,
|
|
110
|
+
minN: args.minN,
|
|
111
|
+
allowWeakEvidence: args.allowWeakEvidence,
|
|
112
|
+
baseline: args.baseline,
|
|
113
|
+
maxCostUsd: args.maxCostUsd,
|
|
114
|
+
maxLatencyMs: args.maxLatencyMs,
|
|
115
|
+
maxCostDeltaUsd: args.maxCostDeltaUsd,
|
|
116
|
+
},
|
|
117
|
+
dashboardUrl,
|
|
118
|
+
failedCases,
|
|
119
|
+
failedCasesShown: failedCases.length > 0 ? failedCasesShown : undefined,
|
|
120
|
+
failedCasesMore: failedCasesMore > 0 ? failedCasesMore : undefined,
|
|
121
|
+
requestId,
|
|
122
|
+
explain: args.explain,
|
|
123
|
+
policyEvidence: args.explain && gateResult.policyEvidence
|
|
124
|
+
? {
|
|
125
|
+
failedCheck: gateResult.policyEvidence.failedCheck,
|
|
126
|
+
remediation: gateResult.policyEvidence.remediation,
|
|
127
|
+
snapshot: gateResult.policyEvidence.snapshot,
|
|
128
|
+
}
|
|
129
|
+
: undefined,
|
|
130
|
+
};
|
|
131
|
+
return report;
|
|
132
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TICKET 4 — Unified evalgate run CLI Command
|
|
3
|
+
*
|
|
4
|
+
* Goal: Consolidated execution interface that consumes manifest
|
|
5
|
+
*
|
|
6
|
+
* Features:
|
|
7
|
+
* - Manifest loading and spec filtering
|
|
8
|
+
* - --impacted-only integration with impact analysis
|
|
9
|
+
* - Local executor integration
|
|
10
|
+
* - .evalgate/last-run.json output
|
|
11
|
+
* - Legacy mode compatibility
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Run execution options
|
|
15
|
+
*/
|
|
16
|
+
export interface RunOptions {
|
|
17
|
+
/** Filter to specific spec IDs */
|
|
18
|
+
specIds?: string[];
|
|
19
|
+
/** Run only impacted specs (requires base branch) */
|
|
20
|
+
impactedOnly?: boolean;
|
|
21
|
+
/** Base branch for impact analysis */
|
|
22
|
+
baseBranch?: string;
|
|
23
|
+
/** Output format */
|
|
24
|
+
format?: "human" | "json";
|
|
25
|
+
/** Write run results to file */
|
|
26
|
+
writeResults?: boolean;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Run execution result
|
|
30
|
+
*/
|
|
31
|
+
export interface RunResult {
|
|
32
|
+
/** Schema version for compatibility checking */
|
|
33
|
+
schemaVersion: number;
|
|
34
|
+
/** Unique run identifier */
|
|
35
|
+
runId: string;
|
|
36
|
+
/** Execution metadata */
|
|
37
|
+
metadata: {
|
|
38
|
+
startedAt: number;
|
|
39
|
+
completedAt: number;
|
|
40
|
+
duration: number;
|
|
41
|
+
totalSpecs: number;
|
|
42
|
+
executedSpecs: number;
|
|
43
|
+
mode: "spec" | "legacy";
|
|
44
|
+
};
|
|
45
|
+
/** Individual spec results */
|
|
46
|
+
results: SpecResult[];
|
|
47
|
+
/** Summary statistics */
|
|
48
|
+
summary: {
|
|
49
|
+
passed: number;
|
|
50
|
+
failed: number;
|
|
51
|
+
skipped: number;
|
|
52
|
+
passRate: number;
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Individual spec result
|
|
57
|
+
*/
|
|
58
|
+
export interface SpecResult {
|
|
59
|
+
/** Spec identifier */
|
|
60
|
+
specId: string;
|
|
61
|
+
/** Spec name */
|
|
62
|
+
name: string;
|
|
63
|
+
/** File path */
|
|
64
|
+
filePath: string;
|
|
65
|
+
/** Execution result */
|
|
66
|
+
result: {
|
|
67
|
+
status: "passed" | "failed" | "skipped";
|
|
68
|
+
score?: number;
|
|
69
|
+
error?: string;
|
|
70
|
+
duration: number;
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Run evaluation specifications
|
|
75
|
+
*/
|
|
76
|
+
export declare function runEvaluations(options: RunOptions, projectRoot?: string): Promise<RunResult>;
|
|
77
|
+
/**
|
|
78
|
+
* Run index entry
|
|
79
|
+
*/
|
|
80
|
+
export interface RunIndexEntry {
|
|
81
|
+
runId: string;
|
|
82
|
+
createdAt: number;
|
|
83
|
+
gitSha?: string;
|
|
84
|
+
branch?: string;
|
|
85
|
+
mode: "spec" | "legacy";
|
|
86
|
+
specCount: number;
|
|
87
|
+
passRate: number;
|
|
88
|
+
avgScore: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Print human-readable results
|
|
92
|
+
*/
|
|
93
|
+
export declare function printHumanResults(result: RunResult): void;
|
|
94
|
+
/**
|
|
95
|
+
* Print JSON results
|
|
96
|
+
*/
|
|
97
|
+
export declare function printJsonResults(result: RunResult): void;
|
|
98
|
+
/**
|
|
99
|
+
* CLI entry point
|
|
100
|
+
*/
|
|
101
|
+
export declare function runEvaluationsCLI(options: RunOptions): Promise<void>;
|