@pauly4010/evalai-sdk 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +172 -251
- package/dist/cli/baseline.js +1 -1
- package/dist/cli/index.js +6 -0
- package/dist/cli/init.d.ts +11 -2
- package/dist/cli/init.js +227 -16
- package/dist/cli/regression-gate.d.ts +6 -2
- package/dist/cli/regression-gate.js +246 -61
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +491 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +7 -7
- package/package.json +1 -1
package/dist/cli/init.js
CHANGED
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
"use strict";
|
|
3
3
|
/**
|
|
4
|
-
* evalai init —
|
|
4
|
+
* evalai init — Full project scaffolder
|
|
5
5
|
*
|
|
6
|
-
*
|
|
6
|
+
* Zero-to-gate in under 5 minutes:
|
|
7
|
+
* npx evalai init
|
|
8
|
+
* git push
|
|
9
|
+
* …CI starts blocking regressions.
|
|
10
|
+
*
|
|
11
|
+
* What it does:
|
|
12
|
+
* 1. Detects Node repo + package manager
|
|
13
|
+
* 2. Creates evals/ directory + baseline.json
|
|
14
|
+
* 3. Installs .github/workflows/evalai-gate.yml
|
|
15
|
+
* 4. Prints next steps (no docs required)
|
|
7
16
|
*/
|
|
8
17
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
18
|
if (k2 === undefined) k2 = k;
|
|
@@ -40,30 +49,232 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
40
49
|
})();
|
|
41
50
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
51
|
exports.runInit = runInit;
|
|
52
|
+
const node_child_process_1 = require("node:child_process");
|
|
43
53
|
const fs = __importStar(require("node:fs"));
|
|
44
54
|
const path = __importStar(require("node:path"));
|
|
45
|
-
|
|
46
|
-
|
|
55
|
+
function detectProject(cwd) {
|
|
56
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
57
|
+
if (!fs.existsSync(pkgPath))
|
|
58
|
+
return null;
|
|
59
|
+
let pkg;
|
|
60
|
+
try {
|
|
61
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
let pm = "npm";
|
|
67
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
68
|
+
pm = "pnpm";
|
|
69
|
+
else if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
70
|
+
pm = "yarn";
|
|
71
|
+
const testScript = pkg.scripts?.test ?? "";
|
|
72
|
+
const hasTestScript = !!testScript && testScript !== 'echo "Error: no test specified" && exit 1';
|
|
73
|
+
return {
|
|
74
|
+
cwd,
|
|
75
|
+
pm,
|
|
76
|
+
hasTestScript,
|
|
77
|
+
testScript,
|
|
78
|
+
name: pkg.name ?? path.basename(cwd),
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
// ── Step helpers ──
|
|
82
|
+
function ok(msg) {
|
|
83
|
+
console.log(` ✔ ${msg}`);
|
|
84
|
+
}
|
|
85
|
+
function skip(msg) {
|
|
86
|
+
console.log(` – ${msg}`);
|
|
87
|
+
}
|
|
88
|
+
// ── 1. Create evals/ + baseline.json ──
|
|
89
|
+
function createBaseline(cwd, project) {
|
|
90
|
+
const evalsDir = path.join(cwd, "evals");
|
|
91
|
+
const baselinePath = path.join(evalsDir, "baseline.json");
|
|
92
|
+
if (fs.existsSync(baselinePath)) {
|
|
93
|
+
skip("evals/baseline.json already exists");
|
|
94
|
+
return true;
|
|
95
|
+
}
|
|
96
|
+
if (!fs.existsSync(evalsDir)) {
|
|
97
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
98
|
+
}
|
|
99
|
+
const user = process.env.USER || process.env.USERNAME || "unknown";
|
|
100
|
+
const now = new Date().toISOString();
|
|
101
|
+
// Run tests to capture real count if possible
|
|
102
|
+
let testTotal = 0;
|
|
103
|
+
let testsPassed = true;
|
|
104
|
+
if (project.hasTestScript) {
|
|
105
|
+
const isWin = process.platform === "win32";
|
|
106
|
+
const result = (0, node_child_process_1.spawnSync)(project.pm, ["test"], {
|
|
107
|
+
cwd,
|
|
108
|
+
stdio: "pipe",
|
|
109
|
+
shell: isWin,
|
|
110
|
+
timeout: 120000,
|
|
111
|
+
});
|
|
112
|
+
testsPassed = result.status === 0;
|
|
113
|
+
// Try to extract test count from output
|
|
114
|
+
const output = (result.stdout?.toString() ?? "") + (result.stderr?.toString() ?? "");
|
|
115
|
+
const countMatch = output.match(/(\d+)\s+(?:tests?|specs?)\s+(?:passed|completed)/i) ??
|
|
116
|
+
output.match(/Tests:\s+(\d+)\s+passed/i) ??
|
|
117
|
+
output.match(/(\d+)\s+passing/i);
|
|
118
|
+
if (countMatch)
|
|
119
|
+
testTotal = parseInt(countMatch[1], 10);
|
|
120
|
+
}
|
|
121
|
+
const baseline = {
|
|
122
|
+
schemaVersion: 1,
|
|
123
|
+
description: `Regression gate baseline for ${project.name}`,
|
|
124
|
+
generatedAt: now,
|
|
125
|
+
generatedBy: user,
|
|
126
|
+
commitSha: getHeadSha(cwd),
|
|
127
|
+
updatedAt: now,
|
|
128
|
+
updatedBy: user,
|
|
129
|
+
tolerance: {
|
|
130
|
+
scoreDrop: 5,
|
|
131
|
+
passRateDrop: 5,
|
|
132
|
+
maxLatencyIncreaseMs: 200,
|
|
133
|
+
maxCostIncreaseUsd: 0.05,
|
|
134
|
+
},
|
|
135
|
+
goldenEval: {
|
|
136
|
+
score: 100,
|
|
137
|
+
passRate: 100,
|
|
138
|
+
totalCases: 3,
|
|
139
|
+
passedCases: 3,
|
|
140
|
+
},
|
|
141
|
+
confidenceTests: {
|
|
142
|
+
passed: testsPassed,
|
|
143
|
+
total: testTotal,
|
|
144
|
+
},
|
|
145
|
+
productMetrics: {},
|
|
146
|
+
};
|
|
147
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
|
|
148
|
+
ok("Created evals/baseline.json");
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
function getHeadSha(cwd) {
|
|
152
|
+
try {
|
|
153
|
+
const result = (0, node_child_process_1.spawnSync)("git", ["rev-parse", "--short", "HEAD"], {
|
|
154
|
+
cwd,
|
|
155
|
+
stdio: "pipe",
|
|
156
|
+
});
|
|
157
|
+
return result.stdout?.toString().trim() || "0000000";
|
|
158
|
+
}
|
|
159
|
+
catch {
|
|
160
|
+
return "0000000";
|
|
161
|
+
}
|
|
47
162
|
}
|
|
163
|
+
// ── 2. Install GitHub Actions workflow ──
|
|
164
|
+
function installWorkflow(cwd, project) {
|
|
165
|
+
const workflowDir = path.join(cwd, ".github", "workflows");
|
|
166
|
+
const workflowPath = path.join(workflowDir, "evalai-gate.yml");
|
|
167
|
+
if (fs.existsSync(workflowPath)) {
|
|
168
|
+
skip(".github/workflows/evalai-gate.yml already exists");
|
|
169
|
+
return true;
|
|
170
|
+
}
|
|
171
|
+
if (!fs.existsSync(workflowDir)) {
|
|
172
|
+
fs.mkdirSync(workflowDir, { recursive: true });
|
|
173
|
+
}
|
|
174
|
+
const installCmd = project.pm === "pnpm"
|
|
175
|
+
? "pnpm install --frozen-lockfile"
|
|
176
|
+
: project.pm === "yarn"
|
|
177
|
+
? "yarn install --frozen-lockfile"
|
|
178
|
+
: "npm ci";
|
|
179
|
+
const setupSteps = project.pm === "pnpm"
|
|
180
|
+
? ` - uses: pnpm/action-setup@v4
|
|
181
|
+
- uses: actions/setup-node@v4
|
|
182
|
+
with:
|
|
183
|
+
node-version: '20'
|
|
184
|
+
cache: pnpm
|
|
185
|
+
- run: ${installCmd}`
|
|
186
|
+
: ` - uses: actions/setup-node@v4
|
|
187
|
+
with:
|
|
188
|
+
node-version: '20'
|
|
189
|
+
cache: ${project.pm}
|
|
190
|
+
- run: ${installCmd}`;
|
|
191
|
+
const workflow = `# EvalAI Regression Gate
|
|
192
|
+
# Auto-generated by: npx evalai init
|
|
193
|
+
# Blocks PRs that regress test health.
|
|
194
|
+
name: EvalAI Gate
|
|
195
|
+
|
|
196
|
+
on:
|
|
197
|
+
pull_request:
|
|
198
|
+
branches: [main]
|
|
199
|
+
|
|
200
|
+
concurrency:
|
|
201
|
+
group: evalai-\${{ github.ref }}
|
|
202
|
+
cancel-in-progress: true
|
|
203
|
+
|
|
204
|
+
jobs:
|
|
205
|
+
regression-gate:
|
|
206
|
+
runs-on: ubuntu-latest
|
|
207
|
+
steps:
|
|
208
|
+
- uses: actions/checkout@v4
|
|
209
|
+
${setupSteps}
|
|
210
|
+
- name: EvalAI Regression Gate
|
|
211
|
+
run: npx -y @pauly4010/evalai-sdk@^1 gate --format github
|
|
212
|
+
|
|
213
|
+
- name: Upload report
|
|
214
|
+
if: always()
|
|
215
|
+
uses: actions/upload-artifact@v4
|
|
216
|
+
with:
|
|
217
|
+
name: regression-report
|
|
218
|
+
path: evals/regression-report.json
|
|
219
|
+
if-no-files-found: ignore
|
|
48
220
|
`;
|
|
49
|
-
|
|
221
|
+
fs.writeFileSync(workflowPath, workflow);
|
|
222
|
+
ok("Created .github/workflows/evalai-gate.yml");
|
|
223
|
+
return true;
|
|
224
|
+
}
|
|
225
|
+
// ── 3. Create evalai.config.json ──
|
|
226
|
+
function createConfig(cwd) {
|
|
50
227
|
const configPath = path.join(cwd, "evalai.config.json");
|
|
51
228
|
if (fs.existsSync(configPath)) {
|
|
52
|
-
|
|
229
|
+
skip("evalai.config.json already exists");
|
|
230
|
+
return true;
|
|
231
|
+
}
|
|
232
|
+
const config = {
|
|
233
|
+
evaluationId: "",
|
|
234
|
+
gate: {
|
|
235
|
+
baseline: "evals/baseline.json",
|
|
236
|
+
report: "evals/regression-report.json",
|
|
237
|
+
},
|
|
238
|
+
};
|
|
239
|
+
fs.writeFileSync(configPath, `${JSON.stringify(config, null, 2)}\n`);
|
|
240
|
+
ok("Created evalai.config.json");
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
// ── Main ──
|
|
244
|
+
function runInit(cwd = process.cwd()) {
|
|
245
|
+
console.log("");
|
|
246
|
+
console.log(" evalai init — setting up regression gate\n");
|
|
247
|
+
// Detect
|
|
248
|
+
const project = detectProject(cwd);
|
|
249
|
+
if (!project) {
|
|
250
|
+
console.error(" ✖ No package.json found. Run this from a Node.js project root.");
|
|
53
251
|
return false;
|
|
54
252
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
253
|
+
ok(`Detected ${project.pm} project: ${project.name}`);
|
|
254
|
+
if (!project.hasTestScript) {
|
|
255
|
+
console.log(` ⚠ No test script found in package.json`);
|
|
256
|
+
console.log(` The gate will still work — add a "test" script later for full coverage.\n`);
|
|
257
|
+
}
|
|
258
|
+
// Scaffold
|
|
259
|
+
createBaseline(cwd, project);
|
|
260
|
+
installWorkflow(cwd, project);
|
|
261
|
+
createConfig(cwd);
|
|
262
|
+
// Next steps
|
|
263
|
+
console.log("");
|
|
264
|
+
console.log(" Done! Next:");
|
|
265
|
+
console.log("");
|
|
266
|
+
console.log(" git add evals/ .github/workflows/evalai-gate.yml evalai.config.json");
|
|
267
|
+
console.log(" git commit -m 'chore: add EvalAI regression gate'");
|
|
268
|
+
console.log(" git push");
|
|
269
|
+
console.log("");
|
|
270
|
+
console.log(" That's it. Open a PR and the gate runs automatically.");
|
|
58
271
|
console.log("");
|
|
59
|
-
console.log("
|
|
272
|
+
console.log(" Commands:");
|
|
273
|
+
console.log(" npx evalai gate Run gate locally");
|
|
274
|
+
console.log(" npx evalai gate --format json Machine-readable output");
|
|
275
|
+
console.log(" npx evalai baseline update Update baseline after intentional changes");
|
|
60
276
|
console.log("");
|
|
61
|
-
console.log("
|
|
62
|
-
console.log(" - name: EvalAI gate");
|
|
63
|
-
console.log(" env:");
|
|
64
|
-
console.log(" EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}");
|
|
65
|
-
console.log(" run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
|
|
277
|
+
console.log(" To remove: delete evals/, evalai.config.json, and .github/workflows/evalai-gate.yml");
|
|
66
278
|
console.log("");
|
|
67
|
-
console.log("To uninstall: delete evalai.config.json.");
|
|
68
279
|
return true;
|
|
69
280
|
}
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* evalai gate — Run the regression gate
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
4
|
+
* Two modes:
|
|
5
|
+
* 1. Project mode: delegates to eval:regression-gate npm script (full gate)
|
|
6
|
+
* 2. Built-in mode: runs `npm test`, compares against evals/baseline.json
|
|
7
|
+
*
|
|
8
|
+
* Built-in mode activates when no eval:regression-gate script is defined,
|
|
9
|
+
* making `npx evalai gate` work for any project after `npx evalai init`.
|
|
6
10
|
*/
|
|
7
11
|
export interface GateArgs {
|
|
8
12
|
format: "human" | "json" | "github";
|
|
@@ -2,8 +2,12 @@
|
|
|
2
2
|
/**
|
|
3
3
|
* evalai gate — Run the regression gate
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
* Two modes:
|
|
6
|
+
* 1. Project mode: delegates to eval:regression-gate npm script (full gate)
|
|
7
|
+
* 2. Built-in mode: runs `npm test`, compares against evals/baseline.json
|
|
8
|
+
*
|
|
9
|
+
* Built-in mode activates when no eval:regression-gate script is defined,
|
|
10
|
+
* making `npx evalai gate` work for any project after `npx evalai init`.
|
|
7
11
|
*/
|
|
8
12
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
13
|
if (k2 === undefined) k2 = k;
|
|
@@ -45,6 +49,7 @@ const node_child_process_1 = require("node:child_process");
|
|
|
45
49
|
const fs = __importStar(require("node:fs"));
|
|
46
50
|
const path = __importStar(require("node:path"));
|
|
47
51
|
const REPORT_REL = "evals/regression-report.json";
|
|
52
|
+
const BASELINE_REL = "evals/baseline.json";
|
|
48
53
|
/** Detect the package manager used in the project */
|
|
49
54
|
function detectPackageManager(cwd) {
|
|
50
55
|
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
@@ -66,10 +71,206 @@ function parseGateArgs(argv) {
|
|
|
66
71
|
}
|
|
67
72
|
return args;
|
|
68
73
|
}
|
|
74
|
+
function detectRunner(cwd) {
|
|
75
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
76
|
+
try {
|
|
77
|
+
const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
78
|
+
const testCmd = pkg.scripts?.test ?? "";
|
|
79
|
+
if (testCmd.includes("vitest"))
|
|
80
|
+
return "vitest";
|
|
81
|
+
if (testCmd.includes("jest"))
|
|
82
|
+
return "jest";
|
|
83
|
+
if (testCmd.includes("mocha"))
|
|
84
|
+
return "mocha";
|
|
85
|
+
if (testCmd.includes("node --test"))
|
|
86
|
+
return "node:test";
|
|
87
|
+
if (testCmd.includes("ava"))
|
|
88
|
+
return "ava";
|
|
89
|
+
if (testCmd.includes("tap"))
|
|
90
|
+
return "tap";
|
|
91
|
+
}
|
|
92
|
+
catch {
|
|
93
|
+
// ignore
|
|
94
|
+
}
|
|
95
|
+
return "unknown";
|
|
96
|
+
}
|
|
97
|
+
function runBuiltinGate(cwd) {
|
|
98
|
+
const t0 = Date.now();
|
|
99
|
+
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
100
|
+
const now = new Date().toISOString();
|
|
101
|
+
const pm = detectPackageManager(cwd);
|
|
102
|
+
const command = `${pm} test`;
|
|
103
|
+
const runner = detectRunner(cwd);
|
|
104
|
+
// Load baseline
|
|
105
|
+
if (!fs.existsSync(baselinePath)) {
|
|
106
|
+
return {
|
|
107
|
+
schemaVersion: 1,
|
|
108
|
+
timestamp: now,
|
|
109
|
+
exitCode: 2,
|
|
110
|
+
category: "infra_error",
|
|
111
|
+
passed: false,
|
|
112
|
+
failures: ["Baseline file not found. Run: npx evalai init"],
|
|
113
|
+
deltas: [],
|
|
114
|
+
baseline: null,
|
|
115
|
+
durationMs: Date.now() - t0,
|
|
116
|
+
command,
|
|
117
|
+
runner,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
let baselineData;
|
|
121
|
+
try {
|
|
122
|
+
baselineData = JSON.parse(fs.readFileSync(baselinePath, "utf-8"));
|
|
123
|
+
}
|
|
124
|
+
catch {
|
|
125
|
+
return {
|
|
126
|
+
schemaVersion: 1,
|
|
127
|
+
timestamp: now,
|
|
128
|
+
exitCode: 2,
|
|
129
|
+
category: "infra_error",
|
|
130
|
+
passed: false,
|
|
131
|
+
failures: ["Failed to parse evals/baseline.json"],
|
|
132
|
+
deltas: [],
|
|
133
|
+
baseline: null,
|
|
134
|
+
durationMs: Date.now() - t0,
|
|
135
|
+
command,
|
|
136
|
+
runner,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
const baselineMeta = baselineData.updatedAt
|
|
140
|
+
? { updatedAt: baselineData.updatedAt, updatedBy: baselineData.updatedBy ?? "unknown" }
|
|
141
|
+
: null;
|
|
142
|
+
// Run tests
|
|
143
|
+
const isWin = process.platform === "win32";
|
|
144
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["test"], {
|
|
145
|
+
cwd,
|
|
146
|
+
stdio: "pipe",
|
|
147
|
+
shell: isWin,
|
|
148
|
+
timeout: 300000,
|
|
149
|
+
});
|
|
150
|
+
const testsPassed = result.status === 0;
|
|
151
|
+
const output = (result.stdout?.toString() ?? "") + (result.stderr?.toString() ?? "");
|
|
152
|
+
// Try to extract test count
|
|
153
|
+
let testCount = 0;
|
|
154
|
+
const countMatch = output.match(/(\d+)\s+(?:tests?|specs?)\s+(?:passed|completed)/i) ??
|
|
155
|
+
output.match(/Tests:\s+(\d+)\s+passed/i) ??
|
|
156
|
+
output.match(/(\d+)\s+passing/i) ??
|
|
157
|
+
output.match(/Test Files\s+\d+\s+passed.*\n\s+Tests\s+(\d+)\s+passed/i);
|
|
158
|
+
if (countMatch)
|
|
159
|
+
testCount = parseInt(countMatch[1], 10);
|
|
160
|
+
// Compare against baseline
|
|
161
|
+
const baselinePassed = baselineData.confidenceTests?.passed ?? true;
|
|
162
|
+
const baselineTotal = baselineData.confidenceTests?.total ?? 0;
|
|
163
|
+
const failures = [];
|
|
164
|
+
const deltas = [];
|
|
165
|
+
// Delta: tests passing
|
|
166
|
+
deltas.push({
|
|
167
|
+
metric: "tests_passing",
|
|
168
|
+
baseline: baselinePassed,
|
|
169
|
+
current: testsPassed,
|
|
170
|
+
delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
|
|
171
|
+
status: testsPassed ? "pass" : "fail",
|
|
172
|
+
});
|
|
173
|
+
if (!testsPassed && baselinePassed) {
|
|
174
|
+
failures.push("Tests were passing in baseline but are now failing");
|
|
175
|
+
}
|
|
176
|
+
// Delta: test count (only if we captured counts)
|
|
177
|
+
if (testCount > 0 || baselineTotal > 0) {
|
|
178
|
+
const countDelta = testCount - baselineTotal;
|
|
179
|
+
deltas.push({
|
|
180
|
+
metric: "test_count",
|
|
181
|
+
baseline: baselineTotal,
|
|
182
|
+
current: testCount,
|
|
183
|
+
delta: countDelta >= 0 ? `+${countDelta}` : `${countDelta}`,
|
|
184
|
+
status: testCount >= baselineTotal ? "pass" : "fail",
|
|
185
|
+
});
|
|
186
|
+
if (testCount < baselineTotal) {
|
|
187
|
+
failures.push(`Test count dropped from ${baselineTotal} to ${testCount} (${countDelta})`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
const hasRegression = failures.length > 0;
|
|
191
|
+
return {
|
|
192
|
+
schemaVersion: 1,
|
|
193
|
+
timestamp: now,
|
|
194
|
+
exitCode: hasRegression ? 1 : 0,
|
|
195
|
+
category: hasRegression ? "regression" : "pass",
|
|
196
|
+
passed: !hasRegression,
|
|
197
|
+
failures,
|
|
198
|
+
deltas,
|
|
199
|
+
baseline: baselineMeta,
|
|
200
|
+
durationMs: Date.now() - t0,
|
|
201
|
+
command,
|
|
202
|
+
runner,
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
// ── Format helpers ──
|
|
206
|
+
function formatHuman(report) {
|
|
207
|
+
const icon = report.passed ? "✅" : "❌";
|
|
208
|
+
console.log(`\n${icon} EvalAI Gate: ${report.category.toUpperCase()}\n`);
|
|
209
|
+
if (report.deltas.length > 0) {
|
|
210
|
+
const pad = (s, n) => s.padEnd(n);
|
|
211
|
+
console.log(` ${pad("Metric", 16)} ${pad("Baseline", 10)} ${pad("Current", 10)} ${pad("Delta", 8)} Status`);
|
|
212
|
+
console.log(` ${"-".repeat(16)} ${"-".repeat(10)} ${"-".repeat(10)} ${"-".repeat(8)} ------`);
|
|
213
|
+
for (const d of report.deltas) {
|
|
214
|
+
const si = d.status === "pass" ? "✔" : "✖";
|
|
215
|
+
console.log(` ${pad(d.metric, 16)} ${pad(String(d.baseline), 10)} ${pad(String(d.current), 10)} ${pad(d.delta, 8)} ${si}`);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
if (report.failures.length > 0) {
|
|
219
|
+
console.log("\n Failures:");
|
|
220
|
+
for (const f of report.failures) {
|
|
221
|
+
console.log(` • ${f}`);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
console.log("");
|
|
225
|
+
}
|
|
226
|
+
function formatGithub(report) {
|
|
227
|
+
const icon = report.passed ? "✅" : "❌";
|
|
228
|
+
const lines = [
|
|
229
|
+
`## ${icon} EvalAI Gate: ${report.category}`,
|
|
230
|
+
"",
|
|
231
|
+
"| Metric | Baseline | Current | Delta | Status |",
|
|
232
|
+
"|--------|----------|---------|-------|--------|",
|
|
233
|
+
];
|
|
234
|
+
for (const d of report.deltas) {
|
|
235
|
+
const si = d.status === "pass" ? "✅" : "❌";
|
|
236
|
+
lines.push(`| ${d.metric} | ${d.baseline} | ${d.current} | ${d.delta} | ${si} |`);
|
|
237
|
+
}
|
|
238
|
+
if (report.failures.length > 0) {
|
|
239
|
+
lines.push("", "### Failures", "");
|
|
240
|
+
for (const f of report.failures) {
|
|
241
|
+
lines.push(`- ${f}`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
lines.push("", `Schema version: ${report.schemaVersion}`);
|
|
245
|
+
const md = lines.join("\n");
|
|
246
|
+
// Write to $GITHUB_STEP_SUMMARY if available
|
|
247
|
+
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
248
|
+
if (summaryPath) {
|
|
249
|
+
try {
|
|
250
|
+
fs.appendFileSync(summaryPath, `${md}\n`);
|
|
251
|
+
}
|
|
252
|
+
catch {
|
|
253
|
+
// ignore if not writable
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
console.log(md);
|
|
257
|
+
}
|
|
258
|
+
function formatReport(report, args) {
|
|
259
|
+
if (args.format === "json") {
|
|
260
|
+
process.stdout.write(JSON.stringify(report, null, 2));
|
|
261
|
+
}
|
|
262
|
+
else if (args.format === "github") {
|
|
263
|
+
formatGithub(report);
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
formatHuman(report);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// ── Main ──
|
|
69
270
|
function runGate(argv) {
|
|
70
271
|
const cwd = process.cwd();
|
|
71
272
|
const args = parseGateArgs(argv);
|
|
72
|
-
// Check
|
|
273
|
+
// Check for package.json
|
|
73
274
|
const pkgPath = path.join(cwd, "package.json");
|
|
74
275
|
if (!fs.existsSync(pkgPath)) {
|
|
75
276
|
console.error("❌ No package.json found. Run this from your project root.");
|
|
@@ -83,68 +284,52 @@ function runGate(argv) {
|
|
|
83
284
|
console.error("❌ Failed to parse package.json");
|
|
84
285
|
return 1;
|
|
85
286
|
}
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
process.stdout.write(report);
|
|
107
|
-
}
|
|
108
|
-
else {
|
|
109
|
-
console.error(JSON.stringify({ error: "regression-report.json not found", exitCode }));
|
|
287
|
+
// ── Project mode: delegate to eval:regression-gate if it exists ──
|
|
288
|
+
if (pkg.scripts?.["eval:regression-gate"]) {
|
|
289
|
+
const pm = detectPackageManager(cwd);
|
|
290
|
+
const isWin = process.platform === "win32";
|
|
291
|
+
const stdio = args.format === "json" ? "pipe" : "inherit";
|
|
292
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", "eval:regression-gate"], {
|
|
293
|
+
cwd,
|
|
294
|
+
stdio: stdio,
|
|
295
|
+
shell: isWin,
|
|
296
|
+
});
|
|
297
|
+
const exitCode = result.status ?? 1;
|
|
298
|
+
// Post-process report for json/github formats
|
|
299
|
+
if (args.format === "json") {
|
|
300
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
301
|
+
if (fs.existsSync(reportPath)) {
|
|
302
|
+
process.stdout.write(fs.readFileSync(reportPath, "utf-8"));
|
|
303
|
+
}
|
|
304
|
+
else {
|
|
305
|
+
console.error(JSON.stringify({ error: "regression-report.json not found", exitCode }));
|
|
306
|
+
}
|
|
110
307
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
const report = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
|
|
118
|
-
const icon = report.passed ? "✅" : "❌";
|
|
119
|
-
const lines = [
|
|
120
|
-
`## ${icon} Regression Gate: ${report.category}`,
|
|
121
|
-
"",
|
|
122
|
-
"| Metric | Baseline | Current | Delta | Status |",
|
|
123
|
-
"|--------|----------|---------|-------|--------|",
|
|
124
|
-
];
|
|
125
|
-
for (const d of report.deltas ?? []) {
|
|
126
|
-
const statusIcon = d.status === "pass" ? "✅" : "❌";
|
|
127
|
-
lines.push(`| ${d.metric} | ${d.baseline} | ${d.current} | ${d.delta} | ${statusIcon} |`);
|
|
128
|
-
}
|
|
129
|
-
if (report.failures?.length > 0) {
|
|
130
|
-
lines.push("", "### Failures", "");
|
|
131
|
-
for (const f of report.failures) {
|
|
132
|
-
lines.push(`- ${f}`);
|
|
133
|
-
}
|
|
308
|
+
else if (args.format === "github") {
|
|
309
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
310
|
+
if (fs.existsSync(reportPath)) {
|
|
311
|
+
try {
|
|
312
|
+
const report = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
|
|
313
|
+
formatGithub(report);
|
|
134
314
|
}
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
// Write to $GITHUB_STEP_SUMMARY if available
|
|
138
|
-
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
139
|
-
if (summaryPath) {
|
|
140
|
-
fs.appendFileSync(summaryPath, `${md}\n`);
|
|
315
|
+
catch {
|
|
316
|
+
// human output already printed
|
|
141
317
|
}
|
|
142
|
-
console.log(md);
|
|
143
|
-
}
|
|
144
|
-
catch {
|
|
145
|
-
// Fall through — human output already printed
|
|
146
318
|
}
|
|
147
319
|
}
|
|
320
|
+
return exitCode;
|
|
321
|
+
}
|
|
322
|
+
// ── Built-in mode: run tests + compare against baseline ──
|
|
323
|
+
if (args.format === "human") {
|
|
324
|
+
console.log("\n Running EvalAI regression gate (built-in mode)...\n");
|
|
325
|
+
}
|
|
326
|
+
const report = runBuiltinGate(cwd);
|
|
327
|
+
// Write report artifact
|
|
328
|
+
const evalsDir = path.join(cwd, "evals");
|
|
329
|
+
if (!fs.existsSync(evalsDir)) {
|
|
330
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
148
331
|
}
|
|
149
|
-
|
|
332
|
+
fs.writeFileSync(path.join(cwd, REPORT_REL), `${JSON.stringify(report, null, 2)}\n`);
|
|
333
|
+
formatReport(report, args);
|
|
334
|
+
return report.exitCode;
|
|
150
335
|
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalai upgrade --full — Upgrade from Tier 1 (built-in gate) to Tier 2 (full gate)
|
|
3
|
+
*
|
|
4
|
+
* What it does:
|
|
5
|
+
* 1. Adds full regression gate script (scripts/regression-gate.ts)
|
|
6
|
+
* 2. Adds baseline governance workflow (.github/workflows/baseline-governance.yml)
|
|
7
|
+
* 3. Updates package.json with eval:regression-gate + eval:baseline-update scripts
|
|
8
|
+
* 4. Updates .github/workflows/evalai-gate.yml to use project mode
|
|
9
|
+
* 5. Prints next steps
|
|
10
|
+
*/
|
|
11
|
+
export interface UpgradeArgs {
|
|
12
|
+
full: boolean;
|
|
13
|
+
}
|
|
14
|
+
export declare function parseUpgradeArgs(argv: string[]): UpgradeArgs;
|
|
15
|
+
export declare function runUpgrade(argv: string[]): number;
|