@pauly4010/evalai-sdk 1.5.8 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,491 @@
1
+ "use strict";
2
+ /**
3
+ * evalai upgrade --full — Upgrade from Tier 1 (built-in gate) to Tier 2 (full gate)
4
+ *
5
+ * What it does:
6
+ * 1. Adds full regression gate script (scripts/regression-gate.ts)
7
+ * 2. Adds baseline governance workflow (.github/workflows/baseline-governance.yml)
8
+ * 3. Updates package.json with eval:regression-gate + eval:baseline-update scripts
9
+ * 4. Updates .github/workflows/evalai-gate.yml to use project mode
10
+ * 5. Prints next steps
11
+ */
12
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
13
+ if (k2 === undefined) k2 = k;
14
+ var desc = Object.getOwnPropertyDescriptor(m, k);
15
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
16
+ desc = { enumerable: true, get: function() { return m[k]; } };
17
+ }
18
+ Object.defineProperty(o, k2, desc);
19
+ }) : (function(o, m, k, k2) {
20
+ if (k2 === undefined) k2 = k;
21
+ o[k2] = m[k];
22
+ }));
23
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
24
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
25
+ }) : function(o, v) {
26
+ o["default"] = v;
27
+ });
28
+ var __importStar = (this && this.__importStar) || (function () {
29
+ var ownKeys = function(o) {
30
+ ownKeys = Object.getOwnPropertyNames || function (o) {
31
+ var ar = [];
32
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
33
+ return ar;
34
+ };
35
+ return ownKeys(o);
36
+ };
37
+ return function (mod) {
38
+ if (mod && mod.__esModule) return mod;
39
+ var result = {};
40
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
41
+ __setModuleDefault(result, mod);
42
+ return result;
43
+ };
44
+ })();
45
+ Object.defineProperty(exports, "__esModule", { value: true });
46
+ exports.parseUpgradeArgs = parseUpgradeArgs;
47
+ exports.runUpgrade = runUpgrade;
48
+ const fs = __importStar(require("node:fs"));
49
+ const path = __importStar(require("node:path"));
50
+ // ── Detect environment ──
51
+ function detectPackageManager(cwd) {
52
+ if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
53
+ return "pnpm";
54
+ if (fs.existsSync(path.join(cwd, "yarn.lock")))
55
+ return "yarn";
56
+ return "npm";
57
+ }
58
+ function ok(msg) {
59
+ console.log(` ✔ ${msg}`);
60
+ }
61
+ function skip(msg) {
62
+ console.log(` – ${msg}`);
63
+ }
64
+ // ── 1. Create scripts/regression-gate.ts ──
65
+ function createGateScript(cwd) {
66
+ const scriptPath = path.join(cwd, "scripts", "regression-gate.ts");
67
+ if (fs.existsSync(scriptPath)) {
68
+ skip("scripts/regression-gate.ts already exists");
69
+ return true;
70
+ }
71
+ const scriptsDir = path.join(cwd, "scripts");
72
+ if (!fs.existsSync(scriptsDir)) {
73
+ fs.mkdirSync(scriptsDir, { recursive: true });
74
+ }
75
+ const content = `#!/usr/bin/env npx tsx
76
+ /**
77
+ * Full regression gate — compares current test results against baseline.
78
+ *
79
+ * Usage:
80
+ * npx tsx scripts/regression-gate.ts # run gate
81
+ * npx tsx scripts/regression-gate.ts --update-baseline # update baseline with current values
82
+ *
83
+ * Generated by: npx evalai upgrade --full
84
+ */
85
+ import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
86
+ import { execSync, spawnSync } from "node:child_process";
87
+ import { resolve } from "node:path";
88
+
89
+ const BASELINE_PATH = resolve("evals/baseline.json");
90
+ const REPORT_PATH = resolve("evals/regression-report.json");
91
+ const CONFIDENCE_PATH = resolve("evals/confidence-summary.json");
92
+
93
+ const isUpdateBaseline = process.argv.includes("--update-baseline");
94
+
95
+ // ── Helpers ──
96
+
97
+ function loadJSON(p: string): Record<string, unknown> | null {
98
+ try {
99
+ return JSON.parse(readFileSync(p, "utf-8"));
100
+ } catch {
101
+ return null;
102
+ }
103
+ }
104
+
105
+ function getHeadSha(): string {
106
+ try {
107
+ return execSync("git rev-parse --short HEAD").toString().trim();
108
+ } catch {
109
+ return "0000000";
110
+ }
111
+ }
112
+
113
+ function writeReport(report: Record<string, unknown>): void {
114
+ const dir = resolve("evals");
115
+ if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
116
+ writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2) + "\\n");
117
+ }
118
+
119
+ // ── Run tests ──
120
+
121
+ function runTests(): { passed: boolean; total: number; durationMs: number } {
122
+ const t0 = Date.now();
123
+ const result = spawnSync("npm", ["test"], {
124
+ stdio: "pipe",
125
+ shell: process.platform === "win32",
126
+ timeout: 300_000,
127
+ });
128
+ const durationMs = Date.now() - t0;
129
+ const passed = result.status === 0;
130
+ const output = (result.stdout?.toString() ?? "") + (result.stderr?.toString() ?? "");
131
+
132
+ let total = 0;
133
+ const m =
134
+ output.match(/(\\d+)\\s+(?:tests?|specs?)\\s+(?:passed|completed)/i) ??
135
+ output.match(/Tests:\\s+(\\d+)\\s+passed/i) ??
136
+ output.match(/(\\d+)\\s+passing/i);
137
+ if (m) total = parseInt(m[1], 10);
138
+
139
+ return { passed, total, durationMs };
140
+ }
141
+
142
+ // ── Main ──
143
+
144
+ const baseline = loadJSON(BASELINE_PATH);
145
+ if (!baseline) {
146
+ console.error("❌ Baseline not found. Run: npx evalai init");
147
+ const report = {
148
+ schemaVersion: 1,
149
+ timestamp: new Date().toISOString(),
150
+ exitCode: 2,
151
+ category: "infra_error",
152
+ passed: false,
153
+ failures: ["Baseline file not found"],
154
+ deltas: [],
155
+ baseline: null,
156
+ durationMs: 0,
157
+ command: "npm test",
158
+ runner: "unknown",
159
+ };
160
+ writeReport(report);
161
+ process.exit(2);
162
+ }
163
+
164
+ const tests = runTests();
165
+
166
+ if (isUpdateBaseline) {
167
+ const user = process.env.USER || process.env.USERNAME || "unknown";
168
+ const now = new Date().toISOString();
169
+ const updated = {
170
+ ...baseline,
171
+ updatedAt: now,
172
+ updatedBy: user,
173
+ commitSha: getHeadSha(),
174
+ confidenceTests: {
175
+ ...(baseline.confidenceTests as Record<string, unknown> ?? {}),
176
+ passed: tests.passed,
177
+ total: tests.total,
178
+ },
179
+ };
180
+ writeFileSync(BASELINE_PATH, JSON.stringify(updated, null, 2) + "\\n");
181
+ console.log("✅ Baseline updated with current test results");
182
+ console.log(\` Tests: \${tests.total} (\${tests.passed ? "passing" : "FAILING"})\`);
183
+ process.exit(0);
184
+ }
185
+
186
+ // ── Compare ──
187
+
188
+ const bConf = baseline.confidenceTests as { passed?: boolean; total?: number } | undefined;
189
+ const baselinePassed = bConf?.passed ?? true;
190
+ const baselineTotal = bConf?.total ?? 0;
191
+
192
+ const failures: string[] = [];
193
+ const deltas: Array<Record<string, unknown>> = [];
194
+
195
+ deltas.push({
196
+ metric: "tests_passing",
197
+ baseline: baselinePassed,
198
+ current: tests.passed,
199
+ delta: tests.passed === baselinePassed ? "0" : tests.passed ? "+1" : "-1",
200
+ status: tests.passed ? "pass" : "fail",
201
+ });
202
+
203
+ if (!tests.passed && baselinePassed) {
204
+ failures.push("Tests were passing in baseline but are now failing");
205
+ }
206
+
207
+ if (tests.total > 0 || baselineTotal > 0) {
208
+ const d = tests.total - baselineTotal;
209
+ deltas.push({
210
+ metric: "test_count",
211
+ baseline: baselineTotal,
212
+ current: tests.total,
213
+ delta: d >= 0 ? \`+\${d}\` : \`\${d}\`,
214
+ status: tests.total >= baselineTotal ? "pass" : "fail",
215
+ });
216
+ if (tests.total < baselineTotal) {
217
+ failures.push(\`Test count dropped from \${baselineTotal} to \${tests.total} (\${d})\`);
218
+ }
219
+ }
220
+
221
+ const hasRegression = failures.length > 0;
222
+ const report = {
223
+ schemaVersion: 1,
224
+ timestamp: new Date().toISOString(),
225
+ exitCode: hasRegression ? 1 : 0,
226
+ category: hasRegression ? "regression" : "pass",
227
+ passed: !hasRegression,
228
+ failures,
229
+ deltas,
230
+ baseline: {
231
+ updatedAt: (baseline.updatedAt as string) ?? "unknown",
232
+ updatedBy: (baseline.updatedBy as string) ?? "unknown",
233
+ },
234
+ durationMs: tests.durationMs,
235
+ command: "npm test",
236
+ runner: "unknown",
237
+ };
238
+
239
+ writeReport(report);
240
+
241
+ if (hasRegression) {
242
+ console.error("❌ REGRESSION DETECTED");
243
+ for (const f of failures) console.error(\` \${f}\`);
244
+ } else {
245
+ console.log("✅ NO REGRESSION — gate passed");
246
+ }
247
+
248
+ for (const d of deltas) {
249
+ const icon = d.status === "pass" ? "✔" : "✖";
250
+ console.log(\` \${icon} \${d.metric}: \${d.baseline} → \${d.current} (\${d.delta})\`);
251
+ }
252
+
253
+ process.exit(report.exitCode);
254
+ `;
255
+ fs.writeFileSync(scriptPath, content);
256
+ ok("Created scripts/regression-gate.ts");
257
+ return true;
258
+ }
259
+ // ── 2. Add npm scripts to package.json ──
260
+ function addNpmScripts(cwd) {
261
+ const pkgPath = path.join(cwd, "package.json");
262
+ if (!fs.existsSync(pkgPath))
263
+ return false;
264
+ let pkg;
265
+ try {
266
+ pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
267
+ }
268
+ catch {
269
+ return false;
270
+ }
271
+ const scripts = (pkg.scripts ?? {});
272
+ let changed = false;
273
+ if (!scripts["eval:regression-gate"]) {
274
+ scripts["eval:regression-gate"] = "npx tsx scripts/regression-gate.ts";
275
+ changed = true;
276
+ }
277
+ if (!scripts["eval:baseline-update"]) {
278
+ scripts["eval:baseline-update"] = "npx tsx scripts/regression-gate.ts --update-baseline";
279
+ changed = true;
280
+ }
281
+ if (changed) {
282
+ pkg.scripts = scripts;
283
+ fs.writeFileSync(pkgPath, `${JSON.stringify(pkg, null, 2)}\n`);
284
+ ok("Added eval:regression-gate and eval:baseline-update scripts to package.json");
285
+ }
286
+ else {
287
+ skip("eval:regression-gate and eval:baseline-update scripts already exist");
288
+ }
289
+ return true;
290
+ }
291
+ // ── 3. Create baseline governance workflow ──
292
+ function createGovernanceWorkflow(cwd) {
293
+ const workflowDir = path.join(cwd, ".github", "workflows");
294
+ const workflowPath = path.join(workflowDir, "baseline-governance.yml");
295
+ if (fs.existsSync(workflowPath)) {
296
+ skip(".github/workflows/baseline-governance.yml already exists");
297
+ return true;
298
+ }
299
+ if (!fs.existsSync(workflowDir)) {
300
+ fs.mkdirSync(workflowDir, { recursive: true });
301
+ }
302
+ const workflow = `# Baseline Governance — requires label + approval for baseline changes
303
+ # Auto-generated by: npx evalai upgrade --full
304
+ name: Baseline Governance
305
+
306
+ on:
307
+ pull_request:
308
+ paths:
309
+ - 'evals/baseline.json'
310
+
311
+ jobs:
312
+ governance:
313
+ runs-on: ubuntu-latest
314
+ steps:
315
+ - uses: actions/checkout@v4
316
+
317
+ - name: Check label
318
+ run: |
319
+ LABELS=\${{ toJSON(github.event.pull_request.labels.*.name) }}
320
+ if echo "$LABELS" | grep -q "baseline-update"; then
321
+ echo "✅ baseline-update label found"
322
+ elif echo "$LABELS" | grep -q "baseline-exception"; then
323
+ echo "⚠️ baseline-exception label found — bypassing delta checks"
324
+ else
325
+ echo "❌ Missing 'baseline-update' label"
326
+ echo "Add the 'baseline-update' label to this PR to update the baseline."
327
+ exit 1
328
+ fi
329
+
330
+ - name: Show baseline diff
331
+ run: |
332
+ echo "## Baseline Changes" >> "$GITHUB_STEP_SUMMARY"
333
+ echo "" >> "$GITHUB_STEP_SUMMARY"
334
+ echo "\\\`\\\`\\\`diff" >> "$GITHUB_STEP_SUMMARY"
335
+ git diff HEAD~1 -- evals/baseline.json >> "$GITHUB_STEP_SUMMARY" || echo "No previous baseline" >> "$GITHUB_STEP_SUMMARY"
336
+ echo "\\\`\\\`\\\`" >> "$GITHUB_STEP_SUMMARY"
337
+ `;
338
+ fs.writeFileSync(workflowPath, workflow);
339
+ ok("Created .github/workflows/baseline-governance.yml");
340
+ return true;
341
+ }
342
+ // ── 4. Upgrade evalai-gate.yml to project mode ──
343
+ function upgradeGateWorkflow(cwd) {
344
+ const pm = detectPackageManager(cwd);
345
+ const workflowPath = path.join(cwd, ".github", "workflows", "evalai-gate.yml");
346
+ if (!fs.existsSync(workflowPath)) {
347
+ skip("No .github/workflows/evalai-gate.yml found — run evalai init first");
348
+ return false;
349
+ }
350
+ const content = fs.readFileSync(workflowPath, "utf-8");
351
+ // Already upgraded?
352
+ if (content.includes("eval:regression-gate")) {
353
+ skip("evalai-gate.yml already uses project mode");
354
+ return true;
355
+ }
356
+ const installCmd = pm === "pnpm"
357
+ ? "pnpm install --frozen-lockfile"
358
+ : pm === "yarn"
359
+ ? "yarn install --frozen-lockfile"
360
+ : "npm ci";
361
+ const setupSteps = pm === "pnpm"
362
+ ? ` - uses: pnpm/action-setup@v4
363
+ - uses: actions/setup-node@v4
364
+ with:
365
+ node-version: '20'
366
+ cache: pnpm
367
+ - run: ${installCmd}`
368
+ : ` - uses: actions/setup-node@v4
369
+ with:
370
+ node-version: '20'
371
+ cache: ${pm}
372
+ - run: ${installCmd}`;
373
+ const workflow = `# EvalAI Regression Gate (Full / Tier 2)
374
+ # Upgraded by: npx evalai upgrade --full
375
+ name: EvalAI Gate
376
+
377
+ on:
378
+ pull_request:
379
+ branches: [main]
380
+
381
+ concurrency:
382
+ group: evalai-\${{ github.ref }}
383
+ cancel-in-progress: true
384
+
385
+ jobs:
386
+ regression-gate:
387
+ runs-on: ubuntu-latest
388
+ steps:
389
+ - uses: actions/checkout@v4
390
+ ${setupSteps}
391
+ - name: Run regression gate
392
+ run: ${pm} run eval:regression-gate
393
+
394
+ - name: Gate summary
395
+ if: always()
396
+ run: npx -y @pauly4010/evalai-sdk@^1 gate --format github
397
+
398
+ - name: Upload report
399
+ if: always()
400
+ uses: actions/upload-artifact@v4
401
+ with:
402
+ name: regression-report
403
+ path: evals/regression-report.json
404
+ if-no-files-found: ignore
405
+ `;
406
+ fs.writeFileSync(workflowPath, workflow);
407
+ ok("Upgraded .github/workflows/evalai-gate.yml to project mode (Tier 2)");
408
+ return true;
409
+ }
410
+ // ── 5. Add CODEOWNERS entry ──
411
+ function addCodeowners(cwd) {
412
+ const codeownersPath = path.join(cwd, ".github", "CODEOWNERS");
413
+ const entry = "evals/baseline.json";
414
+ if (fs.existsSync(codeownersPath)) {
415
+ const content = fs.readFileSync(codeownersPath, "utf-8");
416
+ if (content.includes(entry)) {
417
+ skip("CODEOWNERS already has evals/baseline.json entry");
418
+ return true;
419
+ }
420
+ fs.appendFileSync(codeownersPath, `\n# EvalAI baseline — requires approval\n${entry} @YOUR_TEAM\n`);
421
+ }
422
+ else {
423
+ const dir = path.join(cwd, ".github");
424
+ if (!fs.existsSync(dir))
425
+ fs.mkdirSync(dir, { recursive: true });
426
+ fs.writeFileSync(codeownersPath, `# EvalAI baseline — requires approval\n${entry} @YOUR_TEAM\n`);
427
+ }
428
+ ok("Added evals/baseline.json to .github/CODEOWNERS (edit @YOUR_TEAM)");
429
+ return true;
430
+ }
431
+ function parseUpgradeArgs(argv) {
432
+ return { full: argv.includes("--full") };
433
+ }
434
+ // ── Main ──
435
+ function runUpgrade(argv) {
436
+ const args = parseUpgradeArgs(argv);
437
+ const cwd = process.cwd();
438
+ if (!args.full) {
439
+ console.log(`evalai upgrade — Upgrade regression gate
440
+
441
+ Usage:
442
+ evalai upgrade --full Upgrade from Tier 1 (built-in) to Tier 2 (full gate)
443
+
444
+ What --full does:
445
+ 1. Creates scripts/regression-gate.ts (full gate script)
446
+ 2. Adds eval:regression-gate + eval:baseline-update npm scripts
447
+ 3. Creates baseline governance workflow
448
+ 4. Upgrades CI workflow to project mode
449
+ 5. Adds CODEOWNERS entry for baseline
450
+
451
+ After upgrading:
452
+ - evalai gate delegates to your eval:regression-gate script
453
+ - Baseline changes require PR label + approval
454
+ - Full metric comparison: golden eval, confidence, latency, cost
455
+ `);
456
+ return argv.includes("--help") || argv.includes("-h") ? 0 : 1;
457
+ }
458
+ console.log("");
459
+ console.log(" evalai upgrade --full — upgrading to Tier 2\n");
460
+ // Check preconditions
461
+ const pkgPath = path.join(cwd, "package.json");
462
+ if (!fs.existsSync(pkgPath)) {
463
+ console.error(" ✖ No package.json found. Run this from a Node.js project root.");
464
+ return 1;
465
+ }
466
+ if (!fs.existsSync(path.join(cwd, "evals", "baseline.json"))) {
467
+ console.error(" ✖ No evals/baseline.json found. Run 'npx evalai init' first.");
468
+ return 1;
469
+ }
470
+ createGateScript(cwd);
471
+ addNpmScripts(cwd);
472
+ createGovernanceWorkflow(cwd);
473
+ upgradeGateWorkflow(cwd);
474
+ addCodeowners(cwd);
475
+ console.log("");
476
+ console.log(" Done! Your repo is now Tier 2.\n");
477
+ console.log(" What changed:");
478
+ console.log(" - scripts/regression-gate.ts Full gate script");
479
+ console.log(" - package.json eval:regression-gate + eval:baseline-update");
480
+ console.log(" - .github/workflows/ Gate + governance workflows");
481
+ console.log(" - .github/CODEOWNERS Baseline requires approval\n");
482
+ console.log(" Next:");
483
+ console.log(" git add -A");
484
+ console.log(" git commit -m 'chore: upgrade EvalAI gate to Tier 2'");
485
+ console.log(" git push\n");
486
+ console.log(" Commands:");
487
+ console.log(" npx evalai gate Run full gate locally");
488
+ console.log(" npx evalai baseline update Update baseline with real scores");
489
+ console.log("");
490
+ return 0;
491
+ }
@@ -1 +1 @@
1
- export {};
1
+ export {};