@pauly4010/evalai-sdk 1.6.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/init.js CHANGED
@@ -1,9 +1,18 @@
1
1
  #!/usr/bin/env node
2
2
  "use strict";
3
3
  /**
4
- * evalai init — Create evalai.config.json
4
+ * evalai init — Full project scaffolder
5
5
  *
6
- * Creates the smallest possible config file. Defaults belong in code.
6
+ * Zero-to-gate in under 5 minutes:
7
+ * npx evalai init
8
+ * git push
9
+ * …CI starts blocking regressions.
10
+ *
11
+ * What it does:
12
+ * 1. Detects Node repo + package manager
13
+ * 2. Creates evals/ directory + baseline.json
14
+ * 3. Installs .github/workflows/evalai-gate.yml
15
+ * 4. Prints next steps (no docs required)
7
16
  */
8
17
  var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
9
18
  if (k2 === undefined) k2 = k;
@@ -40,30 +49,244 @@ var __importStar = (this && this.__importStar) || (function () {
40
49
  })();
41
50
  Object.defineProperty(exports, "__esModule", { value: true });
42
51
  exports.runInit = runInit;
52
+ const node_child_process_1 = require("node:child_process");
43
53
  const fs = __importStar(require("node:fs"));
44
54
  const path = __importStar(require("node:path"));
45
- const CONFIG_CONTENT = `{
46
- "evaluationId": ""
55
+ function detectProject(cwd) {
56
+ const pkgPath = path.join(cwd, "package.json");
57
+ if (!fs.existsSync(pkgPath))
58
+ return null;
59
+ let pkg;
60
+ try {
61
+ pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
62
+ }
63
+ catch {
64
+ return null;
65
+ }
66
+ let pm = "npm";
67
+ if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
68
+ pm = "pnpm";
69
+ else if (fs.existsSync(path.join(cwd, "yarn.lock")))
70
+ pm = "yarn";
71
+ const testScript = pkg.scripts?.test ?? "";
72
+ const hasTestScript = !!testScript && testScript !== 'echo "Error: no test specified" && exit 1';
73
+ return {
74
+ cwd,
75
+ pm,
76
+ hasTestScript,
77
+ testScript,
78
+ name: pkg.name ?? path.basename(cwd),
79
+ };
80
+ }
81
+ // ── Step helpers ──
82
+ function ok(msg) {
83
+ console.log(` ✔ ${msg}`);
84
+ }
85
+ function skip(msg) {
86
+ console.log(` – ${msg}`);
87
+ }
88
+ // ── 1. Create evals/ + baseline.json ──
89
+ function createBaseline(cwd, project) {
90
+ const evalsDir = path.join(cwd, "evals");
91
+ const baselinePath = path.join(evalsDir, "baseline.json");
92
+ if (fs.existsSync(baselinePath)) {
93
+ skip("evals/baseline.json already exists");
94
+ return true;
95
+ }
96
+ if (!fs.existsSync(evalsDir)) {
97
+ fs.mkdirSync(evalsDir, { recursive: true });
98
+ }
99
+ const user = process.env.USER || process.env.USERNAME || "unknown";
100
+ const now = new Date().toISOString();
101
+ // Run tests to capture real count if possible
102
+ let testTotal = 0;
103
+ let testsPassed = true;
104
+ if (project.hasTestScript) {
105
+ const isWin = process.platform === "win32";
106
+ const result = (0, node_child_process_1.spawnSync)(project.pm, ["test"], {
107
+ cwd,
108
+ stdio: "pipe",
109
+ shell: isWin,
110
+ timeout: 120000,
111
+ });
112
+ testsPassed = result.status === 0;
113
+ // Try to extract test count from output
114
+ const output = (result.stdout?.toString() ?? "") + (result.stderr?.toString() ?? "");
115
+ const countMatch = output.match(/(\d+)\s+(?:tests?|specs?)\s+(?:passed|completed)/i) ??
116
+ output.match(/Tests:\s+(\d+)\s+passed/i) ??
117
+ output.match(/(\d+)\s+passing/i);
118
+ if (countMatch)
119
+ testTotal = parseInt(countMatch[1], 10);
120
+ }
121
+ const baseline = {
122
+ schemaVersion: 1,
123
+ description: `Regression gate baseline for ${project.name}`,
124
+ generatedAt: now,
125
+ generatedBy: user,
126
+ commitSha: getHeadSha(cwd),
127
+ updatedAt: now,
128
+ updatedBy: user,
129
+ tolerance: {
130
+ scoreDrop: 5,
131
+ passRateDrop: 5,
132
+ maxLatencyIncreaseMs: 200,
133
+ maxCostIncreaseUsd: 0.05,
134
+ },
135
+ goldenEval: {
136
+ score: 100,
137
+ passRate: 100,
138
+ totalCases: 3,
139
+ passedCases: 3,
140
+ },
141
+ confidenceTests: {
142
+ passed: testsPassed,
143
+ total: testTotal,
144
+ },
145
+ productMetrics: {},
146
+ };
147
+ fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
148
+ ok("Created evals/baseline.json");
149
+ return true;
47
150
  }
151
+ function getHeadSha(cwd) {
152
+ try {
153
+ const result = (0, node_child_process_1.spawnSync)("git", ["rev-parse", "--short", "HEAD"], {
154
+ cwd,
155
+ stdio: "pipe",
156
+ });
157
+ return result.stdout?.toString().trim() || "0000000";
158
+ }
159
+ catch {
160
+ return "0000000";
161
+ }
162
+ }
163
+ // ── 2. Install GitHub Actions workflow ──
164
+ function installWorkflow(cwd, project) {
165
+ const workflowDir = path.join(cwd, ".github", "workflows");
166
+ const workflowPath = path.join(workflowDir, "evalai-gate.yml");
167
+ if (fs.existsSync(workflowPath)) {
168
+ skip(".github/workflows/evalai-gate.yml already exists");
169
+ return true;
170
+ }
171
+ if (!fs.existsSync(workflowDir)) {
172
+ fs.mkdirSync(workflowDir, { recursive: true });
173
+ }
174
+ const installCmd = project.pm === "pnpm"
175
+ ? "pnpm install --frozen-lockfile"
176
+ : project.pm === "yarn"
177
+ ? "yarn install --frozen-lockfile"
178
+ : "npm ci";
179
+ const setupSteps = project.pm === "pnpm"
180
+ ? ` - uses: pnpm/action-setup@v4
181
+ - uses: actions/setup-node@v4
182
+ with:
183
+ node-version: '20'
184
+ cache: pnpm
185
+ - run: ${installCmd}`
186
+ : ` - uses: actions/setup-node@v4
187
+ with:
188
+ node-version: '20'
189
+ cache: ${project.pm}
190
+ - run: ${installCmd}`;
191
+ const workflow = `# EvalAI Regression Gate
192
+ # Auto-generated by: npx evalai init
193
+ # Blocks PRs that regress test health.
194
+ name: EvalAI Gate
195
+
196
+ on:
197
+ pull_request:
198
+ branches: [main]
199
+
200
+ concurrency:
201
+ group: evalai-\${{ github.ref }}
202
+ cancel-in-progress: true
203
+
204
+ jobs:
205
+ regression-gate:
206
+ runs-on: ubuntu-latest
207
+ steps:
208
+ - uses: actions/checkout@v4
209
+ ${setupSteps}
210
+ - name: EvalAI Doctor (preflight)
211
+ continue-on-error: true # Strict: set to false, or use: evalai doctor --strict
212
+ run: npx -y @pauly4010/evalai-sdk@^1 doctor
213
+
214
+ - name: EvalAI Regression Gate
215
+ run: npx -y @pauly4010/evalai-sdk@^1 gate --format github
216
+
217
+ - name: Upload report
218
+ if: always()
219
+ uses: actions/upload-artifact@v4
220
+ with:
221
+ name: evalai-report
222
+ path: |
223
+ evals/regression-report.json
224
+ .evalai/last-report.json
225
+ if-no-files-found: ignore
48
226
  `;
49
- function runInit(cwd = process.cwd()) {
227
+ fs.writeFileSync(workflowPath, workflow);
228
+ ok("Created .github/workflows/evalai-gate.yml");
229
+ return true;
230
+ }
231
+ // ── 3. Create evalai.config.json ──
232
+ function createConfig(cwd) {
50
233
  const configPath = path.join(cwd, "evalai.config.json");
51
234
  if (fs.existsSync(configPath)) {
52
- console.log(`evalai.config.json already exists at ${path.resolve(configPath)}`);
235
+ skip("evalai.config.json already exists");
236
+ return true;
237
+ }
238
+ const config = {
239
+ evaluationId: "",
240
+ gate: {
241
+ baseline: "evals/baseline.json",
242
+ report: "evals/regression-report.json",
243
+ },
244
+ };
245
+ fs.writeFileSync(configPath, `${JSON.stringify(config, null, 2)}\n`);
246
+ ok("Created evalai.config.json");
247
+ return true;
248
+ }
249
+ // ── Main ──
250
+ function runInit(cwd = process.cwd()) {
251
+ console.log("");
252
+ console.log(" evalai init — setting up regression gate\n");
253
+ // Detect
254
+ const project = detectProject(cwd);
255
+ if (!project) {
256
+ console.error(" ✖ No package.json found. Run this from a Node.js project root.");
53
257
  return false;
54
258
  }
55
- fs.writeFileSync(configPath, CONFIG_CONTENT, "utf-8");
56
- const resolvedPath = path.resolve(configPath);
57
- console.log(`Wrote evalai.config.json at ${resolvedPath}`);
259
+ ok(`Detected ${project.pm} project: ${project.name}`);
260
+ if (!project.hasTestScript) {
261
+ console.log(` No test script found in package.json`);
262
+ console.log(` The gate will still work — add a "test" script later for full coverage.\n`);
263
+ }
264
+ // Scaffold
265
+ createBaseline(cwd, project);
266
+ installWorkflow(cwd, project);
267
+ createConfig(cwd);
268
+ // Next steps
269
+ console.log("");
270
+ console.log(" Done! Next:");
271
+ console.log("");
272
+ console.log(" npx evalai doctor Verify your setup is complete");
273
+ console.log("");
274
+ console.log(" Then commit:");
275
+ console.log("");
276
+ console.log(" git add evals/ .github/workflows/evalai-gate.yml evalai.config.json");
277
+ console.log(" git commit -m 'chore: add EvalAI regression gate'");
278
+ console.log(" git push");
279
+ console.log("");
280
+ console.log(" That's it. Open a PR and the gate runs automatically.");
58
281
  console.log("");
59
- console.log("Next: paste evaluationId into evalai.config.json, then run npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
282
+ console.log(" Commands:");
283
+ console.log(" npx evalai doctor Preflight check — verify config, baseline, CI");
284
+ console.log(" npx evalai gate Run regression gate locally");
285
+ console.log(" npx evalai check API-based gate (requires account)");
286
+ console.log(" npx evalai explain Explain last failure with root causes + fixes");
287
+ console.log(" npx evalai baseline update Update baseline after intentional changes");
60
288
  console.log("");
61
- console.log("GitHub Actions snippet (add to your workflow):");
62
- console.log(" - name: EvalAI gate");
63
- console.log(" env:");
64
- console.log(" EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}");
65
- console.log(" run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
289
+ console.log(" To remove: delete evals/, evalai.config.json, and .github/workflows/evalai-gate.yml");
66
290
  console.log("");
67
- console.log("To uninstall: delete evalai.config.json.");
68
291
  return true;
69
292
  }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * evalai print-config — Show resolved configuration with source-of-truth annotations.
3
+ *
4
+ * Prints every config field, where it came from (file, env, default, CLI arg),
5
+ * and redacts secrets. Useful for debugging "why is it using this baseUrl?"
6
+ *
7
+ * Usage:
8
+ * evalai print-config
9
+ * evalai print-config --format json
10
+ *
11
+ * Exit codes:
12
+ * 0 — Always (informational only)
13
+ */
14
+ type Source = "file" | "env" | "default" | "profile" | "arg";
15
+ interface ResolvedField {
16
+ key: string;
17
+ value: string | number | boolean | null;
18
+ source: Source;
19
+ raw?: string;
20
+ }
21
+ export interface PrintConfigOutput {
22
+ cliVersion: string;
23
+ configFile: string | null;
24
+ cwd: string;
25
+ resolved: ResolvedField[];
26
+ env: Record<string, string | null>;
27
+ }
28
+ export declare function runPrintConfig(argv: string[]): number;
29
+ export {};
@@ -0,0 +1,251 @@
1
+ "use strict";
2
+ /**
3
+ * evalai print-config — Show resolved configuration with source-of-truth annotations.
4
+ *
5
+ * Prints every config field, where it came from (file, env, default, CLI arg),
6
+ * and redacts secrets. Useful for debugging "why is it using this baseUrl?"
7
+ *
8
+ * Usage:
9
+ * evalai print-config
10
+ * evalai print-config --format json
11
+ *
12
+ * Exit codes:
13
+ * 0 — Always (informational only)
14
+ */
15
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
16
+ if (k2 === undefined) k2 = k;
17
+ var desc = Object.getOwnPropertyDescriptor(m, k);
18
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
19
+ desc = { enumerable: true, get: function() { return m[k]; } };
20
+ }
21
+ Object.defineProperty(o, k2, desc);
22
+ }) : (function(o, m, k, k2) {
23
+ if (k2 === undefined) k2 = k;
24
+ o[k2] = m[k];
25
+ }));
26
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
27
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
28
+ }) : function(o, v) {
29
+ o["default"] = v;
30
+ });
31
+ var __importStar = (this && this.__importStar) || (function () {
32
+ var ownKeys = function(o) {
33
+ ownKeys = Object.getOwnPropertyNames || function (o) {
34
+ var ar = [];
35
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
36
+ return ar;
37
+ };
38
+ return ownKeys(o);
39
+ };
40
+ return function (mod) {
41
+ if (mod && mod.__esModule) return mod;
42
+ var result = {};
43
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
44
+ __setModuleDefault(result, mod);
45
+ return result;
46
+ };
47
+ })();
48
+ Object.defineProperty(exports, "__esModule", { value: true });
49
+ exports.runPrintConfig = runPrintConfig;
50
+ const path = __importStar(require("node:path"));
51
+ const version_1 = require("../version");
52
+ const config_1 = require("./config");
53
+ const profiles_1 = require("./profiles");
54
+ function parseFlags(argv) {
55
+ const raw = {};
56
+ for (let i = 0; i < argv.length; i++) {
57
+ const arg = argv[i];
58
+ if (arg.startsWith("--")) {
59
+ const key = arg.slice(2);
60
+ const next = argv[i + 1];
61
+ if (next !== undefined && !next.startsWith("--")) {
62
+ raw[key] = next;
63
+ i++;
64
+ }
65
+ else {
66
+ raw[key] = "true";
67
+ }
68
+ }
69
+ }
70
+ return {
71
+ format: raw.format === "json" ? "json" : "human",
72
+ evaluationId: raw.evaluationId,
73
+ baseUrl: raw.baseUrl,
74
+ apiKey: raw.apiKey,
75
+ baseline: raw.baseline,
76
+ profile: raw.profile,
77
+ minScore: raw.minScore,
78
+ maxDrop: raw.maxDrop,
79
+ warnDrop: raw.warnDrop,
80
+ minN: raw.minN,
81
+ };
82
+ }
83
+ // ── Helpers ──
84
+ function redact(value) {
85
+ if (!value)
86
+ return null;
87
+ if (value.length > 8)
88
+ return `${value.slice(0, 4)}...${value.slice(-4)}`;
89
+ return "****";
90
+ }
91
+ // ── Build resolved config ──
92
+ function buildResolvedConfig(cwd, flags) {
93
+ const configPath = (0, config_1.findConfigPath)(cwd);
94
+ const fileConfig = (0, config_1.loadConfig)(cwd);
95
+ // Build CLI args object (only what was explicitly passed)
96
+ const cliArgs = {};
97
+ if (flags.evaluationId)
98
+ cliArgs.evaluationId = flags.evaluationId;
99
+ if (flags.baseUrl)
100
+ cliArgs.baseUrl = flags.baseUrl;
101
+ if (flags.baseline)
102
+ cliArgs.baseline = flags.baseline;
103
+ if (flags.profile)
104
+ cliArgs.profile = flags.profile;
105
+ if (flags.minScore)
106
+ cliArgs.minScore = flags.minScore;
107
+ if (flags.maxDrop)
108
+ cliArgs.maxDrop = flags.maxDrop;
109
+ if (flags.warnDrop)
110
+ cliArgs.warnDrop = flags.warnDrop;
111
+ if (flags.minN)
112
+ cliArgs.minN = flags.minN;
113
+ const merged = (0, config_1.mergeConfigWithArgs)(fileConfig, cliArgs);
114
+ // Determine source of each field
115
+ const fields = [];
116
+ // evaluationId
117
+ const evalIdSource = flags.evaluationId ? "arg"
118
+ : fileConfig?.evaluationId ? "file"
119
+ : "default";
120
+ fields.push({
121
+ key: "evaluationId",
122
+ value: merged.evaluationId ?? null,
123
+ source: evalIdSource,
124
+ });
125
+ // baseUrl
126
+ const envBaseUrl = process.env.EVALAI_BASE_URL;
127
+ const baseUrlSource = flags.baseUrl ? "arg"
128
+ : envBaseUrl ? "env"
129
+ : fileConfig?.baseUrl ? "file"
130
+ : "default";
131
+ fields.push({
132
+ key: "baseUrl",
133
+ value: flags.baseUrl || envBaseUrl || fileConfig?.baseUrl || "http://localhost:3000",
134
+ source: baseUrlSource,
135
+ });
136
+ // apiKey (always redacted)
137
+ const envApiKey = process.env.EVALAI_API_KEY;
138
+ const rawApiKey = flags.apiKey || envApiKey || "";
139
+ const apiKeySource = flags.apiKey ? "arg"
140
+ : envApiKey ? "env"
141
+ : "default";
142
+ fields.push({
143
+ key: "apiKey",
144
+ value: redact(rawApiKey) ?? "(not set)",
145
+ source: apiKeySource,
146
+ raw: rawApiKey ? "(redacted)" : undefined,
147
+ });
148
+ // profile
149
+ const profileName = (flags.profile || fileConfig?.profile);
150
+ const profileSource = flags.profile ? "arg" : fileConfig?.profile ? "file" : "default";
151
+ fields.push({
152
+ key: "profile",
153
+ value: profileName ?? null,
154
+ source: profileSource,
155
+ });
156
+ // Numeric gate fields: minScore, maxDrop, warnDrop, minN, allowWeakEvidence
157
+ const numericFields = [
158
+ { key: "minScore" },
159
+ { key: "maxDrop" },
160
+ { key: "warnDrop" },
161
+ { key: "minN" },
162
+ { key: "allowWeakEvidence" },
163
+ ];
164
+ for (const { key } of numericFields) {
165
+ const argVal = cliArgs[key];
166
+ const fileVal = fileConfig?.[key];
167
+ const profileVal = profileName && profileName in profiles_1.PROFILES
168
+ ? profiles_1.PROFILES[profileName][key]
169
+ : undefined;
170
+ const source = argVal !== undefined ? "arg"
171
+ : fileVal !== undefined ? "file"
172
+ : profileVal !== undefined ? "profile"
173
+ : "default";
174
+ fields.push({
175
+ key,
176
+ value: merged[key] ?? null,
177
+ source,
178
+ });
179
+ }
180
+ // baseline
181
+ const baselineSource = flags.baseline ? "arg"
182
+ : fileConfig?.baseline ? "file"
183
+ : "default";
184
+ fields.push({
185
+ key: "baseline",
186
+ value: merged.baseline ?? "published",
187
+ source: baselineSource,
188
+ });
189
+ // Environment variables summary
190
+ const envVars = {
191
+ EVALAI_API_KEY: redact(envApiKey),
192
+ EVALAI_BASE_URL: envBaseUrl ?? null,
193
+ OPENAI_API_KEY: redact(process.env.OPENAI_API_KEY),
194
+ ANTHROPIC_API_KEY: redact(process.env.ANTHROPIC_API_KEY),
195
+ AZURE_OPENAI_API_KEY: redact(process.env.AZURE_OPENAI_API_KEY),
196
+ GITHUB_ACTIONS: process.env.GITHUB_ACTIONS ?? null,
197
+ CI: process.env.CI ?? null,
198
+ };
199
+ return {
200
+ cliVersion: version_1.SDK_VERSION,
201
+ configFile: configPath ? path.relative(cwd, configPath) : null,
202
+ cwd,
203
+ resolved: fields,
204
+ env: envVars,
205
+ };
206
+ }
207
+ // ── Output formatting ──
208
+ function printHuman(output) {
209
+ console.log("\n evalai print-config\n");
210
+ console.log(` CLI version: ${output.cliVersion}`);
211
+ console.log(` Config file: ${output.configFile ?? "(none found)"}`);
212
+ console.log(` Working dir: ${output.cwd}`);
213
+ console.log("");
214
+ console.log(" Resolved configuration:");
215
+ console.log("");
216
+ const maxKeyLen = Math.max(...output.resolved.map((f) => f.key.length));
217
+ for (const field of output.resolved) {
218
+ const val = field.value === null ? "(not set)" : String(field.value);
219
+ const pad = " ".repeat(maxKeyLen - field.key.length);
220
+ const sourceTag = `[${field.source}]`;
221
+ console.log(` ${field.key}${pad} ${val} ${sourceTag}`);
222
+ }
223
+ console.log("");
224
+ console.log(" Environment variables:");
225
+ console.log("");
226
+ for (const [key, val] of Object.entries(output.env)) {
227
+ if (val !== null) {
228
+ console.log(` ${key} = ${val}`);
229
+ }
230
+ }
231
+ const unsetEnv = Object.entries(output.env)
232
+ .filter(([, v]) => v === null)
233
+ .map(([k]) => k);
234
+ if (unsetEnv.length > 0) {
235
+ console.log(` (not set: ${unsetEnv.join(", ")})`);
236
+ }
237
+ console.log("");
238
+ }
239
+ // ── Main ──
240
+ function runPrintConfig(argv) {
241
+ const flags = parseFlags(argv);
242
+ const cwd = process.cwd();
243
+ const output = buildResolvedConfig(cwd, flags);
244
+ if (flags.format === "json") {
245
+ console.log(JSON.stringify(output, null, 2));
246
+ }
247
+ else {
248
+ printHuman(output);
249
+ }
250
+ return 0;
251
+ }
@@ -1,8 +1,12 @@
1
1
  /**
2
2
  * evalai gate — Run the regression gate
3
3
  *
4
- * Delegates to the project's eval:regression-gate npm script.
5
- * Supports --format json to output the regression-report.json contents.
4
+ * Two modes:
5
+ * 1. Project mode: delegates to eval:regression-gate npm script (full gate)
6
+ * 2. Built-in mode: runs `npm test`, compares against evals/baseline.json
7
+ *
8
+ * Built-in mode activates when no eval:regression-gate script is defined,
9
+ * making `npx evalai gate` work for any project after `npx evalai init`.
6
10
  */
7
11
  export interface GateArgs {
8
12
  format: "human" | "json" | "github";