@pauly4010/evalai-sdk 1.6.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ /**
2
+ * evalai explain — Offline report explainer.
3
+ *
4
+ * Reads the last check/gate report artifact and prints:
5
+ * 1. Top failing test cases (up to 3)
6
+ * 2. What changed (baseline vs current)
7
+ * 3. Likely root cause class
8
+ * 4. Suggested fix actions
9
+ *
10
+ * Works offline — no network calls. Designed for CI logs.
11
+ *
12
+ * Usage:
13
+ * evalai explain # reads evals/regression-report.json or .evalai/last-report.json
14
+ * evalai explain --report path/to/report.json
15
+ * evalai explain --format json
16
+ *
17
+ * Exit codes:
18
+ * 0 — Explained successfully
19
+ * 1 — Report not found or unreadable
20
+ */
21
+ export interface ExplainFlags {
22
+ reportPath: string | null;
23
+ format: "human" | "json";
24
+ }
25
+ export type RootCauseClass = "prompt_drift" | "retrieval_drift" | "formatting_drift" | "tool_use_drift" | "safety_regression" | "cost_regression" | "latency_regression" | "coverage_drop" | "baseline_stale" | "unknown";
26
+ export interface SuggestedFix {
27
+ action: string;
28
+ detail: string;
29
+ priority: "high" | "medium" | "low";
30
+ }
31
+ export interface ExplainOutput {
32
+ verdict: string;
33
+ score?: number;
34
+ baselineScore?: number;
35
+ delta?: number;
36
+ reasonCode?: string;
37
+ reasonMessage?: string;
38
+ topFailures: Array<{
39
+ rank: number;
40
+ name?: string;
41
+ input?: string;
42
+ expected?: string;
43
+ actual?: string;
44
+ reason?: string;
45
+ }>;
46
+ totalFailures: number;
47
+ changes: Array<{
48
+ metric: string;
49
+ baseline: string;
50
+ current: string;
51
+ direction: "better" | "worse" | "same";
52
+ }>;
53
+ rootCauses: RootCauseClass[];
54
+ suggestedFixes: SuggestedFix[];
55
+ reportPath: string;
56
+ }
57
+ export declare function parseExplainFlags(argv: string[]): ExplainFlags;
58
+ export declare function runExplain(argv: string[]): Promise<number>;
@@ -0,0 +1,429 @@
1
+ "use strict";
2
+ /**
3
+ * evalai explain — Offline report explainer.
4
+ *
5
+ * Reads the last check/gate report artifact and prints:
6
+ * 1. Top failing test cases (up to 3)
7
+ * 2. What changed (baseline vs current)
8
+ * 3. Likely root cause class
9
+ * 4. Suggested fix actions
10
+ *
11
+ * Works offline — no network calls. Designed for CI logs.
12
+ *
13
+ * Usage:
14
+ * evalai explain # reads evals/regression-report.json or .evalai/last-report.json
15
+ * evalai explain --report path/to/report.json
16
+ * evalai explain --format json
17
+ *
18
+ * Exit codes:
19
+ * 0 — Explained successfully
20
+ * 1 — Report not found or unreadable
21
+ */
22
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
23
+ if (k2 === undefined) k2 = k;
24
+ var desc = Object.getOwnPropertyDescriptor(m, k);
25
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
26
+ desc = { enumerable: true, get: function() { return m[k]; } };
27
+ }
28
+ Object.defineProperty(o, k2, desc);
29
+ }) : (function(o, m, k, k2) {
30
+ if (k2 === undefined) k2 = k;
31
+ o[k2] = m[k];
32
+ }));
33
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
34
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
35
+ }) : function(o, v) {
36
+ o["default"] = v;
37
+ });
38
+ var __importStar = (this && this.__importStar) || (function () {
39
+ var ownKeys = function(o) {
40
+ ownKeys = Object.getOwnPropertyNames || function (o) {
41
+ var ar = [];
42
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
43
+ return ar;
44
+ };
45
+ return ownKeys(o);
46
+ };
47
+ return function (mod) {
48
+ if (mod && mod.__esModule) return mod;
49
+ var result = {};
50
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
51
+ __setModuleDefault(result, mod);
52
+ return result;
53
+ };
54
+ })();
55
+ Object.defineProperty(exports, "__esModule", { value: true });
56
+ exports.parseExplainFlags = parseExplainFlags;
57
+ exports.runExplain = runExplain;
58
+ const fs = __importStar(require("node:fs"));
59
+ const path = __importStar(require("node:path"));
60
+ const types_1 = require("./formatters/types");
61
+ // ── Arg parsing ──
62
+ function parseExplainFlags(argv) {
63
+ const raw = {};
64
+ for (let i = 0; i < argv.length; i++) {
65
+ const arg = argv[i];
66
+ if (arg.startsWith("--")) {
67
+ const key = arg.slice(2);
68
+ const next = argv[i + 1];
69
+ if (next !== undefined && !next.startsWith("--")) {
70
+ raw[key] = next;
71
+ i++;
72
+ }
73
+ else {
74
+ raw[key] = "true";
75
+ }
76
+ }
77
+ }
78
+ const reportPath = raw.report || raw.reportPath || null;
79
+ const format = raw.format === "json" ? "json" : "human";
80
+ return { reportPath, format };
81
+ }
82
+ // ── Report discovery ──
83
+ const REPORT_SEARCH_PATHS = [
84
+ "evals/regression-report.json",
85
+ ".evalai/last-report.json",
86
+ ".evalai/last_report.json",
87
+ ];
88
+ function findReport(cwd, explicitPath) {
89
+ if (explicitPath) {
90
+ const abs = path.isAbsolute(explicitPath) ? explicitPath : path.join(cwd, explicitPath);
91
+ return fs.existsSync(abs) ? abs : null;
92
+ }
93
+ for (const rel of REPORT_SEARCH_PATHS) {
94
+ const abs = path.join(cwd, rel);
95
+ if (fs.existsSync(abs))
96
+ return abs;
97
+ }
98
+ return null;
99
+ }
100
+ // ── Root cause classification ──
101
+ function classifyRootCauses(report) {
102
+ const causes = [];
103
+ const failedCases = report.failedCases ?? [];
104
+ const reasonCode = report.reasonCode ?? "";
105
+ const breakdown = report.breakdown01;
106
+ const delta = report.delta;
107
+ // Safety regression
108
+ if (reasonCode === "POLICY_FAILED" ||
109
+ reasonCode === "SAFETY_RISK" ||
110
+ (breakdown?.safety != null && breakdown.safety < 0.9)) {
111
+ causes.push("safety_regression");
112
+ }
113
+ // Cost regression
114
+ if (reasonCode === "COST_BUDGET_EXCEEDED" || reasonCode === "COST_RISK") {
115
+ causes.push("cost_regression");
116
+ }
117
+ // Latency regression
118
+ if (reasonCode === "LATENCY_BUDGET_EXCEEDED" || reasonCode === "LATENCY_RISK") {
119
+ causes.push("latency_regression");
120
+ }
121
+ // Coverage drop (test count decreased)
122
+ if (reasonCode === "LOW_SAMPLE_SIZE" || reasonCode === "INSUFFICIENT_EVIDENCE") {
123
+ causes.push("coverage_drop");
124
+ }
125
+ // Analyze failed cases for drift patterns
126
+ if (failedCases.length > 0) {
127
+ const outputs = failedCases
128
+ .map((fc) => (fc.output ?? "").toLowerCase())
129
+ .filter(Boolean);
130
+ const expectedOutputs = failedCases
131
+ .map((fc) => (fc.expectedOutput ?? "").toLowerCase())
132
+ .filter(Boolean);
133
+ // Formatting drift: output structure changed (JSON/markdown/format mismatch)
134
+ const hasFormatIssue = outputs.some((o) => o.includes("```") !== expectedOutputs.some((e) => e.includes("```")) ||
135
+ o.includes("{") !== expectedOutputs.some((e) => e.includes("{")) ||
136
+ o.includes("<") !== expectedOutputs.some((e) => e.includes("<")));
137
+ if (hasFormatIssue && failedCases.length >= 2) {
138
+ causes.push("formatting_drift");
139
+ }
140
+ // Tool use drift: output mentions tool calls or function calls
141
+ const hasToolIssue = outputs.some((o) => o.includes("tool_call") ||
142
+ o.includes("function_call") ||
143
+ o.includes("tool_use"));
144
+ if (hasToolIssue) {
145
+ causes.push("tool_use_drift");
146
+ }
147
+ // Retrieval drift: output mentions "not found", "no results", context issues
148
+ const hasRetrievalIssue = outputs.some((o) => o.includes("not found") ||
149
+ o.includes("no results") ||
150
+ o.includes("no relevant") ||
151
+ o.includes("unable to find"));
152
+ if (hasRetrievalIssue) {
153
+ causes.push("retrieval_drift");
154
+ }
155
+ // Prompt drift: catch-all for score regression with failed cases
156
+ if (delta != null &&
157
+ delta < -2 &&
158
+ !causes.includes("formatting_drift") &&
159
+ !causes.includes("tool_use_drift") &&
160
+ !causes.includes("retrieval_drift")) {
161
+ causes.push("prompt_drift");
162
+ }
163
+ }
164
+ // Baseline stale
165
+ if (reasonCode === "BASELINE_MISSING") {
166
+ causes.push("baseline_stale");
167
+ }
168
+ if (causes.length === 0) {
169
+ causes.push("unknown");
170
+ }
171
+ return [...new Set(causes)];
172
+ }
173
+ // ── Suggested fixes ──
174
+ const ROOT_CAUSE_FIXES = {
175
+ prompt_drift: [
176
+ { action: "Review prompt changes", detail: "Compare current prompt with the version used in baseline run. Diff system/user messages.", priority: "high" },
177
+ { action: "Pin model version", detail: "Use a specific model snapshot (e.g. gpt-4-0613) instead of a rolling alias.", priority: "medium" },
178
+ { action: "Update baseline", detail: "If changes are intentional, run: npx evalai baseline update", priority: "low" },
179
+ ],
180
+ retrieval_drift: [
181
+ { action: "Check retrieval pipeline", detail: "Verify embeddings, index, and chunk strategy haven't changed.", priority: "high" },
182
+ { action: "Update test case context", detail: "If knowledge base changed, update expected outputs in test cases.", priority: "medium" },
183
+ { action: "Add retrieval-specific tests", detail: "Add test cases that verify document retrieval before generation.", priority: "low" },
184
+ ],
185
+ formatting_drift: [
186
+ { action: "Update output format instructions", detail: "Check if system prompt format instructions match expected output structure.", priority: "high" },
187
+ { action: "Add format validators", detail: "Use schema assertions to validate output structure (JSON schema, regex).", priority: "medium" },
188
+ { action: "Refresh baseline", detail: "If new format is intentional, run: npx evalai baseline update", priority: "low" },
189
+ ],
190
+ tool_use_drift: [
191
+ { action: "Verify tool definitions", detail: "Check that tool/function schemas match what the model expects.", priority: "high" },
192
+ { action: "Review tool call patterns", detail: "Compare tool call sequences in failing vs passing cases.", priority: "medium" },
193
+ { action: "Add tool-use assertions", detail: "Assert specific tool calls are made (or not made) per test case.", priority: "low" },
194
+ ],
195
+ safety_regression: [
196
+ { action: "Review safety assertions", detail: "Check which safety test cases are failing and why.", priority: "high" },
197
+ { action: "Strengthen guardrails", detail: "Add or update content filters, system prompt safety instructions.", priority: "high" },
198
+ { action: "Update rubric", detail: "If safety criteria changed, update the LLM judge rubric.", priority: "medium" },
199
+ ],
200
+ cost_regression: [
201
+ { action: "Check token usage", detail: "Compare input/output token counts between baseline and current run.", priority: "high" },
202
+ { action: "Optimize prompts", detail: "Reduce prompt length or use a smaller model for non-critical paths.", priority: "medium" },
203
+ { action: "Update cost budget", detail: "If higher cost is expected, adjust --max-cost-usd threshold.", priority: "low" },
204
+ ],
205
+ latency_regression: [
206
+ { action: "Check response times", detail: "Compare per-test-case latency between baseline and current run.", priority: "high" },
207
+ { action: "Reduce prompt complexity", detail: "Simplify prompts or use streaming to reduce perceived latency.", priority: "medium" },
208
+ { action: "Update latency budget", detail: "If higher latency is expected, adjust --max-latency-ms threshold.", priority: "low" },
209
+ ],
210
+ coverage_drop: [
211
+ { action: "Add test cases", detail: "Current test count is below minimum. Add more test cases to the evaluation.", priority: "high" },
212
+ { action: "Check test case filtering", detail: "Verify no test cases were accidentally deleted or filtered out.", priority: "medium" },
213
+ ],
214
+ baseline_stale: [
215
+ { action: "Create baseline", detail: "Run: npx evalai baseline init (or publish a run from the dashboard)", priority: "high" },
216
+ { action: "Use --baseline previous", detail: "Compare against the previous run instead of a published baseline.", priority: "medium" },
217
+ ],
218
+ unknown: [
219
+ { action: "Run evalai doctor", detail: "Run: npx evalai doctor to check your full CI/CD setup.", priority: "high" },
220
+ { action: "Check logs", detail: "Review CI logs for errors or unexpected behavior.", priority: "medium" },
221
+ { action: "Update baseline", detail: "If changes are intentional, run: npx evalai baseline update", priority: "low" },
222
+ ],
223
+ };
224
+ function suggestFixes(causes) {
225
+ const seen = new Set();
226
+ const fixes = [];
227
+ for (const cause of causes) {
228
+ for (const fix of ROOT_CAUSE_FIXES[cause] ?? []) {
229
+ if (!seen.has(fix.action)) {
230
+ seen.add(fix.action);
231
+ fixes.push(fix);
232
+ }
233
+ }
234
+ }
235
+ // Sort by priority
236
+ const pOrder = { high: 0, medium: 1, low: 2 };
237
+ return fixes.sort((a, b) => (pOrder[a.priority] ?? 9) - (pOrder[b.priority] ?? 9));
238
+ }
239
+ // ── Build explain output ──
240
+ function buildExplainOutput(report, reportPath) {
241
+ // Support both CheckReport (from evalai check) and BuiltinReport (from evalai gate)
242
+ const isBuiltinReport = "category" in report && "deltas" in report;
243
+ if (isBuiltinReport) {
244
+ return buildFromBuiltinReport(report, reportPath);
245
+ }
246
+ return buildFromCheckReport(report, reportPath);
247
+ }
248
+ function buildFromCheckReport(report, reportPath) {
249
+ const failedCases = report.failedCases ?? [];
250
+ // Top failures (up to 3)
251
+ const topFailures = failedCases.slice(0, 3).map((fc, i) => ({
252
+ rank: i + 1,
253
+ name: fc.name,
254
+ input: fc.inputSnippet || fc.input,
255
+ expected: fc.expectedSnippet || fc.expectedOutput,
256
+ actual: fc.outputSnippet || fc.output,
257
+ reason: fc.reason,
258
+ }));
259
+ // Changes
260
+ const changes = [];
261
+ if (report.score != null && report.baselineScore != null) {
262
+ const d = report.score - report.baselineScore;
263
+ changes.push({
264
+ metric: "Score",
265
+ baseline: String(report.baselineScore),
266
+ current: String(report.score),
267
+ direction: d > 0 ? "better" : d < 0 ? "worse" : "same",
268
+ });
269
+ }
270
+ if (report.breakdown01?.passRate != null) {
271
+ changes.push({
272
+ metric: "Pass rate",
273
+ baseline: "—",
274
+ current: `${Math.round(report.breakdown01.passRate * 100)}%`,
275
+ direction: "same",
276
+ });
277
+ }
278
+ if (report.breakdown01?.safety != null) {
279
+ changes.push({
280
+ metric: "Safety",
281
+ baseline: "—",
282
+ current: `${Math.round(report.breakdown01.safety * 100)}%`,
283
+ direction: report.breakdown01.safety < 0.95 ? "worse" : "same",
284
+ });
285
+ }
286
+ const rootCauses = classifyRootCauses(report);
287
+ const suggestedFixes = suggestFixes(rootCauses);
288
+ return {
289
+ verdict: report.verdict ?? "unknown",
290
+ score: report.score,
291
+ baselineScore: report.baselineScore,
292
+ delta: report.delta,
293
+ reasonCode: report.reasonCode,
294
+ reasonMessage: report.reasonMessage ?? report.actionableMessage,
295
+ topFailures,
296
+ totalFailures: failedCases.length,
297
+ changes,
298
+ rootCauses,
299
+ suggestedFixes,
300
+ reportPath,
301
+ };
302
+ }
303
+ function buildFromBuiltinReport(report, reportPath) {
304
+ const passed = report.passed;
305
+ const failures = report.failures ?? [];
306
+ const deltas = report.deltas ?? [];
307
+ const changes = deltas.map((d) => ({
308
+ metric: d.metric,
309
+ baseline: String(d.baseline),
310
+ current: String(d.current),
311
+ direction: d.status === "pass" ? "same" : "worse",
312
+ }));
313
+ const topFailures = failures.slice(0, 3).map((f, i) => ({
314
+ rank: i + 1,
315
+ reason: f,
316
+ }));
317
+ // Simple root cause for builtin reports
318
+ const rootCauses = [];
319
+ if (failures.some((f) => f.includes("failing")))
320
+ rootCauses.push("prompt_drift");
321
+ if (failures.some((f) => f.includes("count dropped")))
322
+ rootCauses.push("coverage_drop");
323
+ if (rootCauses.length === 0)
324
+ rootCauses.push("unknown");
325
+ return {
326
+ verdict: passed ? "pass" : "fail",
327
+ reasonCode: report.category ?? undefined,
328
+ reasonMessage: failures[0],
329
+ topFailures,
330
+ totalFailures: failures.length,
331
+ changes,
332
+ rootCauses,
333
+ suggestedFixes: suggestFixes(rootCauses),
334
+ reportPath,
335
+ };
336
+ }
337
+ // ── Output formatting ──
338
+ function printHuman(output) {
339
+ const verdictIcon = output.verdict === "pass" ? "\u2705" : output.verdict === "warn" ? "\u26A0\uFE0F" : "\u274C";
340
+ console.log(`\n evalai explain\n`);
341
+ console.log(` ${verdictIcon} Verdict: ${output.verdict.toUpperCase()}`);
342
+ if (output.score != null) {
343
+ const scoreStr = output.baselineScore != null
344
+ ? `${output.score} (baseline: ${output.baselineScore}, delta: ${output.delta ?? "n/a"})`
345
+ : `${output.score}`;
346
+ console.log(` Score: ${scoreStr}`);
347
+ }
348
+ if (output.reasonMessage) {
349
+ console.log(` Reason: ${output.reasonMessage}`);
350
+ }
351
+ // Changes
352
+ if (output.changes.length > 0) {
353
+ console.log("\n What changed:");
354
+ for (const c of output.changes) {
355
+ const arrow = c.direction === "worse" ? "\u2193" : c.direction === "better" ? "\u2191" : "\u2192";
356
+ console.log(` ${arrow} ${c.metric}: ${c.baseline} \u2192 ${c.current}`);
357
+ }
358
+ }
359
+ // Top failures
360
+ if (output.topFailures.length > 0) {
361
+ console.log(`\n Top failing cases (${output.topFailures.length} of ${output.totalFailures}):`);
362
+ for (const f of output.topFailures) {
363
+ console.log(`\n ${f.rank}. ${f.name ?? "unnamed"}`);
364
+ if (f.input)
365
+ console.log(` Input: ${f.input}`);
366
+ if (f.expected)
367
+ console.log(` Expected: ${f.expected}`);
368
+ if (f.actual)
369
+ console.log(` Actual: ${f.actual}`);
370
+ if (f.reason)
371
+ console.log(` Reason: ${f.reason}`);
372
+ }
373
+ }
374
+ // Root causes
375
+ if (output.rootCauses.length > 0 && output.rootCauses[0] !== "unknown") {
376
+ console.log("\n Likely root causes:");
377
+ for (const cause of output.rootCauses) {
378
+ console.log(` \u2022 ${cause.replace(/_/g, " ")}`);
379
+ }
380
+ }
381
+ // Suggested fixes
382
+ if (output.suggestedFixes.length > 0) {
383
+ console.log("\n Suggested fixes:");
384
+ for (const fix of output.suggestedFixes) {
385
+ const pIcon = fix.priority === "high" ? "\u203C\uFE0F" : fix.priority === "medium" ? "\u2757" : "\u2022";
386
+ console.log(` ${pIcon} ${fix.action}`);
387
+ console.log(` ${fix.detail}`);
388
+ }
389
+ }
390
+ console.log(`\n Report: ${output.reportPath}\n`);
391
+ }
392
+ // ── Main ──
393
+ async function runExplain(argv) {
394
+ const flags = parseExplainFlags(argv);
395
+ const cwd = process.cwd();
396
+ const reportPath = findReport(cwd, flags.reportPath);
397
+ if (!reportPath) {
398
+ const searched = flags.reportPath
399
+ ? flags.reportPath
400
+ : REPORT_SEARCH_PATHS.join(", ");
401
+ console.error(`\n \u274C No report found. Searched: ${searched}`);
402
+ console.error(" Run a gate first:");
403
+ console.error(" npx evalai gate --format json");
404
+ console.error(" npx evalai check --format json > .evalai/last-report.json\n");
405
+ return 1;
406
+ }
407
+ let reportData;
408
+ try {
409
+ reportData = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
410
+ }
411
+ catch {
412
+ console.error(`\n \u274C Cannot parse report: ${reportPath}\n`);
413
+ return 1;
414
+ }
415
+ // Schema version compatibility check
416
+ const reportSchema = typeof reportData.schemaVersion === "number" ? reportData.schemaVersion : undefined;
417
+ if (reportSchema != null && reportSchema > types_1.CHECK_REPORT_SCHEMA_VERSION) {
418
+ console.error(`\n \u26A0\uFE0F Report schema version ${reportSchema} is newer than this CLI supports (v${types_1.CHECK_REPORT_SCHEMA_VERSION}).`);
419
+ console.error(" Update your SDK: npm install @pauly4010/evalai-sdk@latest\n");
420
+ }
421
+ const output = buildExplainOutput(reportData, path.relative(cwd, reportPath));
422
+ if (flags.format === "json") {
423
+ console.log(JSON.stringify(output, null, 2));
424
+ }
425
+ else {
426
+ printHuman(output);
427
+ }
428
+ return 0;
429
+ }
@@ -92,6 +92,11 @@ function appendStepSummary(report) {
92
92
  lines.push(`[View Dashboard](${report.dashboardUrl})`);
93
93
  lines.push("");
94
94
  }
95
+ if (!passed) {
96
+ lines.push("> **Tip:** Run `evalai explain` locally to see root causes and suggested fixes.");
97
+ lines.push("> Report saved to `.evalai/last-report.json` — upload as a build artifact for offline analysis.");
98
+ lines.push("");
99
+ }
95
100
  try {
96
101
  fs.appendFileSync(path, lines.join("\n"), "utf8");
97
102
  }
@@ -54,7 +54,10 @@ export type CiContext = {
54
54
  runUrl?: string;
55
55
  actor?: string;
56
56
  };
57
+ /** Current schema version for CheckReport (.evalai/last-report.json). Bump on breaking changes. */
58
+ export declare const CHECK_REPORT_SCHEMA_VERSION = 1;
57
59
  export type CheckReport = {
60
+ schemaVersion?: number;
58
61
  evaluationId: string;
59
62
  runId?: number;
60
63
  verdict: GateVerdict;
@@ -3,3 +3,6 @@
3
3
  * CheckReport and related types for formatters.
4
4
  */
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.CHECK_REPORT_SCHEMA_VERSION = void 0;
7
+ /** Current schema version for CheckReport (.evalai/last-report.json). Bump on breaking changes. */
8
+ exports.CHECK_REPORT_SCHEMA_VERSION = 1;
package/dist/cli/index.js CHANGED
@@ -11,9 +11,12 @@ Object.defineProperty(exports, "__esModule", { value: true });
11
11
  const baseline_1 = require("./baseline");
12
12
  const check_1 = require("./check");
13
13
  const doctor_1 = require("./doctor");
14
+ const explain_1 = require("./explain");
14
15
  const init_1 = require("./init");
16
+ const print_config_1 = require("./print-config");
15
17
  const regression_gate_1 = require("./regression-gate");
16
18
  const share_1 = require("./share");
19
+ const upgrade_1 = require("./upgrade");
17
20
  const argv = process.argv.slice(2);
18
21
  const subcommand = argv[0];
19
22
  if (subcommand === "init") {
@@ -29,6 +32,10 @@ else if (subcommand === "gate") {
29
32
  const code = (0, regression_gate_1.runGate)(argv.slice(1));
30
33
  process.exit(code);
31
34
  }
35
+ else if (subcommand === "upgrade") {
36
+ const code = (0, upgrade_1.runUpgrade)(argv.slice(1));
37
+ process.exit(code);
38
+ }
32
39
  else if (subcommand === "doctor") {
33
40
  (0, doctor_1.runDoctor)(argv.slice(1))
34
41
  .then((code) => process.exit(code))
@@ -50,6 +57,18 @@ else if (subcommand === "check") {
50
57
  process.exit(4);
51
58
  });
52
59
  }
60
+ else if (subcommand === "explain") {
61
+ (0, explain_1.runExplain)(argv.slice(1))
62
+ .then((code) => process.exit(code))
63
+ .catch((err) => {
64
+ console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
65
+ process.exit(1);
66
+ });
67
+ }
68
+ else if (subcommand === "print-config") {
69
+ const code = (0, print_config_1.runPrintConfig)(argv.slice(1));
70
+ process.exit(code);
71
+ }
53
72
  else if (subcommand === "share") {
54
73
  const parsed = (0, share_1.parseShareArgs)(argv.slice(1));
55
74
  if ("error" in parsed) {
@@ -67,12 +86,15 @@ else {
67
86
  console.log(`EvalAI CLI
68
87
 
69
88
  Usage:
70
- evalai init Create evalai.config.json
89
+ evalai init Create evalai.config.json + baseline + CI workflow
90
+ evalai gate [options] Run regression gate (local test-based, no API needed)
91
+ evalai check [options] CI/CD evaluation gate (API-based)
92
+ evalai explain [options] Explain last gate/check failure with root causes + fixes
93
+ evalai doctor [options] Comprehensive CI/CD readiness checklist
71
94
  evalai baseline init Create starter evals/baseline.json
72
95
  evalai baseline update Run tests and update baseline with real scores
73
- evalai gate [options] Run regression gate (local test-based)
74
- evalai doctor [options] Verify CI/CD setup (same endpoint as check)
75
- evalai check [options] CI/CD evaluation gate (API-based)
96
+ evalai upgrade --full Upgrade from Tier 1 to Tier 2 (full gate)
97
+ evalai print-config Show resolved config with source-of-truth annotations
76
98
  evalai share [options] Create share link for a run
77
99
 
78
100
  Options for gate:
@@ -94,8 +116,29 @@ Options for check:
94
116
  --share <mode> Share link: always | fail | never (fail = only when gate fails)
95
117
  --baseUrl <url> API base URL
96
118
 
119
+ Options for explain:
120
+ --report <path> Path to report JSON (default: evals/regression-report.json)
121
+ --format <fmt> Output format: human (default), json
122
+
123
+ Options for print-config:
124
+ --format <fmt> Output format: human (default), json
125
+
126
+ Options for doctor:
127
+ --report Output JSON diagnostic bundle
128
+ --format <fmt> Output format: human (default), json
129
+ --strict Treat warnings as failures (exit 2)
130
+ --apiKey <key> API key (or EVALAI_API_KEY env)
131
+ --baseUrl <url> API base URL
132
+ --evaluationId <id> Evaluation to verify
133
+
97
134
  Examples:
98
135
  evalai init
136
+ evalai gate
137
+ evalai gate --format json
138
+ evalai explain
139
+ evalai doctor
140
+ evalai print-config
141
+ evalai doctor --report
99
142
  evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
100
143
  evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
101
144
  evalai share --scope run --evaluationId 42 --runId 123 --expires 7d --apiKey $EVALAI_API_KEY
@@ -1,7 +1,16 @@
1
1
  #!/usr/bin/env node
2
2
  /**
3
- * evalai init — Create evalai.config.json
3
+ * evalai init — Full project scaffolder
4
4
  *
5
- * Creates the smallest possible config file. Defaults belong in code.
5
+ * Zero-to-gate in under 5 minutes:
6
+ * npx evalai init
7
+ * git push
8
+ * …CI starts blocking regressions.
9
+ *
10
+ * What it does:
11
+ * 1. Detects Node repo + package manager
12
+ * 2. Creates evals/ directory + baseline.json
13
+ * 3. Installs .github/workflows/evalai-gate.yml
14
+ * 4. Prints next steps (no docs required)
6
15
  */
7
16
  export declare function runInit(cwd?: string): boolean;