@pauly4010/evalai-sdk 1.7.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,76 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.8.0] - 2026-02-26
9
+
10
+ ### ✨ Added
11
+
12
+ #### CLI — `evalai doctor` Rewrite (Comprehensive Checklist)
13
+
14
+ - **9 itemized checks** with pass/fail/warn/skip status and exact remediation commands:
15
+ 1. Project detection (package.json + lockfile + package manager)
16
+ 2. Config file validity (evalai.config.json)
17
+ 3. Baseline file (evals/baseline.json — schema, staleness)
18
+ 4. Authentication (API key presence, redacted display)
19
+ 5. Evaluation target (evaluationId configured)
20
+ 6. API connectivity (reachable, latency)
21
+ 7. Evaluation access (permissions, baseline presence)
22
+ 8. CI wiring (.github/workflows/evalai-gate.yml)
23
+ 9. Provider env vars (OpenAI/Anthropic/Azure — optional)
24
+ - **Exit codes**: `0` ready, `2` not ready, `3` infrastructure error
25
+ - **`--report`** flag outputs full JSON diagnostic bundle (versions, hashes, latency, all checks)
26
+ - **`--format json`** for machine-readable output
27
+
28
+ #### CLI — `evalai explain` (New Command)
29
+
30
+ - **Offline report explainer** — reads `.evalai/last-report.json` or `evals/regression-report.json` with zero flags
31
+ - **Top 3 failing test cases** with input/expected/actual
32
+ - **What changed** — baseline vs current with directional indicators
33
+ - **Root cause classification**: prompt drift, retrieval drift, formatting drift, tool-use drift, safety/cost/latency regression, coverage drop, baseline stale
34
+ - **Prioritized suggested fixes** with actionable commands
35
+ - Works with both `evalai check` reports (CheckReport) and `evalai gate` reports (BuiltinReport)
36
+ - **`--format json`** for CI pipeline consumption
37
+
38
+ #### Guided Failure Flow
39
+
40
+ - **`evalai check` now writes `.evalai/last-report.json`** automatically after every run
41
+ - **Failure hint**: prints `Next: evalai explain` on gate failure
42
+ - **GitHub step summary**: adds tip about `evalai explain` and report artifact location on failure
43
+
44
+ #### CI Template Improvements
45
+
46
+ - **Doctor preflight step** added to generated workflow (`continue-on-error: true`)
47
+ - **Report artifact upload** now includes both `evals/regression-report.json` and `.evalai/last-report.json`
48
+
49
+ #### `evalai init` Output Updated
50
+
51
+ - First recommendation: `npx evalai doctor` (verify setup)
52
+ - Full command reference: doctor, gate, check, explain, baseline update
53
+
54
+ #### CLI — `evalai print-config` (New Command)
55
+
56
+ - **Resolved config viewer** — prints every config field with its current value
57
+ - **Source-of-truth annotations**: `[file]`, `[env]`, `[default]`, `[profile]`, `[arg]` for each field
58
+ - **Secrets redacted** — API keys shown as `sk_t...abcd`
59
+ - **Environment summary** — shows all relevant env vars (EVALAI_API_KEY, OPENAI_API_KEY, CI, etc.)
60
+ - **`--format json`** for machine-readable output
61
+ - Accepts `--evaluationId`, `--baseUrl`, etc. to show how CLI args would merge
62
+
63
+ #### Minimal Green Example
64
+
65
+ - **`examples/minimal-green/`** — passes on first run, no account needed
66
+ - Zero dependencies, 3 `node:test` tests
67
+ - Clone → init → doctor → gate → ✅
68
+
69
+ ### 🔧 Changed
70
+
71
+ - `evalai doctor` exit codes changed: was `0`/`1`, now `0`/`2`/`3`
72
+ - SDK README: added Debugging & Diagnostics section with guided flow diagram
73
+ - SDK README: added Doctor Exit Codes table
74
+ - Doctor test count: 4 → 29 tests; added 9 explain tests (38 total new tests)
75
+
76
+ ---
77
+
8
78
  ## [1.7.0] - 2026-02-25
9
79
 
10
80
  ### ✨ Added
package/README.md CHANGED
@@ -2,6 +2,10 @@
2
2
 
3
3
  [![npm version](https://img.shields.io/npm/v/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
4
4
  [![npm downloads](https://img.shields.io/npm/dm/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
5
+ [![TypeScript](https://img.shields.io/badge/TypeScript-strict-blue.svg)](https://www.typescriptlang.org/)
6
+ [![SDK Tests](https://img.shields.io/badge/tests-172%20passed-brightgreen.svg)](#)
7
+ [![Contract Version](https://img.shields.io/badge/report%20schema-v1-blue.svg)](#)
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
9
 
6
10
  **Stop LLM regressions in CI in minutes.**
7
11
 
@@ -54,9 +58,36 @@ That's it. Open a PR and CI blocks regressions automatically.
54
58
  | Command | Description |
55
59
  |---------|-------------|
56
60
  | `npx evalai check` | Gate on quality score from dashboard |
57
- | `npx evalai doctor` | Verify CI/CD setup |
58
61
  | `npx evalai share` | Create share link for a run |
59
62
 
63
+ ### Debugging & Diagnostics
64
+
65
+ | Command | Description |
66
+ |---------|-------------|
67
+ | `npx evalai doctor` | Comprehensive preflight checklist — verifies config, baseline, auth, API, CI wiring |
68
+ | `npx evalai explain` | Offline report explainer — top failures, root cause classification, suggested fixes |
69
+ | `npx evalai print-config` | Show resolved config with source-of-truth annotations (file/env/default/arg) |
70
+
71
+ **Guided failure flow:**
72
+
73
+ ```
74
+ evalai check → fails → "Next: evalai explain"
75
+
76
+ evalai explain → root causes + fixes
77
+ ```
78
+
79
+ **GitHub Actions step summary** — gate result at a glance:
80
+
81
+ ![GitHub Actions step summary showing gate pass/fail with delta table](../../docs/images/evalai-gate-step-summary.svg)
82
+
83
+ **`evalai explain` terminal output** — root causes + fix commands:
84
+
85
+ ![Terminal output of evalai explain showing top failures and suggested fixes](../../docs/images/evalai-explain-terminal.svg)
86
+
87
+ `check` automatically writes `.evalai/last-report.json` so `explain` works with zero flags.
88
+
89
+ `doctor` uses exit codes: **0** = ready, **2** = not ready, **3** = infra error. Use `--report` for a JSON diagnostic bundle.
90
+
60
91
  ### Gate Exit Codes
61
92
 
62
93
  | Code | Meaning |
@@ -79,6 +110,14 @@ That's it. Open a PR and CI blocks regressions automatically.
79
110
  | 7 | Weak evidence |
80
111
  | 8 | Warn (soft regression) |
81
112
 
113
+ ### Doctor Exit Codes
114
+
115
+ | Code | Meaning |
116
+ |------|---------|
117
+ | 0 | Ready — all checks passed |
118
+ | 2 | Not ready — one or more checks failed |
119
+ | 3 | Infrastructure error |
120
+
82
121
  ---
83
122
 
84
123
  ## How the Gate Works
@@ -224,6 +263,8 @@ Your local `openAIChatEval` runs continue to work. No account cancellation. No d
224
263
 
225
264
  See [CHANGELOG.md](CHANGELOG.md) for the full release history.
226
265
 
266
+ **v1.8.0** — `evalai doctor` rewrite (9-check checklist), `evalai explain` command, guided failure flow, CI template with doctor preflight
267
+
227
268
  **v1.7.0** — `evalai init` scaffolder, `evalai upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
228
269
 
229
270
  **v1.6.0** — `evalai gate`, `evalai baseline`, regression gate constants & types
package/dist/cli/check.js CHANGED
@@ -77,6 +77,7 @@ exports.EXIT = void 0;
77
77
  exports.parseArgs = parseArgs;
78
78
  exports.runCheck = runCheck;
79
79
  const fs = __importStar(require("node:fs"));
80
+ const path = __importStar(require("node:path"));
80
81
  const api_1 = require("./api");
81
82
  const ci_context_1 = require("./ci-context");
82
83
  const config_1 = require("./config");
@@ -260,12 +261,26 @@ async function runCheck(args) {
260
261
  baselineRunId: quality?.baselineRunId ?? undefined,
261
262
  ciRunUrl: ci?.runUrl ?? undefined,
262
263
  });
264
+ // Persist report artifact so `evalai explain` works with zero flags
265
+ try {
266
+ const reportDir = path.join(process.cwd(), ".evalai");
267
+ if (!fs.existsSync(reportDir))
268
+ fs.mkdirSync(reportDir, { recursive: true });
269
+ fs.writeFileSync(path.join(reportDir, "last-report.json"), JSON.stringify(report, null, 2), "utf8");
270
+ }
271
+ catch {
272
+ // Non-fatal: best-effort artifact write
273
+ }
263
274
  const formatted = args.format === "json"
264
275
  ? (0, json_1.formatJson)(report)
265
276
  : args.format === "github"
266
277
  ? (0, github_1.formatGitHub)(report)
267
278
  : (0, human_1.formatHuman)(report);
268
279
  console.log(formatted);
280
+ // Guided flow hint on failure
281
+ if (!gateResult.passed) {
282
+ console.error("\nNext: evalai explain");
283
+ }
269
284
  // --pr-comment-out: write markdown to file for GitHub Action to post
270
285
  if (args.prCommentOut) {
271
286
  try {
@@ -1,11 +1,88 @@
1
1
  /**
2
- * evalai doctor — Verify CI/CD setup.
3
- * Uses the same quality endpoint as check — if doctor passes, check works.
2
+ * evalai doctor — Comprehensive CI/CD readiness checklist.
3
+ *
4
+ * Runs itemized pass/fail checks with exact remediation commands.
5
+ *
6
+ * Exit codes:
7
+ * 0 — All checks passed (ready)
8
+ * 2 — One or more checks failed (not ready)
9
+ * 3 — Infrastructure error (couldn't complete checks)
10
+ *
11
+ * Flags:
12
+ * --report Output JSON diagnostic bundle (redacted)
13
+ * --format <fmt> Output format: human (default), json
14
+ * --apiKey <key> API key (or EVALAI_API_KEY env)
15
+ * --baseUrl <url> API base URL
16
+ * --evaluationId <id> Evaluation to verify
17
+ * --baseline <mode> Baseline mode
4
18
  */
5
- export type DoctorArgs = {
19
+ import { type EvalAIConfig } from "./config";
20
+ export declare const DOCTOR_EXIT: {
21
+ readonly READY: 0;
22
+ readonly NOT_READY: 2;
23
+ readonly INFRA_ERROR: 3;
24
+ };
25
+ export type CheckStatus = "pass" | "fail" | "warn" | "skip";
26
+ export interface CheckResult {
27
+ id: string;
28
+ label: string;
29
+ status: CheckStatus;
30
+ message: string;
31
+ remediation?: string;
32
+ }
33
+ export interface DiagnosticBundle {
34
+ timestamp: string;
35
+ cliVersion: string;
36
+ specVersion: string;
37
+ platform: string;
38
+ nodeVersion: string;
39
+ checks: CheckResult[];
40
+ config: Partial<EvalAIConfig> & {
41
+ path?: string | null;
42
+ };
43
+ baseline: {
44
+ path: string;
45
+ exists: boolean;
46
+ hash?: string;
47
+ schemaVersion?: number;
48
+ stale?: boolean;
49
+ } | null;
50
+ api: {
51
+ reachable: boolean;
52
+ latencyMs?: number;
53
+ scopes?: string[];
54
+ } | null;
55
+ ci: {
56
+ workflowPath: string;
57
+ exists: boolean;
58
+ } | null;
59
+ overall: "ready" | "not_ready" | "infra_error";
60
+ }
61
+ export interface DoctorFlags {
62
+ report: boolean;
63
+ format: "human" | "json";
64
+ strict: boolean;
6
65
  baseUrl: string;
7
66
  apiKey: string;
8
67
  evaluationId: string;
9
68
  baseline: "published" | "previous" | "production";
69
+ }
70
+ export declare function checkProject(cwd: string): CheckResult;
71
+ export declare function checkConfig(cwd: string): CheckResult & {
72
+ config: EvalAIConfig | null;
73
+ configPath: string | null;
74
+ };
75
+ export declare function checkBaseline(cwd: string): CheckResult & {
76
+ baselineInfo: DiagnosticBundle["baseline"];
77
+ };
78
+ export declare function checkAuth(apiKey: string): CheckResult;
79
+ export declare function checkConnectivity(baseUrl: string, apiKey: string): Promise<CheckResult & {
80
+ latencyMs?: number;
81
+ }>;
82
+ export declare function checkEvalTarget(evaluationId: string): CheckResult;
83
+ export declare function checkEvalAccess(baseUrl: string, apiKey: string, evaluationId: string, baseline: string): Promise<CheckResult>;
84
+ export declare function checkCiWiring(cwd: string): CheckResult & {
85
+ ciInfo: DiagnosticBundle["ci"];
10
86
  };
87
+ export declare function checkProviderEnv(): CheckResult;
11
88
  export declare function runDoctor(argv: string[]): Promise<number>;