@pauly4010/evalai-sdk 1.7.0 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +124 -0
- package/README.md +42 -1
- package/dist/cli/check.js +15 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +680 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +408 -0
- package/dist/cli/doctor.d.ts +80 -3
- package/dist/cli/doctor.js +583 -41
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +535 -0
- package/dist/cli/formatters/github.js +5 -0
- package/dist/cli/formatters/types.d.ts +3 -0
- package/dist/cli/formatters/types.js +3 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +251 -0
- package/dist/cli/index.js +214 -4
- package/dist/cli/init.js +16 -4
- package/dist/cli/manifest.d.ts +105 -0
- package/dist/cli/manifest.js +275 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +255 -0
- package/dist/cli/report/build-check-report.d.ts +1 -1
- package/dist/cli/report/build-check-report.js +2 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +389 -0
- package/dist/cli/workspace.d.ts +28 -0
- package/dist/cli/workspace.js +58 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +30 -5
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +391 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +271 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +237 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +353 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +416 -0
- package/dist/runtime/run-report.d.ts +202 -0
- package/dist/runtime/run-report.js +220 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/testing.d.ts +65 -0
- package/dist/testing.js +42 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +4 -3
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,130 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.9.0] - 2026-02-27
|
|
9
|
+
|
|
10
|
+
### ✨ Added
|
|
11
|
+
|
|
12
|
+
#### CLI — One-Command CI Loop (`evalai ci`)
|
|
13
|
+
|
|
14
|
+
- **`evalai ci`** — Single command teams put in GitHub workflows and never think about again
|
|
15
|
+
- **Complete CI pipeline**: discover → manifest → impact → run → diff → PR summary → safe failure → "next step"
|
|
16
|
+
- **Automatic manifest building**: Builds manifest if missing, no manual steps required
|
|
17
|
+
- **Impact analysis integration**: `--impacted-only` flag for targeted testing
|
|
18
|
+
- **Smart exit codes**: 0=clean, 1=regressions, 2=config/infra issues
|
|
19
|
+
- **Self-documenting failures**: Always prints copy/paste next step for debugging
|
|
20
|
+
- **GitHub Step Summary integration**: Automatic PR summaries with regressions and artifacts
|
|
21
|
+
|
|
22
|
+
#### CLI — Durable Run History & Diff System
|
|
23
|
+
|
|
24
|
+
- **Run artifact retention**: Timestamped artifacts in `.evalai/runs/run-<runId>.json`
|
|
25
|
+
- **Run index file**: `.evalai/runs/index.json` tracks all runs with metadata
|
|
26
|
+
- **Schema versioning**: `RunResult` and `DiffResult` include `schemaVersion` for compatibility
|
|
27
|
+
- **Base/head shortcuts**: `--base baseline`, `--base last`, `--head last` for common cases
|
|
28
|
+
- **Floating point normalization**: Consistent score/delta calculations across runs
|
|
29
|
+
- **Comprehensive diff comparison**: Classifies regressions, improvements, added, removed specs
|
|
30
|
+
|
|
31
|
+
#### CLI — Centralized Architecture
|
|
32
|
+
|
|
33
|
+
- **Environment detection**: `isCI()`, `isGitHubActions()`, `getGitHubStepSummaryPath()` unified
|
|
34
|
+
- **Workspace resolution**: `resolveEvalWorkspace()` provides all `.evalai` paths
|
|
35
|
+
- **Git reference detection**: Comprehensive patterns for branches, tags, and ranges
|
|
36
|
+
- **No more duplication**: All commands use shared utilities for consistency
|
|
37
|
+
|
|
38
|
+
#### CLI — CI Friendliness
|
|
39
|
+
|
|
40
|
+
- **Fail-safe base resolution**: Clear error messages when base artifacts missing in CI
|
|
41
|
+
- **GitHub Step Summary**: Rich markdown summaries with metrics, regressions, and artifact links
|
|
42
|
+
- **CI-specific error handling**: Exit code 2 for config issues with helpful guidance
|
|
43
|
+
- **Artifact download instructions**: Exact commands for manual base artifact setup
|
|
44
|
+
|
|
45
|
+
### 🔧 Changed
|
|
46
|
+
|
|
47
|
+
- **Exit codes standardized**: 0=clean, 1=regressions, 2=config/infra issues across all commands
|
|
48
|
+
- **Schema compatibility**: Added `schemaVersion` validation for future-proofing
|
|
49
|
+
- **Path resolution**: All commands use centralized workspace helpers
|
|
50
|
+
- **Error messages**: More actionable and context-aware guidance
|
|
51
|
+
|
|
52
|
+
### 📊 New Features Summary
|
|
53
|
+
|
|
54
|
+
- **One-command CI**: `evalai ci` replaces multi-step workflows
|
|
55
|
+
- **Durable history**: Run artifacts preserved with smart indexing
|
|
56
|
+
- **Smart diffing**: Automated regression detection with GitHub integration
|
|
57
|
+
- **Centralized utilities**: Environment detection and workspace resolution unified
|
|
58
|
+
- **Self-documenting**: Clear next steps for any failure scenario
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## [1.8.0] - 2026-02-26
|
|
63
|
+
|
|
64
|
+
### ✨ Added
|
|
65
|
+
|
|
66
|
+
#### CLI — `evalai doctor` Rewrite (Comprehensive Checklist)
|
|
67
|
+
|
|
68
|
+
- **9 itemized checks** with pass/fail/warn/skip status and exact remediation commands:
|
|
69
|
+
1. Project detection (package.json + lockfile + package manager)
|
|
70
|
+
2. Config file validity (evalai.config.json)
|
|
71
|
+
3. Baseline file (evals/baseline.json — schema, staleness)
|
|
72
|
+
4. Authentication (API key presence, redacted display)
|
|
73
|
+
5. Evaluation target (evaluationId configured)
|
|
74
|
+
6. API connectivity (reachable, latency)
|
|
75
|
+
7. Evaluation access (permissions, baseline presence)
|
|
76
|
+
8. CI wiring (.github/workflows/evalai-gate.yml)
|
|
77
|
+
9. Provider env vars (OpenAI/Anthropic/Azure — optional)
|
|
78
|
+
- **Exit codes**: `0` ready, `2` not ready, `3` infrastructure error
|
|
79
|
+
- **`--report`** flag outputs full JSON diagnostic bundle (versions, hashes, latency, all checks)
|
|
80
|
+
- **`--format json`** for machine-readable output
|
|
81
|
+
|
|
82
|
+
#### CLI — `evalai explain` (New Command)
|
|
83
|
+
|
|
84
|
+
- **Offline report explainer** — reads `.evalai/last-report.json` or `evals/regression-report.json` with zero flags
|
|
85
|
+
- **Top 3 failing test cases** with input/expected/actual
|
|
86
|
+
- **What changed** — baseline vs current with directional indicators
|
|
87
|
+
- **Root cause classification**: prompt drift, retrieval drift, formatting drift, tool-use drift, safety/cost/latency regression, coverage drop, baseline stale
|
|
88
|
+
- **Prioritized suggested fixes** with actionable commands
|
|
89
|
+
- Works with both `evalai check` reports (CheckReport) and `evalai gate` reports (BuiltinReport)
|
|
90
|
+
- **`--format json`** for CI pipeline consumption
|
|
91
|
+
|
|
92
|
+
#### Guided Failure Flow
|
|
93
|
+
|
|
94
|
+
- **`evalai check` now writes `.evalai/last-report.json`** automatically after every run
|
|
95
|
+
- **Failure hint**: prints `Next: evalai explain` on gate failure
|
|
96
|
+
- **GitHub step summary**: adds tip about `evalai explain` and report artifact location on failure
|
|
97
|
+
|
|
98
|
+
#### CI Template Improvements
|
|
99
|
+
|
|
100
|
+
- **Doctor preflight step** added to generated workflow (`continue-on-error: true`)
|
|
101
|
+
- **Report artifact upload** now includes both `evals/regression-report.json` and `.evalai/last-report.json`
|
|
102
|
+
|
|
103
|
+
#### `evalai init` Output Updated
|
|
104
|
+
|
|
105
|
+
- First recommendation: `npx evalai doctor` (verify setup)
|
|
106
|
+
- Full command reference: doctor, gate, check, explain, baseline update
|
|
107
|
+
|
|
108
|
+
#### CLI — `evalai print-config` (New Command)
|
|
109
|
+
|
|
110
|
+
- **Resolved config viewer** — prints every config field with its current value
|
|
111
|
+
- **Source-of-truth annotations**: `[file]`, `[env]`, `[default]`, `[profile]`, `[arg]` for each field
|
|
112
|
+
- **Secrets redacted** — API keys shown as `sk_t...abcd`
|
|
113
|
+
- **Environment summary** — shows all relevant env vars (EVALAI_API_KEY, OPENAI_API_KEY, CI, etc.)
|
|
114
|
+
- **`--format json`** for machine-readable output
|
|
115
|
+
- Accepts `--evaluationId`, `--baseUrl`, etc. to show how CLI args would merge
|
|
116
|
+
|
|
117
|
+
#### Minimal Green Example
|
|
118
|
+
|
|
119
|
+
- **`examples/minimal-green/`** — passes on first run, no account needed
|
|
120
|
+
- Zero dependencies, 3 `node:test` tests
|
|
121
|
+
- Clone → init → doctor → gate → ✅
|
|
122
|
+
|
|
123
|
+
### 🔧 Changed
|
|
124
|
+
|
|
125
|
+
- `evalai doctor` exit codes changed: was `0`/`1`, now `0`/`2`/`3`
|
|
126
|
+
- SDK README: added Debugging & Diagnostics section with guided flow diagram
|
|
127
|
+
- SDK README: added Doctor Exit Codes table
|
|
128
|
+
- Doctor test count: 4 → 29 tests; added 9 explain tests (38 total new tests)
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
8
132
|
## [1.7.0] - 2026-02-25
|
|
9
133
|
|
|
10
134
|
### ✨ Added
|
package/README.md
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
4
4
|
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
5
|
+
[](https://www.typescriptlang.org/)
|
|
6
|
+
[](#)
|
|
7
|
+
[](#)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
9
|
|
|
6
10
|
**Stop LLM regressions in CI in minutes.**
|
|
7
11
|
|
|
@@ -54,9 +58,36 @@ That's it. Open a PR and CI blocks regressions automatically.
|
|
|
54
58
|
| Command | Description |
|
|
55
59
|
|---------|-------------|
|
|
56
60
|
| `npx evalai check` | Gate on quality score from dashboard |
|
|
57
|
-
| `npx evalai doctor` | Verify CI/CD setup |
|
|
58
61
|
| `npx evalai share` | Create share link for a run |
|
|
59
62
|
|
|
63
|
+
### Debugging & Diagnostics
|
|
64
|
+
|
|
65
|
+
| Command | Description |
|
|
66
|
+
|---------|-------------|
|
|
67
|
+
| `npx evalai doctor` | Comprehensive preflight checklist — verifies config, baseline, auth, API, CI wiring |
|
|
68
|
+
| `npx evalai explain` | Offline report explainer — top failures, root cause classification, suggested fixes |
|
|
69
|
+
| `npx evalai print-config` | Show resolved config with source-of-truth annotations (file/env/default/arg) |
|
|
70
|
+
|
|
71
|
+
**Guided failure flow:**
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
evalai check → fails → "Next: evalai explain"
|
|
75
|
+
↓
|
|
76
|
+
evalai explain → root causes + fixes
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**GitHub Actions step summary** — gate result at a glance:
|
|
80
|
+
|
|
81
|
+

|
|
82
|
+
|
|
83
|
+
**`evalai explain` terminal output** — root causes + fix commands:
|
|
84
|
+
|
|
85
|
+

|
|
86
|
+
|
|
87
|
+
`check` automatically writes `.evalai/last-report.json` so `explain` works with zero flags.
|
|
88
|
+
|
|
89
|
+
`doctor` uses exit codes: **0** = ready, **2** = not ready, **3** = infra error. Use `--report` for a JSON diagnostic bundle.
|
|
90
|
+
|
|
60
91
|
### Gate Exit Codes
|
|
61
92
|
|
|
62
93
|
| Code | Meaning |
|
|
@@ -79,6 +110,14 @@ That's it. Open a PR and CI blocks regressions automatically.
|
|
|
79
110
|
| 7 | Weak evidence |
|
|
80
111
|
| 8 | Warn (soft regression) |
|
|
81
112
|
|
|
113
|
+
### Doctor Exit Codes
|
|
114
|
+
|
|
115
|
+
| Code | Meaning |
|
|
116
|
+
|------|---------|
|
|
117
|
+
| 0 | Ready — all checks passed |
|
|
118
|
+
| 2 | Not ready — one or more checks failed |
|
|
119
|
+
| 3 | Infrastructure error |
|
|
120
|
+
|
|
82
121
|
---
|
|
83
122
|
|
|
84
123
|
## How the Gate Works
|
|
@@ -224,6 +263,8 @@ Your local `openAIChatEval` runs continue to work. No account cancellation. No d
|
|
|
224
263
|
|
|
225
264
|
See [CHANGELOG.md](CHANGELOG.md) for the full release history.
|
|
226
265
|
|
|
266
|
+
**v1.8.0** — `evalai doctor` rewrite (9-check checklist), `evalai explain` command, guided failure flow, CI template with doctor preflight
|
|
267
|
+
|
|
227
268
|
**v1.7.0** — `evalai init` scaffolder, `evalai upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
|
|
228
269
|
|
|
229
270
|
**v1.6.0** — `evalai gate`, `evalai baseline`, regression gate constants & types
|
package/dist/cli/check.js
CHANGED
|
@@ -77,6 +77,7 @@ exports.EXIT = void 0;
|
|
|
77
77
|
exports.parseArgs = parseArgs;
|
|
78
78
|
exports.runCheck = runCheck;
|
|
79
79
|
const fs = __importStar(require("node:fs"));
|
|
80
|
+
const path = __importStar(require("node:path"));
|
|
80
81
|
const api_1 = require("./api");
|
|
81
82
|
const ci_context_1 = require("./ci-context");
|
|
82
83
|
const config_1 = require("./config");
|
|
@@ -260,12 +261,26 @@ async function runCheck(args) {
|
|
|
260
261
|
baselineRunId: quality?.baselineRunId ?? undefined,
|
|
261
262
|
ciRunUrl: ci?.runUrl ?? undefined,
|
|
262
263
|
});
|
|
264
|
+
// Persist report artifact so `evalai explain` works with zero flags
|
|
265
|
+
try {
|
|
266
|
+
const reportDir = path.join(process.cwd(), ".evalai");
|
|
267
|
+
if (!fs.existsSync(reportDir))
|
|
268
|
+
fs.mkdirSync(reportDir, { recursive: true });
|
|
269
|
+
fs.writeFileSync(path.join(reportDir, "last-report.json"), JSON.stringify(report, null, 2), "utf8");
|
|
270
|
+
}
|
|
271
|
+
catch {
|
|
272
|
+
// Non-fatal: best-effort artifact write
|
|
273
|
+
}
|
|
263
274
|
const formatted = args.format === "json"
|
|
264
275
|
? (0, json_1.formatJson)(report)
|
|
265
276
|
: args.format === "github"
|
|
266
277
|
? (0, github_1.formatGitHub)(report)
|
|
267
278
|
: (0, human_1.formatHuman)(report);
|
|
268
279
|
console.log(formatted);
|
|
280
|
+
// Guided flow hint on failure
|
|
281
|
+
if (!gateResult.passed) {
|
|
282
|
+
console.error("\nNext: evalai explain");
|
|
283
|
+
}
|
|
269
284
|
// --pr-comment-out: write markdown to file for GitHub Action to post
|
|
270
285
|
if (args.prCommentOut) {
|
|
271
286
|
try {
|
package/dist/cli/ci.d.ts
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UX-401: One-command CI loop (evalai ci)
|
|
3
|
+
*
|
|
4
|
+
* Provides a single command teams put in .github/workflows/* and never think about again.
|
|
5
|
+
*/
|
|
6
|
+
import type { DiffResult } from "./diff";
|
|
7
|
+
import type { RunResult } from "./run";
|
|
8
|
+
/**
|
|
9
|
+
* CI command options
|
|
10
|
+
*/
|
|
11
|
+
export interface CIOptions {
|
|
12
|
+
/** Base reference for diff comparison */
|
|
13
|
+
base?: string;
|
|
14
|
+
/** Run only impacted specs */
|
|
15
|
+
impactedOnly?: boolean;
|
|
16
|
+
/** Output format */
|
|
17
|
+
format?: "human" | "json" | "github";
|
|
18
|
+
/** Write run results */
|
|
19
|
+
writeResults?: boolean;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* CI execution result
|
|
23
|
+
*/
|
|
24
|
+
export interface CIResult {
|
|
25
|
+
/** Success status */
|
|
26
|
+
success: boolean;
|
|
27
|
+
/** Exit code */
|
|
28
|
+
exitCode: number;
|
|
29
|
+
/** Execution narrative */
|
|
30
|
+
narrative: string;
|
|
31
|
+
/** Run result (if executed) */
|
|
32
|
+
runResult?: RunResult;
|
|
33
|
+
/** Diff result (if executed) */
|
|
34
|
+
diffResult?: DiffResult;
|
|
35
|
+
/** Error message (if failed) */
|
|
36
|
+
error?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Run CI command
|
|
40
|
+
*/
|
|
41
|
+
export declare function runCI(options: CIOptions, projectRoot?: string): Promise<CIResult>;
|
|
42
|
+
/**
|
|
43
|
+
* CLI entry point
|
|
44
|
+
*/
|
|
45
|
+
export declare function runCICLI(options: CIOptions): Promise<void>;
|
package/dist/cli/ci.js
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* UX-401: One-command CI loop (evalai ci)
|
|
4
|
+
*
|
|
5
|
+
* Provides a single command teams put in .github/workflows/* and never think about again.
|
|
6
|
+
*/
|
|
7
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
8
|
+
if (k2 === undefined) k2 = k;
|
|
9
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
10
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
11
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
12
|
+
}
|
|
13
|
+
Object.defineProperty(o, k2, desc);
|
|
14
|
+
}) : (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
o[k2] = m[k];
|
|
17
|
+
}));
|
|
18
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
19
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
20
|
+
}) : function(o, v) {
|
|
21
|
+
o["default"] = v;
|
|
22
|
+
});
|
|
23
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
24
|
+
var ownKeys = function(o) {
|
|
25
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
26
|
+
var ar = [];
|
|
27
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
28
|
+
return ar;
|
|
29
|
+
};
|
|
30
|
+
return ownKeys(o);
|
|
31
|
+
};
|
|
32
|
+
return function (mod) {
|
|
33
|
+
if (mod && mod.__esModule) return mod;
|
|
34
|
+
var result = {};
|
|
35
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
36
|
+
__setModuleDefault(result, mod);
|
|
37
|
+
return result;
|
|
38
|
+
};
|
|
39
|
+
})();
|
|
40
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
41
|
+
exports.runCI = runCI;
|
|
42
|
+
exports.runCICLI = runCICLI;
|
|
43
|
+
const fs = __importStar(require("node:fs/promises"));
|
|
44
|
+
const diff_1 = require("./diff");
|
|
45
|
+
const discover_1 = require("./discover");
|
|
46
|
+
const impact_analysis_1 = require("./impact-analysis");
|
|
47
|
+
const run_1 = require("./run");
|
|
48
|
+
const workspace_1 = require("./workspace");
|
|
49
|
+
/**
|
|
50
|
+
* Run CI command
|
|
51
|
+
*/
|
|
52
|
+
async function runCI(options, projectRoot = process.cwd()) {
|
|
53
|
+
const workspace = (0, workspace_1.resolveEvalWorkspace)(projectRoot);
|
|
54
|
+
const narrative = [];
|
|
55
|
+
try {
|
|
56
|
+
// 1. Ensure .evalai workspace exists
|
|
57
|
+
await fs.mkdir(workspace.evalaiDir, { recursive: true });
|
|
58
|
+
narrative.push("✅ workspace ok");
|
|
59
|
+
// 2. Ensure manifest exists (build if missing)
|
|
60
|
+
let manifestExists = true;
|
|
61
|
+
try {
|
|
62
|
+
await fs.access(workspace.manifestPath);
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
manifestExists = false;
|
|
66
|
+
}
|
|
67
|
+
if (!manifestExists) {
|
|
68
|
+
console.log("📋 Building evaluation manifest...");
|
|
69
|
+
await (0, discover_1.discoverSpecs)({ manifest: true });
|
|
70
|
+
narrative.push("→ manifest built");
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
narrative.push("→ manifest ok");
|
|
74
|
+
}
|
|
75
|
+
// 3. Run impact analysis if --impacted-only
|
|
76
|
+
let impactedSpecCount;
|
|
77
|
+
if (options.impactedOnly) {
|
|
78
|
+
const impactResult = await (0, impact_analysis_1.runImpactAnalysis)({
|
|
79
|
+
baseBranch: options.base || "main",
|
|
80
|
+
}, projectRoot);
|
|
81
|
+
impactedSpecCount = impactResult.metadata.impactedCount;
|
|
82
|
+
narrative.push(`→ impacted specs ${impactedSpecCount}`);
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
narrative.push("→ running all specs");
|
|
86
|
+
}
|
|
87
|
+
// 4. Run evaluations
|
|
88
|
+
const runResult = await (0, run_1.runEvaluations)({
|
|
89
|
+
impactedOnly: options.impactedOnly,
|
|
90
|
+
baseBranch: options.base,
|
|
91
|
+
writeResults: options.writeResults ?? true, // Always write results for CI
|
|
92
|
+
}, projectRoot);
|
|
93
|
+
narrative.push(`→ runId ${runResult.runId}`);
|
|
94
|
+
// 5. Run diff if --base provided
|
|
95
|
+
let diffResult;
|
|
96
|
+
if (options.base) {
|
|
97
|
+
diffResult = await (0, diff_1.runDiff)({
|
|
98
|
+
base: options.base,
|
|
99
|
+
head: "last",
|
|
100
|
+
});
|
|
101
|
+
if (diffResult.summary.regressions > 0) {
|
|
102
|
+
narrative.push(`→ diff ${diffResult.summary.regressions} regressions`);
|
|
103
|
+
return {
|
|
104
|
+
success: false,
|
|
105
|
+
exitCode: 1,
|
|
106
|
+
narrative: narrative.join(" "),
|
|
107
|
+
runResult,
|
|
108
|
+
diffResult,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
narrative.push("→ diff clean");
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
narrative.push("→ no diff");
|
|
117
|
+
}
|
|
118
|
+
// 6. Check for run failures
|
|
119
|
+
if (runResult.summary.failed > 0) {
|
|
120
|
+
return {
|
|
121
|
+
success: false,
|
|
122
|
+
exitCode: 1,
|
|
123
|
+
narrative: narrative.join(" "),
|
|
124
|
+
runResult,
|
|
125
|
+
diffResult,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
return {
|
|
129
|
+
success: true,
|
|
130
|
+
exitCode: 0,
|
|
131
|
+
narrative: narrative.join(" "),
|
|
132
|
+
runResult,
|
|
133
|
+
diffResult,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
catch (error) {
|
|
137
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
138
|
+
// Print next step for debugging
|
|
139
|
+
printNextStep(errorMessage, options, workspace);
|
|
140
|
+
return {
|
|
141
|
+
success: false,
|
|
142
|
+
exitCode: 2, // Config/infra issue
|
|
143
|
+
narrative: narrative.join(" "),
|
|
144
|
+
error: errorMessage,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Print copy/paste debug flow
|
|
150
|
+
*/
|
|
151
|
+
function printNextStep(error, options, workspace) {
|
|
152
|
+
console.log("\n🔧 Next step for debugging:");
|
|
153
|
+
if (error.includes("No evaluation manifest found")) {
|
|
154
|
+
console.log(" evalai discover --manifest");
|
|
155
|
+
}
|
|
156
|
+
else if (error.includes("Base run report not found in CI environment")) {
|
|
157
|
+
console.log(` Download base artifact and run: evalai diff --base .evalai/base-run.json --head ${workspace.lastRunPath}`);
|
|
158
|
+
}
|
|
159
|
+
else if (options.base && error.includes("Base run report not found")) {
|
|
160
|
+
console.log(` evalai explain --report ${workspace.lastRunPath}`);
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
console.log(` evalai explain --report ${workspace.lastRunPath}`);
|
|
164
|
+
}
|
|
165
|
+
console.log(` Artifacts: ${workspace.runsDir}/`);
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* CLI entry point
|
|
169
|
+
*/
|
|
170
|
+
async function runCICLI(options) {
|
|
171
|
+
const result = await runCI(options);
|
|
172
|
+
// Print narrative
|
|
173
|
+
console.log(`🤖 ${result.narrative}`);
|
|
174
|
+
// Print detailed results if not clean
|
|
175
|
+
if (!result.success && result.runResult) {
|
|
176
|
+
console.log("\n📊 Run Results:");
|
|
177
|
+
console.log(` ✅ Passed: ${result.runResult.summary.passed}`);
|
|
178
|
+
console.log(` ❌ Failed: ${result.runResult.summary.failed}`);
|
|
179
|
+
console.log(` 📊 Pass Rate: ${(result.runResult.summary.passRate * 100).toFixed(1)}%`);
|
|
180
|
+
}
|
|
181
|
+
if (!result.success && result.diffResult) {
|
|
182
|
+
console.log("\n🔄 Diff Results:");
|
|
183
|
+
console.log(` 📉 Regressions: ${result.diffResult.summary.regressions}`);
|
|
184
|
+
console.log(` 📈 Improvements: ${result.diffResult.summary.improvements}`);
|
|
185
|
+
console.log(` 📊 Pass Rate Delta: ${(result.diffResult.summary.passRateDelta * 100).toFixed(1)}%`);
|
|
186
|
+
}
|
|
187
|
+
if (result.error) {
|
|
188
|
+
console.log(`\n❌ Error: ${result.error}`);
|
|
189
|
+
}
|
|
190
|
+
// Exit with appropriate code
|
|
191
|
+
process.exit(result.exitCode);
|
|
192
|
+
}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TICKET 5 — Behavioral Diff CLI (EVAL-401)
|
|
3
|
+
*
|
|
4
|
+
* Goal: "Git diff for AI behavior" from two RunReports
|
|
5
|
+
*
|
|
6
|
+
* Command:
|
|
7
|
+
* evalai diff --base main (default uses git to find baseline run)
|
|
8
|
+
* evalai diff --a <runReportPath> --b <runReportPath>
|
|
9
|
+
* evalai diff main..feature (nice-to-have alias)
|
|
10
|
+
*/
|
|
11
|
+
import type { RunResult } from "./run";
|
|
12
|
+
/**
|
|
13
|
+
* Diff schema version
|
|
14
|
+
*/
|
|
15
|
+
export declare const DIFF_SCHEMA_VERSION = 1;
|
|
16
|
+
/**
|
|
17
|
+
* Supported RunReport schema versions
|
|
18
|
+
*/
|
|
19
|
+
export declare const SUPPORTED_SCHEMA_VERSIONS: readonly [1];
|
|
20
|
+
/**
|
|
21
|
+
* Rounding helpers for floating point normalization
|
|
22
|
+
*/
|
|
23
|
+
export declare function round(value: number, precision?: number): number;
|
|
24
|
+
export declare function roundPct(value: number, precision?: number): number;
|
|
25
|
+
/**
|
|
26
|
+
* Validate RunReport schema version
|
|
27
|
+
*/
|
|
28
|
+
export declare function validateSchemaVersion(report: RunResult): void;
|
|
29
|
+
/**
|
|
30
|
+
* Diff result classification
|
|
31
|
+
*/
|
|
32
|
+
export type DiffClassification = "new_failure" | "fixed_failure" | "score_drop" | "score_improve" | "execution_error" | "skipped_change" | "added" | "removed";
|
|
33
|
+
/**
|
|
34
|
+
* Individual spec diff
|
|
35
|
+
*/
|
|
36
|
+
export interface SpecDiff {
|
|
37
|
+
/** Spec identifier */
|
|
38
|
+
specId: string;
|
|
39
|
+
/** Spec name */
|
|
40
|
+
name: string;
|
|
41
|
+
/** File path */
|
|
42
|
+
filePath: string;
|
|
43
|
+
/** Classification of change */
|
|
44
|
+
classification: DiffClassification;
|
|
45
|
+
/** Base run result (if exists) */
|
|
46
|
+
base?: {
|
|
47
|
+
status: "passed" | "failed" | "skipped";
|
|
48
|
+
score?: number;
|
|
49
|
+
duration: number;
|
|
50
|
+
error?: string;
|
|
51
|
+
};
|
|
52
|
+
/** Head run result (if exists) */
|
|
53
|
+
head?: {
|
|
54
|
+
status: "passed" | "failed" | "skipped";
|
|
55
|
+
score?: number;
|
|
56
|
+
duration: number;
|
|
57
|
+
error?: string;
|
|
58
|
+
};
|
|
59
|
+
/** Calculated deltas */
|
|
60
|
+
deltas: {
|
|
61
|
+
scoreDelta?: number;
|
|
62
|
+
durationDelta?: number;
|
|
63
|
+
statusChange?: string;
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Diff summary statistics
|
|
68
|
+
*/
|
|
69
|
+
export interface DiffSummary {
|
|
70
|
+
/** Total specs in base */
|
|
71
|
+
baseTotal: number;
|
|
72
|
+
/** Total specs in head */
|
|
73
|
+
headTotal: number;
|
|
74
|
+
/** Pass rate delta */
|
|
75
|
+
passRateDelta: number;
|
|
76
|
+
/** Score delta (average) */
|
|
77
|
+
scoreDelta: number;
|
|
78
|
+
/** Number of regressions */
|
|
79
|
+
regressions: number;
|
|
80
|
+
/** Number of improvements */
|
|
81
|
+
improvements: number;
|
|
82
|
+
/** Number of added specs */
|
|
83
|
+
added: number;
|
|
84
|
+
/** Number of removed specs */
|
|
85
|
+
removed: number;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Complete diff result
|
|
89
|
+
*/
|
|
90
|
+
export interface DiffResult {
|
|
91
|
+
/** Schema version */
|
|
92
|
+
schemaVersion: number;
|
|
93
|
+
/** Base run report */
|
|
94
|
+
base: RunResult;
|
|
95
|
+
/** Head run report */
|
|
96
|
+
head: RunResult;
|
|
97
|
+
/** Diff summary */
|
|
98
|
+
summary: DiffSummary;
|
|
99
|
+
/** Individual spec diffs */
|
|
100
|
+
changedSpecs: SpecDiff[];
|
|
101
|
+
/** Diff metadata */
|
|
102
|
+
metadata: {
|
|
103
|
+
generatedAt: number;
|
|
104
|
+
baseSource: string;
|
|
105
|
+
headSource: string;
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Diff options
|
|
110
|
+
*/
|
|
111
|
+
export interface DiffOptions {
|
|
112
|
+
/** Base report path or branch */
|
|
113
|
+
base?: string;
|
|
114
|
+
/** Head report path */
|
|
115
|
+
head?: string;
|
|
116
|
+
/** Output format */
|
|
117
|
+
format?: "human" | "json";
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Run diff comparison
|
|
121
|
+
*/
|
|
122
|
+
export declare function runDiff(options: DiffOptions): Promise<DiffResult>;
|
|
123
|
+
/**
|
|
124
|
+
* Compare two run reports
|
|
125
|
+
*/
|
|
126
|
+
export declare function compareReports(base: RunResult, head: RunResult): DiffResult;
|
|
127
|
+
/**
|
|
128
|
+
* Classify the type of change
|
|
129
|
+
*/
|
|
130
|
+
declare function classifyDiff(base?: RunResult["results"][0], head?: RunResult["results"][0]): DiffClassification;
|
|
131
|
+
/**
|
|
132
|
+
* Calculate deltas between base and head
|
|
133
|
+
*/
|
|
134
|
+
declare function calculateDeltas(base?: RunResult["results"][0], head?: RunResult["results"][0]): SpecDiff["deltas"];
|
|
135
|
+
/**
|
|
136
|
+
* Calculate diff summary statistics
|
|
137
|
+
*/
|
|
138
|
+
export declare function calculateDiffSummary(base: RunResult, head: RunResult, changedSpecs: SpecDiff[]): DiffSummary;
|
|
139
|
+
/**
|
|
140
|
+
* Print human-readable diff results
|
|
141
|
+
*/
|
|
142
|
+
export declare function printHumanResults(result: DiffResult): void;
|
|
143
|
+
/**
|
|
144
|
+
* Print JSON results
|
|
145
|
+
*/
|
|
146
|
+
export declare function printJsonResults(result: DiffResult): void;
|
|
147
|
+
/**
|
|
148
|
+
* Write GitHub Step Summary
|
|
149
|
+
*/
|
|
150
|
+
export declare function writeGitHubStepSummary(result: DiffResult): Promise<void>;
|
|
151
|
+
/**
|
|
152
|
+
* CLI entry point
|
|
153
|
+
*/
|
|
154
|
+
export declare function runDiffCLI(options: DiffOptions): Promise<void>;
|
|
155
|
+
export { classifyDiff, calculateDeltas };
|
|
156
|
+
export declare const diffCore: {
|
|
157
|
+
/**
|
|
158
|
+
* Compare two run reports and return diff result
|
|
159
|
+
*/
|
|
160
|
+
readonly diffRunReports: typeof compareReports;
|
|
161
|
+
/**
|
|
162
|
+
* Classify the type of change between two specs
|
|
163
|
+
*/
|
|
164
|
+
readonly classifyChange: typeof classifyDiff;
|
|
165
|
+
/**
|
|
166
|
+
* Calculate summary statistics for a diff
|
|
167
|
+
*/
|
|
168
|
+
readonly summarizeDiff: typeof calculateDiffSummary;
|
|
169
|
+
/**
|
|
170
|
+
* Calculate deltas between two spec results
|
|
171
|
+
*/
|
|
172
|
+
readonly calculateDeltas: typeof calculateDeltas;
|
|
173
|
+
};
|