@pauly4010/evalai-sdk 1.8.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/dist/cli/ci.d.ts +45 -0
  3. package/dist/cli/ci.js +192 -0
  4. package/dist/cli/diff.d.ts +173 -0
  5. package/dist/cli/diff.js +680 -0
  6. package/dist/cli/discover.d.ts +84 -0
  7. package/dist/cli/discover.js +408 -0
  8. package/dist/cli/doctor.js +19 -10
  9. package/dist/cli/env.d.ts +21 -0
  10. package/dist/cli/env.js +42 -0
  11. package/dist/cli/explain.js +143 -37
  12. package/dist/cli/impact-analysis.d.ts +63 -0
  13. package/dist/cli/impact-analysis.js +251 -0
  14. package/dist/cli/index.js +173 -0
  15. package/dist/cli/manifest.d.ts +105 -0
  16. package/dist/cli/manifest.js +275 -0
  17. package/dist/cli/migrate.d.ts +41 -0
  18. package/dist/cli/migrate.js +349 -0
  19. package/dist/cli/print-config.js +18 -14
  20. package/dist/cli/run.d.ts +101 -0
  21. package/dist/cli/run.js +389 -0
  22. package/dist/cli/workspace.d.ts +28 -0
  23. package/dist/cli/workspace.js +58 -0
  24. package/dist/index.d.ts +6 -0
  25. package/dist/index.js +30 -5
  26. package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
  27. package/dist/runtime/adapters/config-to-dsl.js +391 -0
  28. package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
  29. package/dist/runtime/adapters/testsuite-to-dsl.js +271 -0
  30. package/dist/runtime/context.d.ts +26 -0
  31. package/dist/runtime/context.js +74 -0
  32. package/dist/runtime/eval.d.ts +46 -0
  33. package/dist/runtime/eval.js +237 -0
  34. package/dist/runtime/execution-mode.d.ts +80 -0
  35. package/dist/runtime/execution-mode.js +353 -0
  36. package/dist/runtime/executor.d.ts +16 -0
  37. package/dist/runtime/executor.js +152 -0
  38. package/dist/runtime/registry.d.ts +78 -0
  39. package/dist/runtime/registry.js +416 -0
  40. package/dist/runtime/run-report.d.ts +202 -0
  41. package/dist/runtime/run-report.js +220 -0
  42. package/dist/runtime/types.d.ts +356 -0
  43. package/dist/runtime/types.js +76 -0
  44. package/dist/testing.d.ts +65 -0
  45. package/dist/testing.js +42 -0
  46. package/dist/version.d.ts +1 -1
  47. package/dist/version.js +1 -1
  48. package/package.json +4 -3
package/CHANGELOG.md CHANGED
@@ -5,6 +5,60 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.9.0] - 2026-02-27
9
+
10
+ ### ✨ Added
11
+
12
+ #### CLI — One-Command CI Loop (`evalai ci`)
13
+
14
+ - **`evalai ci`** — Single command teams put in GitHub workflows and never think about again
15
+ - **Complete CI pipeline**: discover → manifest → impact → run → diff → PR summary → safe failure → "next step"
16
+ - **Automatic manifest building**: Builds manifest if missing, no manual steps required
17
+ - **Impact analysis integration**: `--impacted-only` flag for targeted testing
18
+ - **Smart exit codes**: 0=clean, 1=regressions, 2=config/infra issues
19
+ - **Self-documenting failures**: Always prints copy/paste next step for debugging
20
+ - **GitHub Step Summary integration**: Automatic PR summaries with regressions and artifacts
21
+
22
+ #### CLI — Durable Run History & Diff System
23
+
24
+ - **Run artifact retention**: Timestamped artifacts in `.evalai/runs/run-<runId>.json`
25
+ - **Run index file**: `.evalai/runs/index.json` tracks all runs with metadata
26
+ - **Schema versioning**: `RunResult` and `DiffResult` include `schemaVersion` for compatibility
27
+ - **Base/head shortcuts**: `--base baseline`, `--base last`, `--head last` for common cases
28
+ - **Floating point normalization**: Consistent score/delta calculations across runs
29
+ - **Comprehensive diff comparison**: Classifies regressions, improvements, added, removed specs
30
+
31
+ #### CLI — Centralized Architecture
32
+
33
+ - **Environment detection**: `isCI()`, `isGitHubActions()`, `getGitHubStepSummaryPath()` unified
34
+ - **Workspace resolution**: `resolveEvalWorkspace()` provides all `.evalai` paths
35
+ - **Git reference detection**: Comprehensive patterns for branches, tags, and ranges
36
+ - **No more duplication**: All commands use shared utilities for consistency
37
+
38
+ #### CLI — CI Friendliness
39
+
40
+ - **Fail-safe base resolution**: Clear error messages when base artifacts missing in CI
41
+ - **GitHub Step Summary**: Rich markdown summaries with metrics, regressions, and artifact links
42
+ - **CI-specific error handling**: Exit code 2 for config issues with helpful guidance
43
+ - **Artifact download instructions**: Exact commands for manual base artifact setup
44
+
45
+ ### 🔧 Changed
46
+
47
+ - **Exit codes standardized**: 0=clean, 1=regressions, 2=config/infra issues across all commands
48
+ - **Schema compatibility**: Added `schemaVersion` validation for future-proofing
49
+ - **Path resolution**: All commands use centralized workspace helpers
50
+ - **Error messages**: More actionable and context-aware guidance
51
+
52
+ ### 📊 New Features Summary
53
+
54
+ - **One-command CI**: `evalai ci` replaces multi-step workflows
55
+ - **Durable history**: Run artifacts preserved with smart indexing
56
+ - **Smart diffing**: Automated regression detection with GitHub integration
57
+ - **Centralized utilities**: Environment detection and workspace resolution unified
58
+ - **Self-documenting**: Clear next steps for any failure scenario
59
+
60
+ ---
61
+
8
62
  ## [1.8.0] - 2026-02-26
9
63
 
10
64
  ### ✨ Added
@@ -0,0 +1,45 @@
1
+ /**
2
+ * UX-401: One-command CI loop (evalai ci)
3
+ *
4
+ * Provides a single command teams put in .github/workflows/* and never think about again.
5
+ */
6
+ import type { DiffResult } from "./diff";
7
+ import type { RunResult } from "./run";
8
+ /**
9
+ * CI command options
10
+ */
11
+ export interface CIOptions {
12
+ /** Base reference for diff comparison */
13
+ base?: string;
14
+ /** Run only impacted specs */
15
+ impactedOnly?: boolean;
16
+ /** Output format */
17
+ format?: "human" | "json" | "github";
18
+ /** Write run results */
19
+ writeResults?: boolean;
20
+ }
21
+ /**
22
+ * CI execution result
23
+ */
24
+ export interface CIResult {
25
+ /** Success status */
26
+ success: boolean;
27
+ /** Exit code */
28
+ exitCode: number;
29
+ /** Execution narrative */
30
+ narrative: string;
31
+ /** Run result (if executed) */
32
+ runResult?: RunResult;
33
+ /** Diff result (if executed) */
34
+ diffResult?: DiffResult;
35
+ /** Error message (if failed) */
36
+ error?: string;
37
+ }
38
+ /**
39
+ * Run CI command
40
+ */
41
+ export declare function runCI(options: CIOptions, projectRoot?: string): Promise<CIResult>;
42
+ /**
43
+ * CLI entry point
44
+ */
45
+ export declare function runCICLI(options: CIOptions): Promise<void>;
package/dist/cli/ci.js ADDED
@@ -0,0 +1,192 @@
1
+ "use strict";
2
+ /**
3
+ * UX-401: One-command CI loop (evalai ci)
4
+ *
5
+ * Provides a single command teams put in .github/workflows/* and never think about again.
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ Object.defineProperty(exports, "__esModule", { value: true });
41
+ exports.runCI = runCI;
42
+ exports.runCICLI = runCICLI;
43
+ const fs = __importStar(require("node:fs/promises"));
44
+ const diff_1 = require("./diff");
45
+ const discover_1 = require("./discover");
46
+ const impact_analysis_1 = require("./impact-analysis");
47
+ const run_1 = require("./run");
48
+ const workspace_1 = require("./workspace");
49
+ /**
50
+ * Run CI command
51
+ */
52
+ async function runCI(options, projectRoot = process.cwd()) {
53
+ const workspace = (0, workspace_1.resolveEvalWorkspace)(projectRoot);
54
+ const narrative = [];
55
+ try {
56
+ // 1. Ensure .evalai workspace exists
57
+ await fs.mkdir(workspace.evalaiDir, { recursive: true });
58
+ narrative.push("✅ workspace ok");
59
+ // 2. Ensure manifest exists (build if missing)
60
+ let manifestExists = true;
61
+ try {
62
+ await fs.access(workspace.manifestPath);
63
+ }
64
+ catch {
65
+ manifestExists = false;
66
+ }
67
+ if (!manifestExists) {
68
+ console.log("📋 Building evaluation manifest...");
69
+ await (0, discover_1.discoverSpecs)({ manifest: true });
70
+ narrative.push("→ manifest built");
71
+ }
72
+ else {
73
+ narrative.push("→ manifest ok");
74
+ }
75
+ // 3. Run impact analysis if --impacted-only
76
+ let impactedSpecCount;
77
+ if (options.impactedOnly) {
78
+ const impactResult = await (0, impact_analysis_1.runImpactAnalysis)({
79
+ baseBranch: options.base || "main",
80
+ }, projectRoot);
81
+ impactedSpecCount = impactResult.metadata.impactedCount;
82
+ narrative.push(`→ impacted specs ${impactedSpecCount}`);
83
+ }
84
+ else {
85
+ narrative.push("→ running all specs");
86
+ }
87
+ // 4. Run evaluations
88
+ const runResult = await (0, run_1.runEvaluations)({
89
+ impactedOnly: options.impactedOnly,
90
+ baseBranch: options.base,
91
+ writeResults: options.writeResults ?? true, // Always write results for CI
92
+ }, projectRoot);
93
+ narrative.push(`→ runId ${runResult.runId}`);
94
+ // 5. Run diff if --base provided
95
+ let diffResult;
96
+ if (options.base) {
97
+ diffResult = await (0, diff_1.runDiff)({
98
+ base: options.base,
99
+ head: "last",
100
+ });
101
+ if (diffResult.summary.regressions > 0) {
102
+ narrative.push(`→ diff ${diffResult.summary.regressions} regressions`);
103
+ return {
104
+ success: false,
105
+ exitCode: 1,
106
+ narrative: narrative.join(" "),
107
+ runResult,
108
+ diffResult,
109
+ };
110
+ }
111
+ else {
112
+ narrative.push("→ diff clean");
113
+ }
114
+ }
115
+ else {
116
+ narrative.push("→ no diff");
117
+ }
118
+ // 6. Check for run failures
119
+ if (runResult.summary.failed > 0) {
120
+ return {
121
+ success: false,
122
+ exitCode: 1,
123
+ narrative: narrative.join(" "),
124
+ runResult,
125
+ diffResult,
126
+ };
127
+ }
128
+ return {
129
+ success: true,
130
+ exitCode: 0,
131
+ narrative: narrative.join(" "),
132
+ runResult,
133
+ diffResult,
134
+ };
135
+ }
136
+ catch (error) {
137
+ const errorMessage = error instanceof Error ? error.message : String(error);
138
+ // Print next step for debugging
139
+ printNextStep(errorMessage, options, workspace);
140
+ return {
141
+ success: false,
142
+ exitCode: 2, // Config/infra issue
143
+ narrative: narrative.join(" "),
144
+ error: errorMessage,
145
+ };
146
+ }
147
+ }
148
+ /**
149
+ * Print copy/paste debug flow
150
+ */
151
+ function printNextStep(error, options, workspace) {
152
+ console.log("\n🔧 Next step for debugging:");
153
+ if (error.includes("No evaluation manifest found")) {
154
+ console.log(" evalai discover --manifest");
155
+ }
156
+ else if (error.includes("Base run report not found in CI environment")) {
157
+ console.log(` Download base artifact and run: evalai diff --base .evalai/base-run.json --head ${workspace.lastRunPath}`);
158
+ }
159
+ else if (options.base && error.includes("Base run report not found")) {
160
+ console.log(` evalai explain --report ${workspace.lastRunPath}`);
161
+ }
162
+ else {
163
+ console.log(` evalai explain --report ${workspace.lastRunPath}`);
164
+ }
165
+ console.log(` Artifacts: ${workspace.runsDir}/`);
166
+ }
167
+ /**
168
+ * CLI entry point
169
+ */
170
+ async function runCICLI(options) {
171
+ const result = await runCI(options);
172
+ // Print narrative
173
+ console.log(`🤖 ${result.narrative}`);
174
+ // Print detailed results if not clean
175
+ if (!result.success && result.runResult) {
176
+ console.log("\n📊 Run Results:");
177
+ console.log(` ✅ Passed: ${result.runResult.summary.passed}`);
178
+ console.log(` ❌ Failed: ${result.runResult.summary.failed}`);
179
+ console.log(` 📊 Pass Rate: ${(result.runResult.summary.passRate * 100).toFixed(1)}%`);
180
+ }
181
+ if (!result.success && result.diffResult) {
182
+ console.log("\n🔄 Diff Results:");
183
+ console.log(` 📉 Regressions: ${result.diffResult.summary.regressions}`);
184
+ console.log(` 📈 Improvements: ${result.diffResult.summary.improvements}`);
185
+ console.log(` 📊 Pass Rate Delta: ${(result.diffResult.summary.passRateDelta * 100).toFixed(1)}%`);
186
+ }
187
+ if (result.error) {
188
+ console.log(`\n❌ Error: ${result.error}`);
189
+ }
190
+ // Exit with appropriate code
191
+ process.exit(result.exitCode);
192
+ }
@@ -0,0 +1,173 @@
1
+ /**
2
+ * TICKET 5 — Behavioral Diff CLI (EVAL-401)
3
+ *
4
+ * Goal: "Git diff for AI behavior" from two RunReports
5
+ *
6
+ * Command:
7
+ * evalai diff --base main (default uses git to find baseline run)
8
+ * evalai diff --a <runReportPath> --b <runReportPath>
9
+ * evalai diff main..feature (nice-to-have alias)
10
+ */
11
+ import type { RunResult } from "./run";
12
+ /**
13
+ * Diff schema version
14
+ */
15
+ export declare const DIFF_SCHEMA_VERSION = 1;
16
+ /**
17
+ * Supported RunReport schema versions
18
+ */
19
+ export declare const SUPPORTED_SCHEMA_VERSIONS: readonly [1];
20
+ /**
21
+ * Rounding helpers for floating point normalization
22
+ */
23
+ export declare function round(value: number, precision?: number): number;
24
+ export declare function roundPct(value: number, precision?: number): number;
25
+ /**
26
+ * Validate RunReport schema version
27
+ */
28
+ export declare function validateSchemaVersion(report: RunResult): void;
29
+ /**
30
+ * Diff result classification
31
+ */
32
+ export type DiffClassification = "new_failure" | "fixed_failure" | "score_drop" | "score_improve" | "execution_error" | "skipped_change" | "added" | "removed";
33
+ /**
34
+ * Individual spec diff
35
+ */
36
+ export interface SpecDiff {
37
+ /** Spec identifier */
38
+ specId: string;
39
+ /** Spec name */
40
+ name: string;
41
+ /** File path */
42
+ filePath: string;
43
+ /** Classification of change */
44
+ classification: DiffClassification;
45
+ /** Base run result (if exists) */
46
+ base?: {
47
+ status: "passed" | "failed" | "skipped";
48
+ score?: number;
49
+ duration: number;
50
+ error?: string;
51
+ };
52
+ /** Head run result (if exists) */
53
+ head?: {
54
+ status: "passed" | "failed" | "skipped";
55
+ score?: number;
56
+ duration: number;
57
+ error?: string;
58
+ };
59
+ /** Calculated deltas */
60
+ deltas: {
61
+ scoreDelta?: number;
62
+ durationDelta?: number;
63
+ statusChange?: string;
64
+ };
65
+ }
66
+ /**
67
+ * Diff summary statistics
68
+ */
69
+ export interface DiffSummary {
70
+ /** Total specs in base */
71
+ baseTotal: number;
72
+ /** Total specs in head */
73
+ headTotal: number;
74
+ /** Pass rate delta */
75
+ passRateDelta: number;
76
+ /** Score delta (average) */
77
+ scoreDelta: number;
78
+ /** Number of regressions */
79
+ regressions: number;
80
+ /** Number of improvements */
81
+ improvements: number;
82
+ /** Number of added specs */
83
+ added: number;
84
+ /** Number of removed specs */
85
+ removed: number;
86
+ }
87
+ /**
88
+ * Complete diff result
89
+ */
90
+ export interface DiffResult {
91
+ /** Schema version */
92
+ schemaVersion: number;
93
+ /** Base run report */
94
+ base: RunResult;
95
+ /** Head run report */
96
+ head: RunResult;
97
+ /** Diff summary */
98
+ summary: DiffSummary;
99
+ /** Individual spec diffs */
100
+ changedSpecs: SpecDiff[];
101
+ /** Diff metadata */
102
+ metadata: {
103
+ generatedAt: number;
104
+ baseSource: string;
105
+ headSource: string;
106
+ };
107
+ }
108
+ /**
109
+ * Diff options
110
+ */
111
+ export interface DiffOptions {
112
+ /** Base report path or branch */
113
+ base?: string;
114
+ /** Head report path */
115
+ head?: string;
116
+ /** Output format */
117
+ format?: "human" | "json";
118
+ }
119
+ /**
120
+ * Run diff comparison
121
+ */
122
+ export declare function runDiff(options: DiffOptions): Promise<DiffResult>;
123
+ /**
124
+ * Compare two run reports
125
+ */
126
+ export declare function compareReports(base: RunResult, head: RunResult): DiffResult;
127
+ /**
128
+ * Classify the type of change
129
+ */
130
+ declare function classifyDiff(base?: RunResult["results"][0], head?: RunResult["results"][0]): DiffClassification;
131
+ /**
132
+ * Calculate deltas between base and head
133
+ */
134
+ declare function calculateDeltas(base?: RunResult["results"][0], head?: RunResult["results"][0]): SpecDiff["deltas"];
135
+ /**
136
+ * Calculate diff summary statistics
137
+ */
138
+ export declare function calculateDiffSummary(base: RunResult, head: RunResult, changedSpecs: SpecDiff[]): DiffSummary;
139
+ /**
140
+ * Print human-readable diff results
141
+ */
142
+ export declare function printHumanResults(result: DiffResult): void;
143
+ /**
144
+ * Print JSON results
145
+ */
146
+ export declare function printJsonResults(result: DiffResult): void;
147
+ /**
148
+ * Write GitHub Step Summary
149
+ */
150
+ export declare function writeGitHubStepSummary(result: DiffResult): Promise<void>;
151
+ /**
152
+ * CLI entry point
153
+ */
154
+ export declare function runDiffCLI(options: DiffOptions): Promise<void>;
155
+ export { classifyDiff, calculateDeltas };
156
+ export declare const diffCore: {
157
+ /**
158
+ * Compare two run reports and return diff result
159
+ */
160
+ readonly diffRunReports: typeof compareReports;
161
+ /**
162
+ * Classify the type of change between two specs
163
+ */
164
+ readonly classifyChange: typeof classifyDiff;
165
+ /**
166
+ * Calculate summary statistics for a diff
167
+ */
168
+ readonly summarizeDiff: typeof calculateDiffSummary;
169
+ /**
170
+ * Calculate deltas between two spec results
171
+ */
172
+ readonly calculateDeltas: typeof calculateDeltas;
173
+ };