@pauly4010/evalai-sdk 1.8.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/README.md +136 -23
  3. package/dist/assertions.js +51 -18
  4. package/dist/batch.js +8 -2
  5. package/dist/cli/api.js +3 -1
  6. package/dist/cli/check.js +19 -6
  7. package/dist/cli/ci-context.js +3 -1
  8. package/dist/cli/ci.d.ts +45 -0
  9. package/dist/cli/ci.js +192 -0
  10. package/dist/cli/config.js +28 -8
  11. package/dist/cli/diff.d.ts +173 -0
  12. package/dist/cli/diff.js +685 -0
  13. package/dist/cli/discover.d.ts +84 -0
  14. package/dist/cli/discover.js +419 -0
  15. package/dist/cli/doctor.js +62 -19
  16. package/dist/cli/env.d.ts +21 -0
  17. package/dist/cli/env.js +42 -0
  18. package/dist/cli/explain.js +168 -36
  19. package/dist/cli/formatters/human.js +4 -1
  20. package/dist/cli/formatters/pr-comment.js +3 -1
  21. package/dist/cli/gate.js +6 -2
  22. package/dist/cli/impact-analysis.d.ts +63 -0
  23. package/dist/cli/impact-analysis.js +252 -0
  24. package/dist/cli/index.js +185 -0
  25. package/dist/cli/manifest.d.ts +103 -0
  26. package/dist/cli/manifest.js +282 -0
  27. package/dist/cli/migrate.d.ts +41 -0
  28. package/dist/cli/migrate.js +349 -0
  29. package/dist/cli/policy-packs.js +8 -2
  30. package/dist/cli/print-config.js +33 -14
  31. package/dist/cli/regression-gate.js +8 -2
  32. package/dist/cli/report/build-check-report.js +8 -2
  33. package/dist/cli/run.d.ts +101 -0
  34. package/dist/cli/run.js +395 -0
  35. package/dist/cli/share.js +3 -1
  36. package/dist/cli/upgrade.js +2 -1
  37. package/dist/cli/workspace.d.ts +28 -0
  38. package/dist/cli/workspace.js +58 -0
  39. package/dist/client.d.ts +16 -19
  40. package/dist/client.js +60 -43
  41. package/dist/client.request.test.d.ts +1 -1
  42. package/dist/client.request.test.js +222 -147
  43. package/dist/context.js +3 -1
  44. package/dist/errors.js +11 -4
  45. package/dist/export.js +3 -1
  46. package/dist/index.d.ts +8 -2
  47. package/dist/index.js +30 -5
  48. package/dist/integrations/anthropic.d.ts +20 -1
  49. package/dist/integrations/openai-eval.js +4 -2
  50. package/dist/integrations/openai.d.ts +24 -1
  51. package/dist/local.js +3 -1
  52. package/dist/logger.js +6 -2
  53. package/dist/pagination.js +6 -2
  54. package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
  55. package/dist/runtime/adapters/config-to-dsl.js +394 -0
  56. package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
  57. package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
  58. package/dist/runtime/context.d.ts +26 -0
  59. package/dist/runtime/context.js +74 -0
  60. package/dist/runtime/eval.d.ts +46 -0
  61. package/dist/runtime/eval.js +244 -0
  62. package/dist/runtime/execution-mode.d.ts +80 -0
  63. package/dist/runtime/execution-mode.js +357 -0
  64. package/dist/runtime/executor.d.ts +16 -0
  65. package/dist/runtime/executor.js +152 -0
  66. package/dist/runtime/registry.d.ts +78 -0
  67. package/dist/runtime/registry.js +403 -0
  68. package/dist/runtime/run-report.d.ts +200 -0
  69. package/dist/runtime/run-report.js +222 -0
  70. package/dist/runtime/types.d.ts +356 -0
  71. package/dist/runtime/types.js +76 -0
  72. package/dist/testing.d.ts +65 -0
  73. package/dist/testing.js +49 -2
  74. package/dist/types.d.ts +100 -69
  75. package/dist/utils/input-hash.js +4 -1
  76. package/dist/version.d.ts +1 -1
  77. package/dist/version.js +1 -1
  78. package/dist/workflows.js +62 -14
  79. package/package.json +115 -110
package/dist/cli/ci.js ADDED
@@ -0,0 +1,192 @@
1
+ "use strict";
2
+ /**
3
+ * UX-401: One-command CI loop (evalai ci)
4
+ *
5
+ * Provides a single command teams put in .github/workflows/* and never think about again.
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ Object.defineProperty(exports, "__esModule", { value: true });
41
+ exports.runCI = runCI;
42
+ exports.runCICLI = runCICLI;
43
+ const fs = __importStar(require("node:fs/promises"));
44
+ const diff_1 = require("./diff");
45
+ const discover_1 = require("./discover");
46
+ const impact_analysis_1 = require("./impact-analysis");
47
+ const run_1 = require("./run");
48
+ const workspace_1 = require("./workspace");
49
+ /**
50
+ * Run CI command
51
+ */
52
+ async function runCI(options, projectRoot = process.cwd()) {
53
+ const workspace = (0, workspace_1.resolveEvalWorkspace)(projectRoot);
54
+ const narrative = [];
55
+ try {
56
+ // 1. Ensure .evalai workspace exists
57
+ await fs.mkdir(workspace.evalaiDir, { recursive: true });
58
+ narrative.push("✅ workspace ok");
59
+ // 2. Ensure manifest exists (build if missing)
60
+ let manifestExists = true;
61
+ try {
62
+ await fs.access(workspace.manifestPath);
63
+ }
64
+ catch {
65
+ manifestExists = false;
66
+ }
67
+ if (!manifestExists) {
68
+ console.log("📋 Building evaluation manifest...");
69
+ await (0, discover_1.discoverSpecs)({ manifest: true });
70
+ narrative.push("→ manifest built");
71
+ }
72
+ else {
73
+ narrative.push("→ manifest ok");
74
+ }
75
+ // 3. Run impact analysis if --impacted-only
76
+ let impactedSpecCount;
77
+ if (options.impactedOnly) {
78
+ const impactResult = await (0, impact_analysis_1.runImpactAnalysis)({
79
+ baseBranch: options.base || "main",
80
+ }, projectRoot);
81
+ impactedSpecCount = impactResult.metadata.impactedCount;
82
+ narrative.push(`→ impacted specs ${impactedSpecCount}`);
83
+ }
84
+ else {
85
+ narrative.push("→ running all specs");
86
+ }
87
+ // 4. Run evaluations
88
+ const runResult = await (0, run_1.runEvaluations)({
89
+ impactedOnly: options.impactedOnly,
90
+ baseBranch: options.base,
91
+ writeResults: options.writeResults ?? true, // Always write results for CI
92
+ }, projectRoot);
93
+ narrative.push(`→ runId ${runResult.runId}`);
94
+ // 5. Run diff if --base provided
95
+ let diffResult;
96
+ if (options.base) {
97
+ diffResult = await (0, diff_1.runDiff)({
98
+ base: options.base,
99
+ head: "last",
100
+ });
101
+ if (diffResult.summary.regressions > 0) {
102
+ narrative.push(`→ diff ${diffResult.summary.regressions} regressions`);
103
+ return {
104
+ success: false,
105
+ exitCode: 1,
106
+ narrative: narrative.join(" "),
107
+ runResult,
108
+ diffResult,
109
+ };
110
+ }
111
+ else {
112
+ narrative.push("→ diff clean");
113
+ }
114
+ }
115
+ else {
116
+ narrative.push("→ no diff");
117
+ }
118
+ // 6. Check for run failures
119
+ if (runResult.summary.failed > 0) {
120
+ return {
121
+ success: false,
122
+ exitCode: 1,
123
+ narrative: narrative.join(" "),
124
+ runResult,
125
+ diffResult,
126
+ };
127
+ }
128
+ return {
129
+ success: true,
130
+ exitCode: 0,
131
+ narrative: narrative.join(" "),
132
+ runResult,
133
+ diffResult,
134
+ };
135
+ }
136
+ catch (error) {
137
+ const errorMessage = error instanceof Error ? error.message : String(error);
138
+ // Print next step for debugging
139
+ printNextStep(errorMessage, options, workspace);
140
+ return {
141
+ success: false,
142
+ exitCode: 2, // Config/infra issue
143
+ narrative: narrative.join(" "),
144
+ error: errorMessage,
145
+ };
146
+ }
147
+ }
148
+ /**
149
+ * Print copy/paste debug flow
150
+ */
151
+ function printNextStep(error, options, workspace) {
152
+ console.log("\n🔧 Next step for debugging:");
153
+ if (error.includes("No evaluation manifest found")) {
154
+ console.log(" evalai discover --manifest");
155
+ }
156
+ else if (error.includes("Base run report not found in CI environment")) {
157
+ console.log(` Download base artifact and run: evalai diff --base .evalai/base-run.json --head ${workspace.lastRunPath}`);
158
+ }
159
+ else if (options.base && error.includes("Base run report not found")) {
160
+ console.log(` evalai explain --report ${workspace.lastRunPath}`);
161
+ }
162
+ else {
163
+ console.log(` evalai explain --report ${workspace.lastRunPath}`);
164
+ }
165
+ console.log(` Artifacts: ${workspace.runsDir}/`);
166
+ }
167
+ /**
168
+ * CLI entry point
169
+ */
170
+ async function runCICLI(options) {
171
+ const result = await runCI(options);
172
+ // Print narrative
173
+ console.log(`🤖 ${result.narrative}`);
174
+ // Print detailed results if not clean
175
+ if (!result.success && result.runResult) {
176
+ console.log("\n📊 Run Results:");
177
+ console.log(` ✅ Passed: ${result.runResult.summary.passed}`);
178
+ console.log(` ❌ Failed: ${result.runResult.summary.failed}`);
179
+ console.log(` 📊 Pass Rate: ${(result.runResult.summary.passRate * 100).toFixed(1)}%`);
180
+ }
181
+ if (!result.success && result.diffResult) {
182
+ console.log("\n🔄 Diff Results:");
183
+ console.log(` 📉 Regressions: ${result.diffResult.summary.regressions}`);
184
+ console.log(` 📈 Improvements: ${result.diffResult.summary.improvements}`);
185
+ console.log(` 📊 Pass Rate Delta: ${(result.diffResult.summary.passRateDelta * 100).toFixed(1)}%`);
186
+ }
187
+ if (result.error) {
188
+ console.log(`\n❌ Error: ${result.error}`);
189
+ }
190
+ // Exit with appropriate code
191
+ process.exit(result.exitCode);
192
+ }
@@ -43,7 +43,11 @@ exports.mergeConfigWithArgs = mergeConfigWithArgs;
43
43
  const fs = __importStar(require("node:fs"));
44
44
  const path = __importStar(require("node:path"));
45
45
  const profiles_1 = require("./profiles");
46
- const CONFIG_FILES = ["evalai.config.json", "evalai.config.js", "evalai.config.cjs"];
46
+ const CONFIG_FILES = [
47
+ "evalai.config.json",
48
+ "evalai.config.js",
49
+ "evalai.config.cjs",
50
+ ];
47
51
  /**
48
52
  * Find config file path in directory, walking up to root
49
53
  */
@@ -113,7 +117,11 @@ function loadConfig(cwd = process.cwd()) {
113
117
  }
114
118
  for (const key of Object.keys(config.packages)) {
115
119
  if (relNorm === key || relNorm.startsWith(`${key}/`)) {
116
- return { ...config, ...config.packages[key], packages: config.packages };
120
+ return {
121
+ ...config,
122
+ ...config.packages[key],
123
+ packages: config.packages,
124
+ };
117
125
  }
118
126
  }
119
127
  }
@@ -156,11 +164,14 @@ function mergeConfigWithArgs(config, args) {
156
164
  merged.minScore = profile.minScore;
157
165
  if (merged.maxDrop === undefined && args.maxDrop === undefined)
158
166
  merged.maxDrop = profile.maxDrop;
159
- if (merged.warnDrop === undefined && args.warnDrop === undefined && "warnDrop" in profile)
167
+ if (merged.warnDrop === undefined &&
168
+ args.warnDrop === undefined &&
169
+ "warnDrop" in profile)
160
170
  merged.warnDrop = profile.warnDrop;
161
171
  if (merged.minN === undefined && args.minN === undefined)
162
172
  merged.minN = profile.minN;
163
- if (merged.allowWeakEvidence === undefined && args.allowWeakEvidence === undefined)
173
+ if (merged.allowWeakEvidence === undefined &&
174
+ args.allowWeakEvidence === undefined)
164
175
  merged.allowWeakEvidence = profile.allowWeakEvidence;
165
176
  }
166
177
  // Args override
@@ -172,18 +183,27 @@ function mergeConfigWithArgs(config, args) {
172
183
  }
173
184
  if (args.minScore !== undefined) {
174
185
  merged.minScore =
175
- typeof args.minScore === "number" ? args.minScore : parseInt(String(args.minScore), 10);
186
+ typeof args.minScore === "number"
187
+ ? args.minScore
188
+ : parseInt(String(args.minScore), 10);
176
189
  }
177
190
  if (args.maxDrop !== undefined) {
178
191
  merged.maxDrop =
179
- typeof args.maxDrop === "number" ? args.maxDrop : parseInt(String(args.maxDrop), 10);
192
+ typeof args.maxDrop === "number"
193
+ ? args.maxDrop
194
+ : parseInt(String(args.maxDrop), 10);
180
195
  }
181
196
  if (args.warnDrop !== undefined) {
182
197
  merged.warnDrop =
183
- typeof args.warnDrop === "number" ? args.warnDrop : parseInt(String(args.warnDrop), 10);
198
+ typeof args.warnDrop === "number"
199
+ ? args.warnDrop
200
+ : parseInt(String(args.warnDrop), 10);
184
201
  }
185
202
  if (args.minN !== undefined) {
186
- merged.minN = typeof args.minN === "number" ? args.minN : parseInt(String(args.minN), 10);
203
+ merged.minN =
204
+ typeof args.minN === "number"
205
+ ? args.minN
206
+ : parseInt(String(args.minN), 10);
187
207
  }
188
208
  if (args.allowWeakEvidence !== undefined) {
189
209
  merged.allowWeakEvidence =
@@ -0,0 +1,173 @@
1
+ /**
2
+ * TICKET 5 — Behavioral Diff CLI (EVAL-401)
3
+ *
4
+ * Goal: "Git diff for AI behavior" from two RunReports
5
+ *
6
+ * Command:
7
+ * evalai diff --base main (default uses git to find baseline run)
8
+ * evalai diff --a <runReportPath> --b <runReportPath>
9
+ * evalai diff main..feature (nice-to-have alias)
10
+ */
11
+ import type { RunResult } from "./run";
12
+ /**
13
+ * Diff schema version
14
+ */
15
+ export declare const DIFF_SCHEMA_VERSION = 1;
16
+ /**
17
+ * Supported RunReport schema versions
18
+ */
19
+ export declare const SUPPORTED_SCHEMA_VERSIONS: readonly [1];
20
+ /**
21
+ * Rounding helpers for floating point normalization
22
+ */
23
+ export declare function round(value: number, precision?: number): number;
24
+ export declare function roundPct(value: number, precision?: number): number;
25
+ /**
26
+ * Validate RunReport schema version
27
+ */
28
+ export declare function validateSchemaVersion(report: RunResult): void;
29
+ /**
30
+ * Diff result classification
31
+ */
32
+ export type DiffClassification = "new_failure" | "fixed_failure" | "score_drop" | "score_improve" | "execution_error" | "skipped_change" | "added" | "removed";
33
+ /**
34
+ * Individual spec diff
35
+ */
36
+ export interface SpecDiff {
37
+ /** Spec identifier */
38
+ specId: string;
39
+ /** Spec name */
40
+ name: string;
41
+ /** File path */
42
+ filePath: string;
43
+ /** Classification of change */
44
+ classification: DiffClassification;
45
+ /** Base run result (if exists) */
46
+ base?: {
47
+ status: "passed" | "failed" | "skipped";
48
+ score?: number;
49
+ duration: number;
50
+ error?: string;
51
+ };
52
+ /** Head run result (if exists) */
53
+ head?: {
54
+ status: "passed" | "failed" | "skipped";
55
+ score?: number;
56
+ duration: number;
57
+ error?: string;
58
+ };
59
+ /** Calculated deltas */
60
+ deltas: {
61
+ scoreDelta?: number;
62
+ durationDelta?: number;
63
+ statusChange?: string;
64
+ };
65
+ }
66
+ /**
67
+ * Diff summary statistics
68
+ */
69
+ export interface DiffSummary {
70
+ /** Total specs in base */
71
+ baseTotal: number;
72
+ /** Total specs in head */
73
+ headTotal: number;
74
+ /** Pass rate delta */
75
+ passRateDelta: number;
76
+ /** Score delta (average) */
77
+ scoreDelta: number;
78
+ /** Number of regressions */
79
+ regressions: number;
80
+ /** Number of improvements */
81
+ improvements: number;
82
+ /** Number of added specs */
83
+ added: number;
84
+ /** Number of removed specs */
85
+ removed: number;
86
+ }
87
+ /**
88
+ * Complete diff result
89
+ */
90
+ export interface DiffResult {
91
+ /** Schema version */
92
+ schemaVersion: number;
93
+ /** Base run report */
94
+ base: RunResult;
95
+ /** Head run report */
96
+ head: RunResult;
97
+ /** Diff summary */
98
+ summary: DiffSummary;
99
+ /** Individual spec diffs */
100
+ changedSpecs: SpecDiff[];
101
+ /** Diff metadata */
102
+ metadata: {
103
+ generatedAt: number;
104
+ baseSource: string;
105
+ headSource: string;
106
+ };
107
+ }
108
+ /**
109
+ * Diff options
110
+ */
111
+ export interface DiffOptions {
112
+ /** Base report path or branch */
113
+ base?: string;
114
+ /** Head report path */
115
+ head?: string;
116
+ /** Output format */
117
+ format?: "human" | "json";
118
+ }
119
+ /**
120
+ * Run diff comparison
121
+ */
122
+ export declare function runDiff(options: DiffOptions): Promise<DiffResult>;
123
+ /**
124
+ * Compare two run reports
125
+ */
126
+ export declare function compareReports(base: RunResult, head: RunResult): DiffResult;
127
+ /**
128
+ * Classify the type of change
129
+ */
130
+ declare function classifyDiff(base?: RunResult["results"][0], head?: RunResult["results"][0]): DiffClassification;
131
+ /**
132
+ * Calculate deltas between base and head
133
+ */
134
+ declare function calculateDeltas(base?: RunResult["results"][0], head?: RunResult["results"][0]): SpecDiff["deltas"];
135
+ /**
136
+ * Calculate diff summary statistics
137
+ */
138
+ export declare function calculateDiffSummary(base: RunResult, head: RunResult, changedSpecs: SpecDiff[]): DiffSummary;
139
+ /**
140
+ * Print human-readable diff results
141
+ */
142
+ export declare function printHumanResults(result: DiffResult): void;
143
+ /**
144
+ * Print JSON results
145
+ */
146
+ export declare function printJsonResults(result: DiffResult): void;
147
+ /**
148
+ * Write GitHub Step Summary
149
+ */
150
+ export declare function writeGitHubStepSummary(result: DiffResult): Promise<void>;
151
+ /**
152
+ * CLI entry point
153
+ */
154
+ export declare function runDiffCLI(options: DiffOptions): Promise<void>;
155
+ export { classifyDiff, calculateDeltas };
156
+ export declare const diffCore: {
157
+ /**
158
+ * Compare two run reports and return diff result
159
+ */
160
+ readonly diffRunReports: typeof compareReports;
161
+ /**
162
+ * Classify the type of change between two specs
163
+ */
164
+ readonly classifyChange: typeof classifyDiff;
165
+ /**
166
+ * Calculate summary statistics for a diff
167
+ */
168
+ readonly summarizeDiff: typeof calculateDiffSummary;
169
+ /**
170
+ * Calculate deltas between two spec results
171
+ */
172
+ readonly calculateDeltas: typeof calculateDeltas;
173
+ };