@pauly4010/evalai-sdk 1.5.7 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -1
- package/README.md +12 -3
- package/dist/assertions.d.ts +11 -11
- package/dist/assertions.js +1 -1
- package/dist/batch.d.ts +3 -3
- package/dist/batch.js +1 -1
- package/dist/cache.d.ts +3 -3
- package/dist/cache.js +1 -1
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/formatters/github.js +1 -1
- package/dist/cli/formatters/human.js +1 -1
- package/dist/cli/formatters/pr-comment.js +1 -1
- package/dist/cli/index.js +20 -4
- package/dist/cli/regression-gate.d.ts +11 -0
- package/dist/cli/regression-gate.js +150 -0
- package/dist/client.d.ts +3 -3
- package/dist/client.js +3 -2
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +157 -0
- package/dist/context.d.ts +4 -4
- package/dist/context.js +1 -1
- package/dist/errors.d.ts +5 -5
- package/dist/errors.js +21 -24
- package/dist/export.d.ts +1 -1
- package/dist/export.js +4 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +7 -1
- package/dist/integrations/openai-eval.js +1 -1
- package/dist/logger.d.ts +10 -10
- package/dist/pagination.d.ts +2 -2
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/snapshot.d.ts +3 -3
- package/dist/streaming.d.ts +4 -4
- package/dist/testing.d.ts +1 -1
- package/dist/types.d.ts +33 -33
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.d.ts +29 -18
- package/package.json +7 -3
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,51 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.6.0] - 2026-02-24
|
|
9
|
+
|
|
10
|
+
### ✨ Added
|
|
11
|
+
|
|
12
|
+
#### CLI — Regression Gate & Baseline Management
|
|
13
|
+
|
|
14
|
+
- **`evalai baseline init`** — Create a starter `evals/baseline.json` with sample values and provenance metadata
|
|
15
|
+
- **`evalai baseline update`** — Run confidence tests, golden eval, and latency benchmark, then update baseline with real scores
|
|
16
|
+
- **`evalai gate`** — Run the local regression gate with proper exit code taxonomy (0=pass, 1=regression, 2=infra_error, 3=confidence_failed, 4=confidence_missing)
|
|
17
|
+
- **`evalai gate --format json`** — Output `evals/regression-report.json` as machine-readable JSON to stdout
|
|
18
|
+
- **`evalai gate --format github`** — Output GitHub Step Summary markdown with delta table
|
|
19
|
+
|
|
20
|
+
#### SDK Exports — Regression Gate Constants & Types
|
|
21
|
+
|
|
22
|
+
- **`GATE_EXIT`** — Exit code constants (`PASS`, `REGRESSION`, `INFRA_ERROR`, `CONFIDENCE_FAILED`, `CONFIDENCE_MISSING`)
|
|
23
|
+
- **`GATE_CATEGORY`** — Report category constants (`pass`, `regression`, `infra_error`)
|
|
24
|
+
- **`REPORT_SCHEMA_VERSION`** — Current schema version for `regression-report.json`
|
|
25
|
+
- **`ARTIFACTS`** — Well-known artifact paths (`BASELINE`, `REGRESSION_REPORT`, `CONFIDENCE_SUMMARY`, `LATENCY_BENCHMARK`)
|
|
26
|
+
- **Types**: `RegressionReport`, `RegressionDelta`, `Baseline`, `BaselineTolerance`, `GateExitCode`, `GateCategory`
|
|
27
|
+
- **Subpath export**: `@pauly4010/evalai-sdk/regression` for tree-shakeable imports
|
|
28
|
+
|
|
29
|
+
### 🔧 Changed
|
|
30
|
+
|
|
31
|
+
- CLI help text updated to include `baseline` and `gate` commands
|
|
32
|
+
- SDK becomes the public contract for regression gate — scripts are implementation detail
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## [1.5.8] - 2026-02-22
|
|
37
|
+
|
|
38
|
+
### 🐛 Fixed
|
|
39
|
+
|
|
40
|
+
- **secureRoute TypeScript overload compatibility** — Fixed implementation signature to use `ctx: any` for proper overload compatibility
|
|
41
|
+
- **Test infrastructure fixes** — Replaced invalid `expect.unknown()` with `expect.any()` across test files
|
|
42
|
+
- **NextRequest constructor** — Fixed test mocks using incorrect `(NextRequest as any)()` syntax
|
|
43
|
+
- **304 response handling** — Fixed exports API returning invalid 304 response with body
|
|
44
|
+
- **Error catalog tests** — Updated test expectations to match actual EvalAIError behavior
|
|
45
|
+
- **Redis cache timeout** — Added explicit timeout to prevent test hangs
|
|
46
|
+
|
|
47
|
+
### 🔧 Changed
|
|
48
|
+
|
|
49
|
+
- **Biome formatting** — Applied consistent line endings across 199 files
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
8
53
|
## [1.5.7] - 2026-02-20
|
|
9
54
|
|
|
10
55
|
### 📚 Documentation
|
|
@@ -32,7 +77,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
32
77
|
- **`--warnDrop <n>`** — Introduce a WARN band when score drops > `warnDrop` but < `maxDrop`
|
|
33
78
|
- **Gate verdicts:** PASS, WARN, FAIL
|
|
34
79
|
- **Profiles:** `strict` (warnDrop: 0), `balanced` (warnDrop: 1), `fast` (warnDrop: 2)
|
|
35
|
-
- **`--fail-on-flake`** — Fail the gate if
|
|
80
|
+
- **`--fail-on-flake`** — Fail the gate if unknown case is flagged as flaky (partial pass rate across determinism runs)
|
|
36
81
|
|
|
37
82
|
#### Determinism & flake intelligence
|
|
38
83
|
|
package/README.md
CHANGED
|
@@ -99,7 +99,7 @@ Key flags
|
|
|
99
99
|
|
|
100
100
|
--maxDrop → hard regression fail
|
|
101
101
|
|
|
102
|
-
--fail-on-flake → fail if
|
|
102
|
+
--fail-on-flake → fail if unknown test is unstable
|
|
103
103
|
|
|
104
104
|
This lets teams tune signal vs noise in CI.
|
|
105
105
|
|
|
@@ -190,7 +190,7 @@ Option Description
|
|
|
190
190
|
--allowWeakEvidence Permit weak evidence
|
|
191
191
|
--policy <name> HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511
|
|
192
192
|
--baseline <mode> published, previous, production
|
|
193
|
-
--fail-on-flake Fail if
|
|
193
|
+
--fail-on-flake Fail if unknown case is flaky
|
|
194
194
|
--baseUrl <url> Override API base URL
|
|
195
195
|
|
|
196
196
|
Exit codes
|
|
@@ -257,7 +257,16 @@ await openai.chat.completions.create({
|
|
|
257
257
|
|
|
258
258
|
|
|
259
259
|
🧭 Changelog
|
|
260
|
-
v1.5.
|
|
260
|
+
v1.5.8 (Latest)
|
|
261
|
+
Fixed secureRoute TypeScript overload compatibility
|
|
262
|
+
|
|
263
|
+
Fixed test infrastructure (expect.any, NextRequest constructor)
|
|
264
|
+
|
|
265
|
+
Fixed 304 response handling in exports API
|
|
266
|
+
|
|
267
|
+
Improved error catalog test coverage
|
|
268
|
+
|
|
269
|
+
v1.5.7
|
|
261
270
|
Documentation updates for CJS compatibility
|
|
262
271
|
|
|
263
272
|
Version alignment across README and changelog
|
package/dist/assertions.d.ts
CHANGED
|
@@ -17,26 +17,26 @@
|
|
|
17
17
|
export interface AssertionResult {
|
|
18
18
|
name: string;
|
|
19
19
|
passed: boolean;
|
|
20
|
-
expected:
|
|
21
|
-
actual:
|
|
20
|
+
expected: unknown;
|
|
21
|
+
actual: unknown;
|
|
22
22
|
message?: string;
|
|
23
23
|
}
|
|
24
24
|
export declare class AssertionError extends Error {
|
|
25
|
-
expected:
|
|
26
|
-
actual:
|
|
27
|
-
constructor(message: string, expected:
|
|
25
|
+
expected: unknown;
|
|
26
|
+
actual: unknown;
|
|
27
|
+
constructor(message: string, expected: unknown, actual: unknown);
|
|
28
28
|
}
|
|
29
29
|
/**
|
|
30
30
|
* Fluent assertion builder
|
|
31
31
|
*/
|
|
32
32
|
export declare class Expectation {
|
|
33
33
|
private value;
|
|
34
|
-
constructor(value:
|
|
34
|
+
constructor(value: unknown);
|
|
35
35
|
/**
|
|
36
36
|
* Assert value equals expected
|
|
37
37
|
* @example expect(output).toEqual("Hello")
|
|
38
38
|
*/
|
|
39
|
-
toEqual(expected:
|
|
39
|
+
toEqual(expected: unknown, message?: string): AssertionResult;
|
|
40
40
|
/**
|
|
41
41
|
* Assert value contains substring
|
|
42
42
|
* @example expect(output).toContain("help")
|
|
@@ -71,7 +71,7 @@ export declare class Expectation {
|
|
|
71
71
|
* Assert JSON matches schema
|
|
72
72
|
* @example expect(output).toMatchJSON({ status: 'success' })
|
|
73
73
|
*/
|
|
74
|
-
toMatchJSON(schema: Record<string,
|
|
74
|
+
toMatchJSON(schema: Record<string, unknown>, message?: string): AssertionResult;
|
|
75
75
|
/**
|
|
76
76
|
* Assert value has expected sentiment
|
|
77
77
|
* @example expect(output).toHaveSentiment('positive')
|
|
@@ -148,7 +148,7 @@ export declare class Expectation {
|
|
|
148
148
|
* expect(output).toHaveLength({ min: 10, max: 100 });
|
|
149
149
|
* ```
|
|
150
150
|
*/
|
|
151
|
-
export declare function expect(value:
|
|
151
|
+
export declare function expect(value: unknown): Expectation;
|
|
152
152
|
/**
|
|
153
153
|
* Run multiple assertions and collect results
|
|
154
154
|
*
|
|
@@ -178,12 +178,12 @@ export declare function withinRange(value: number, min: number, max: number): bo
|
|
|
178
178
|
export declare function isValidEmail(email: string): boolean;
|
|
179
179
|
export declare function isValidURL(url: string): boolean;
|
|
180
180
|
export declare function hasNoHallucinations(text: string, groundTruth: string[]): boolean;
|
|
181
|
-
export declare function matchesSchema(value:
|
|
181
|
+
export declare function matchesSchema(value: unknown, schema: Record<string, unknown>): boolean;
|
|
182
182
|
export declare function hasReadabilityScore(text: string, minScore: number): boolean;
|
|
183
183
|
export declare function containsLanguage(text: string, language: string): boolean;
|
|
184
184
|
export declare function hasFactualAccuracy(text: string, facts: string[]): boolean;
|
|
185
185
|
export declare function respondedWithinTime(startTime: number, maxMs: number): boolean;
|
|
186
186
|
export declare function hasNoToxicity(text: string): boolean;
|
|
187
187
|
export declare function followsInstructions(text: string, instructions: string[]): boolean;
|
|
188
|
-
export declare function containsAllRequiredFields(obj:
|
|
188
|
+
export declare function containsAllRequiredFields(obj: unknown, requiredFields: string[]): boolean;
|
|
189
189
|
export declare function hasValidCodeSyntax(code: string, language: string): boolean;
|
package/dist/assertions.js
CHANGED
|
@@ -612,7 +612,7 @@ function followsInstructions(text, instructions) {
|
|
|
612
612
|
});
|
|
613
613
|
}
|
|
614
614
|
function containsAllRequiredFields(obj, requiredFields) {
|
|
615
|
-
return requiredFields.every((field) => field in obj);
|
|
615
|
+
return requiredFields.every((field) => obj && typeof obj === "object" && field in obj);
|
|
616
616
|
}
|
|
617
617
|
function hasValidCodeSyntax(code, language) {
|
|
618
618
|
// This is a simplified implementation
|
package/dist/batch.d.ts
CHANGED
|
@@ -6,13 +6,13 @@ export interface BatchRequest {
|
|
|
6
6
|
id: string;
|
|
7
7
|
method: string;
|
|
8
8
|
endpoint: string;
|
|
9
|
-
body?:
|
|
9
|
+
body?: unknown;
|
|
10
10
|
headers?: Record<string, string>;
|
|
11
11
|
}
|
|
12
12
|
export interface BatchResponse {
|
|
13
13
|
id: string;
|
|
14
14
|
status: number;
|
|
15
|
-
data?:
|
|
15
|
+
data?: unknown;
|
|
16
16
|
error?: string;
|
|
17
17
|
}
|
|
18
18
|
/**
|
|
@@ -32,7 +32,7 @@ export declare class RequestBatcher {
|
|
|
32
32
|
/**
|
|
33
33
|
* Add request to batch queue
|
|
34
34
|
*/
|
|
35
|
-
enqueue(method: string, endpoint: string, body?:
|
|
35
|
+
enqueue(method: string, endpoint: string, body?: unknown, headers?: Record<string, string>): Promise<unknown>;
|
|
36
36
|
/**
|
|
37
37
|
* Schedule batch processing after delay
|
|
38
38
|
*/
|
package/dist/batch.js
CHANGED
|
@@ -85,7 +85,7 @@ class RequestBatcher {
|
|
|
85
85
|
}
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
|
-
// Handle
|
|
88
|
+
// Handle unknown requests that didn't get a response
|
|
89
89
|
for (const item of batch) {
|
|
90
90
|
if (!responses.find((r) => r.id === item.id)) {
|
|
91
91
|
item.reject(new Error("No response received for request"));
|
package/dist/cache.d.ts
CHANGED
|
@@ -17,15 +17,15 @@ export declare class RequestCache {
|
|
|
17
17
|
/**
|
|
18
18
|
* Get cached response if valid
|
|
19
19
|
*/
|
|
20
|
-
get<T>(method: string, url: string, params?:
|
|
20
|
+
get<T>(method: string, url: string, params?: unknown): T | null;
|
|
21
21
|
/**
|
|
22
22
|
* Store response in cache
|
|
23
23
|
*/
|
|
24
|
-
set<T>(method: string, url: string, data: T, ttl: number, params?:
|
|
24
|
+
set<T>(method: string, url: string, data: T, ttl: number, params?: unknown): void;
|
|
25
25
|
/**
|
|
26
26
|
* Invalidate specific cache entry
|
|
27
27
|
*/
|
|
28
|
-
invalidate(method: string, url: string, params?:
|
|
28
|
+
invalidate(method: string, url: string, params?: unknown): void;
|
|
29
29
|
/**
|
|
30
30
|
* Invalidate all cache entries matching a pattern
|
|
31
31
|
*/
|
package/dist/cache.js
CHANGED
|
@@ -69,7 +69,7 @@ class RequestCache {
|
|
|
69
69
|
* Invalidate all cache entries matching a pattern
|
|
70
70
|
*/
|
|
71
71
|
invalidatePattern(pattern) {
|
|
72
|
-
for (const key of this.cache.keys()) {
|
|
72
|
+
for (const key of Array.from(this.cache.keys())) {
|
|
73
73
|
if (key.includes(pattern)) {
|
|
74
74
|
this.cache.delete(key);
|
|
75
75
|
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalai baseline — Baseline management commands
|
|
3
|
+
*
|
|
4
|
+
* Subcommands:
|
|
5
|
+
* evalai baseline init — Create a starter evals/baseline.json
|
|
6
|
+
* evalai baseline update — Run tests + update baseline with real scores
|
|
7
|
+
*/
|
|
8
|
+
export declare function runBaselineInit(cwd: string): number;
|
|
9
|
+
export declare function runBaselineUpdate(cwd: string): number;
|
|
10
|
+
export declare function runBaseline(argv: string[]): number;
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalai baseline — Baseline management commands
|
|
4
|
+
*
|
|
5
|
+
* Subcommands:
|
|
6
|
+
* evalai baseline init — Create a starter evals/baseline.json
|
|
7
|
+
* evalai baseline update — Run tests + update baseline with real scores
|
|
8
|
+
*/
|
|
9
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
12
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
13
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
14
|
+
}
|
|
15
|
+
Object.defineProperty(o, k2, desc);
|
|
16
|
+
}) : (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
o[k2] = m[k];
|
|
19
|
+
}));
|
|
20
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
21
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
22
|
+
}) : function(o, v) {
|
|
23
|
+
o["default"] = v;
|
|
24
|
+
});
|
|
25
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
26
|
+
var ownKeys = function(o) {
|
|
27
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
28
|
+
var ar = [];
|
|
29
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
30
|
+
return ar;
|
|
31
|
+
};
|
|
32
|
+
return ownKeys(o);
|
|
33
|
+
};
|
|
34
|
+
return function (mod) {
|
|
35
|
+
if (mod && mod.__esModule) return mod;
|
|
36
|
+
var result = {};
|
|
37
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
38
|
+
__setModuleDefault(result, mod);
|
|
39
|
+
return result;
|
|
40
|
+
};
|
|
41
|
+
})();
|
|
42
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
43
|
+
exports.runBaselineInit = runBaselineInit;
|
|
44
|
+
exports.runBaselineUpdate = runBaselineUpdate;
|
|
45
|
+
exports.runBaseline = runBaseline;
|
|
46
|
+
const node_child_process_1 = require("node:child_process");
|
|
47
|
+
const fs = __importStar(require("node:fs"));
|
|
48
|
+
const path = __importStar(require("node:path"));
|
|
49
|
+
const BASELINE_REL = "evals/baseline.json";
|
|
50
|
+
/** Detect the package manager used in the project */
|
|
51
|
+
function detectPackageManager(cwd) {
|
|
52
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
53
|
+
return "pnpm";
|
|
54
|
+
if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
55
|
+
return "yarn";
|
|
56
|
+
return "npm";
|
|
57
|
+
}
|
|
58
|
+
/** Run an npm script via the detected package manager */
|
|
59
|
+
function runScript(cwd, scriptName) {
|
|
60
|
+
const pm = detectPackageManager(cwd);
|
|
61
|
+
const isWin = process.platform === "win32";
|
|
62
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", scriptName], {
|
|
63
|
+
cwd,
|
|
64
|
+
stdio: "inherit",
|
|
65
|
+
shell: isWin,
|
|
66
|
+
});
|
|
67
|
+
return result.status ?? 1;
|
|
68
|
+
}
|
|
69
|
+
function runBaselineInit(cwd) {
|
|
70
|
+
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
71
|
+
if (fs.existsSync(baselinePath)) {
|
|
72
|
+
console.log(`⚠ ${BASELINE_REL} already exists. Delete it first or use 'evalai baseline update'.`);
|
|
73
|
+
return 1;
|
|
74
|
+
}
|
|
75
|
+
// Ensure evals/ directory exists
|
|
76
|
+
const evalsDir = path.join(cwd, "evals");
|
|
77
|
+
if (!fs.existsSync(evalsDir)) {
|
|
78
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
79
|
+
}
|
|
80
|
+
const user = process.env.USER || process.env.USERNAME || "unknown";
|
|
81
|
+
const now = new Date().toISOString();
|
|
82
|
+
const baseline = {
|
|
83
|
+
schemaVersion: 1,
|
|
84
|
+
description: "Regression gate baseline — created by evalai baseline init",
|
|
85
|
+
generatedAt: now,
|
|
86
|
+
generatedBy: user,
|
|
87
|
+
commitSha: "0000000",
|
|
88
|
+
updatedAt: now,
|
|
89
|
+
updatedBy: user,
|
|
90
|
+
tolerance: {
|
|
91
|
+
scoreDrop: 5,
|
|
92
|
+
passRateDrop: 5,
|
|
93
|
+
maxLatencyIncreaseMs: 200,
|
|
94
|
+
maxCostIncreaseUsd: 0.05,
|
|
95
|
+
},
|
|
96
|
+
goldenEval: {
|
|
97
|
+
score: 100,
|
|
98
|
+
passRate: 100,
|
|
99
|
+
totalCases: 3,
|
|
100
|
+
passedCases: 3,
|
|
101
|
+
},
|
|
102
|
+
qualityScore: {
|
|
103
|
+
overall: 90,
|
|
104
|
+
grade: "A",
|
|
105
|
+
accuracy: 85,
|
|
106
|
+
safety: 100,
|
|
107
|
+
latency: 90,
|
|
108
|
+
cost: 90,
|
|
109
|
+
consistency: 90,
|
|
110
|
+
},
|
|
111
|
+
confidenceTests: {
|
|
112
|
+
unitPassed: true,
|
|
113
|
+
unitTotal: 0,
|
|
114
|
+
dbPassed: true,
|
|
115
|
+
dbTotal: 0,
|
|
116
|
+
},
|
|
117
|
+
productMetrics: {},
|
|
118
|
+
};
|
|
119
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
|
|
120
|
+
console.log(`✅ Created ${BASELINE_REL} with sample values\n`);
|
|
121
|
+
console.log("Next steps:");
|
|
122
|
+
console.log(` 1. Commit ${BASELINE_REL} to your repo`);
|
|
123
|
+
console.log(" 2. Run 'evalai baseline update' to populate with real scores");
|
|
124
|
+
console.log(" 3. Run 'evalai gate' to verify the regression gate\n");
|
|
125
|
+
return 0;
|
|
126
|
+
}
|
|
127
|
+
// ── baseline update ──
|
|
128
|
+
function runBaselineUpdate(cwd) {
|
|
129
|
+
// Check if eval:baseline-update script exists in package.json
|
|
130
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
131
|
+
if (!fs.existsSync(pkgPath)) {
|
|
132
|
+
console.error("❌ No package.json found. Run this from your project root.");
|
|
133
|
+
return 1;
|
|
134
|
+
}
|
|
135
|
+
let pkg;
|
|
136
|
+
try {
|
|
137
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
console.error("❌ Failed to parse package.json");
|
|
141
|
+
return 1;
|
|
142
|
+
}
|
|
143
|
+
if (!pkg.scripts?.["eval:baseline-update"]) {
|
|
144
|
+
console.error("❌ Missing 'eval:baseline-update' script in package.json.");
|
|
145
|
+
console.error(" Add it: \"eval:baseline-update\": \"npx tsx scripts/regression-gate.ts --update-baseline\"");
|
|
146
|
+
return 1;
|
|
147
|
+
}
|
|
148
|
+
console.log("📊 Running baseline update...\n");
|
|
149
|
+
return runScript(cwd, "eval:baseline-update");
|
|
150
|
+
}
|
|
151
|
+
// ── baseline router ──
|
|
152
|
+
function runBaseline(argv) {
|
|
153
|
+
const sub = argv[0];
|
|
154
|
+
const cwd = process.cwd();
|
|
155
|
+
if (sub === "init") {
|
|
156
|
+
return runBaselineInit(cwd);
|
|
157
|
+
}
|
|
158
|
+
if (sub === "update") {
|
|
159
|
+
return runBaselineUpdate(cwd);
|
|
160
|
+
}
|
|
161
|
+
console.log(`evalai baseline — Manage regression gate baselines
|
|
162
|
+
|
|
163
|
+
Usage:
|
|
164
|
+
evalai baseline init Create starter ${BASELINE_REL}
|
|
165
|
+
evalai baseline update Run tests and update baseline with real scores
|
|
166
|
+
|
|
167
|
+
Examples:
|
|
168
|
+
evalai baseline init
|
|
169
|
+
evalai baseline update
|
|
170
|
+
`);
|
|
171
|
+
return sub === "--help" || sub === "-h" ? 0 : 1;
|
|
172
|
+
}
|
|
@@ -81,7 +81,7 @@ function appendStepSummary(report) {
|
|
|
81
81
|
const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 80);
|
|
82
82
|
const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 80);
|
|
83
83
|
const reason = out ? `got "${out}"` : "no output";
|
|
84
|
-
lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || "(
|
|
84
|
+
lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || "(unknown)"}, ${reason}`);
|
|
85
85
|
}
|
|
86
86
|
if (failedCases.length > 10) {
|
|
87
87
|
lines.push(`- _+ ${failedCases.length - 10} more_`);
|
|
@@ -30,7 +30,7 @@ function formatHuman(report) {
|
|
|
30
30
|
const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 50);
|
|
31
31
|
const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 50);
|
|
32
32
|
const reason = out ? `got "${out}"` : "no output";
|
|
33
|
-
lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || "(
|
|
33
|
+
lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || "(unknown)"}, ${reason}`);
|
|
34
34
|
}
|
|
35
35
|
if (failedCases.length > toShow.length) {
|
|
36
36
|
lines.push(` + ${failedCases.length - toShow.length} more`);
|
package/dist/cli/index.js
CHANGED
|
@@ -8,9 +8,11 @@
|
|
|
8
8
|
* evalai check — CI/CD evaluation gate (see evalai check --help)
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
const baseline_1 = require("./baseline");
|
|
11
12
|
const check_1 = require("./check");
|
|
12
13
|
const doctor_1 = require("./doctor");
|
|
13
14
|
const init_1 = require("./init");
|
|
15
|
+
const regression_gate_1 = require("./regression-gate");
|
|
14
16
|
const share_1 = require("./share");
|
|
15
17
|
const argv = process.argv.slice(2);
|
|
16
18
|
const subcommand = argv[0];
|
|
@@ -19,6 +21,14 @@ if (subcommand === "init") {
|
|
|
19
21
|
const ok = (0, init_1.runInit)(cwd);
|
|
20
22
|
process.exit(ok ? 0 : 1);
|
|
21
23
|
}
|
|
24
|
+
else if (subcommand === "baseline") {
|
|
25
|
+
const code = (0, baseline_1.runBaseline)(argv.slice(1));
|
|
26
|
+
process.exit(code);
|
|
27
|
+
}
|
|
28
|
+
else if (subcommand === "gate") {
|
|
29
|
+
const code = (0, regression_gate_1.runGate)(argv.slice(1));
|
|
30
|
+
process.exit(code);
|
|
31
|
+
}
|
|
22
32
|
else if (subcommand === "doctor") {
|
|
23
33
|
(0, doctor_1.runDoctor)(argv.slice(1))
|
|
24
34
|
.then((code) => process.exit(code))
|
|
@@ -57,10 +67,16 @@ else {
|
|
|
57
67
|
console.log(`EvalAI CLI
|
|
58
68
|
|
|
59
69
|
Usage:
|
|
60
|
-
evalai init
|
|
61
|
-
evalai
|
|
62
|
-
evalai
|
|
63
|
-
evalai
|
|
70
|
+
evalai init Create evalai.config.json
|
|
71
|
+
evalai baseline init Create starter evals/baseline.json
|
|
72
|
+
evalai baseline update Run tests and update baseline with real scores
|
|
73
|
+
evalai gate [options] Run regression gate (local test-based)
|
|
74
|
+
evalai doctor [options] Verify CI/CD setup (same endpoint as check)
|
|
75
|
+
evalai check [options] CI/CD evaluation gate (API-based)
|
|
76
|
+
evalai share [options] Create share link for a run
|
|
77
|
+
|
|
78
|
+
Options for gate:
|
|
79
|
+
--format <fmt> Output format: human (default), json, github
|
|
64
80
|
|
|
65
81
|
Options for check:
|
|
66
82
|
--evaluationId <id> Evaluation to gate on (or from config)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalai gate — Run the regression gate
|
|
3
|
+
*
|
|
4
|
+
* Delegates to the project's eval:regression-gate npm script.
|
|
5
|
+
* Supports --format json to output the regression-report.json contents.
|
|
6
|
+
*/
|
|
7
|
+
export interface GateArgs {
|
|
8
|
+
format: "human" | "json" | "github";
|
|
9
|
+
}
|
|
10
|
+
export declare function parseGateArgs(argv: string[]): GateArgs;
|
|
11
|
+
export declare function runGate(argv: string[]): number;
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalai gate — Run the regression gate
|
|
4
|
+
*
|
|
5
|
+
* Delegates to the project's eval:regression-gate npm script.
|
|
6
|
+
* Supports --format json to output the regression-report.json contents.
|
|
7
|
+
*/
|
|
8
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
|
+
if (k2 === undefined) k2 = k;
|
|
10
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
11
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
12
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
13
|
+
}
|
|
14
|
+
Object.defineProperty(o, k2, desc);
|
|
15
|
+
}) : (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
o[k2] = m[k];
|
|
18
|
+
}));
|
|
19
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
20
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
21
|
+
}) : function(o, v) {
|
|
22
|
+
o["default"] = v;
|
|
23
|
+
});
|
|
24
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
25
|
+
var ownKeys = function(o) {
|
|
26
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
27
|
+
var ar = [];
|
|
28
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
29
|
+
return ar;
|
|
30
|
+
};
|
|
31
|
+
return ownKeys(o);
|
|
32
|
+
};
|
|
33
|
+
return function (mod) {
|
|
34
|
+
if (mod && mod.__esModule) return mod;
|
|
35
|
+
var result = {};
|
|
36
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
37
|
+
__setModuleDefault(result, mod);
|
|
38
|
+
return result;
|
|
39
|
+
};
|
|
40
|
+
})();
|
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports.parseGateArgs = parseGateArgs;
|
|
43
|
+
exports.runGate = runGate;
|
|
44
|
+
const node_child_process_1 = require("node:child_process");
|
|
45
|
+
const fs = __importStar(require("node:fs"));
|
|
46
|
+
const path = __importStar(require("node:path"));
|
|
47
|
+
const REPORT_REL = "evals/regression-report.json";
|
|
48
|
+
/** Detect the package manager used in the project */
|
|
49
|
+
function detectPackageManager(cwd) {
|
|
50
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
51
|
+
return "pnpm";
|
|
52
|
+
if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
53
|
+
return "yarn";
|
|
54
|
+
return "npm";
|
|
55
|
+
}
|
|
56
|
+
function parseGateArgs(argv) {
|
|
57
|
+
const args = { format: "human" };
|
|
58
|
+
for (let i = 0; i < argv.length; i++) {
|
|
59
|
+
if (argv[i] === "--format" && argv[i + 1]) {
|
|
60
|
+
const fmt = argv[i + 1];
|
|
61
|
+
if (fmt === "json" || fmt === "github" || fmt === "human") {
|
|
62
|
+
args.format = fmt;
|
|
63
|
+
}
|
|
64
|
+
i++;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return args;
|
|
68
|
+
}
|
|
69
|
+
function runGate(argv) {
|
|
70
|
+
const cwd = process.cwd();
|
|
71
|
+
const args = parseGateArgs(argv);
|
|
72
|
+
// Check if eval:regression-gate script exists
|
|
73
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
74
|
+
if (!fs.existsSync(pkgPath)) {
|
|
75
|
+
console.error("❌ No package.json found. Run this from your project root.");
|
|
76
|
+
return 1;
|
|
77
|
+
}
|
|
78
|
+
let pkg;
|
|
79
|
+
try {
|
|
80
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
console.error("❌ Failed to parse package.json");
|
|
84
|
+
return 1;
|
|
85
|
+
}
|
|
86
|
+
if (!pkg.scripts?.["eval:regression-gate"]) {
|
|
87
|
+
console.error("❌ Missing 'eval:regression-gate' script in package.json.");
|
|
88
|
+
console.error(' Add it: "eval:regression-gate": "npx tsx scripts/regression-gate.ts"');
|
|
89
|
+
return 1;
|
|
90
|
+
}
|
|
91
|
+
const pm = detectPackageManager(cwd);
|
|
92
|
+
const isWin = process.platform === "win32";
|
|
93
|
+
// For json format, suppress human output and print report JSON
|
|
94
|
+
const stdio = args.format === "json" ? "pipe" : "inherit";
|
|
95
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", "eval:regression-gate"], {
|
|
96
|
+
cwd,
|
|
97
|
+
stdio: stdio,
|
|
98
|
+
shell: isWin,
|
|
99
|
+
});
|
|
100
|
+
const exitCode = result.status ?? 1;
|
|
101
|
+
if (args.format === "json") {
|
|
102
|
+
// Output the regression report as JSON
|
|
103
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
104
|
+
if (fs.existsSync(reportPath)) {
|
|
105
|
+
const report = fs.readFileSync(reportPath, "utf-8");
|
|
106
|
+
process.stdout.write(report);
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
console.error(JSON.stringify({ error: "regression-report.json not found", exitCode }));
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
else if (args.format === "github") {
|
|
113
|
+
// Output GitHub Step Summary markdown
|
|
114
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
115
|
+
if (fs.existsSync(reportPath)) {
|
|
116
|
+
try {
|
|
117
|
+
const report = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
|
|
118
|
+
const icon = report.passed ? "✅" : "❌";
|
|
119
|
+
const lines = [
|
|
120
|
+
`## ${icon} Regression Gate: ${report.category}`,
|
|
121
|
+
"",
|
|
122
|
+
"| Metric | Baseline | Current | Delta | Status |",
|
|
123
|
+
"|--------|----------|---------|-------|--------|",
|
|
124
|
+
];
|
|
125
|
+
for (const d of report.deltas ?? []) {
|
|
126
|
+
const statusIcon = d.status === "pass" ? "✅" : "❌";
|
|
127
|
+
lines.push(`| ${d.metric} | ${d.baseline} | ${d.current} | ${d.delta} | ${statusIcon} |`);
|
|
128
|
+
}
|
|
129
|
+
if (report.failures?.length > 0) {
|
|
130
|
+
lines.push("", "### Failures", "");
|
|
131
|
+
for (const f of report.failures) {
|
|
132
|
+
lines.push(`- ${f}`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
lines.push("", `Schema version: ${report.schemaVersion ?? "unknown"}`);
|
|
136
|
+
const md = lines.join("\n");
|
|
137
|
+
// Write to $GITHUB_STEP_SUMMARY if available
|
|
138
|
+
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
139
|
+
if (summaryPath) {
|
|
140
|
+
fs.appendFileSync(summaryPath, `${md}\n`);
|
|
141
|
+
}
|
|
142
|
+
console.log(md);
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
// Fall through — human output already printed
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return exitCode;
|
|
150
|
+
}
|