@pauly4010/evalai-sdk 1.5.8 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/index.js +20 -4
- package/dist/cli/regression-gate.d.ts +11 -0
- package/dist/cli/regression-gate.js +150 -0
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +157 -157
- package/dist/index.d.ts +1 -0
- package/dist/index.js +7 -1
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +6 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,34 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.6.0] - 2026-02-24
|
|
9
|
+
|
|
10
|
+
### ✨ Added
|
|
11
|
+
|
|
12
|
+
#### CLI — Regression Gate & Baseline Management
|
|
13
|
+
|
|
14
|
+
- **`evalai baseline init`** — Create a starter `evals/baseline.json` with sample values and provenance metadata
|
|
15
|
+
- **`evalai baseline update`** — Run confidence tests, golden eval, and latency benchmark, then update baseline with real scores
|
|
16
|
+
- **`evalai gate`** — Run the local regression gate with proper exit code taxonomy (0=pass, 1=regression, 2=infra_error, 3=confidence_failed, 4=confidence_missing)
|
|
17
|
+
- **`evalai gate --format json`** — Output `evals/regression-report.json` as machine-readable JSON to stdout
|
|
18
|
+
- **`evalai gate --format github`** — Output GitHub Step Summary markdown with delta table
|
|
19
|
+
|
|
20
|
+
#### SDK Exports — Regression Gate Constants & Types
|
|
21
|
+
|
|
22
|
+
- **`GATE_EXIT`** — Exit code constants (`PASS`, `REGRESSION`, `INFRA_ERROR`, `CONFIDENCE_FAILED`, `CONFIDENCE_MISSING`)
|
|
23
|
+
- **`GATE_CATEGORY`** — Report category constants (`pass`, `regression`, `infra_error`)
|
|
24
|
+
- **`REPORT_SCHEMA_VERSION`** — Current schema version for `regression-report.json`
|
|
25
|
+
- **`ARTIFACTS`** — Well-known artifact paths (`BASELINE`, `REGRESSION_REPORT`, `CONFIDENCE_SUMMARY`, `LATENCY_BENCHMARK`)
|
|
26
|
+
- **Types**: `RegressionReport`, `RegressionDelta`, `Baseline`, `BaselineTolerance`, `GateExitCode`, `GateCategory`
|
|
27
|
+
- **Subpath export**: `@pauly4010/evalai-sdk/regression` for tree-shakeable imports
|
|
28
|
+
|
|
29
|
+
### 🔧 Changed
|
|
30
|
+
|
|
31
|
+
- CLI help text updated to include `baseline` and `gate` commands
|
|
32
|
+
- SDK becomes the public contract for regression gate — scripts are implementation detail
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
8
36
|
## [1.5.8] - 2026-02-22
|
|
9
37
|
|
|
10
38
|
### 🐛 Fixed
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalai baseline — Baseline management commands
|
|
3
|
+
*
|
|
4
|
+
* Subcommands:
|
|
5
|
+
* evalai baseline init — Create a starter evals/baseline.json
|
|
6
|
+
* evalai baseline update — Run tests + update baseline with real scores
|
|
7
|
+
*/
|
|
8
|
+
export declare function runBaselineInit(cwd: string): number;
|
|
9
|
+
export declare function runBaselineUpdate(cwd: string): number;
|
|
10
|
+
export declare function runBaseline(argv: string[]): number;
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalai baseline — Baseline management commands
|
|
4
|
+
*
|
|
5
|
+
* Subcommands:
|
|
6
|
+
* evalai baseline init — Create a starter evals/baseline.json
|
|
7
|
+
* evalai baseline update — Run tests + update baseline with real scores
|
|
8
|
+
*/
|
|
9
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
12
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
13
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
14
|
+
}
|
|
15
|
+
Object.defineProperty(o, k2, desc);
|
|
16
|
+
}) : (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
o[k2] = m[k];
|
|
19
|
+
}));
|
|
20
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
21
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
22
|
+
}) : function(o, v) {
|
|
23
|
+
o["default"] = v;
|
|
24
|
+
});
|
|
25
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
26
|
+
var ownKeys = function(o) {
|
|
27
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
28
|
+
var ar = [];
|
|
29
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
30
|
+
return ar;
|
|
31
|
+
};
|
|
32
|
+
return ownKeys(o);
|
|
33
|
+
};
|
|
34
|
+
return function (mod) {
|
|
35
|
+
if (mod && mod.__esModule) return mod;
|
|
36
|
+
var result = {};
|
|
37
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
38
|
+
__setModuleDefault(result, mod);
|
|
39
|
+
return result;
|
|
40
|
+
};
|
|
41
|
+
})();
|
|
42
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
43
|
+
exports.runBaselineInit = runBaselineInit;
|
|
44
|
+
exports.runBaselineUpdate = runBaselineUpdate;
|
|
45
|
+
exports.runBaseline = runBaseline;
|
|
46
|
+
const node_child_process_1 = require("node:child_process");
|
|
47
|
+
const fs = __importStar(require("node:fs"));
|
|
48
|
+
const path = __importStar(require("node:path"));
|
|
49
|
+
const BASELINE_REL = "evals/baseline.json";
|
|
50
|
+
/** Detect the package manager used in the project */
|
|
51
|
+
function detectPackageManager(cwd) {
|
|
52
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
53
|
+
return "pnpm";
|
|
54
|
+
if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
55
|
+
return "yarn";
|
|
56
|
+
return "npm";
|
|
57
|
+
}
|
|
58
|
+
/** Run an npm script via the detected package manager */
|
|
59
|
+
function runScript(cwd, scriptName) {
|
|
60
|
+
const pm = detectPackageManager(cwd);
|
|
61
|
+
const isWin = process.platform === "win32";
|
|
62
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", scriptName], {
|
|
63
|
+
cwd,
|
|
64
|
+
stdio: "inherit",
|
|
65
|
+
shell: isWin,
|
|
66
|
+
});
|
|
67
|
+
return result.status ?? 1;
|
|
68
|
+
}
|
|
69
|
+
function runBaselineInit(cwd) {
|
|
70
|
+
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
71
|
+
if (fs.existsSync(baselinePath)) {
|
|
72
|
+
console.log(`⚠ ${BASELINE_REL} already exists. Delete it first or use 'evalai baseline update'.`);
|
|
73
|
+
return 1;
|
|
74
|
+
}
|
|
75
|
+
// Ensure evals/ directory exists
|
|
76
|
+
const evalsDir = path.join(cwd, "evals");
|
|
77
|
+
if (!fs.existsSync(evalsDir)) {
|
|
78
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
79
|
+
}
|
|
80
|
+
const user = process.env.USER || process.env.USERNAME || "unknown";
|
|
81
|
+
const now = new Date().toISOString();
|
|
82
|
+
const baseline = {
|
|
83
|
+
schemaVersion: 1,
|
|
84
|
+
description: "Regression gate baseline — created by evalai baseline init",
|
|
85
|
+
generatedAt: now,
|
|
86
|
+
generatedBy: user,
|
|
87
|
+
commitSha: "0000000",
|
|
88
|
+
updatedAt: now,
|
|
89
|
+
updatedBy: user,
|
|
90
|
+
tolerance: {
|
|
91
|
+
scoreDrop: 5,
|
|
92
|
+
passRateDrop: 5,
|
|
93
|
+
maxLatencyIncreaseMs: 200,
|
|
94
|
+
maxCostIncreaseUsd: 0.05,
|
|
95
|
+
},
|
|
96
|
+
goldenEval: {
|
|
97
|
+
score: 100,
|
|
98
|
+
passRate: 100,
|
|
99
|
+
totalCases: 3,
|
|
100
|
+
passedCases: 3,
|
|
101
|
+
},
|
|
102
|
+
qualityScore: {
|
|
103
|
+
overall: 90,
|
|
104
|
+
grade: "A",
|
|
105
|
+
accuracy: 85,
|
|
106
|
+
safety: 100,
|
|
107
|
+
latency: 90,
|
|
108
|
+
cost: 90,
|
|
109
|
+
consistency: 90,
|
|
110
|
+
},
|
|
111
|
+
confidenceTests: {
|
|
112
|
+
unitPassed: true,
|
|
113
|
+
unitTotal: 0,
|
|
114
|
+
dbPassed: true,
|
|
115
|
+
dbTotal: 0,
|
|
116
|
+
},
|
|
117
|
+
productMetrics: {},
|
|
118
|
+
};
|
|
119
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
|
|
120
|
+
console.log(`✅ Created ${BASELINE_REL} with sample values\n`);
|
|
121
|
+
console.log("Next steps:");
|
|
122
|
+
console.log(` 1. Commit ${BASELINE_REL} to your repo`);
|
|
123
|
+
console.log(" 2. Run 'evalai baseline update' to populate with real scores");
|
|
124
|
+
console.log(" 3. Run 'evalai gate' to verify the regression gate\n");
|
|
125
|
+
return 0;
|
|
126
|
+
}
|
|
127
|
+
// ── baseline update ──
|
|
128
|
+
function runBaselineUpdate(cwd) {
|
|
129
|
+
// Check if eval:baseline-update script exists in package.json
|
|
130
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
131
|
+
if (!fs.existsSync(pkgPath)) {
|
|
132
|
+
console.error("❌ No package.json found. Run this from your project root.");
|
|
133
|
+
return 1;
|
|
134
|
+
}
|
|
135
|
+
let pkg;
|
|
136
|
+
try {
|
|
137
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
console.error("❌ Failed to parse package.json");
|
|
141
|
+
return 1;
|
|
142
|
+
}
|
|
143
|
+
if (!pkg.scripts?.["eval:baseline-update"]) {
|
|
144
|
+
console.error("❌ Missing 'eval:baseline-update' script in package.json.");
|
|
145
|
+
console.error(" Add it: \"eval:baseline-update\": \"npx tsx scripts/regression-gate.ts --update-baseline\"");
|
|
146
|
+
return 1;
|
|
147
|
+
}
|
|
148
|
+
console.log("📊 Running baseline update...\n");
|
|
149
|
+
return runScript(cwd, "eval:baseline-update");
|
|
150
|
+
}
|
|
151
|
+
// ── baseline router ──
|
|
152
|
+
function runBaseline(argv) {
|
|
153
|
+
const sub = argv[0];
|
|
154
|
+
const cwd = process.cwd();
|
|
155
|
+
if (sub === "init") {
|
|
156
|
+
return runBaselineInit(cwd);
|
|
157
|
+
}
|
|
158
|
+
if (sub === "update") {
|
|
159
|
+
return runBaselineUpdate(cwd);
|
|
160
|
+
}
|
|
161
|
+
console.log(`evalai baseline — Manage regression gate baselines
|
|
162
|
+
|
|
163
|
+
Usage:
|
|
164
|
+
evalai baseline init Create starter ${BASELINE_REL}
|
|
165
|
+
evalai baseline update Run tests and update baseline with real scores
|
|
166
|
+
|
|
167
|
+
Examples:
|
|
168
|
+
evalai baseline init
|
|
169
|
+
evalai baseline update
|
|
170
|
+
`);
|
|
171
|
+
return sub === "--help" || sub === "-h" ? 0 : 1;
|
|
172
|
+
}
|
package/dist/cli/index.js
CHANGED
|
@@ -8,9 +8,11 @@
|
|
|
8
8
|
* evalai check — CI/CD evaluation gate (see evalai check --help)
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
const baseline_1 = require("./baseline");
|
|
11
12
|
const check_1 = require("./check");
|
|
12
13
|
const doctor_1 = require("./doctor");
|
|
13
14
|
const init_1 = require("./init");
|
|
15
|
+
const regression_gate_1 = require("./regression-gate");
|
|
14
16
|
const share_1 = require("./share");
|
|
15
17
|
const argv = process.argv.slice(2);
|
|
16
18
|
const subcommand = argv[0];
|
|
@@ -19,6 +21,14 @@ if (subcommand === "init") {
|
|
|
19
21
|
const ok = (0, init_1.runInit)(cwd);
|
|
20
22
|
process.exit(ok ? 0 : 1);
|
|
21
23
|
}
|
|
24
|
+
else if (subcommand === "baseline") {
|
|
25
|
+
const code = (0, baseline_1.runBaseline)(argv.slice(1));
|
|
26
|
+
process.exit(code);
|
|
27
|
+
}
|
|
28
|
+
else if (subcommand === "gate") {
|
|
29
|
+
const code = (0, regression_gate_1.runGate)(argv.slice(1));
|
|
30
|
+
process.exit(code);
|
|
31
|
+
}
|
|
22
32
|
else if (subcommand === "doctor") {
|
|
23
33
|
(0, doctor_1.runDoctor)(argv.slice(1))
|
|
24
34
|
.then((code) => process.exit(code))
|
|
@@ -57,10 +67,16 @@ else {
|
|
|
57
67
|
console.log(`EvalAI CLI
|
|
58
68
|
|
|
59
69
|
Usage:
|
|
60
|
-
evalai init
|
|
61
|
-
evalai
|
|
62
|
-
evalai
|
|
63
|
-
evalai
|
|
70
|
+
evalai init Create evalai.config.json
|
|
71
|
+
evalai baseline init Create starter evals/baseline.json
|
|
72
|
+
evalai baseline update Run tests and update baseline with real scores
|
|
73
|
+
evalai gate [options] Run regression gate (local test-based)
|
|
74
|
+
evalai doctor [options] Verify CI/CD setup (same endpoint as check)
|
|
75
|
+
evalai check [options] CI/CD evaluation gate (API-based)
|
|
76
|
+
evalai share [options] Create share link for a run
|
|
77
|
+
|
|
78
|
+
Options for gate:
|
|
79
|
+
--format <fmt> Output format: human (default), json, github
|
|
64
80
|
|
|
65
81
|
Options for check:
|
|
66
82
|
--evaluationId <id> Evaluation to gate on (or from config)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalai gate — Run the regression gate
|
|
3
|
+
*
|
|
4
|
+
* Delegates to the project's eval:regression-gate npm script.
|
|
5
|
+
* Supports --format json to output the regression-report.json contents.
|
|
6
|
+
*/
|
|
7
|
+
export interface GateArgs {
|
|
8
|
+
format: "human" | "json" | "github";
|
|
9
|
+
}
|
|
10
|
+
export declare function parseGateArgs(argv: string[]): GateArgs;
|
|
11
|
+
export declare function runGate(argv: string[]): number;
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalai gate — Run the regression gate
|
|
4
|
+
*
|
|
5
|
+
* Delegates to the project's eval:regression-gate npm script.
|
|
6
|
+
* Supports --format json to output the regression-report.json contents.
|
|
7
|
+
*/
|
|
8
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
|
+
if (k2 === undefined) k2 = k;
|
|
10
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
11
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
12
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
13
|
+
}
|
|
14
|
+
Object.defineProperty(o, k2, desc);
|
|
15
|
+
}) : (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
o[k2] = m[k];
|
|
18
|
+
}));
|
|
19
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
20
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
21
|
+
}) : function(o, v) {
|
|
22
|
+
o["default"] = v;
|
|
23
|
+
});
|
|
24
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
25
|
+
var ownKeys = function(o) {
|
|
26
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
27
|
+
var ar = [];
|
|
28
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
29
|
+
return ar;
|
|
30
|
+
};
|
|
31
|
+
return ownKeys(o);
|
|
32
|
+
};
|
|
33
|
+
return function (mod) {
|
|
34
|
+
if (mod && mod.__esModule) return mod;
|
|
35
|
+
var result = {};
|
|
36
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
37
|
+
__setModuleDefault(result, mod);
|
|
38
|
+
return result;
|
|
39
|
+
};
|
|
40
|
+
})();
|
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports.parseGateArgs = parseGateArgs;
|
|
43
|
+
exports.runGate = runGate;
|
|
44
|
+
const node_child_process_1 = require("node:child_process");
|
|
45
|
+
const fs = __importStar(require("node:fs"));
|
|
46
|
+
const path = __importStar(require("node:path"));
|
|
47
|
+
const REPORT_REL = "evals/regression-report.json";
|
|
48
|
+
/** Detect the package manager used in the project */
|
|
49
|
+
function detectPackageManager(cwd) {
|
|
50
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
51
|
+
return "pnpm";
|
|
52
|
+
if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
53
|
+
return "yarn";
|
|
54
|
+
return "npm";
|
|
55
|
+
}
|
|
56
|
+
function parseGateArgs(argv) {
|
|
57
|
+
const args = { format: "human" };
|
|
58
|
+
for (let i = 0; i < argv.length; i++) {
|
|
59
|
+
if (argv[i] === "--format" && argv[i + 1]) {
|
|
60
|
+
const fmt = argv[i + 1];
|
|
61
|
+
if (fmt === "json" || fmt === "github" || fmt === "human") {
|
|
62
|
+
args.format = fmt;
|
|
63
|
+
}
|
|
64
|
+
i++;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return args;
|
|
68
|
+
}
|
|
69
|
+
function runGate(argv) {
|
|
70
|
+
const cwd = process.cwd();
|
|
71
|
+
const args = parseGateArgs(argv);
|
|
72
|
+
// Check if eval:regression-gate script exists
|
|
73
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
74
|
+
if (!fs.existsSync(pkgPath)) {
|
|
75
|
+
console.error("❌ No package.json found. Run this from your project root.");
|
|
76
|
+
return 1;
|
|
77
|
+
}
|
|
78
|
+
let pkg;
|
|
79
|
+
try {
|
|
80
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
console.error("❌ Failed to parse package.json");
|
|
84
|
+
return 1;
|
|
85
|
+
}
|
|
86
|
+
if (!pkg.scripts?.["eval:regression-gate"]) {
|
|
87
|
+
console.error("❌ Missing 'eval:regression-gate' script in package.json.");
|
|
88
|
+
console.error(' Add it: "eval:regression-gate": "npx tsx scripts/regression-gate.ts"');
|
|
89
|
+
return 1;
|
|
90
|
+
}
|
|
91
|
+
const pm = detectPackageManager(cwd);
|
|
92
|
+
const isWin = process.platform === "win32";
|
|
93
|
+
// For json format, suppress human output and print report JSON
|
|
94
|
+
const stdio = args.format === "json" ? "pipe" : "inherit";
|
|
95
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", "eval:regression-gate"], {
|
|
96
|
+
cwd,
|
|
97
|
+
stdio: stdio,
|
|
98
|
+
shell: isWin,
|
|
99
|
+
});
|
|
100
|
+
const exitCode = result.status ?? 1;
|
|
101
|
+
if (args.format === "json") {
|
|
102
|
+
// Output the regression report as JSON
|
|
103
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
104
|
+
if (fs.existsSync(reportPath)) {
|
|
105
|
+
const report = fs.readFileSync(reportPath, "utf-8");
|
|
106
|
+
process.stdout.write(report);
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
console.error(JSON.stringify({ error: "regression-report.json not found", exitCode }));
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
else if (args.format === "github") {
|
|
113
|
+
// Output GitHub Step Summary markdown
|
|
114
|
+
const reportPath = path.join(cwd, REPORT_REL);
|
|
115
|
+
if (fs.existsSync(reportPath)) {
|
|
116
|
+
try {
|
|
117
|
+
const report = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
|
|
118
|
+
const icon = report.passed ? "✅" : "❌";
|
|
119
|
+
const lines = [
|
|
120
|
+
`## ${icon} Regression Gate: ${report.category}`,
|
|
121
|
+
"",
|
|
122
|
+
"| Metric | Baseline | Current | Delta | Status |",
|
|
123
|
+
"|--------|----------|---------|-------|--------|",
|
|
124
|
+
];
|
|
125
|
+
for (const d of report.deltas ?? []) {
|
|
126
|
+
const statusIcon = d.status === "pass" ? "✅" : "❌";
|
|
127
|
+
lines.push(`| ${d.metric} | ${d.baseline} | ${d.current} | ${d.delta} | ${statusIcon} |`);
|
|
128
|
+
}
|
|
129
|
+
if (report.failures?.length > 0) {
|
|
130
|
+
lines.push("", "### Failures", "");
|
|
131
|
+
for (const f of report.failures) {
|
|
132
|
+
lines.push(`- ${f}`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
lines.push("", `Schema version: ${report.schemaVersion ?? "unknown"}`);
|
|
136
|
+
const md = lines.join("\n");
|
|
137
|
+
// Write to $GITHUB_STEP_SUMMARY if available
|
|
138
|
+
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
139
|
+
if (summaryPath) {
|
|
140
|
+
fs.appendFileSync(summaryPath, `${md}\n`);
|
|
141
|
+
}
|
|
142
|
+
console.log(md);
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
// Fall through — human output already printed
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return exitCode;
|
|
150
|
+
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export {};
|
|
1
|
+
export {};
|
|
@@ -1,157 +1,157 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
const vitest_1 = require("vitest");
|
|
37
|
-
const client_1 = require("./client");
|
|
38
|
-
const errorsModule = __importStar(require("./errors"));
|
|
39
|
-
vitest_1.vi.mock("./cache", () => {
|
|
40
|
-
const cacheTracker = { invalidatedPatterns: [] };
|
|
41
|
-
const shouldCache = vitest_1.vi.fn().mockReturnValue(true);
|
|
42
|
-
const getTTL = vitest_1.vi.fn().mockReturnValue(1000);
|
|
43
|
-
const makeKey = (method, url, params) => `${method}:${url}:${JSON.stringify(params ?? null)}`;
|
|
44
|
-
return {
|
|
45
|
-
__esModule: true,
|
|
46
|
-
shouldCache,
|
|
47
|
-
getTTL,
|
|
48
|
-
cacheTracker,
|
|
49
|
-
RequestCache: class RequestCache {
|
|
50
|
-
constructor() {
|
|
51
|
-
this.store = new Map();
|
|
52
|
-
}
|
|
53
|
-
get(method, url, params) {
|
|
54
|
-
const key = makeKey(method, url, params);
|
|
55
|
-
return this.store.get(key) ?? null;
|
|
56
|
-
}
|
|
57
|
-
set(method, url, data, _ttl, params) {
|
|
58
|
-
const key = makeKey(method, url, params);
|
|
59
|
-
this.store.set(key, data);
|
|
60
|
-
}
|
|
61
|
-
invalidatePattern(pattern) {
|
|
62
|
-
cacheTracker.invalidatedPatterns.push(pattern);
|
|
63
|
-
}
|
|
64
|
-
invalidate(_method, _url, _params) {
|
|
65
|
-
// no-op for tests
|
|
66
|
-
}
|
|
67
|
-
clear() {
|
|
68
|
-
this.store.clear();
|
|
69
|
-
}
|
|
70
|
-
},
|
|
71
|
-
};
|
|
72
|
-
});
|
|
73
|
-
const cache_1 = require("./cache");
|
|
74
|
-
(0, vitest_1.describe)("AIEvalClient.request", () => {
|
|
75
|
-
(0, vitest_1.beforeEach)(() => {
|
|
76
|
-
process.env.EVALAI_API_KEY = "test";
|
|
77
|
-
cache_1.shouldCache.mockReset().mockReturnValue(true);
|
|
78
|
-
cache_1.getTTL.mockReset().mockReturnValue(1000);
|
|
79
|
-
cache_1.cacheTracker.invalidatedPatterns.length = 0;
|
|
80
|
-
});
|
|
81
|
-
(0, vitest_1.it)("caches GET responses and reuses data without re-fetching", async () => {
|
|
82
|
-
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
83
|
-
const payload = { items: [1, 2, 3] };
|
|
84
|
-
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
85
|
-
ok: true,
|
|
86
|
-
status: 200,
|
|
87
|
-
json: async () => payload,
|
|
88
|
-
});
|
|
89
|
-
globalThis.fetch = fetchMock;
|
|
90
|
-
const first = await client.request("/api/traces", { method: "GET" });
|
|
91
|
-
const second = await client.request("/api/traces", { method: "GET" });
|
|
92
|
-
(0, vitest_1.expect)(first).toEqual(payload);
|
|
93
|
-
(0, vitest_1.expect)(second).toEqual(payload);
|
|
94
|
-
(0, vitest_1.expect)(fetchMock).toHaveBeenCalledTimes(1);
|
|
95
|
-
});
|
|
96
|
-
(0, vitest_1.it)("propagates non-ok responses as SDK errors", async () => {
|
|
97
|
-
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost" });
|
|
98
|
-
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
99
|
-
ok: false,
|
|
100
|
-
status: 429,
|
|
101
|
-
json: async () => ({ error: { code: "RATE_LIMIT_EXCEEDED" } }),
|
|
102
|
-
});
|
|
103
|
-
globalThis.fetch = fetchMock;
|
|
104
|
-
const createErrorSpy = vitest_1.vi
|
|
105
|
-
.spyOn(errorsModule, "createErrorFromResponse")
|
|
106
|
-
.mockReturnValue(new errorsModule.EvalAIError("rate limited", "RATE_LIMIT_EXCEEDED", 429));
|
|
107
|
-
await (0, vitest_1.expect)(client.request("/api/fail", { method: "GET" })).rejects.toHaveProperty("code", "RATE_LIMIT_EXCEEDED");
|
|
108
|
-
createErrorSpy.mockRestore();
|
|
109
|
-
});
|
|
110
|
-
(0, vitest_1.it)("retries on retryable SDK errors and eventually succeeds", async () => {
|
|
111
|
-
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
112
|
-
vitest_1.vi.spyOn(client, "calculateBackoff").mockReturnValue(0);
|
|
113
|
-
const failureResponse = {
|
|
114
|
-
ok: false,
|
|
115
|
-
status: 429,
|
|
116
|
-
json: async () => ({ error: { code: "RATE_LIMIT_EXCEEDED" } }),
|
|
117
|
-
};
|
|
118
|
-
const successResponse = {
|
|
119
|
-
ok: true,
|
|
120
|
-
status: 200,
|
|
121
|
-
json: async () => ({ ok: true }),
|
|
122
|
-
};
|
|
123
|
-
const createErrorSpy = vitest_1.vi
|
|
124
|
-
.spyOn(errorsModule, "createErrorFromResponse")
|
|
125
|
-
.mockReturnValue(new errorsModule.EvalAIError("rate limited", "RATE_LIMIT_EXCEEDED", 429));
|
|
126
|
-
const fetchMock = vitest_1.vi
|
|
127
|
-
.fn()
|
|
128
|
-
.mockResolvedValueOnce(failureResponse)
|
|
129
|
-
.mockResolvedValueOnce(successResponse);
|
|
130
|
-
globalThis.fetch = fetchMock;
|
|
131
|
-
const result = await client.request("/api/retry", { method: "GET" });
|
|
132
|
-
(0, vitest_1.expect)(result).toEqual({ ok: true });
|
|
133
|
-
(0, vitest_1.expect)(fetchMock).toHaveBeenCalledTimes(2);
|
|
134
|
-
createErrorSpy.mockRestore();
|
|
135
|
-
});
|
|
136
|
-
(0, vitest_1.it)("throws a TIMEOUT SDK error when fetch aborts", async () => {
|
|
137
|
-
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
138
|
-
const abortError = Object.assign(new Error("aborted"), { name: "AbortError" });
|
|
139
|
-
const fetchMock = vitest_1.vi.fn().mockRejectedValue(abortError);
|
|
140
|
-
globalThis.fetch = fetchMock;
|
|
141
|
-
await (0, vitest_1.expect)(client.request("/api/timeout", { method: "GET" })).rejects.toMatchObject({
|
|
142
|
-
code: "TIMEOUT",
|
|
143
|
-
});
|
|
144
|
-
});
|
|
145
|
-
(0, vitest_1.it)("invalidates related cache entries for mutation requests", async () => {
|
|
146
|
-
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
147
|
-
cache_1.shouldCache.mockReturnValue(false);
|
|
148
|
-
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
149
|
-
ok: true,
|
|
150
|
-
status: 201,
|
|
151
|
-
json: async () => ({ result: "ok" }),
|
|
152
|
-
});
|
|
153
|
-
globalThis.fetch = fetchMock;
|
|
154
|
-
await client.request("/api/evaluations", { method: "POST", body: JSON.stringify({}) });
|
|
155
|
-
(0, vitest_1.expect)(cache_1.cacheTracker.invalidatedPatterns).toContain("evaluations");
|
|
156
|
-
});
|
|
157
|
-
});
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
const vitest_1 = require("vitest");
|
|
37
|
+
const client_1 = require("./client");
|
|
38
|
+
const errorsModule = __importStar(require("./errors"));
|
|
39
|
+
vitest_1.vi.mock("./cache", () => {
|
|
40
|
+
const cacheTracker = { invalidatedPatterns: [] };
|
|
41
|
+
const shouldCache = vitest_1.vi.fn().mockReturnValue(true);
|
|
42
|
+
const getTTL = vitest_1.vi.fn().mockReturnValue(1000);
|
|
43
|
+
const makeKey = (method, url, params) => `${method}:${url}:${JSON.stringify(params ?? null)}`;
|
|
44
|
+
return {
|
|
45
|
+
__esModule: true,
|
|
46
|
+
shouldCache,
|
|
47
|
+
getTTL,
|
|
48
|
+
cacheTracker,
|
|
49
|
+
RequestCache: class RequestCache {
|
|
50
|
+
constructor() {
|
|
51
|
+
this.store = new Map();
|
|
52
|
+
}
|
|
53
|
+
get(method, url, params) {
|
|
54
|
+
const key = makeKey(method, url, params);
|
|
55
|
+
return this.store.get(key) ?? null;
|
|
56
|
+
}
|
|
57
|
+
set(method, url, data, _ttl, params) {
|
|
58
|
+
const key = makeKey(method, url, params);
|
|
59
|
+
this.store.set(key, data);
|
|
60
|
+
}
|
|
61
|
+
invalidatePattern(pattern) {
|
|
62
|
+
cacheTracker.invalidatedPatterns.push(pattern);
|
|
63
|
+
}
|
|
64
|
+
invalidate(_method, _url, _params) {
|
|
65
|
+
// no-op for tests
|
|
66
|
+
}
|
|
67
|
+
clear() {
|
|
68
|
+
this.store.clear();
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
});
|
|
73
|
+
const cache_1 = require("./cache");
|
|
74
|
+
(0, vitest_1.describe)("AIEvalClient.request", () => {
|
|
75
|
+
(0, vitest_1.beforeEach)(() => {
|
|
76
|
+
process.env.EVALAI_API_KEY = "test";
|
|
77
|
+
cache_1.shouldCache.mockReset().mockReturnValue(true);
|
|
78
|
+
cache_1.getTTL.mockReset().mockReturnValue(1000);
|
|
79
|
+
cache_1.cacheTracker.invalidatedPatterns.length = 0;
|
|
80
|
+
});
|
|
81
|
+
(0, vitest_1.it)("caches GET responses and reuses data without re-fetching", async () => {
|
|
82
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
83
|
+
const payload = { items: [1, 2, 3] };
|
|
84
|
+
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
85
|
+
ok: true,
|
|
86
|
+
status: 200,
|
|
87
|
+
json: async () => payload,
|
|
88
|
+
});
|
|
89
|
+
globalThis.fetch = fetchMock;
|
|
90
|
+
const first = await client.request("/api/traces", { method: "GET" });
|
|
91
|
+
const second = await client.request("/api/traces", { method: "GET" });
|
|
92
|
+
(0, vitest_1.expect)(first).toEqual(payload);
|
|
93
|
+
(0, vitest_1.expect)(second).toEqual(payload);
|
|
94
|
+
(0, vitest_1.expect)(fetchMock).toHaveBeenCalledTimes(1);
|
|
95
|
+
});
|
|
96
|
+
(0, vitest_1.it)("propagates non-ok responses as SDK errors", async () => {
|
|
97
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost" });
|
|
98
|
+
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
99
|
+
ok: false,
|
|
100
|
+
status: 429,
|
|
101
|
+
json: async () => ({ error: { code: "RATE_LIMIT_EXCEEDED" } }),
|
|
102
|
+
});
|
|
103
|
+
globalThis.fetch = fetchMock;
|
|
104
|
+
const createErrorSpy = vitest_1.vi
|
|
105
|
+
.spyOn(errorsModule, "createErrorFromResponse")
|
|
106
|
+
.mockReturnValue(new errorsModule.EvalAIError("rate limited", "RATE_LIMIT_EXCEEDED", 429));
|
|
107
|
+
await (0, vitest_1.expect)(client.request("/api/fail", { method: "GET" })).rejects.toHaveProperty("code", "RATE_LIMIT_EXCEEDED");
|
|
108
|
+
createErrorSpy.mockRestore();
|
|
109
|
+
});
|
|
110
|
+
(0, vitest_1.it)("retries on retryable SDK errors and eventually succeeds", async () => {
|
|
111
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
112
|
+
vitest_1.vi.spyOn(client, "calculateBackoff").mockReturnValue(0);
|
|
113
|
+
const failureResponse = {
|
|
114
|
+
ok: false,
|
|
115
|
+
status: 429,
|
|
116
|
+
json: async () => ({ error: { code: "RATE_LIMIT_EXCEEDED" } }),
|
|
117
|
+
};
|
|
118
|
+
const successResponse = {
|
|
119
|
+
ok: true,
|
|
120
|
+
status: 200,
|
|
121
|
+
json: async () => ({ ok: true }),
|
|
122
|
+
};
|
|
123
|
+
const createErrorSpy = vitest_1.vi
|
|
124
|
+
.spyOn(errorsModule, "createErrorFromResponse")
|
|
125
|
+
.mockReturnValue(new errorsModule.EvalAIError("rate limited", "RATE_LIMIT_EXCEEDED", 429));
|
|
126
|
+
const fetchMock = vitest_1.vi
|
|
127
|
+
.fn()
|
|
128
|
+
.mockResolvedValueOnce(failureResponse)
|
|
129
|
+
.mockResolvedValueOnce(successResponse);
|
|
130
|
+
globalThis.fetch = fetchMock;
|
|
131
|
+
const result = await client.request("/api/retry", { method: "GET" });
|
|
132
|
+
(0, vitest_1.expect)(result).toEqual({ ok: true });
|
|
133
|
+
(0, vitest_1.expect)(fetchMock).toHaveBeenCalledTimes(2);
|
|
134
|
+
createErrorSpy.mockRestore();
|
|
135
|
+
});
|
|
136
|
+
(0, vitest_1.it)("throws a TIMEOUT SDK error when fetch aborts", async () => {
|
|
137
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
138
|
+
const abortError = Object.assign(new Error("aborted"), { name: "AbortError" });
|
|
139
|
+
const fetchMock = vitest_1.vi.fn().mockRejectedValue(abortError);
|
|
140
|
+
globalThis.fetch = fetchMock;
|
|
141
|
+
await (0, vitest_1.expect)(client.request("/api/timeout", { method: "GET" })).rejects.toMatchObject({
|
|
142
|
+
code: "TIMEOUT",
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
(0, vitest_1.it)("invalidates related cache entries for mutation requests", async () => {
|
|
146
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
147
|
+
cache_1.shouldCache.mockReturnValue(false);
|
|
148
|
+
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
149
|
+
ok: true,
|
|
150
|
+
status: 201,
|
|
151
|
+
json: async () => ({ result: "ok" }),
|
|
152
|
+
});
|
|
153
|
+
globalThis.fetch = fetchMock;
|
|
154
|
+
await client.request("/api/evaluations", { method: "POST", body: JSON.stringify({}) });
|
|
155
|
+
(0, vitest_1.expect)(cache_1.cacheTracker.invalidatedPatterns).toContain("evaluations");
|
|
156
|
+
});
|
|
157
|
+
});
|
package/dist/index.d.ts
CHANGED
|
@@ -33,5 +33,6 @@ export { batchProcess, batchRead, RateLimiter, streamEvaluation } from "./stream
|
|
|
33
33
|
export type { Annotation, AnnotationItem, AnnotationTask, APIKey, APIKeyUsage, APIKeyWithSecret, BatchOptions, ClientConfig as AIEvalConfig, CreateAnnotationItemParams, CreateAnnotationParams, CreateAnnotationTaskParams, CreateAPIKeyParams, CreateLLMJudgeConfigParams, CreateWebhookParams, Evaluation as EvaluationData, ExportOptions, GenericMetadata as AnnotationData, GetLLMJudgeAlignmentParams, GetUsageParams, ImportOptions, ListAnnotationItemsParams, ListAnnotationsParams, ListAnnotationTasksParams, ListAPIKeysParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, ListWebhookDeliveriesParams, ListWebhooksParams, LLMJudgeAlignment, LLMJudgeConfig, LLMJudgeResult as LLMJudgeData, Organization, RetryConfig, SnapshotData, Span as SpanData, StreamOptions, TestCase, TestResult, Trace as TraceData, TracedResponse, UpdateAPIKeyParams, UpdateWebhookParams, UsageStats, UsageSummary, Webhook, WebhookDelivery, } from "./types";
|
|
34
34
|
export { EvaluationTemplates, type EvaluationTemplateType, type FeatureUsage, type OrganizationLimits, } from "./types";
|
|
35
35
|
export { type AgentHandoff, type AgentSpanContext, type CostCategory, type CostRecord, createWorkflowTracer, type DecisionAlternative, type DecisionType, type HandoffType, type LLMProvider, type RecordCostParams, type RecordDecisionParams, traceAutoGen, traceCrewAI, traceLangChainAgent, traceWorkflowStep, type WorkflowContext, type WorkflowDefinition, type WorkflowEdge, type WorkflowNode, type WorkflowStatus, WorkflowTracer, type WorkflowTracerOptions, } from "./workflows";
|
|
36
|
+
export { ARTIFACTS, type Baseline, type BaselineTolerance, GATE_CATEGORY, GATE_EXIT, type GateCategory, type GateExitCode, type RegressionDelta, type RegressionReport, REPORT_SCHEMA_VERSION, } from "./regression";
|
|
36
37
|
import { AIEvalClient } from "./client";
|
|
37
38
|
export default AIEvalClient;
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
11
|
exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalAIError = exports.AIEvalClient = void 0;
|
|
12
|
-
exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = void 0;
|
|
12
|
+
exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = void 0;
|
|
13
13
|
// Main SDK exports
|
|
14
14
|
var client_1 = require("./client");
|
|
15
15
|
Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
|
|
@@ -117,6 +117,12 @@ Object.defineProperty(exports, "traceCrewAI", { enumerable: true, get: function
|
|
|
117
117
|
Object.defineProperty(exports, "traceLangChainAgent", { enumerable: true, get: function () { return workflows_1.traceLangChainAgent; } });
|
|
118
118
|
Object.defineProperty(exports, "traceWorkflowStep", { enumerable: true, get: function () { return workflows_1.traceWorkflowStep; } });
|
|
119
119
|
Object.defineProperty(exports, "WorkflowTracer", { enumerable: true, get: function () { return workflows_1.WorkflowTracer; } });
|
|
120
|
+
// Regression gate constants & types (v1.6.0)
|
|
121
|
+
var regression_1 = require("./regression");
|
|
122
|
+
Object.defineProperty(exports, "ARTIFACTS", { enumerable: true, get: function () { return regression_1.ARTIFACTS; } });
|
|
123
|
+
Object.defineProperty(exports, "GATE_CATEGORY", { enumerable: true, get: function () { return regression_1.GATE_CATEGORY; } });
|
|
124
|
+
Object.defineProperty(exports, "GATE_EXIT", { enumerable: true, get: function () { return regression_1.GATE_EXIT; } });
|
|
125
|
+
Object.defineProperty(exports, "REPORT_SCHEMA_VERSION", { enumerable: true, get: function () { return regression_1.REPORT_SCHEMA_VERSION; } });
|
|
120
126
|
// Default export for convenience
|
|
121
127
|
const client_2 = require("./client");
|
|
122
128
|
exports.default = client_2.AIEvalClient;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Regression gate constants and types.
|
|
3
|
+
*
|
|
4
|
+
* These mirror the contracts defined in scripts/regression-gate.ts
|
|
5
|
+
* and evals/schemas/regression-report.schema.json so that SDK consumers
|
|
6
|
+
* can programmatically inspect gate results without parsing strings.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
/** Exit codes emitted by `evalai gate` / `scripts/regression-gate.ts`. */
|
|
11
|
+
export declare const GATE_EXIT: {
|
|
12
|
+
/** Gate passed — no regressions detected */
|
|
13
|
+
readonly PASS: 0;
|
|
14
|
+
/** One or more regression thresholds exceeded */
|
|
15
|
+
readonly REGRESSION: 1;
|
|
16
|
+
/** Infrastructure error (baseline missing, summary missing, etc.) */
|
|
17
|
+
readonly INFRA_ERROR: 2;
|
|
18
|
+
/** Confidence tests failed (test suite red) */
|
|
19
|
+
readonly CONFIDENCE_FAILED: 3;
|
|
20
|
+
/** Confidence summary file missing (test infra crashed) */
|
|
21
|
+
readonly CONFIDENCE_MISSING: 4;
|
|
22
|
+
};
|
|
23
|
+
export type GateExitCode = (typeof GATE_EXIT)[keyof typeof GATE_EXIT];
|
|
24
|
+
/** Categories written to regression-report.json `category` field. */
|
|
25
|
+
export declare const GATE_CATEGORY: {
|
|
26
|
+
readonly PASS: "pass";
|
|
27
|
+
readonly REGRESSION: "regression";
|
|
28
|
+
readonly INFRA_ERROR: "infra_error";
|
|
29
|
+
};
|
|
30
|
+
export type GateCategory = (typeof GATE_CATEGORY)[keyof typeof GATE_CATEGORY];
|
|
31
|
+
/** Current schema version for regression-report.json. */
|
|
32
|
+
export declare const REPORT_SCHEMA_VERSION = 1;
|
|
33
|
+
export interface RegressionDelta {
|
|
34
|
+
metric: string;
|
|
35
|
+
baseline: number | string;
|
|
36
|
+
current: number | string;
|
|
37
|
+
delta: string;
|
|
38
|
+
status: "pass" | "fail";
|
|
39
|
+
}
|
|
40
|
+
export interface RegressionReport {
|
|
41
|
+
schemaVersion: number;
|
|
42
|
+
timestamp: string;
|
|
43
|
+
exitCode: GateExitCode;
|
|
44
|
+
category: GateCategory;
|
|
45
|
+
passed: boolean;
|
|
46
|
+
failures: string[];
|
|
47
|
+
deltas: RegressionDelta[];
|
|
48
|
+
}
|
|
49
|
+
export interface BaselineTolerance {
|
|
50
|
+
scoreDrop: number;
|
|
51
|
+
passRateDrop: number;
|
|
52
|
+
maxLatencyIncreaseMs: number;
|
|
53
|
+
maxCostIncreaseUsd: number;
|
|
54
|
+
}
|
|
55
|
+
export interface Baseline {
|
|
56
|
+
schemaVersion: number;
|
|
57
|
+
description: string;
|
|
58
|
+
generatedAt: string;
|
|
59
|
+
generatedBy: string;
|
|
60
|
+
commitSha: string;
|
|
61
|
+
updatedAt: string;
|
|
62
|
+
updatedBy: string;
|
|
63
|
+
tolerance: BaselineTolerance;
|
|
64
|
+
goldenEval: {
|
|
65
|
+
score: number;
|
|
66
|
+
passRate: number;
|
|
67
|
+
totalCases: number;
|
|
68
|
+
passedCases: number;
|
|
69
|
+
};
|
|
70
|
+
qualityScore: {
|
|
71
|
+
overall: number;
|
|
72
|
+
grade: string;
|
|
73
|
+
accuracy: number;
|
|
74
|
+
safety: number;
|
|
75
|
+
latency: number;
|
|
76
|
+
cost: number;
|
|
77
|
+
consistency: number;
|
|
78
|
+
};
|
|
79
|
+
confidenceTests: {
|
|
80
|
+
unitPassed: boolean;
|
|
81
|
+
unitTotal: number;
|
|
82
|
+
dbPassed: boolean;
|
|
83
|
+
dbTotal: number;
|
|
84
|
+
};
|
|
85
|
+
productMetrics: {
|
|
86
|
+
p95ApiLatencyMs?: number;
|
|
87
|
+
goldenCostUsd?: number;
|
|
88
|
+
};
|
|
89
|
+
qualityMetrics?: {
|
|
90
|
+
unitLaneDurationMs?: number;
|
|
91
|
+
dbLaneDurationMs?: number;
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
/** Well-known artifact paths relative to project root. */
|
|
95
|
+
export declare const ARTIFACTS: {
|
|
96
|
+
readonly BASELINE: "evals/baseline.json";
|
|
97
|
+
readonly REGRESSION_REPORT: "evals/regression-report.json";
|
|
98
|
+
readonly CONFIDENCE_SUMMARY: "evals/confidence-summary.json";
|
|
99
|
+
readonly LATENCY_BENCHMARK: "evals/latency-benchmark.json";
|
|
100
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Regression gate constants and types.
|
|
4
|
+
*
|
|
5
|
+
* These mirror the contracts defined in scripts/regression-gate.ts
|
|
6
|
+
* and evals/schemas/regression-report.schema.json so that SDK consumers
|
|
7
|
+
* can programmatically inspect gate results without parsing strings.
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.ARTIFACTS = exports.REPORT_SCHEMA_VERSION = exports.GATE_CATEGORY = exports.GATE_EXIT = void 0;
|
|
13
|
+
// ── Exit codes ──
|
|
14
|
+
/** Exit codes emitted by `evalai gate` / `scripts/regression-gate.ts`. */
|
|
15
|
+
exports.GATE_EXIT = {
|
|
16
|
+
/** Gate passed — no regressions detected */
|
|
17
|
+
PASS: 0,
|
|
18
|
+
/** One or more regression thresholds exceeded */
|
|
19
|
+
REGRESSION: 1,
|
|
20
|
+
/** Infrastructure error (baseline missing, summary missing, etc.) */
|
|
21
|
+
INFRA_ERROR: 2,
|
|
22
|
+
/** Confidence tests failed (test suite red) */
|
|
23
|
+
CONFIDENCE_FAILED: 3,
|
|
24
|
+
/** Confidence summary file missing (test infra crashed) */
|
|
25
|
+
CONFIDENCE_MISSING: 4,
|
|
26
|
+
};
|
|
27
|
+
// ── Report categories ──
|
|
28
|
+
/** Categories written to regression-report.json `category` field. */
|
|
29
|
+
exports.GATE_CATEGORY = {
|
|
30
|
+
PASS: "pass",
|
|
31
|
+
REGRESSION: "regression",
|
|
32
|
+
INFRA_ERROR: "infra_error",
|
|
33
|
+
};
|
|
34
|
+
// ── Schema version ──
|
|
35
|
+
/** Current schema version for regression-report.json. */
|
|
36
|
+
exports.REPORT_SCHEMA_VERSION = 1;
|
|
37
|
+
// ── Artifact paths ──
|
|
38
|
+
/** Well-known artifact paths relative to project root. */
|
|
39
|
+
exports.ARTIFACTS = {
|
|
40
|
+
BASELINE: "evals/baseline.json",
|
|
41
|
+
REGRESSION_REPORT: "evals/regression-report.json",
|
|
42
|
+
CONFIDENCE_SUMMARY: "evals/confidence-summary.json",
|
|
43
|
+
LATENCY_BENCHMARK: "evals/latency-benchmark.json",
|
|
44
|
+
};
|
package/dist/version.d.ts
CHANGED
package/dist/version.js
CHANGED
|
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
|
|
|
6
6
|
* X-EvalAI-SDK-Version: SDK package version
|
|
7
7
|
* X-EvalAI-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
8
8
|
*/
|
|
9
|
-
exports.SDK_VERSION = "1.
|
|
9
|
+
exports.SDK_VERSION = "1.6.0";
|
|
10
10
|
exports.SPEC_VERSION = "1.0.0";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pauly4010/evalai-sdk",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "AI Evaluation Platform SDK - Complete API Coverage with Performance Optimizations",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.js",
|
|
@@ -102,6 +102,11 @@
|
|
|
102
102
|
"import": "./dist/matchers/index.js",
|
|
103
103
|
"require": "./dist/matchers/index.js",
|
|
104
104
|
"types": "./dist/matchers/index.d.ts"
|
|
105
|
+
},
|
|
106
|
+
"./regression": {
|
|
107
|
+
"import": "./dist/regression.js",
|
|
108
|
+
"require": "./dist/regression.js",
|
|
109
|
+
"types": "./dist/regression.d.ts"
|
|
105
110
|
}
|
|
106
111
|
}
|
|
107
112
|
}
|