@evalgate/sdk 2.1.2 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +27 -0
- package/dist/cli/discover.js +42 -23
- package/dist/cli/doctor.js +1 -1
- package/dist/cli/explain.js +1 -0
- package/dist/cli/regression-gate.js +23 -10
- package/dist/cli/run.js +87 -57
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,18 @@ All notable changes to the @evalgate/sdk package will be documented in this file
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.1.3] - 2026-03-02
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
- **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
|
|
13
|
+
- **High:** First-run gate false regression on fresh init when no test script exists
|
|
14
|
+
- **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
|
|
15
|
+
- **Critical:** Simulated executeSpec replaced with real spec execution
|
|
16
|
+
- **High:** Run scores now include scoring model context for clarity
|
|
17
|
+
- **Low:** Explain no longer shows "unnamed" for builtin gate failures
|
|
18
|
+
- **Docs:** Added missing `discover --manifest` step to local quickstart
|
|
19
|
+
|
|
8
20
|
## [2.1.2] - 2026-03-02
|
|
9
21
|
|
|
10
22
|
### Fixed
|
package/README.md
CHANGED
|
@@ -254,6 +254,33 @@ All commands automatically write artifacts so `explain` works with zero flags.
|
|
|
254
254
|
npm install @evalgate/sdk openai
|
|
255
255
|
```
|
|
256
256
|
|
|
257
|
+
Create `eval/your-spec.spec.ts`:
|
|
258
|
+
|
|
259
|
+
```typescript
|
|
260
|
+
import { defineEval } from "@evalgate/sdk";
|
|
261
|
+
|
|
262
|
+
defineEval({
|
|
263
|
+
name: "Basic Math Operations",
|
|
264
|
+
description: "Test fundamental arithmetic",
|
|
265
|
+
prompt: "Test: 1+1=2, string concatenation, array includes",
|
|
266
|
+
expected: "All tests should pass",
|
|
267
|
+
tags: ["basic", "math"],
|
|
268
|
+
category: "unit-test"
|
|
269
|
+
});
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
```bash
|
|
273
|
+
# Discover specs and generate manifest
|
|
274
|
+
npx @evalgate/sdk discover
|
|
275
|
+
npx @evalgate/sdk discover --manifest
|
|
276
|
+
|
|
277
|
+
# Run evaluations
|
|
278
|
+
npx @evalgate/sdk run --write-results
|
|
279
|
+
|
|
280
|
+
# Run local regression gate
|
|
281
|
+
npx @evalgate/sdk gate
|
|
282
|
+
```
|
|
283
|
+
|
|
257
284
|
```typescript
|
|
258
285
|
import { openAIChatEval } from "@evalgate/sdk";
|
|
259
286
|
|
package/dist/cli/discover.js
CHANGED
|
@@ -145,8 +145,8 @@ async function analyzeSpecifications(specFiles) {
|
|
|
145
145
|
for (const filePath of specFiles) {
|
|
146
146
|
try {
|
|
147
147
|
const content = await fs.readFile(filePath, "utf-8");
|
|
148
|
-
const
|
|
149
|
-
specs.push(
|
|
148
|
+
const fileSpecs = analyzeSpecFile(filePath, content);
|
|
149
|
+
specs.push(...fileSpecs);
|
|
150
150
|
}
|
|
151
151
|
catch (error) {
|
|
152
152
|
console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
@@ -155,20 +155,40 @@ async function analyzeSpecifications(specFiles) {
|
|
|
155
155
|
return specs;
|
|
156
156
|
}
|
|
157
157
|
/**
|
|
158
|
-
*
|
|
158
|
+
* Extract all spec names from file content (handles both call forms)
|
|
159
|
+
*/
|
|
160
|
+
function extractSpecNames(content) {
|
|
161
|
+
const names = [];
|
|
162
|
+
// Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
|
|
163
|
+
const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
|
|
164
|
+
let m = stringArgPattern.exec(content);
|
|
165
|
+
while (m !== null) {
|
|
166
|
+
names.push(m[1]);
|
|
167
|
+
m = stringArgPattern.exec(content);
|
|
168
|
+
}
|
|
169
|
+
if (names.length > 0)
|
|
170
|
+
return names;
|
|
171
|
+
// Form 2: defineEval({ name: "..." }) — object-first form
|
|
172
|
+
const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
|
|
173
|
+
m = objNamePattern.exec(content);
|
|
174
|
+
while (m !== null) {
|
|
175
|
+
names.push(m[1]);
|
|
176
|
+
m = objNamePattern.exec(content);
|
|
177
|
+
}
|
|
178
|
+
return names;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Analyze a single specification file — returns one SpecAnalysis per defineEval call
|
|
159
182
|
*/
|
|
160
183
|
function analyzeSpecFile(filePath, content) {
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
// Extract tags
|
|
184
|
+
const specNames = extractSpecNames(content);
|
|
185
|
+
// Fallback: file matched as a spec file but we couldn't parse names
|
|
186
|
+
if (specNames.length === 0) {
|
|
187
|
+
specNames.push(path.basename(filePath, path.extname(filePath)));
|
|
188
|
+
}
|
|
189
|
+
// Shared analysis for the file
|
|
168
190
|
const tags = extractTags(content);
|
|
169
|
-
// Analyze complexity
|
|
170
191
|
const complexity = analyzeComplexity(content);
|
|
171
|
-
// Check for models and tools
|
|
172
192
|
const usesModels = content.includes("model:") ||
|
|
173
193
|
content.includes("model=") ||
|
|
174
194
|
content.includes("openai") ||
|
|
@@ -176,22 +196,20 @@ function analyzeSpecFile(filePath, content) {
|
|
|
176
196
|
const usesTools = content.includes("tool:") ||
|
|
177
197
|
content.includes("function.") ||
|
|
178
198
|
content.includes("call(");
|
|
179
|
-
// Check for assertions
|
|
180
199
|
const hasAssertions = content.includes("assert") ||
|
|
181
200
|
content.includes("expect") ||
|
|
182
201
|
content.includes("should");
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
file: path.relative(process.cwd(), filePath),
|
|
202
|
+
const relFile = path.relative(process.cwd(), filePath);
|
|
203
|
+
return specNames.map((name, idx) => ({
|
|
204
|
+
id: generateSpecId(filePath, name, idx),
|
|
205
|
+
name,
|
|
206
|
+
file: relFile,
|
|
189
207
|
tags,
|
|
190
208
|
hasAssertions,
|
|
191
209
|
usesModels,
|
|
192
210
|
usesTools,
|
|
193
211
|
complexity,
|
|
194
|
-
};
|
|
212
|
+
}));
|
|
195
213
|
}
|
|
196
214
|
/**
|
|
197
215
|
* Extract tags from specification content
|
|
@@ -263,11 +281,12 @@ function analyzeComplexity(content) {
|
|
|
263
281
|
return "complex";
|
|
264
282
|
}
|
|
265
283
|
/**
|
|
266
|
-
* Generate specification ID from file path
|
|
284
|
+
* Generate specification ID from file path + name + index (unique per defineEval call)
|
|
267
285
|
*/
|
|
268
|
-
function generateSpecId(filePath) {
|
|
286
|
+
function generateSpecId(filePath, name, index) {
|
|
269
287
|
const relativePath = path.relative(process.cwd(), filePath);
|
|
270
|
-
const
|
|
288
|
+
const key = `${relativePath}:${name}:${index}`;
|
|
289
|
+
const hash = Buffer.from(key)
|
|
271
290
|
.toString("base64")
|
|
272
291
|
.replace(/[+/=]/g, "")
|
|
273
292
|
.slice(0, 8);
|
package/dist/cli/doctor.js
CHANGED
|
@@ -96,7 +96,7 @@ function parseFlags(argv) {
|
|
|
96
96
|
const baseUrl = raw.baseUrl ||
|
|
97
97
|
process.env.EVALGATE_BASE_URL ||
|
|
98
98
|
process.env.EVALAI_BASE_URL ||
|
|
99
|
-
"
|
|
99
|
+
"https://api.evalgate.com";
|
|
100
100
|
const apiKey = raw.apiKey ||
|
|
101
101
|
process.env.EVALGATE_API_KEY ||
|
|
102
102
|
process.env.EVALAI_API_KEY ||
|
package/dist/cli/explain.js
CHANGED
|
@@ -430,6 +430,7 @@ function buildFromBuiltinReport(report, reportPath) {
|
|
|
430
430
|
}));
|
|
431
431
|
const topFailures = failures.slice(0, 3).map((f, i) => ({
|
|
432
432
|
rank: i + 1,
|
|
433
|
+
name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
|
|
433
434
|
reason: f,
|
|
434
435
|
}));
|
|
435
436
|
// Simple root cause for builtin reports
|
|
@@ -94,6 +94,16 @@ function detectRunner(cwd) {
|
|
|
94
94
|
}
|
|
95
95
|
return "unknown";
|
|
96
96
|
}
|
|
97
|
+
function hasTestScript(cwd) {
|
|
98
|
+
try {
|
|
99
|
+
const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
|
|
100
|
+
const script = pkg.scripts?.test ?? "";
|
|
101
|
+
return !!script && script !== 'echo "Error: no test specified" && exit 1';
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
97
107
|
function runBuiltinGate(cwd) {
|
|
98
108
|
const t0 = Date.now();
|
|
99
109
|
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
|
|
|
101
111
|
const pm = detectPackageManager(cwd);
|
|
102
112
|
const command = `${pm} test`;
|
|
103
113
|
const runner = detectRunner(cwd);
|
|
114
|
+
const projectHasTestScript = hasTestScript(cwd);
|
|
104
115
|
// Load baseline
|
|
105
116
|
if (!fs.existsSync(baselinePath)) {
|
|
106
117
|
return {
|
|
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
|
|
|
165
176
|
const baselineTotal = baselineData.confidenceTests?.total ?? 0;
|
|
166
177
|
const failures = [];
|
|
167
178
|
const deltas = [];
|
|
168
|
-
// Delta: tests passing
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
179
|
+
// Delta: tests passing — only meaningful when a test script exists
|
|
180
|
+
if (projectHasTestScript) {
|
|
181
|
+
deltas.push({
|
|
182
|
+
metric: "tests_passing",
|
|
183
|
+
baseline: baselinePassed,
|
|
184
|
+
current: testsPassed,
|
|
185
|
+
delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
|
|
186
|
+
status: testsPassed ? "pass" : "fail",
|
|
187
|
+
});
|
|
188
|
+
if (!testsPassed && baselinePassed) {
|
|
189
|
+
failures.push("Tests were passing in baseline but are now failing");
|
|
190
|
+
}
|
|
178
191
|
}
|
|
179
192
|
// Delta: test count (only if we captured counts)
|
|
180
193
|
if (testCount > 0 || baselineTotal > 0) {
|
package/dist/cli/run.js
CHANGED
|
@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
|
|
|
52
52
|
const node_child_process_1 = require("node:child_process");
|
|
53
53
|
const fs = __importStar(require("node:fs/promises"));
|
|
54
54
|
const path = __importStar(require("node:path"));
|
|
55
|
+
const registry_1 = require("../runtime/registry");
|
|
55
56
|
const impact_analysis_1 = require("./impact-analysis");
|
|
56
57
|
/**
|
|
57
58
|
* Generate deterministic run ID
|
|
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
|
|
|
138
139
|
}
|
|
139
140
|
}
|
|
140
141
|
/**
|
|
141
|
-
* Execute specifications
|
|
142
|
+
* Execute specifications — grouped by file to avoid redundant loads
|
|
142
143
|
*/
|
|
143
144
|
async function executeSpecs(specs) {
|
|
144
|
-
|
|
145
|
+
// Group specs by their absolute file path
|
|
146
|
+
const specsByFile = new Map();
|
|
145
147
|
for (const spec of specs) {
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
+
const abs = path.isAbsolute(spec.filePath)
|
|
149
|
+
? spec.filePath
|
|
150
|
+
: path.join(process.cwd(), spec.filePath);
|
|
151
|
+
const group = specsByFile.get(abs) ?? [];
|
|
152
|
+
group.push(spec);
|
|
153
|
+
specsByFile.set(abs, group);
|
|
148
154
|
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
try {
|
|
157
|
-
// For now, simulate execution
|
|
158
|
-
// In a real implementation, this would:
|
|
159
|
-
// 1. Load the spec file
|
|
160
|
-
// 2. Execute the defineEval function
|
|
161
|
-
// 3. Capture the result
|
|
162
|
-
// Simulate some work
|
|
163
|
-
await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
|
|
164
|
-
// Simulate success/failure (90% success rate for demo)
|
|
165
|
-
const success = Math.random() > 0.1;
|
|
166
|
-
const duration = Date.now() - startTime;
|
|
167
|
-
if (success) {
|
|
168
|
-
return {
|
|
169
|
-
specId: spec.id,
|
|
170
|
-
name: spec.name,
|
|
171
|
-
filePath: spec.filePath,
|
|
172
|
-
result: {
|
|
173
|
-
status: "passed",
|
|
174
|
-
score: Math.random() * 0.3 + 0.7, // 0.7-1.0
|
|
175
|
-
duration,
|
|
176
|
-
},
|
|
177
|
-
};
|
|
155
|
+
const results = [];
|
|
156
|
+
for (const [absPath, fileSpecs] of specsByFile) {
|
|
157
|
+
// Fresh runtime per file to avoid cross-file contamination
|
|
158
|
+
(0, registry_1.disposeActiveRuntime)();
|
|
159
|
+
try {
|
|
160
|
+
// Bust require cache so the file re-executes its defineEval calls
|
|
161
|
+
delete require.cache[require.resolve(absPath)];
|
|
178
162
|
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
163
|
+
catch {
|
|
164
|
+
// Not in cache yet — fine
|
|
165
|
+
}
|
|
166
|
+
try {
|
|
167
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
168
|
+
require(absPath);
|
|
169
|
+
}
|
|
170
|
+
catch (loadError) {
|
|
171
|
+
const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
|
|
172
|
+
const msg = isTs &&
|
|
173
|
+
loadError instanceof Error &&
|
|
174
|
+
(loadError.message.includes("Unknown file extension") ||
|
|
175
|
+
loadError.message.includes("SyntaxError"))
|
|
176
|
+
? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
|
|
177
|
+
: loadError instanceof Error
|
|
178
|
+
? loadError.message
|
|
179
|
+
: String(loadError);
|
|
180
|
+
for (const spec of fileSpecs) {
|
|
181
|
+
results.push(makeErrorResult(spec, msg, 0));
|
|
182
|
+
}
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
const runtime = (0, registry_1.getActiveRuntime)();
|
|
186
|
+
const registered = runtime.list();
|
|
187
|
+
for (const spec of fileSpecs) {
|
|
188
|
+
const registeredSpec = registered.find((r) => r.name === spec.name);
|
|
189
|
+
if (!registeredSpec) {
|
|
190
|
+
results.push({
|
|
191
|
+
specId: spec.id,
|
|
192
|
+
name: spec.name,
|
|
193
|
+
filePath: spec.filePath,
|
|
194
|
+
result: {
|
|
195
|
+
status: "skipped",
|
|
196
|
+
error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
|
|
197
|
+
duration: 0,
|
|
198
|
+
},
|
|
199
|
+
});
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
const startTime = Date.now();
|
|
203
|
+
try {
|
|
204
|
+
const evalResult = await registeredSpec.executor({ input: "" });
|
|
205
|
+
results.push({
|
|
206
|
+
specId: spec.id,
|
|
207
|
+
name: spec.name,
|
|
208
|
+
filePath: spec.filePath,
|
|
209
|
+
result: {
|
|
210
|
+
status: evalResult.pass ? "passed" : "failed",
|
|
211
|
+
score: typeof evalResult.score === "number"
|
|
212
|
+
? evalResult.score / 100
|
|
213
|
+
: undefined,
|
|
214
|
+
error: evalResult.error,
|
|
215
|
+
duration: Date.now() - startTime,
|
|
216
|
+
},
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
catch (execError) {
|
|
220
|
+
results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
|
|
221
|
+
}
|
|
190
222
|
}
|
|
191
223
|
}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
},
|
|
202
|
-
};
|
|
203
|
-
}
|
|
224
|
+
return results;
|
|
225
|
+
}
|
|
226
|
+
function makeErrorResult(spec, error, duration) {
|
|
227
|
+
return {
|
|
228
|
+
specId: spec.id,
|
|
229
|
+
name: spec.name,
|
|
230
|
+
filePath: spec.filePath,
|
|
231
|
+
result: { status: "failed", error, duration },
|
|
232
|
+
};
|
|
204
233
|
}
|
|
205
234
|
/**
|
|
206
235
|
* Calculate summary statistics
|
|
@@ -348,7 +377,8 @@ function printHumanResults(result) {
|
|
|
348
377
|
console.log(` ❌ Failed: ${result.summary.failed}`);
|
|
349
378
|
console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
|
|
350
379
|
console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
|
|
351
|
-
|
|
380
|
+
const hasScores = result.results.some((r) => r.result.score !== undefined);
|
|
381
|
+
console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
|
|
352
382
|
for (const spec of result.results) {
|
|
353
383
|
const status = spec.result.status === "passed"
|
|
354
384
|
? "✅"
|
package/dist/version.d.ts
CHANGED
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
* X-EvalGate-SDK-Version: SDK package version
|
|
4
4
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
5
5
|
*/
|
|
6
|
-
export declare const SDK_VERSION = "2.1.
|
|
7
|
-
export declare const SPEC_VERSION = "2.1.
|
|
6
|
+
export declare const SDK_VERSION = "2.1.3";
|
|
7
|
+
export declare const SPEC_VERSION = "2.1.3";
|
package/dist/version.js
CHANGED
|
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
|
|
|
6
6
|
* X-EvalGate-SDK-Version: SDK package version
|
|
7
7
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
8
8
|
*/
|
|
9
|
-
exports.SDK_VERSION = "2.1.
|
|
10
|
-
exports.SPEC_VERSION = "2.1.
|
|
9
|
+
exports.SDK_VERSION = "2.1.3";
|
|
10
|
+
exports.SPEC_VERSION = "2.1.3";
|