@evalgate/sdk 2.1.0 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/README.md +47 -20
- package/dist/cli/discover.js +42 -23
- package/dist/cli/doctor.js +1 -1
- package/dist/cli/explain.js +1 -0
- package/dist/cli/regression-gate.js +23 -10
- package/dist/cli/run.js +87 -57
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,31 @@ All notable changes to the @evalgate/sdk package will be documented in this file
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.1.3] - 2026-03-02
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
- **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
|
|
13
|
+
- **High:** First-run gate false regression on fresh init when no test script exists
|
|
14
|
+
- **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
|
|
15
|
+
- **Critical:** Simulated executeSpec replaced with real spec execution
|
|
16
|
+
- **High:** Run scores now include scoring model context for clarity
|
|
17
|
+
- **Low:** Explain no longer shows "unnamed" for builtin gate failures
|
|
18
|
+
- **Docs:** Added missing `discover --manifest` step to local quickstart
|
|
19
|
+
|
|
20
|
+
## [2.1.2] - 2026-03-02
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
|
|
24
|
+
- **Type safety** — aligned with platform 2.1.2; zero TypeScript errors across all integration points
|
|
25
|
+
- **CI gate** — all SDK tests, lint, and build checks passing
|
|
26
|
+
|
|
27
|
+
## [2.1.1] - 2026-03-02
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
|
|
31
|
+
- Version alignment with platform 2.1.1
|
|
32
|
+
|
|
8
33
|
## [2.0.0] - 2026-03-01
|
|
9
34
|
|
|
10
35
|
### Breaking — EvalGate Rebrand
|
package/README.md
CHANGED
|
@@ -15,13 +15,13 @@ Zero to production CI in 60 seconds. No infra. No lock-in. Remove anytime.
|
|
|
15
15
|
|
|
16
16
|
## Quick Start (60 seconds)
|
|
17
17
|
|
|
18
|
-
Add this to your `.github/workflows/
|
|
18
|
+
Add this to your `.github/workflows/evalgate.yml`:
|
|
19
19
|
|
|
20
20
|
```yaml
|
|
21
21
|
name: EvalGate CI
|
|
22
22
|
on: [push, pull_request]
|
|
23
23
|
jobs:
|
|
24
|
-
|
|
24
|
+
evalgate:
|
|
25
25
|
runs-on: ubuntu-latest
|
|
26
26
|
steps:
|
|
27
27
|
- uses: actions/checkout@v4
|
|
@@ -31,8 +31,8 @@ jobs:
|
|
|
31
31
|
- uses: actions/upload-artifact@v4
|
|
32
32
|
if: always()
|
|
33
33
|
with:
|
|
34
|
-
name:
|
|
35
|
-
path: .
|
|
34
|
+
name: evalgate-results
|
|
35
|
+
path: .evalgate/
|
|
36
36
|
```
|
|
37
37
|
|
|
38
38
|
Create `eval/your-spec.spec.ts`:
|
|
@@ -51,7 +51,7 @@ defineEval({
|
|
|
51
51
|
```
|
|
52
52
|
|
|
53
53
|
```bash
|
|
54
|
-
git add .github/workflows/
|
|
54
|
+
git add .github/workflows/evalgate.yml eval/
|
|
55
55
|
git commit -m "feat: add EvalGate CI pipeline"
|
|
56
56
|
git push
|
|
57
57
|
```
|
|
@@ -67,7 +67,7 @@ That's it! Your CI now:
|
|
|
67
67
|
|
|
68
68
|
## 🚀 New in v2.0.0: One-Command CI
|
|
69
69
|
|
|
70
|
-
### `
|
|
70
|
+
### `evalgate ci` - Complete CI Pipeline
|
|
71
71
|
|
|
72
72
|
```bash
|
|
73
73
|
npx @evalgate/sdk ci --format github --write-results --base main
|
|
@@ -108,8 +108,8 @@ Every failure prints a clear next step:
|
|
|
108
108
|
|
|
109
109
|
```
|
|
110
110
|
🔧 Next step for debugging:
|
|
111
|
-
Download base artifact and run:
|
|
112
|
-
Artifacts: .
|
|
111
|
+
Download base artifact and run: evalgate diff --base .evalgate/base-run.json --head .evalgate/last-run.json
|
|
112
|
+
Artifacts: .evalgate/runs/
|
|
113
113
|
```
|
|
114
114
|
|
|
115
115
|
---
|
|
@@ -181,23 +181,23 @@ Every failure prints a clear next step:
|
|
|
181
181
|
|
|
182
182
|
| Command | Description |
|
|
183
183
|
|---------|-------------|
|
|
184
|
-
| `npx evalgate migrate config --in
|
|
184
|
+
| `npx evalgate migrate config --in evalgate.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
|
|
185
185
|
|
|
186
186
|
**Guided failure flow:**
|
|
187
187
|
|
|
188
188
|
```
|
|
189
|
-
|
|
189
|
+
evalgate ci → fails → "Next: evalgate explain --report .evalgate/last-run.json"
|
|
190
190
|
↓
|
|
191
|
-
|
|
191
|
+
evalgate explain → root causes + fixes
|
|
192
192
|
```
|
|
193
193
|
|
|
194
194
|
**GitHub Actions step summary** — CI result at a glance with regressions and artifacts:
|
|
195
195
|
|
|
196
|
-

|
|
197
197
|
|
|
198
|
-
**`
|
|
198
|
+
**`evalgate explain` terminal output** — root causes + fix commands:
|
|
199
199
|
|
|
200
|
-

|
|
201
201
|
|
|
202
202
|
All commands automatically write artifacts so `explain` works with zero flags.
|
|
203
203
|
|
|
@@ -254,6 +254,33 @@ All commands automatically write artifacts so `explain` works with zero flags.
|
|
|
254
254
|
npm install @evalgate/sdk openai
|
|
255
255
|
```
|
|
256
256
|
|
|
257
|
+
Create `eval/your-spec.spec.ts`:
|
|
258
|
+
|
|
259
|
+
```typescript
|
|
260
|
+
import { defineEval } from "@evalgate/sdk";
|
|
261
|
+
|
|
262
|
+
defineEval({
|
|
263
|
+
name: "Basic Math Operations",
|
|
264
|
+
description: "Test fundamental arithmetic",
|
|
265
|
+
prompt: "Test: 1+1=2, string concatenation, array includes",
|
|
266
|
+
expected: "All tests should pass",
|
|
267
|
+
tags: ["basic", "math"],
|
|
268
|
+
category: "unit-test"
|
|
269
|
+
});
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
```bash
|
|
273
|
+
# Discover specs and generate manifest
|
|
274
|
+
npx @evalgate/sdk discover
|
|
275
|
+
npx @evalgate/sdk discover --manifest
|
|
276
|
+
|
|
277
|
+
# Run evaluations
|
|
278
|
+
npx @evalgate/sdk run --write-results
|
|
279
|
+
|
|
280
|
+
# Run local regression gate
|
|
281
|
+
npx @evalgate/sdk gate
|
|
282
|
+
```
|
|
283
|
+
|
|
257
284
|
```typescript
|
|
258
285
|
import { openAIChatEval } from "@evalgate/sdk";
|
|
259
286
|
|
|
@@ -324,7 +351,7 @@ import type {
|
|
|
324
351
|
```typescript
|
|
325
352
|
import { AIEvalClient } from "@evalgate/sdk";
|
|
326
353
|
|
|
327
|
-
const client = AIEvalClient.init(); // from
|
|
354
|
+
const client = AIEvalClient.init(); // from EVALGATE_API_KEY env
|
|
328
355
|
// or
|
|
329
356
|
const client = new AIEvalClient({ apiKey: "...", organizationId: 123 });
|
|
330
357
|
```
|
|
@@ -367,7 +394,7 @@ npm install openai
|
|
|
367
394
|
## No Lock-in
|
|
368
395
|
|
|
369
396
|
```bash
|
|
370
|
-
rm
|
|
397
|
+
rm evalgate.config.json
|
|
371
398
|
```
|
|
372
399
|
|
|
373
400
|
Your local `openAIChatEval` runs continue to work. No account cancellation. No data export required.
|
|
@@ -376,17 +403,17 @@ Your local `openAIChatEval` runs continue to work. No account cancellation. No d
|
|
|
376
403
|
|
|
377
404
|
See [CHANGELOG.md](CHANGELOG.md) for the full release history.
|
|
378
405
|
|
|
379
|
-
**v1.8.0** — `
|
|
406
|
+
**v1.8.0** — `evalgate doctor` rewrite (9-check checklist), `evalgate explain` command, guided failure flow, CI template with doctor preflight
|
|
380
407
|
|
|
381
|
-
**v1.7.0** — `
|
|
408
|
+
**v1.7.0** — `evalgate init` scaffolder, `evalgate upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
|
|
382
409
|
|
|
383
|
-
**v1.6.0** — `
|
|
410
|
+
**v1.6.0** — `evalgate gate`, `evalgate baseline`, regression gate constants & types
|
|
384
411
|
|
|
385
412
|
**v1.5.8** — secureRoute fix, test infra fixes, 304 handling fix
|
|
386
413
|
|
|
387
414
|
**v1.5.5** — PASS/WARN/FAIL semantics, flake intelligence, golden regression suite
|
|
388
415
|
|
|
389
|
-
**v1.5.0** — GitHub annotations, `--onFail import`, `
|
|
416
|
+
**v1.5.0** — GitHub annotations, `--onFail import`, `evalgate doctor`
|
|
390
417
|
|
|
391
418
|
## License
|
|
392
419
|
|
package/dist/cli/discover.js
CHANGED
|
@@ -145,8 +145,8 @@ async function analyzeSpecifications(specFiles) {
|
|
|
145
145
|
for (const filePath of specFiles) {
|
|
146
146
|
try {
|
|
147
147
|
const content = await fs.readFile(filePath, "utf-8");
|
|
148
|
-
const
|
|
149
|
-
specs.push(
|
|
148
|
+
const fileSpecs = analyzeSpecFile(filePath, content);
|
|
149
|
+
specs.push(...fileSpecs);
|
|
150
150
|
}
|
|
151
151
|
catch (error) {
|
|
152
152
|
console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
@@ -155,20 +155,40 @@ async function analyzeSpecifications(specFiles) {
|
|
|
155
155
|
return specs;
|
|
156
156
|
}
|
|
157
157
|
/**
|
|
158
|
-
*
|
|
158
|
+
* Extract all spec names from file content (handles both call forms)
|
|
159
|
+
*/
|
|
160
|
+
function extractSpecNames(content) {
|
|
161
|
+
const names = [];
|
|
162
|
+
// Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
|
|
163
|
+
const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
|
|
164
|
+
let m = stringArgPattern.exec(content);
|
|
165
|
+
while (m !== null) {
|
|
166
|
+
names.push(m[1]);
|
|
167
|
+
m = stringArgPattern.exec(content);
|
|
168
|
+
}
|
|
169
|
+
if (names.length > 0)
|
|
170
|
+
return names;
|
|
171
|
+
// Form 2: defineEval({ name: "..." }) — object-first form
|
|
172
|
+
const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
|
|
173
|
+
m = objNamePattern.exec(content);
|
|
174
|
+
while (m !== null) {
|
|
175
|
+
names.push(m[1]);
|
|
176
|
+
m = objNamePattern.exec(content);
|
|
177
|
+
}
|
|
178
|
+
return names;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Analyze a single specification file — returns one SpecAnalysis per defineEval call
|
|
159
182
|
*/
|
|
160
183
|
function analyzeSpecFile(filePath, content) {
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
// Extract tags
|
|
184
|
+
const specNames = extractSpecNames(content);
|
|
185
|
+
// Fallback: file matched as a spec file but we couldn't parse names
|
|
186
|
+
if (specNames.length === 0) {
|
|
187
|
+
specNames.push(path.basename(filePath, path.extname(filePath)));
|
|
188
|
+
}
|
|
189
|
+
// Shared analysis for the file
|
|
168
190
|
const tags = extractTags(content);
|
|
169
|
-
// Analyze complexity
|
|
170
191
|
const complexity = analyzeComplexity(content);
|
|
171
|
-
// Check for models and tools
|
|
172
192
|
const usesModels = content.includes("model:") ||
|
|
173
193
|
content.includes("model=") ||
|
|
174
194
|
content.includes("openai") ||
|
|
@@ -176,22 +196,20 @@ function analyzeSpecFile(filePath, content) {
|
|
|
176
196
|
const usesTools = content.includes("tool:") ||
|
|
177
197
|
content.includes("function.") ||
|
|
178
198
|
content.includes("call(");
|
|
179
|
-
// Check for assertions
|
|
180
199
|
const hasAssertions = content.includes("assert") ||
|
|
181
200
|
content.includes("expect") ||
|
|
182
201
|
content.includes("should");
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
file: path.relative(process.cwd(), filePath),
|
|
202
|
+
const relFile = path.relative(process.cwd(), filePath);
|
|
203
|
+
return specNames.map((name, idx) => ({
|
|
204
|
+
id: generateSpecId(filePath, name, idx),
|
|
205
|
+
name,
|
|
206
|
+
file: relFile,
|
|
189
207
|
tags,
|
|
190
208
|
hasAssertions,
|
|
191
209
|
usesModels,
|
|
192
210
|
usesTools,
|
|
193
211
|
complexity,
|
|
194
|
-
};
|
|
212
|
+
}));
|
|
195
213
|
}
|
|
196
214
|
/**
|
|
197
215
|
* Extract tags from specification content
|
|
@@ -263,11 +281,12 @@ function analyzeComplexity(content) {
|
|
|
263
281
|
return "complex";
|
|
264
282
|
}
|
|
265
283
|
/**
|
|
266
|
-
* Generate specification ID from file path
|
|
284
|
+
* Generate specification ID from file path + name + index (unique per defineEval call)
|
|
267
285
|
*/
|
|
268
|
-
function generateSpecId(filePath) {
|
|
286
|
+
function generateSpecId(filePath, name, index) {
|
|
269
287
|
const relativePath = path.relative(process.cwd(), filePath);
|
|
270
|
-
const
|
|
288
|
+
const key = `${relativePath}:${name}:${index}`;
|
|
289
|
+
const hash = Buffer.from(key)
|
|
271
290
|
.toString("base64")
|
|
272
291
|
.replace(/[+/=]/g, "")
|
|
273
292
|
.slice(0, 8);
|
package/dist/cli/doctor.js
CHANGED
|
@@ -96,7 +96,7 @@ function parseFlags(argv) {
|
|
|
96
96
|
const baseUrl = raw.baseUrl ||
|
|
97
97
|
process.env.EVALGATE_BASE_URL ||
|
|
98
98
|
process.env.EVALAI_BASE_URL ||
|
|
99
|
-
"
|
|
99
|
+
"https://api.evalgate.com";
|
|
100
100
|
const apiKey = raw.apiKey ||
|
|
101
101
|
process.env.EVALGATE_API_KEY ||
|
|
102
102
|
process.env.EVALAI_API_KEY ||
|
package/dist/cli/explain.js
CHANGED
|
@@ -430,6 +430,7 @@ function buildFromBuiltinReport(report, reportPath) {
|
|
|
430
430
|
}));
|
|
431
431
|
const topFailures = failures.slice(0, 3).map((f, i) => ({
|
|
432
432
|
rank: i + 1,
|
|
433
|
+
name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
|
|
433
434
|
reason: f,
|
|
434
435
|
}));
|
|
435
436
|
// Simple root cause for builtin reports
|
|
@@ -94,6 +94,16 @@ function detectRunner(cwd) {
|
|
|
94
94
|
}
|
|
95
95
|
return "unknown";
|
|
96
96
|
}
|
|
97
|
+
function hasTestScript(cwd) {
|
|
98
|
+
try {
|
|
99
|
+
const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
|
|
100
|
+
const script = pkg.scripts?.test ?? "";
|
|
101
|
+
return !!script && script !== 'echo "Error: no test specified" && exit 1';
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
97
107
|
function runBuiltinGate(cwd) {
|
|
98
108
|
const t0 = Date.now();
|
|
99
109
|
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
|
|
|
101
111
|
const pm = detectPackageManager(cwd);
|
|
102
112
|
const command = `${pm} test`;
|
|
103
113
|
const runner = detectRunner(cwd);
|
|
114
|
+
const projectHasTestScript = hasTestScript(cwd);
|
|
104
115
|
// Load baseline
|
|
105
116
|
if (!fs.existsSync(baselinePath)) {
|
|
106
117
|
return {
|
|
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
|
|
|
165
176
|
const baselineTotal = baselineData.confidenceTests?.total ?? 0;
|
|
166
177
|
const failures = [];
|
|
167
178
|
const deltas = [];
|
|
168
|
-
// Delta: tests passing
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
179
|
+
// Delta: tests passing — only meaningful when a test script exists
|
|
180
|
+
if (projectHasTestScript) {
|
|
181
|
+
deltas.push({
|
|
182
|
+
metric: "tests_passing",
|
|
183
|
+
baseline: baselinePassed,
|
|
184
|
+
current: testsPassed,
|
|
185
|
+
delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
|
|
186
|
+
status: testsPassed ? "pass" : "fail",
|
|
187
|
+
});
|
|
188
|
+
if (!testsPassed && baselinePassed) {
|
|
189
|
+
failures.push("Tests were passing in baseline but are now failing");
|
|
190
|
+
}
|
|
178
191
|
}
|
|
179
192
|
// Delta: test count (only if we captured counts)
|
|
180
193
|
if (testCount > 0 || baselineTotal > 0) {
|
package/dist/cli/run.js
CHANGED
|
@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
|
|
|
52
52
|
const node_child_process_1 = require("node:child_process");
|
|
53
53
|
const fs = __importStar(require("node:fs/promises"));
|
|
54
54
|
const path = __importStar(require("node:path"));
|
|
55
|
+
const registry_1 = require("../runtime/registry");
|
|
55
56
|
const impact_analysis_1 = require("./impact-analysis");
|
|
56
57
|
/**
|
|
57
58
|
* Generate deterministic run ID
|
|
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
|
|
|
138
139
|
}
|
|
139
140
|
}
|
|
140
141
|
/**
|
|
141
|
-
* Execute specifications
|
|
142
|
+
* Execute specifications — grouped by file to avoid redundant loads
|
|
142
143
|
*/
|
|
143
144
|
async function executeSpecs(specs) {
|
|
144
|
-
|
|
145
|
+
// Group specs by their absolute file path
|
|
146
|
+
const specsByFile = new Map();
|
|
145
147
|
for (const spec of specs) {
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
+
const abs = path.isAbsolute(spec.filePath)
|
|
149
|
+
? spec.filePath
|
|
150
|
+
: path.join(process.cwd(), spec.filePath);
|
|
151
|
+
const group = specsByFile.get(abs) ?? [];
|
|
152
|
+
group.push(spec);
|
|
153
|
+
specsByFile.set(abs, group);
|
|
148
154
|
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
try {
|
|
157
|
-
// For now, simulate execution
|
|
158
|
-
// In a real implementation, this would:
|
|
159
|
-
// 1. Load the spec file
|
|
160
|
-
// 2. Execute the defineEval function
|
|
161
|
-
// 3. Capture the result
|
|
162
|
-
// Simulate some work
|
|
163
|
-
await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
|
|
164
|
-
// Simulate success/failure (90% success rate for demo)
|
|
165
|
-
const success = Math.random() > 0.1;
|
|
166
|
-
const duration = Date.now() - startTime;
|
|
167
|
-
if (success) {
|
|
168
|
-
return {
|
|
169
|
-
specId: spec.id,
|
|
170
|
-
name: spec.name,
|
|
171
|
-
filePath: spec.filePath,
|
|
172
|
-
result: {
|
|
173
|
-
status: "passed",
|
|
174
|
-
score: Math.random() * 0.3 + 0.7, // 0.7-1.0
|
|
175
|
-
duration,
|
|
176
|
-
},
|
|
177
|
-
};
|
|
155
|
+
const results = [];
|
|
156
|
+
for (const [absPath, fileSpecs] of specsByFile) {
|
|
157
|
+
// Fresh runtime per file to avoid cross-file contamination
|
|
158
|
+
(0, registry_1.disposeActiveRuntime)();
|
|
159
|
+
try {
|
|
160
|
+
// Bust require cache so the file re-executes its defineEval calls
|
|
161
|
+
delete require.cache[require.resolve(absPath)];
|
|
178
162
|
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
163
|
+
catch {
|
|
164
|
+
// Not in cache yet — fine
|
|
165
|
+
}
|
|
166
|
+
try {
|
|
167
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
168
|
+
require(absPath);
|
|
169
|
+
}
|
|
170
|
+
catch (loadError) {
|
|
171
|
+
const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
|
|
172
|
+
const msg = isTs &&
|
|
173
|
+
loadError instanceof Error &&
|
|
174
|
+
(loadError.message.includes("Unknown file extension") ||
|
|
175
|
+
loadError.message.includes("SyntaxError"))
|
|
176
|
+
? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
|
|
177
|
+
: loadError instanceof Error
|
|
178
|
+
? loadError.message
|
|
179
|
+
: String(loadError);
|
|
180
|
+
for (const spec of fileSpecs) {
|
|
181
|
+
results.push(makeErrorResult(spec, msg, 0));
|
|
182
|
+
}
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
const runtime = (0, registry_1.getActiveRuntime)();
|
|
186
|
+
const registered = runtime.list();
|
|
187
|
+
for (const spec of fileSpecs) {
|
|
188
|
+
const registeredSpec = registered.find((r) => r.name === spec.name);
|
|
189
|
+
if (!registeredSpec) {
|
|
190
|
+
results.push({
|
|
191
|
+
specId: spec.id,
|
|
192
|
+
name: spec.name,
|
|
193
|
+
filePath: spec.filePath,
|
|
194
|
+
result: {
|
|
195
|
+
status: "skipped",
|
|
196
|
+
error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
|
|
197
|
+
duration: 0,
|
|
198
|
+
},
|
|
199
|
+
});
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
const startTime = Date.now();
|
|
203
|
+
try {
|
|
204
|
+
const evalResult = await registeredSpec.executor({ input: "" });
|
|
205
|
+
results.push({
|
|
206
|
+
specId: spec.id,
|
|
207
|
+
name: spec.name,
|
|
208
|
+
filePath: spec.filePath,
|
|
209
|
+
result: {
|
|
210
|
+
status: evalResult.pass ? "passed" : "failed",
|
|
211
|
+
score: typeof evalResult.score === "number"
|
|
212
|
+
? evalResult.score / 100
|
|
213
|
+
: undefined,
|
|
214
|
+
error: evalResult.error,
|
|
215
|
+
duration: Date.now() - startTime,
|
|
216
|
+
},
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
catch (execError) {
|
|
220
|
+
results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
|
|
221
|
+
}
|
|
190
222
|
}
|
|
191
223
|
}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
},
|
|
202
|
-
};
|
|
203
|
-
}
|
|
224
|
+
return results;
|
|
225
|
+
}
|
|
226
|
+
function makeErrorResult(spec, error, duration) {
|
|
227
|
+
return {
|
|
228
|
+
specId: spec.id,
|
|
229
|
+
name: spec.name,
|
|
230
|
+
filePath: spec.filePath,
|
|
231
|
+
result: { status: "failed", error, duration },
|
|
232
|
+
};
|
|
204
233
|
}
|
|
205
234
|
/**
|
|
206
235
|
* Calculate summary statistics
|
|
@@ -348,7 +377,8 @@ function printHumanResults(result) {
|
|
|
348
377
|
console.log(` ❌ Failed: ${result.summary.failed}`);
|
|
349
378
|
console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
|
|
350
379
|
console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
|
|
351
|
-
|
|
380
|
+
const hasScores = result.results.some((r) => r.result.score !== undefined);
|
|
381
|
+
console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
|
|
352
382
|
for (const spec of result.results) {
|
|
353
383
|
const status = spec.result.status === "passed"
|
|
354
384
|
? "✅"
|
package/dist/version.d.ts
CHANGED
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
* X-EvalGate-SDK-Version: SDK package version
|
|
4
4
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
5
5
|
*/
|
|
6
|
-
export declare const SDK_VERSION = "2.1.
|
|
7
|
-
export declare const SPEC_VERSION = "2.1.
|
|
6
|
+
export declare const SDK_VERSION = "2.1.3";
|
|
7
|
+
export declare const SPEC_VERSION = "2.1.3";
|
package/dist/version.js
CHANGED
|
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
|
|
|
6
6
|
* X-EvalGate-SDK-Version: SDK package version
|
|
7
7
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
8
8
|
*/
|
|
9
|
-
exports.SDK_VERSION = "2.1.
|
|
10
|
-
exports.SPEC_VERSION = "2.1.
|
|
9
|
+
exports.SDK_VERSION = "2.1.3";
|
|
10
|
+
exports.SPEC_VERSION = "2.1.3";
|