@evalgate/sdk 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +37 -0
- package/README.md +52 -5
- package/dist/assertions.d.ts +21 -0
- package/dist/assertions.js +43 -2
- package/dist/cli/baseline.js +31 -6
- package/dist/cli/discover.js +46 -28
- package/dist/cli/doctor.js +1 -1
- package/dist/cli/explain.js +69 -1
- package/dist/cli/impact-analysis.js +12 -1
- package/dist/cli/print-config.js +1 -1
- package/dist/cli/regression-gate.js +23 -10
- package/dist/cli/run.js +87 -57
- package/dist/index.d.ts +1 -1
- package/dist/index.js +3 -2
- package/dist/runtime/eval.d.ts +12 -3
- package/dist/runtime/eval.js +15 -6
- package/dist/snapshot.d.ts +3 -3
- package/dist/snapshot.js +3 -3
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,43 @@ All notable changes to the @evalgate/sdk package will be documented in this file
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.2.0] - 2026-03-03
|
|
9
|
+
|
|
10
|
+
### Breaking
|
|
11
|
+
|
|
12
|
+
- **`snapshot(output, name)` → `snapshot(name, output)`** — parameter order swapped to match natural call convention (`name` first, value second, same as `test('name', fn)`). Update any existing `snapshot(output, 'label')` calls to `snapshot('label', output)`.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
|
|
16
|
+
- **`expect().not` modifier** — `expect('drop table').not.toContain('drop table')` now works; negates `passed` on any chained assertion via Proxy
|
|
17
|
+
- **`hasPII(text)`** — semantic inverse of `notContainsPII`; returns `true` when PII is detected (email, phone, SSN, IP). Exported from main package. Eliminates double-negative confusion.
|
|
18
|
+
- **`defineSuite` object form** — now accepts both `defineSuite(name, [...fns])` and `defineSuite({ name, specs: [...fns] })`. README updated with examples.
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
|
|
22
|
+
- **`specId` collision** — all specs in `eval/` directory shared the same 8-char ID (`ZXZhbC9j`). Root cause: short base64 prefix was identical for any path starting with `eval/c`. Fixed: SHA-256 hex (16 chars) in `discover.ts`.
|
|
23
|
+
- **`explain` UNKNOWN verdict** — `evalgate explain` showed `Verdict: UNKNOWN` when reading `.evalgate/last-run.json`. Added `RunResult` format detection (`results[]` + `summary`). Added `.evalgate/last-run.json` and `.evalgate/runs/latest.json` to auto-search paths. Passing runs now show clean `✅ PASS` with no spurious "Run doctor" suggestions.
|
|
24
|
+
- **`print-config` baseUrl default** — was `http://localhost:3000`; now `https://api.evalgate.com` to match `evalgate doctor`.
|
|
25
|
+
- **`baseline update` self-contained** — no longer requires a custom `eval:baseline-update` npm script. Falls back to built-in mode (runs `pm test`, stamps baseline) if no script is present.
|
|
26
|
+
- **`notContainsPII` phone regex** — broadened to cover `555-123-4567`, `555.123.4567`, and `555 123 4567` formats. JSDoc clarified: `false` = PII found (unsafe), `true` = no PII (safe).
|
|
27
|
+
- **`impact-analysis` git error** — replaced raw `git diff --help` wall-of-text with clean targeted messages: `Not a git repository`, `Base branch 'X' not found. Fetch it first`, or generic exit-code message.
|
|
28
|
+
- **README quickstart** — both `defineEval` examples now include an `executor` function. Running the quickstart no longer throws `Executor must be a function`.
|
|
29
|
+
- **`snapshot` module docstring** — updated `@example` to reflect new `(name, output)` parameter order.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## [2.1.3] - 2026-03-02
|
|
34
|
+
|
|
35
|
+
### Fixed
|
|
36
|
+
|
|
37
|
+
- **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
|
|
38
|
+
- **High:** First-run gate false regression on fresh init when no test script exists
|
|
39
|
+
- **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
|
|
40
|
+
- **Critical:** Simulated executeSpec replaced with real spec execution
|
|
41
|
+
- **High:** Run scores now include scoring model context for clarity
|
|
42
|
+
- **Low:** Explain no longer shows "unnamed" for builtin gate failures
|
|
43
|
+
- **Docs:** Added missing `discover --manifest` step to local quickstart
|
|
44
|
+
|
|
8
45
|
## [2.1.2] - 2026-03-02
|
|
9
46
|
|
|
10
47
|
### Fixed
|
package/README.md
CHANGED
|
@@ -40,13 +40,20 @@ Create `eval/your-spec.spec.ts`:
|
|
|
40
40
|
```typescript
|
|
41
41
|
import { defineEval } from "@evalgate/sdk";
|
|
42
42
|
|
|
43
|
+
defineEval("Basic Math Operations", async () => {
|
|
44
|
+
const result = 1 + 1;
|
|
45
|
+
return { pass: result === 2, score: result === 2 ? 100 : 0 };
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// Object form (with metadata):
|
|
43
49
|
defineEval({
|
|
44
|
-
name: "
|
|
45
|
-
description: "Test
|
|
46
|
-
prompt: "Test: 1+1=2, string concatenation, array includes",
|
|
47
|
-
expected: "All tests should pass",
|
|
50
|
+
name: "String concatenation",
|
|
51
|
+
description: "Test string operations",
|
|
48
52
|
tags: ["basic", "math"],
|
|
49
|
-
|
|
53
|
+
executor: async () => {
|
|
54
|
+
const result = "hello" + " world";
|
|
55
|
+
return { pass: result === "hello world", score: 100 };
|
|
56
|
+
},
|
|
50
57
|
});
|
|
51
58
|
```
|
|
52
59
|
|
|
@@ -254,6 +261,46 @@ All commands automatically write artifacts so `explain` works with zero flags.
|
|
|
254
261
|
npm install @evalgate/sdk openai
|
|
255
262
|
```
|
|
256
263
|
|
|
264
|
+
Create `eval/your-spec.spec.ts`:
|
|
265
|
+
|
|
266
|
+
```typescript
|
|
267
|
+
import { defineEval } from "@evalgate/sdk";
|
|
268
|
+
|
|
269
|
+
defineEval("Basic Math Operations", async () => {
|
|
270
|
+
const result = 1 + 1;
|
|
271
|
+
return { pass: result === 2, score: result === 2 ? 100 : 0 };
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
// Object form (with metadata):
|
|
275
|
+
defineEval({
|
|
276
|
+
name: "String concatenation",
|
|
277
|
+
description: "Test string operations",
|
|
278
|
+
tags: ["basic", "math"],
|
|
279
|
+
executor: async () => {
|
|
280
|
+
const result = "hello" + " world";
|
|
281
|
+
return { pass: result === "hello world", score: 100 };
|
|
282
|
+
},
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
// Suite form — group related specs:
|
|
286
|
+
defineSuite("Math suite", [
|
|
287
|
+
() => defineEval("addition", async () => ({ pass: 1 + 1 === 2, score: 100 })),
|
|
288
|
+
() => defineEval("subtraction", async () => ({ pass: 5 - 3 === 2, score: 100 })),
|
|
289
|
+
]);
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# Discover specs and generate manifest
|
|
294
|
+
npx @evalgate/sdk discover
|
|
295
|
+
npx @evalgate/sdk discover --manifest
|
|
296
|
+
|
|
297
|
+
# Run evaluations
|
|
298
|
+
npx @evalgate/sdk run --write-results
|
|
299
|
+
|
|
300
|
+
# Run local regression gate
|
|
301
|
+
npx @evalgate/sdk gate
|
|
302
|
+
```
|
|
303
|
+
|
|
257
304
|
```typescript
|
|
258
305
|
import { openAIChatEval } from "@evalgate/sdk";
|
|
259
306
|
|
package/dist/assertions.d.ts
CHANGED
|
@@ -32,6 +32,11 @@ export declare class AssertionError extends Error {
|
|
|
32
32
|
export declare class Expectation {
|
|
33
33
|
private value;
|
|
34
34
|
constructor(value: unknown);
|
|
35
|
+
/**
|
|
36
|
+
* Negate the next assertion — inverts `passed` on any chained method.
|
|
37
|
+
* @example expect('drop table').not.toContain('drop table')
|
|
38
|
+
*/
|
|
39
|
+
get not(): Expectation;
|
|
35
40
|
/**
|
|
36
41
|
* Assert value equals expected
|
|
37
42
|
* @example expect(output).toEqual("Hello")
|
|
@@ -171,7 +176,23 @@ export declare function hasLength(text: string, range: {
|
|
|
171
176
|
max?: number;
|
|
172
177
|
}): boolean;
|
|
173
178
|
export declare function containsJSON(text: string): boolean;
|
|
179
|
+
/**
|
|
180
|
+
* Returns `true` when the text is PII-free (safe to use), `false` when PII is detected.
|
|
181
|
+
*
|
|
182
|
+
* @example
|
|
183
|
+
* if (!notContainsPII(response)) throw new Error("PII leak detected");
|
|
184
|
+
* // Or use the clearer alias:
|
|
185
|
+
* if (hasPII(response)) throw new Error("PII leak detected");
|
|
186
|
+
*/
|
|
174
187
|
export declare function notContainsPII(text: string): boolean;
|
|
188
|
+
/**
|
|
189
|
+
* Returns `true` when PII is detected in the text (unsafe), `false` when safe.
|
|
190
|
+
* This is the semantic inverse of `notContainsPII` and may be easier to reason about.
|
|
191
|
+
*
|
|
192
|
+
* @example
|
|
193
|
+
* if (hasPII(response)) throw new Error("PII leak");
|
|
194
|
+
*/
|
|
195
|
+
export declare function hasPII(text: string): boolean;
|
|
175
196
|
export declare function hasSentiment(text: string, expected: "positive" | "negative" | "neutral"): boolean;
|
|
176
197
|
export declare function similarTo(text1: string, text2: string, threshold?: number): boolean;
|
|
177
198
|
export declare function withinRange(value: number, min: number, max: number): boolean;
|
package/dist/assertions.js
CHANGED
|
@@ -24,6 +24,7 @@ exports.matchesPattern = matchesPattern;
|
|
|
24
24
|
exports.hasLength = hasLength;
|
|
25
25
|
exports.containsJSON = containsJSON;
|
|
26
26
|
exports.notContainsPII = notContainsPII;
|
|
27
|
+
exports.hasPII = hasPII;
|
|
27
28
|
exports.hasSentiment = hasSentiment;
|
|
28
29
|
exports.similarTo = similarTo;
|
|
29
30
|
exports.withinRange = withinRange;
|
|
@@ -56,6 +57,28 @@ class Expectation {
|
|
|
56
57
|
constructor(value) {
|
|
57
58
|
this.value = value;
|
|
58
59
|
}
|
|
60
|
+
/**
|
|
61
|
+
* Negate the next assertion — inverts `passed` on any chained method.
|
|
62
|
+
* @example expect('drop table').not.toContain('drop table')
|
|
63
|
+
*/
|
|
64
|
+
get not() {
|
|
65
|
+
const value = this.value;
|
|
66
|
+
return new Proxy(new Expectation(value), {
|
|
67
|
+
get(target, prop) {
|
|
68
|
+
const orig = target[prop];
|
|
69
|
+
if (typeof orig === "function" && prop !== "constructor") {
|
|
70
|
+
return (...args) => {
|
|
71
|
+
const result = orig.call(target, ...args);
|
|
72
|
+
if (result && typeof result === "object" && "passed" in result) {
|
|
73
|
+
return { ...result, passed: !result.passed };
|
|
74
|
+
}
|
|
75
|
+
return result;
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
return orig;
|
|
79
|
+
},
|
|
80
|
+
});
|
|
81
|
+
}
|
|
59
82
|
/**
|
|
60
83
|
* Assert value equals expected
|
|
61
84
|
* @example expect(output).toEqual("Hello")
|
|
@@ -539,17 +562,35 @@ function containsJSON(text) {
|
|
|
539
562
|
return false;
|
|
540
563
|
}
|
|
541
564
|
}
|
|
565
|
+
/**
|
|
566
|
+
* Returns `true` when the text is PII-free (safe to use), `false` when PII is detected.
|
|
567
|
+
*
|
|
568
|
+
* @example
|
|
569
|
+
* if (!notContainsPII(response)) throw new Error("PII leak detected");
|
|
570
|
+
* // Or use the clearer alias:
|
|
571
|
+
* if (hasPII(response)) throw new Error("PII leak detected");
|
|
572
|
+
*/
|
|
542
573
|
function notContainsPII(text) {
|
|
543
574
|
// Simple PII detection patterns
|
|
544
575
|
const piiPatterns = [
|
|
545
576
|
/\b\d{3}-\d{2}-\d{4}\b/, // SSN
|
|
546
577
|
/\b\d{3}\.\d{3}\.\d{4}\b/, // SSN with dots
|
|
547
|
-
/\b\d{
|
|
548
|
-
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-
|
|
578
|
+
/\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b/, // Phone (various formats)
|
|
579
|
+
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/, // Email
|
|
549
580
|
/\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/, // IP address
|
|
550
581
|
];
|
|
551
582
|
return !piiPatterns.some((pattern) => pattern.test(text));
|
|
552
583
|
}
|
|
584
|
+
/**
|
|
585
|
+
* Returns `true` when PII is detected in the text (unsafe), `false` when safe.
|
|
586
|
+
* This is the semantic inverse of `notContainsPII` and may be easier to reason about.
|
|
587
|
+
*
|
|
588
|
+
* @example
|
|
589
|
+
* if (hasPII(response)) throw new Error("PII leak");
|
|
590
|
+
*/
|
|
591
|
+
function hasPII(text) {
|
|
592
|
+
return !notContainsPII(text);
|
|
593
|
+
}
|
|
553
594
|
function hasSentiment(text, expected) {
|
|
554
595
|
// This is a simplified implementation
|
|
555
596
|
const positiveWords = ["good", "great", "excellent", "awesome"];
|
package/dist/cli/baseline.js
CHANGED
|
@@ -126,7 +126,6 @@ function runBaselineInit(cwd) {
|
|
|
126
126
|
}
|
|
127
127
|
// ── baseline update ──
|
|
128
128
|
function runBaselineUpdate(cwd) {
|
|
129
|
-
// Check if eval:baseline-update script exists in package.json
|
|
130
129
|
const pkgPath = path.join(cwd, "package.json");
|
|
131
130
|
if (!fs.existsSync(pkgPath)) {
|
|
132
131
|
console.error("❌ No package.json found. Run this from your project root.");
|
|
@@ -140,13 +139,39 @@ function runBaselineUpdate(cwd) {
|
|
|
140
139
|
console.error("❌ Failed to parse package.json");
|
|
141
140
|
return 1;
|
|
142
141
|
}
|
|
143
|
-
if
|
|
144
|
-
|
|
145
|
-
console.
|
|
142
|
+
// Use custom script if available
|
|
143
|
+
if (pkg.scripts?.["eval:baseline-update"]) {
|
|
144
|
+
console.log("📊 Running baseline update (custom script)...\n");
|
|
145
|
+
return runScript(cwd, "eval:baseline-update");
|
|
146
|
+
}
|
|
147
|
+
// Self-contained built-in mode: run the test suite then stamp the baseline
|
|
148
|
+
console.log("📊 Running baseline update (built-in mode)...\n");
|
|
149
|
+
const pm = detectPackageManager(cwd);
|
|
150
|
+
const isWin = process.platform === "win32";
|
|
151
|
+
const testResult = (0, node_child_process_1.spawnSync)(pm, ["test"], {
|
|
152
|
+
cwd,
|
|
153
|
+
stdio: "inherit",
|
|
154
|
+
shell: isWin,
|
|
155
|
+
});
|
|
156
|
+
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
157
|
+
if (!fs.existsSync(baselinePath)) {
|
|
158
|
+
console.error("❌ No baseline found. Run 'evalgate baseline init' first.");
|
|
159
|
+
return 1;
|
|
160
|
+
}
|
|
161
|
+
try {
|
|
162
|
+
const baseline = JSON.parse(fs.readFileSync(baselinePath, "utf-8"));
|
|
163
|
+
baseline.updatedAt = new Date().toISOString();
|
|
164
|
+
baseline.updatedBy = process.env.USER || process.env.USERNAME || "unknown";
|
|
165
|
+
baseline.confidenceTests = baseline.confidenceTests ?? {};
|
|
166
|
+
baseline.confidenceTests.unitPassed = testResult.status === 0;
|
|
167
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
|
|
168
|
+
console.log("\n✅ Baseline updated successfully");
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
console.error("❌ Failed to update baseline file");
|
|
146
172
|
return 1;
|
|
147
173
|
}
|
|
148
|
-
|
|
149
|
-
return runScript(cwd, "eval:baseline-update");
|
|
174
|
+
return testResult.status ?? 1;
|
|
150
175
|
}
|
|
151
176
|
// ── baseline router ──
|
|
152
177
|
function runBaseline(argv) {
|
package/dist/cli/discover.js
CHANGED
|
@@ -59,6 +59,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
59
59
|
exports.discoverSpecs = discoverSpecs;
|
|
60
60
|
exports.printDiscoveryResults = printDiscoveryResults;
|
|
61
61
|
exports.runDiscover = runDiscover;
|
|
62
|
+
const crypto = __importStar(require("node:crypto"));
|
|
62
63
|
const fs = __importStar(require("node:fs/promises"));
|
|
63
64
|
const path = __importStar(require("node:path"));
|
|
64
65
|
const execution_mode_1 = require("../runtime/execution-mode");
|
|
@@ -145,8 +146,8 @@ async function analyzeSpecifications(specFiles) {
|
|
|
145
146
|
for (const filePath of specFiles) {
|
|
146
147
|
try {
|
|
147
148
|
const content = await fs.readFile(filePath, "utf-8");
|
|
148
|
-
const
|
|
149
|
-
specs.push(
|
|
149
|
+
const fileSpecs = analyzeSpecFile(filePath, content);
|
|
150
|
+
specs.push(...fileSpecs);
|
|
150
151
|
}
|
|
151
152
|
catch (error) {
|
|
152
153
|
console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
@@ -155,20 +156,40 @@ async function analyzeSpecifications(specFiles) {
|
|
|
155
156
|
return specs;
|
|
156
157
|
}
|
|
157
158
|
/**
|
|
158
|
-
*
|
|
159
|
+
* Extract all spec names from file content (handles both call forms)
|
|
160
|
+
*/
|
|
161
|
+
function extractSpecNames(content) {
|
|
162
|
+
const names = [];
|
|
163
|
+
// Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
|
|
164
|
+
const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
|
|
165
|
+
let m = stringArgPattern.exec(content);
|
|
166
|
+
while (m !== null) {
|
|
167
|
+
names.push(m[1]);
|
|
168
|
+
m = stringArgPattern.exec(content);
|
|
169
|
+
}
|
|
170
|
+
if (names.length > 0)
|
|
171
|
+
return names;
|
|
172
|
+
// Form 2: defineEval({ name: "..." }) — object-first form
|
|
173
|
+
const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
|
|
174
|
+
m = objNamePattern.exec(content);
|
|
175
|
+
while (m !== null) {
|
|
176
|
+
names.push(m[1]);
|
|
177
|
+
m = objNamePattern.exec(content);
|
|
178
|
+
}
|
|
179
|
+
return names;
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Analyze a single specification file — returns one SpecAnalysis per defineEval call
|
|
159
183
|
*/
|
|
160
184
|
function analyzeSpecFile(filePath, content) {
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
// Extract tags
|
|
185
|
+
const specNames = extractSpecNames(content);
|
|
186
|
+
// Fallback: file matched as a spec file but we couldn't parse names
|
|
187
|
+
if (specNames.length === 0) {
|
|
188
|
+
specNames.push(path.basename(filePath, path.extname(filePath)));
|
|
189
|
+
}
|
|
190
|
+
// Shared analysis for the file
|
|
168
191
|
const tags = extractTags(content);
|
|
169
|
-
// Analyze complexity
|
|
170
192
|
const complexity = analyzeComplexity(content);
|
|
171
|
-
// Check for models and tools
|
|
172
193
|
const usesModels = content.includes("model:") ||
|
|
173
194
|
content.includes("model=") ||
|
|
174
195
|
content.includes("openai") ||
|
|
@@ -176,22 +197,20 @@ function analyzeSpecFile(filePath, content) {
|
|
|
176
197
|
const usesTools = content.includes("tool:") ||
|
|
177
198
|
content.includes("function.") ||
|
|
178
199
|
content.includes("call(");
|
|
179
|
-
// Check for assertions
|
|
180
200
|
const hasAssertions = content.includes("assert") ||
|
|
181
201
|
content.includes("expect") ||
|
|
182
202
|
content.includes("should");
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
file: path.relative(process.cwd(), filePath),
|
|
203
|
+
const relFile = path.relative(process.cwd(), filePath);
|
|
204
|
+
return specNames.map((name, idx) => ({
|
|
205
|
+
id: generateSpecId(filePath, name, idx),
|
|
206
|
+
name,
|
|
207
|
+
file: relFile,
|
|
189
208
|
tags,
|
|
190
209
|
hasAssertions,
|
|
191
210
|
usesModels,
|
|
192
211
|
usesTools,
|
|
193
212
|
complexity,
|
|
194
|
-
};
|
|
213
|
+
}));
|
|
195
214
|
}
|
|
196
215
|
/**
|
|
197
216
|
* Extract tags from specification content
|
|
@@ -263,15 +282,14 @@ function analyzeComplexity(content) {
|
|
|
263
282
|
return "complex";
|
|
264
283
|
}
|
|
265
284
|
/**
|
|
266
|
-
* Generate specification ID from file path
|
|
285
|
+
* Generate specification ID from file path + name + index (unique per defineEval call)
|
|
267
286
|
*/
|
|
268
|
-
function generateSpecId(filePath) {
|
|
269
|
-
const relativePath = path
|
|
270
|
-
|
|
271
|
-
.
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
return hash;
|
|
287
|
+
function generateSpecId(filePath, name, index) {
|
|
288
|
+
const relativePath = path
|
|
289
|
+
.relative(process.cwd(), filePath)
|
|
290
|
+
.replace(/\\/g, "/");
|
|
291
|
+
const key = `${relativePath}|${name}|${index}`;
|
|
292
|
+
return crypto.createHash("sha256").update(key).digest("hex").slice(0, 16);
|
|
275
293
|
}
|
|
276
294
|
/**
|
|
277
295
|
* Calculate discovery statistics
|
package/dist/cli/doctor.js
CHANGED
|
@@ -96,7 +96,7 @@ function parseFlags(argv) {
|
|
|
96
96
|
const baseUrl = raw.baseUrl ||
|
|
97
97
|
process.env.EVALGATE_BASE_URL ||
|
|
98
98
|
process.env.EVALAI_BASE_URL ||
|
|
99
|
-
"
|
|
99
|
+
"https://api.evalgate.com";
|
|
100
100
|
const apiKey = raw.apiKey ||
|
|
101
101
|
process.env.EVALGATE_API_KEY ||
|
|
102
102
|
process.env.EVALAI_API_KEY ||
|
package/dist/cli/explain.js
CHANGED
|
@@ -84,6 +84,8 @@ const REPORT_SEARCH_PATHS = [
|
|
|
84
84
|
"evals/regression-report.json",
|
|
85
85
|
".evalgate/last-report.json",
|
|
86
86
|
".evalgate/last_report.json",
|
|
87
|
+
".evalgate/last-run.json",
|
|
88
|
+
".evalgate/runs/latest.json",
|
|
87
89
|
];
|
|
88
90
|
function findReport(cwd, explicitPath) {
|
|
89
91
|
if (explicitPath) {
|
|
@@ -354,13 +356,78 @@ function suggestFixes(causes) {
|
|
|
354
356
|
}
|
|
355
357
|
// ── Build explain output ──
|
|
356
358
|
function buildExplainOutput(report, reportPath) {
|
|
357
|
-
// Support
|
|
359
|
+
// Support RunResult (from evalgate run) — has schemaVersion + results[] + summary
|
|
360
|
+
const isRunResult = "results" in report &&
|
|
361
|
+
Array.isArray(report.results) &&
|
|
362
|
+
"summary" in report &&
|
|
363
|
+
report.summary !== null &&
|
|
364
|
+
typeof report.summary === "object";
|
|
365
|
+
if (isRunResult) {
|
|
366
|
+
return buildFromRunResult(report, reportPath);
|
|
367
|
+
}
|
|
368
|
+
// Support BuiltinReport (from evalgate gate)
|
|
358
369
|
const isBuiltinReport = "category" in report && "deltas" in report;
|
|
359
370
|
if (isBuiltinReport) {
|
|
360
371
|
return buildFromBuiltinReport(report, reportPath);
|
|
361
372
|
}
|
|
362
373
|
return buildFromCheckReport(report, reportPath);
|
|
363
374
|
}
|
|
375
|
+
function buildFromRunResult(report, reportPath) {
|
|
376
|
+
const summary = report.summary;
|
|
377
|
+
const results = report.results ?? [];
|
|
378
|
+
const passed = summary.failed === 0;
|
|
379
|
+
// Top failures
|
|
380
|
+
const failures = results.filter((r) => r.result.status === "failed");
|
|
381
|
+
const topFailures = failures.slice(0, 3).map((r, i) => ({
|
|
382
|
+
rank: i + 1,
|
|
383
|
+
name: r.name,
|
|
384
|
+
filePath: r.filePath,
|
|
385
|
+
reason: r.result.error,
|
|
386
|
+
}));
|
|
387
|
+
// Changes: pass rate
|
|
388
|
+
const changes = [
|
|
389
|
+
{
|
|
390
|
+
metric: "Pass rate",
|
|
391
|
+
baseline: "—",
|
|
392
|
+
current: `${Math.round(summary.passRate * 100)}%`,
|
|
393
|
+
direction: passed ? "same" : "worse",
|
|
394
|
+
},
|
|
395
|
+
];
|
|
396
|
+
// For passing runs, emit nothing so no misleading "Run doctor" suggestions appear
|
|
397
|
+
if (passed) {
|
|
398
|
+
return {
|
|
399
|
+
verdict: "pass",
|
|
400
|
+
reasonMessage: `All ${summary.passed} spec${summary.passed === 1 ? "" : "s"} passed`,
|
|
401
|
+
topFailures: [],
|
|
402
|
+
totalFailures: 0,
|
|
403
|
+
changes,
|
|
404
|
+
rootCauses: [],
|
|
405
|
+
suggestedFixes: [],
|
|
406
|
+
reportPath,
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
// Classify root cause by inspecting error messages
|
|
410
|
+
const errorText = failures
|
|
411
|
+
.map((r) => (r.result.error ?? "").toLowerCase())
|
|
412
|
+
.join(" ");
|
|
413
|
+
const rootCauses = [];
|
|
414
|
+
if (errorText.includes("pii") || errorText.includes("safety"))
|
|
415
|
+
rootCauses.push("safety_regression");
|
|
416
|
+
if (errorText.includes("tool") || errorText.includes("function_call"))
|
|
417
|
+
rootCauses.push("tool_use_drift");
|
|
418
|
+
if (rootCauses.length === 0)
|
|
419
|
+
rootCauses.push("prompt_drift");
|
|
420
|
+
return {
|
|
421
|
+
verdict: "fail",
|
|
422
|
+
reasonMessage: `${summary.failed} of ${results.length} spec${results.length === 1 ? "" : "s"} failed`,
|
|
423
|
+
topFailures,
|
|
424
|
+
totalFailures: failures.length,
|
|
425
|
+
changes,
|
|
426
|
+
rootCauses,
|
|
427
|
+
suggestedFixes: suggestFixes(rootCauses),
|
|
428
|
+
reportPath,
|
|
429
|
+
};
|
|
430
|
+
}
|
|
364
431
|
function buildFromCheckReport(report, reportPath) {
|
|
365
432
|
const failedCases = report.failedCases ?? [];
|
|
366
433
|
// Top failures (up to 3)
|
|
@@ -430,6 +497,7 @@ function buildFromBuiltinReport(report, reportPath) {
|
|
|
430
497
|
}));
|
|
431
498
|
const topFailures = failures.slice(0, 3).map((f, i) => ({
|
|
432
499
|
rank: i + 1,
|
|
500
|
+
name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
|
|
433
501
|
reason: f,
|
|
434
502
|
}));
|
|
435
503
|
// Simple root cause for builtin reports
|
|
@@ -109,7 +109,18 @@ async function getChangedFiles(baseBranch) {
|
|
|
109
109
|
});
|
|
110
110
|
git.on("close", (code) => {
|
|
111
111
|
if (code !== 0) {
|
|
112
|
-
|
|
112
|
+
const lowerError = error.toLowerCase();
|
|
113
|
+
if (lowerError.includes("not a git repository") ||
|
|
114
|
+
lowerError.includes("fatal: not a git")) {
|
|
115
|
+
reject(new Error("Not a git repository. Run 'git init' or run evalgate from inside a git repo."));
|
|
116
|
+
}
|
|
117
|
+
else if (lowerError.includes("unknown revision") ||
|
|
118
|
+
lowerError.includes("bad revision")) {
|
|
119
|
+
reject(new Error(`Base branch '${baseBranch}' not found. Fetch it first: git fetch origin ${baseBranch}`));
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
reject(new Error(`Git diff failed (exit ${code}). Ensure git is installed and '${baseBranch}' exists.`));
|
|
123
|
+
}
|
|
113
124
|
return;
|
|
114
125
|
}
|
|
115
126
|
const files = output
|
package/dist/cli/print-config.js
CHANGED
|
@@ -94,6 +94,16 @@ function detectRunner(cwd) {
|
|
|
94
94
|
}
|
|
95
95
|
return "unknown";
|
|
96
96
|
}
|
|
97
|
+
function hasTestScript(cwd) {
|
|
98
|
+
try {
|
|
99
|
+
const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
|
|
100
|
+
const script = pkg.scripts?.test ?? "";
|
|
101
|
+
return !!script && script !== 'echo "Error: no test specified" && exit 1';
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
97
107
|
function runBuiltinGate(cwd) {
|
|
98
108
|
const t0 = Date.now();
|
|
99
109
|
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
|
|
|
101
111
|
const pm = detectPackageManager(cwd);
|
|
102
112
|
const command = `${pm} test`;
|
|
103
113
|
const runner = detectRunner(cwd);
|
|
114
|
+
const projectHasTestScript = hasTestScript(cwd);
|
|
104
115
|
// Load baseline
|
|
105
116
|
if (!fs.existsSync(baselinePath)) {
|
|
106
117
|
return {
|
|
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
|
|
|
165
176
|
const baselineTotal = baselineData.confidenceTests?.total ?? 0;
|
|
166
177
|
const failures = [];
|
|
167
178
|
const deltas = [];
|
|
168
|
-
// Delta: tests passing
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
179
|
+
// Delta: tests passing — only meaningful when a test script exists
|
|
180
|
+
if (projectHasTestScript) {
|
|
181
|
+
deltas.push({
|
|
182
|
+
metric: "tests_passing",
|
|
183
|
+
baseline: baselinePassed,
|
|
184
|
+
current: testsPassed,
|
|
185
|
+
delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
|
|
186
|
+
status: testsPassed ? "pass" : "fail",
|
|
187
|
+
});
|
|
188
|
+
if (!testsPassed && baselinePassed) {
|
|
189
|
+
failures.push("Tests were passing in baseline but are now failing");
|
|
190
|
+
}
|
|
178
191
|
}
|
|
179
192
|
// Delta: test count (only if we captured counts)
|
|
180
193
|
if (testCount > 0 || baselineTotal > 0) {
|
package/dist/cli/run.js
CHANGED
|
@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
|
|
|
52
52
|
const node_child_process_1 = require("node:child_process");
|
|
53
53
|
const fs = __importStar(require("node:fs/promises"));
|
|
54
54
|
const path = __importStar(require("node:path"));
|
|
55
|
+
const registry_1 = require("../runtime/registry");
|
|
55
56
|
const impact_analysis_1 = require("./impact-analysis");
|
|
56
57
|
/**
|
|
57
58
|
* Generate deterministic run ID
|
|
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
|
|
|
138
139
|
}
|
|
139
140
|
}
|
|
140
141
|
/**
|
|
141
|
-
* Execute specifications
|
|
142
|
+
* Execute specifications — grouped by file to avoid redundant loads
|
|
142
143
|
*/
|
|
143
144
|
async function executeSpecs(specs) {
|
|
144
|
-
|
|
145
|
+
// Group specs by their absolute file path
|
|
146
|
+
const specsByFile = new Map();
|
|
145
147
|
for (const spec of specs) {
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
+
const abs = path.isAbsolute(spec.filePath)
|
|
149
|
+
? spec.filePath
|
|
150
|
+
: path.join(process.cwd(), spec.filePath);
|
|
151
|
+
const group = specsByFile.get(abs) ?? [];
|
|
152
|
+
group.push(spec);
|
|
153
|
+
specsByFile.set(abs, group);
|
|
148
154
|
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
try {
|
|
157
|
-
// For now, simulate execution
|
|
158
|
-
// In a real implementation, this would:
|
|
159
|
-
// 1. Load the spec file
|
|
160
|
-
// 2. Execute the defineEval function
|
|
161
|
-
// 3. Capture the result
|
|
162
|
-
// Simulate some work
|
|
163
|
-
await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
|
|
164
|
-
// Simulate success/failure (90% success rate for demo)
|
|
165
|
-
const success = Math.random() > 0.1;
|
|
166
|
-
const duration = Date.now() - startTime;
|
|
167
|
-
if (success) {
|
|
168
|
-
return {
|
|
169
|
-
specId: spec.id,
|
|
170
|
-
name: spec.name,
|
|
171
|
-
filePath: spec.filePath,
|
|
172
|
-
result: {
|
|
173
|
-
status: "passed",
|
|
174
|
-
score: Math.random() * 0.3 + 0.7, // 0.7-1.0
|
|
175
|
-
duration,
|
|
176
|
-
},
|
|
177
|
-
};
|
|
155
|
+
const results = [];
|
|
156
|
+
for (const [absPath, fileSpecs] of specsByFile) {
|
|
157
|
+
// Fresh runtime per file to avoid cross-file contamination
|
|
158
|
+
(0, registry_1.disposeActiveRuntime)();
|
|
159
|
+
try {
|
|
160
|
+
// Bust require cache so the file re-executes its defineEval calls
|
|
161
|
+
delete require.cache[require.resolve(absPath)];
|
|
178
162
|
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
163
|
+
catch {
|
|
164
|
+
// Not in cache yet — fine
|
|
165
|
+
}
|
|
166
|
+
try {
|
|
167
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
168
|
+
require(absPath);
|
|
169
|
+
}
|
|
170
|
+
catch (loadError) {
|
|
171
|
+
const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
|
|
172
|
+
const msg = isTs &&
|
|
173
|
+
loadError instanceof Error &&
|
|
174
|
+
(loadError.message.includes("Unknown file extension") ||
|
|
175
|
+
loadError.message.includes("SyntaxError"))
|
|
176
|
+
? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
|
|
177
|
+
: loadError instanceof Error
|
|
178
|
+
? loadError.message
|
|
179
|
+
: String(loadError);
|
|
180
|
+
for (const spec of fileSpecs) {
|
|
181
|
+
results.push(makeErrorResult(spec, msg, 0));
|
|
182
|
+
}
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
const runtime = (0, registry_1.getActiveRuntime)();
|
|
186
|
+
const registered = runtime.list();
|
|
187
|
+
for (const spec of fileSpecs) {
|
|
188
|
+
const registeredSpec = registered.find((r) => r.name === spec.name);
|
|
189
|
+
if (!registeredSpec) {
|
|
190
|
+
results.push({
|
|
191
|
+
specId: spec.id,
|
|
192
|
+
name: spec.name,
|
|
193
|
+
filePath: spec.filePath,
|
|
194
|
+
result: {
|
|
195
|
+
status: "skipped",
|
|
196
|
+
error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
|
|
197
|
+
duration: 0,
|
|
198
|
+
},
|
|
199
|
+
});
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
const startTime = Date.now();
|
|
203
|
+
try {
|
|
204
|
+
const evalResult = await registeredSpec.executor({ input: "" });
|
|
205
|
+
results.push({
|
|
206
|
+
specId: spec.id,
|
|
207
|
+
name: spec.name,
|
|
208
|
+
filePath: spec.filePath,
|
|
209
|
+
result: {
|
|
210
|
+
status: evalResult.pass ? "passed" : "failed",
|
|
211
|
+
score: typeof evalResult.score === "number"
|
|
212
|
+
? evalResult.score / 100
|
|
213
|
+
: undefined,
|
|
214
|
+
error: evalResult.error,
|
|
215
|
+
duration: Date.now() - startTime,
|
|
216
|
+
},
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
catch (execError) {
|
|
220
|
+
results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
|
|
221
|
+
}
|
|
190
222
|
}
|
|
191
223
|
}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
},
|
|
202
|
-
};
|
|
203
|
-
}
|
|
224
|
+
return results;
|
|
225
|
+
}
|
|
226
|
+
function makeErrorResult(spec, error, duration) {
|
|
227
|
+
return {
|
|
228
|
+
specId: spec.id,
|
|
229
|
+
name: spec.name,
|
|
230
|
+
filePath: spec.filePath,
|
|
231
|
+
result: { status: "failed", error, duration },
|
|
232
|
+
};
|
|
204
233
|
}
|
|
205
234
|
/**
|
|
206
235
|
* Calculate summary statistics
|
|
@@ -348,7 +377,8 @@ function printHumanResults(result) {
|
|
|
348
377
|
console.log(` ❌ Failed: ${result.summary.failed}`);
|
|
349
378
|
console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
|
|
350
379
|
console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
|
|
351
|
-
|
|
380
|
+
const hasScores = result.results.some((r) => r.result.score !== undefined);
|
|
381
|
+
console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
|
|
352
382
|
for (const spec of result.results) {
|
|
353
383
|
const status = spec.result.status === "passed"
|
|
354
384
|
? "✅"
|
package/dist/index.d.ts
CHANGED
|
@@ -10,7 +10,7 @@ export { AIEvalClient } from "./client";
|
|
|
10
10
|
import { AuthenticationError, EvalGateError, NetworkError, RateLimitError, SDKError } from "./errors";
|
|
11
11
|
export { EvalGateError, RateLimitError, AuthenticationError, SDKError as ValidationError, // Using SDKError as ValidationError for backward compatibility
|
|
12
12
|
NetworkError, };
|
|
13
|
-
export { containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, expect, followsInstructions, hasFactualAccuracy, hasLength, hasNoHallucinations, hasNoToxicity, hasReadabilityScore, hasSentiment, hasValidCodeSyntax, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinTime, similarTo, withinRange, } from "./assertions";
|
|
13
|
+
export { containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, expect, followsInstructions, hasFactualAccuracy, hasLength, hasNoHallucinations, hasNoToxicity, hasPII, hasReadabilityScore, hasSentiment, hasValidCodeSyntax, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinTime, similarTo, withinRange, } from "./assertions";
|
|
14
14
|
import { createContext, EvalContext, getCurrentContext, withContext } from "./context";
|
|
15
15
|
export { createContext, getCurrentContext as getContext, withContext, EvalContext as ContextManager, };
|
|
16
16
|
export { cloneContext, mergeContexts, validateContext, } from "./runtime/context";
|
package/dist/index.js
CHANGED
|
@@ -8,8 +8,8 @@
|
|
|
8
8
|
* @packageDocumentation
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
-
exports.
|
|
12
|
-
exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = void 0;
|
|
11
|
+
exports.SpecRegistrationError = exports.SpecExecutionError = exports.RuntimeError = exports.EvalRuntimeError = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.defaultLocalExecutor = exports.createLocalExecutor = exports.evalai = exports.defineSuite = exports.defineEval = exports.createResult = exports.createEvalContext = exports.validateContext = exports.mergeContexts = exports.cloneContext = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasPII = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalGateError = exports.AIEvalClient = void 0;
|
|
12
|
+
exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = void 0;
|
|
13
13
|
// Main SDK exports
|
|
14
14
|
var client_1 = require("./client");
|
|
15
15
|
Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
|
|
@@ -32,6 +32,7 @@ Object.defineProperty(exports, "hasFactualAccuracy", { enumerable: true, get: fu
|
|
|
32
32
|
Object.defineProperty(exports, "hasLength", { enumerable: true, get: function () { return assertions_1.hasLength; } });
|
|
33
33
|
Object.defineProperty(exports, "hasNoHallucinations", { enumerable: true, get: function () { return assertions_1.hasNoHallucinations; } });
|
|
34
34
|
Object.defineProperty(exports, "hasNoToxicity", { enumerable: true, get: function () { return assertions_1.hasNoToxicity; } });
|
|
35
|
+
Object.defineProperty(exports, "hasPII", { enumerable: true, get: function () { return assertions_1.hasPII; } });
|
|
35
36
|
Object.defineProperty(exports, "hasReadabilityScore", { enumerable: true, get: function () { return assertions_1.hasReadabilityScore; } });
|
|
36
37
|
Object.defineProperty(exports, "hasSentiment", { enumerable: true, get: function () { return assertions_1.hasSentiment; } });
|
|
37
38
|
Object.defineProperty(exports, "hasValidCodeSyntax", { enumerable: true, get: function () { return assertions_1.hasValidCodeSyntax; } });
|
package/dist/runtime/eval.d.ts
CHANGED
|
@@ -18,10 +18,19 @@ export declare const evalai: {
|
|
|
18
18
|
test: DefineEvalFunction;
|
|
19
19
|
};
|
|
20
20
|
/**
|
|
21
|
-
* Suite definition for grouping related specifications
|
|
22
|
-
*
|
|
21
|
+
* Suite definition for grouping related specifications.
|
|
22
|
+
* Accepts both a positional form and an object form:
|
|
23
|
+
*
|
|
24
|
+
* @example Positional form:
|
|
25
|
+
* defineSuite('My Suite', [() => defineEval('spec 1', executor), ...])
|
|
26
|
+
*
|
|
27
|
+
* @example Object form:
|
|
28
|
+
* defineSuite({ name: 'My Suite', specs: [() => defineEval('spec 1', executor), ...] })
|
|
23
29
|
*/
|
|
24
|
-
export declare function defineSuite(
|
|
30
|
+
export declare function defineSuite(nameOrConfig: string | {
|
|
31
|
+
name: string;
|
|
32
|
+
specs: (() => void)[];
|
|
33
|
+
}, specsArg?: (() => void)[]): void;
|
|
25
34
|
/**
|
|
26
35
|
* Helper function to create specification contexts
|
|
27
36
|
* Useful for testing and manual execution
|
package/dist/runtime/eval.js
CHANGED
|
@@ -204,13 +204,22 @@ exports.evalai = {
|
|
|
204
204
|
test: exports.defineEval,
|
|
205
205
|
};
|
|
206
206
|
/**
|
|
207
|
-
* Suite definition for grouping related specifications
|
|
208
|
-
*
|
|
207
|
+
* Suite definition for grouping related specifications.
|
|
208
|
+
* Accepts both a positional form and an object form:
|
|
209
|
+
*
|
|
210
|
+
* @example Positional form:
|
|
211
|
+
* defineSuite('My Suite', [() => defineEval('spec 1', executor), ...])
|
|
212
|
+
*
|
|
213
|
+
* @example Object form:
|
|
214
|
+
* defineSuite({ name: 'My Suite', specs: [() => defineEval('spec 1', executor), ...] })
|
|
209
215
|
*/
|
|
210
|
-
function defineSuite(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
216
|
+
function defineSuite(nameOrConfig, specsArg) {
|
|
217
|
+
const specFns = typeof nameOrConfig === "string"
|
|
218
|
+
? (specsArg ?? [])
|
|
219
|
+
: (nameOrConfig.specs ?? []);
|
|
220
|
+
// Execute each spec function to register its defineEval calls
|
|
221
|
+
// In Layer 3, this will also build the dependency graph
|
|
222
|
+
for (const specFn of specFns) {
|
|
214
223
|
specFn();
|
|
215
224
|
}
|
|
216
225
|
}
|
package/dist/snapshot.d.ts
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* import { snapshot, loadSnapshot } from '@ai-eval-platform/sdk';
|
|
10
10
|
*
|
|
11
11
|
* const output = await generateText('Write a haiku about coding');
|
|
12
|
-
* await snapshot(
|
|
12
|
+
* await snapshot('haiku-test', output);
|
|
13
13
|
*
|
|
14
14
|
* // Later, compare with snapshot
|
|
15
15
|
* const saved = await loadSnapshot('haiku-test');
|
|
@@ -135,10 +135,10 @@ export declare class SnapshotManager {
|
|
|
135
135
|
* @example
|
|
136
136
|
* ```typescript
|
|
137
137
|
* const output = await generateText('Write a haiku');
|
|
138
|
-
* await snapshot(
|
|
138
|
+
* await snapshot('haiku-test', output);
|
|
139
139
|
* ```
|
|
140
140
|
*/
|
|
141
|
-
export declare function snapshot(
|
|
141
|
+
export declare function snapshot(name: string, output: string, options?: {
|
|
142
142
|
tags?: string[];
|
|
143
143
|
metadata?: Record<string, unknown>;
|
|
144
144
|
overwrite?: boolean;
|
package/dist/snapshot.js
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* import { snapshot, loadSnapshot } from '@ai-eval-platform/sdk';
|
|
11
11
|
*
|
|
12
12
|
* const output = await generateText('Write a haiku about coding');
|
|
13
|
-
* await snapshot(
|
|
13
|
+
* await snapshot('haiku-test', output);
|
|
14
14
|
*
|
|
15
15
|
* // Later, compare with snapshot
|
|
16
16
|
* const saved = await loadSnapshot('haiku-test');
|
|
@@ -271,10 +271,10 @@ function getSnapshotManager(dir) {
|
|
|
271
271
|
* @example
|
|
272
272
|
* ```typescript
|
|
273
273
|
* const output = await generateText('Write a haiku');
|
|
274
|
-
* await snapshot(
|
|
274
|
+
* await snapshot('haiku-test', output);
|
|
275
275
|
* ```
|
|
276
276
|
*/
|
|
277
|
-
async function snapshot(
|
|
277
|
+
async function snapshot(name, output, options) {
|
|
278
278
|
const manager = getSnapshotManager(options?.dir);
|
|
279
279
|
return manager.save(name, output, options);
|
|
280
280
|
}
|
package/dist/version.d.ts
CHANGED
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
* X-EvalGate-SDK-Version: SDK package version
|
|
4
4
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
5
5
|
*/
|
|
6
|
-
export declare const SDK_VERSION = "2.
|
|
7
|
-
export declare const SPEC_VERSION = "2.
|
|
6
|
+
export declare const SDK_VERSION = "2.2.0";
|
|
7
|
+
export declare const SPEC_VERSION = "2.2.0";
|
package/dist/version.js
CHANGED
|
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
|
|
|
6
6
|
* X-EvalGate-SDK-Version: SDK package version
|
|
7
7
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
8
8
|
*/
|
|
9
|
-
exports.SDK_VERSION = "2.
|
|
10
|
-
exports.SPEC_VERSION = "2.
|
|
9
|
+
exports.SDK_VERSION = "2.2.0";
|
|
10
|
+
exports.SPEC_VERSION = "2.2.0";
|