@evalgate/sdk 2.2.2 → 2.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +40 -1
- package/dist/assertions.d.ts +194 -10
- package/dist/assertions.js +525 -73
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +5 -1
- package/dist/cache.js +5 -1
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/upgrade.js +5 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/errors.js +7 -0
- package/dist/export.js +2 -2
- package/dist/index.d.ts +10 -9
- package/dist/index.js +24 -7
- package/dist/integrations/anthropic.js +6 -6
- package/dist/integrations/openai.js +84 -61
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/pagination.d.ts +13 -2
- package/dist/pagination.js +28 -2
- package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/executor.d.ts +3 -2
- package/dist/runtime/executor.js +3 -2
- package/dist/runtime/registry.d.ts +8 -3
- package/dist/runtime/registry.js +15 -4
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/snapshot.d.ts +12 -0
- package/dist/snapshot.js +24 -1
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +8 -1
package/dist/runtime/registry.js
CHANGED
|
@@ -39,6 +39,7 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
39
39
|
};
|
|
40
40
|
})();
|
|
41
41
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports._registerDefineEval = _registerDefineEval;
|
|
42
43
|
exports.createEvalRuntime = createEvalRuntime;
|
|
43
44
|
exports.withRuntime = withRuntime;
|
|
44
45
|
exports.getActiveRuntime = getActiveRuntime;
|
|
@@ -47,6 +48,12 @@ exports.disposeActiveRuntime = disposeActiveRuntime;
|
|
|
47
48
|
const crypto = __importStar(require("node:crypto"));
|
|
48
49
|
const path = __importStar(require("node:path"));
|
|
49
50
|
const types_1 = require("./types");
|
|
51
|
+
// Registration pattern to break circular dependency (eval.ts imports from registry.ts)
|
|
52
|
+
let _registeredDefineEval = null;
|
|
53
|
+
/** @internal Called by eval.ts to register defineEval without circular import */
|
|
54
|
+
function _registerDefineEval(fn) {
|
|
55
|
+
_registeredDefineEval = fn;
|
|
56
|
+
}
|
|
50
57
|
/**
|
|
51
58
|
* Runtime registry implementation
|
|
52
59
|
* Scoped lifecycle with proper memory management
|
|
@@ -315,7 +322,10 @@ class EvalRuntimeImpl {
|
|
|
315
322
|
* Create a new scoped runtime with lifecycle management
|
|
316
323
|
* Returns a handle for proper resource management
|
|
317
324
|
*/
|
|
318
|
-
function createEvalRuntime(
|
|
325
|
+
function createEvalRuntime(projectRootOrConfig = process.cwd()) {
|
|
326
|
+
const projectRoot = typeof projectRootOrConfig === "string"
|
|
327
|
+
? projectRootOrConfig
|
|
328
|
+
: (projectRootOrConfig.projectRoot ?? process.cwd());
|
|
319
329
|
const runtime = new EvalRuntimeImpl(projectRoot);
|
|
320
330
|
// Create bound defineEval function
|
|
321
331
|
const boundDefineEval = ((nameOrConfig, executor, options) => {
|
|
@@ -323,9 +333,10 @@ function createEvalRuntime(projectRoot = process.cwd()) {
|
|
|
323
333
|
const previousRuntime = activeRuntime;
|
|
324
334
|
activeRuntime = runtime;
|
|
325
335
|
try {
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
336
|
+
if (!_registeredDefineEval) {
|
|
337
|
+
throw new types_1.RuntimeError("defineEval not registered. Ensure eval.ts is imported before calling createEvalRuntime.");
|
|
338
|
+
}
|
|
339
|
+
return _registeredDefineEval(nameOrConfig, executor, options);
|
|
329
340
|
}
|
|
330
341
|
finally {
|
|
331
342
|
// Restore previous runtime
|
|
@@ -159,7 +159,7 @@ export declare class RunReportBuilder {
|
|
|
159
159
|
addResult(testId: string, testName: string, filePath: string, position: {
|
|
160
160
|
line: number;
|
|
161
161
|
column: number;
|
|
162
|
-
}, input: string, result: EnhancedEvalResult): void;
|
|
162
|
+
}, input: string, result: EnhancedEvalResult, tags?: string[]): void;
|
|
163
163
|
/**
|
|
164
164
|
* Update summary statistics
|
|
165
165
|
*/
|
|
@@ -77,7 +77,7 @@ class RunReportBuilder {
|
|
|
77
77
|
/**
|
|
78
78
|
* Add a test result to the report
|
|
79
79
|
*/
|
|
80
|
-
addResult(testId, testName, filePath, position, input, result) {
|
|
80
|
+
addResult(testId, testName, filePath, position, input, result, tags) {
|
|
81
81
|
const runResult = {
|
|
82
82
|
testId,
|
|
83
83
|
testName,
|
|
@@ -88,7 +88,7 @@ class RunReportBuilder {
|
|
|
88
88
|
score: result.score,
|
|
89
89
|
durationMs: result.durationMs || 0,
|
|
90
90
|
metadata: result.metadata,
|
|
91
|
-
tags: [],
|
|
91
|
+
tags: tags ?? [],
|
|
92
92
|
assertions: result.assertions?.map((assertion, index) => ({
|
|
93
93
|
name: assertion.name || `assertion-${index}`,
|
|
94
94
|
passed: assertion.passed,
|
|
@@ -182,8 +182,11 @@ class RunReportBuilder {
|
|
|
182
182
|
// Set completion timestamp
|
|
183
183
|
this.report.finishedAt = new Date().toISOString();
|
|
184
184
|
const finalReport = this.report;
|
|
185
|
-
// Add toJSON method
|
|
186
|
-
finalReport.toJSON = () =>
|
|
185
|
+
// Add toJSON method (spread to avoid circular reference via toJSON itself)
|
|
186
|
+
finalReport.toJSON = () => {
|
|
187
|
+
const { toJSON: _, ...data } = finalReport;
|
|
188
|
+
return JSON.stringify(data, null, 2);
|
|
189
|
+
};
|
|
187
190
|
return finalReport;
|
|
188
191
|
}
|
|
189
192
|
/**
|
package/dist/runtime/types.d.ts
CHANGED
|
@@ -36,6 +36,8 @@ export interface EvalSpec {
|
|
|
36
36
|
budget?: string;
|
|
37
37
|
model?: string | "auto";
|
|
38
38
|
};
|
|
39
|
+
/** Filtering mode: skip = registered but never executed, only = exclusive execution */
|
|
40
|
+
mode?: "normal" | "skip" | "only";
|
|
39
41
|
}
|
|
40
42
|
/**
|
|
41
43
|
* Specification execution context
|
|
@@ -81,6 +83,10 @@ export interface EvalResult {
|
|
|
81
83
|
durationMs?: number;
|
|
82
84
|
/** Execution error if failed */
|
|
83
85
|
error?: string;
|
|
86
|
+
/** Generated output text */
|
|
87
|
+
output?: string;
|
|
88
|
+
/** Token count consumed */
|
|
89
|
+
tokens?: number;
|
|
84
90
|
}
|
|
85
91
|
/**
|
|
86
92
|
* Scoped runtime context - prevents cross-run contamination
|
|
@@ -183,6 +189,38 @@ export interface DefineEvalFunction {
|
|
|
183
189
|
* @param config - Complete specification configuration
|
|
184
190
|
*/
|
|
185
191
|
(config: SpecConfig): void;
|
|
192
|
+
/**
|
|
193
|
+
* Register a specification but skip it during execution.
|
|
194
|
+
* Follows the vitest/jest `.skip` convention.
|
|
195
|
+
*/
|
|
196
|
+
skip: DefineEvalFunction;
|
|
197
|
+
/**
|
|
198
|
+
* Register a specification for exclusive execution.
|
|
199
|
+
* If any spec is marked `.only`, only those specs run.
|
|
200
|
+
* Follows the vitest/jest `.only` convention.
|
|
201
|
+
*/
|
|
202
|
+
only: DefineEvalFunction;
|
|
203
|
+
/**
|
|
204
|
+
* Load a JSONL or CSV dataset and register one spec per row.
|
|
205
|
+
* Each row is passed as `context.input` (the parsed row object) to the executor.
|
|
206
|
+
*
|
|
207
|
+
* @param name - Base name for specs (each gets " [row N]" suffix)
|
|
208
|
+
* @param datasetPath - Path to a .jsonl or .csv file
|
|
209
|
+
* @param executor - Receives the parsed row as input
|
|
210
|
+
* @param options - Optional spec configuration applied to all rows
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```ts
|
|
214
|
+
* defineEval.fromDataset("rag-accuracy", "./evals/golden.jsonl", async (ctx) => {
|
|
215
|
+
* const row = ctx.input; // { question: string, expected: string }
|
|
216
|
+
* const answer = await myRag(row.question);
|
|
217
|
+
* return createResult({ pass: answer.includes(row.expected), score: 100 });
|
|
218
|
+
* });
|
|
219
|
+
* ```
|
|
220
|
+
*/
|
|
221
|
+
fromDataset: <TRow extends Record<string, unknown> = Record<string, unknown>>(name: string, datasetPath: string, executor: (context: EvalContext & {
|
|
222
|
+
input: TRow;
|
|
223
|
+
}) => Promise<EvalResult>, options?: SpecOptions) => void;
|
|
186
224
|
}
|
|
187
225
|
/**
|
|
188
226
|
* Specification definition options
|
package/dist/snapshot.d.ts
CHANGED
|
@@ -166,6 +166,18 @@ export declare function loadSnapshot(name: string, dir?: string): Promise<Snapsh
|
|
|
166
166
|
* ```
|
|
167
167
|
*/
|
|
168
168
|
export declare function compareWithSnapshot(name: string, currentOutput: unknown, dir?: string): Promise<SnapshotComparison>;
|
|
169
|
+
/**
|
|
170
|
+
* Compare two saved snapshots by name (convenience function)
|
|
171
|
+
*
|
|
172
|
+
* @example
|
|
173
|
+
* ```typescript
|
|
174
|
+
* const comparison = await compareSnapshots('baseline', 'current');
|
|
175
|
+
* if (!comparison.matches) {
|
|
176
|
+
* console.log('Snapshots differ!', comparison.differences);
|
|
177
|
+
* }
|
|
178
|
+
* ```
|
|
179
|
+
*/
|
|
180
|
+
export declare function compareSnapshots(nameA: string, nameB: string, dir?: string): Promise<SnapshotComparison>;
|
|
169
181
|
/**
|
|
170
182
|
* Delete a snapshot (convenience function)
|
|
171
183
|
*/
|
package/dist/snapshot.js
CHANGED
|
@@ -55,6 +55,7 @@ exports.SnapshotManager = void 0;
|
|
|
55
55
|
exports.snapshot = snapshot;
|
|
56
56
|
exports.loadSnapshot = loadSnapshot;
|
|
57
57
|
exports.compareWithSnapshot = compareWithSnapshot;
|
|
58
|
+
exports.compareSnapshots = compareSnapshots;
|
|
58
59
|
exports.deleteSnapshot = deleteSnapshot;
|
|
59
60
|
exports.listSnapshots = listSnapshots;
|
|
60
61
|
// Environment check
|
|
@@ -130,7 +131,13 @@ class SnapshotManager {
|
|
|
130
131
|
if (!options?.overwrite && fs.existsSync(filePath)) {
|
|
131
132
|
throw new Error(`Snapshot '${name}' already exists. Use overwrite: true to update.`);
|
|
132
133
|
}
|
|
133
|
-
const serialized =
|
|
134
|
+
const serialized = output === undefined
|
|
135
|
+
? "undefined"
|
|
136
|
+
: output === null
|
|
137
|
+
? "null"
|
|
138
|
+
: typeof output === "string"
|
|
139
|
+
? output
|
|
140
|
+
: JSON.stringify(output);
|
|
134
141
|
const snapshotData = {
|
|
135
142
|
output: serialized,
|
|
136
143
|
metadata: {
|
|
@@ -310,6 +317,22 @@ async function compareWithSnapshot(name, currentOutput, dir) {
|
|
|
310
317
|
const manager = getSnapshotManager(dir);
|
|
311
318
|
return manager.compare(name, currentOutput);
|
|
312
319
|
}
|
|
320
|
+
/**
|
|
321
|
+
* Compare two saved snapshots by name (convenience function)
|
|
322
|
+
*
|
|
323
|
+
* @example
|
|
324
|
+
* ```typescript
|
|
325
|
+
* const comparison = await compareSnapshots('baseline', 'current');
|
|
326
|
+
* if (!comparison.matches) {
|
|
327
|
+
* console.log('Snapshots differ!', comparison.differences);
|
|
328
|
+
* }
|
|
329
|
+
* ```
|
|
330
|
+
*/
|
|
331
|
+
async function compareSnapshots(nameA, nameB, dir) {
|
|
332
|
+
const manager = getSnapshotManager(dir);
|
|
333
|
+
const snapshotB = await manager.load(nameB);
|
|
334
|
+
return manager.compare(nameA, snapshotB.output);
|
|
335
|
+
}
|
|
313
336
|
/**
|
|
314
337
|
* Delete a snapshot (convenience function)
|
|
315
338
|
*/
|
package/dist/testing.d.ts
CHANGED
|
@@ -51,8 +51,16 @@ export interface TestSuiteConfig {
|
|
|
51
51
|
stopOnFailure?: boolean;
|
|
52
52
|
/** Timeout per test case in ms (default: 30000) */
|
|
53
53
|
timeout?: number;
|
|
54
|
+
/** Alias for stopOnFailure — fail the entire suite on the first failing case. Useful in pre-commit hooks. */
|
|
55
|
+
strict?: boolean;
|
|
54
56
|
/** Retry failing cases N times (default: 0). Only failing cases are retried. */
|
|
55
57
|
retries?: number;
|
|
58
|
+
/** Base delay between retries in ms (default: 500). Exponential backoff: delay * 2^attempt. */
|
|
59
|
+
retryDelayMs?: number;
|
|
60
|
+
/** Add random jitter up to this fraction of the delay (default: 0.5 = ±50%). Set 0 to disable. */
|
|
61
|
+
retryJitter?: number;
|
|
62
|
+
/** Seed for deterministic case ordering. When set, cases are shuffled using this seed for reproducible runs. */
|
|
63
|
+
seed?: number;
|
|
56
64
|
}
|
|
57
65
|
export interface TestSuiteCaseResult {
|
|
58
66
|
/** Test case ID */
|
package/dist/testing.js
CHANGED
|
@@ -50,6 +50,26 @@ class TestSuite {
|
|
|
50
50
|
async run() {
|
|
51
51
|
const startTime = Date.now();
|
|
52
52
|
const results = [];
|
|
53
|
+
// Deterministic shuffle when seed is provided
|
|
54
|
+
const orderedCases = this.config.cases.map((c, i) => ({
|
|
55
|
+
case: c,
|
|
56
|
+
originalIndex: i,
|
|
57
|
+
}));
|
|
58
|
+
if (this.config.seed !== undefined) {
|
|
59
|
+
// mulberry32 seeded PRNG
|
|
60
|
+
let s = this.config.seed | 0;
|
|
61
|
+
const rand = () => {
|
|
62
|
+
s = (s + 0x6d2b79f5) | 0;
|
|
63
|
+
let t = Math.imul(s ^ (s >>> 15), 1 | s);
|
|
64
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
65
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
66
|
+
};
|
|
67
|
+
// Fisher-Yates shuffle
|
|
68
|
+
for (let i = orderedCases.length - 1; i > 0; i--) {
|
|
69
|
+
const j = Math.floor(rand() * (i + 1));
|
|
70
|
+
[orderedCases[i], orderedCases[j]] = [orderedCases[j], orderedCases[i]];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
53
73
|
const runTestCase = async (testCase, index) => {
|
|
54
74
|
const caseStartTime = Date.now();
|
|
55
75
|
const id = testCase.id || `case-${index}`;
|
|
@@ -114,37 +134,52 @@ class TestSuite {
|
|
|
114
134
|
};
|
|
115
135
|
}
|
|
116
136
|
};
|
|
117
|
-
// Run tests
|
|
137
|
+
// Run tests (using orderedCases which may be seeded-shuffled)
|
|
118
138
|
if (this.config.parallel) {
|
|
119
|
-
results.push(...(await Promise.all(
|
|
139
|
+
results.push(...(await Promise.all(orderedCases.map((oc) => runTestCase(oc.case, oc.originalIndex)))));
|
|
120
140
|
}
|
|
121
141
|
else {
|
|
122
|
-
for (
|
|
123
|
-
const result = await runTestCase(
|
|
142
|
+
for (const oc of orderedCases) {
|
|
143
|
+
const result = await runTestCase(oc.case, oc.originalIndex);
|
|
124
144
|
results.push(result);
|
|
125
|
-
if (this.config.stopOnFailure
|
|
145
|
+
if ((this.config.stopOnFailure || this.config.strict) &&
|
|
146
|
+
!result.passed) {
|
|
126
147
|
break;
|
|
127
148
|
}
|
|
128
149
|
}
|
|
129
150
|
}
|
|
130
151
|
const retriedCases = [];
|
|
131
152
|
const retries = this.config.retries ?? 0;
|
|
153
|
+
const baseDelay = this.config.retryDelayMs ?? 500;
|
|
154
|
+
const jitterFraction = this.config.retryJitter ?? 0.5;
|
|
132
155
|
if (retries > 0 && results.length > 0) {
|
|
133
156
|
const failingIndices = results
|
|
134
157
|
.map((r, i) => (r.passed ? -1 : i))
|
|
135
158
|
.filter((i) => i >= 0);
|
|
136
159
|
for (let attempt = 0; attempt < retries && failingIndices.length > 0; attempt++) {
|
|
160
|
+
// Exponential backoff with jitter before each retry round
|
|
161
|
+
const delay = baseDelay * 2 ** attempt;
|
|
162
|
+
const jitter = jitterFraction > 0
|
|
163
|
+
? delay * jitterFraction * (Math.random() * 2 - 1)
|
|
164
|
+
: 0;
|
|
165
|
+
const waitMs = Math.max(0, Math.round(delay + jitter));
|
|
166
|
+
if (waitMs > 0) {
|
|
167
|
+
await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
168
|
+
}
|
|
137
169
|
const toRetry = [...failingIndices];
|
|
138
170
|
failingIndices.length = 0;
|
|
139
|
-
for (const
|
|
140
|
-
const tc =
|
|
141
|
-
const
|
|
171
|
+
for (const idx of toRetry) {
|
|
172
|
+
const tc = results[idx]; // retry based on result index
|
|
173
|
+
const originalCase = orderedCases.find((oc) => (oc.case.id || `case-${oc.originalIndex}`) === tc.id);
|
|
174
|
+
if (!originalCase)
|
|
175
|
+
continue;
|
|
176
|
+
const retryResult = await runTestCase(originalCase.case, originalCase.originalIndex);
|
|
142
177
|
if (retryResult.passed) {
|
|
143
|
-
results[
|
|
178
|
+
results[idx] = retryResult;
|
|
144
179
|
retriedCases.push(retryResult.id);
|
|
145
180
|
}
|
|
146
181
|
else {
|
|
147
|
-
failingIndices.push(
|
|
182
|
+
failingIndices.push(idx);
|
|
148
183
|
}
|
|
149
184
|
}
|
|
150
185
|
}
|
package/dist/version.d.ts
CHANGED
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
* X-EvalGate-SDK-Version: SDK package version
|
|
4
4
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
5
5
|
*/
|
|
6
|
-
export declare const SDK_VERSION = "2.2.
|
|
7
|
-
export declare const SPEC_VERSION = "2.2.
|
|
6
|
+
export declare const SDK_VERSION = "2.2.4";
|
|
7
|
+
export declare const SPEC_VERSION = "2.2.3";
|
package/dist/version.js
CHANGED
|
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
|
|
|
6
6
|
* X-EvalGate-SDK-Version: SDK package version
|
|
7
7
|
* X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
8
8
|
*/
|
|
9
|
-
exports.SDK_VERSION = "2.2.
|
|
10
|
-
exports.SPEC_VERSION = "2.2.
|
|
9
|
+
exports.SDK_VERSION = "2.2.4";
|
|
10
|
+
exports.SPEC_VERSION = "2.2.3";
|
package/dist/workflows.d.ts
CHANGED
|
@@ -170,6 +170,8 @@ export interface WorkflowTracerOptions {
|
|
|
170
170
|
captureFullPayloads?: boolean;
|
|
171
171
|
/** Debug mode */
|
|
172
172
|
debug?: boolean;
|
|
173
|
+
/** Offline mode — skip all API calls, keep in-memory state only */
|
|
174
|
+
offline?: boolean;
|
|
173
175
|
}
|
|
174
176
|
/**
|
|
175
177
|
* Agent span context
|