@evalgate/sdk 2.2.3 → 2.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +5 -0
  2. package/README.md +38 -1
  3. package/dist/assertions.d.ts +185 -5
  4. package/dist/assertions.js +496 -61
  5. package/dist/batch.js +4 -4
  6. package/dist/cache.d.ts +4 -0
  7. package/dist/cache.js +4 -0
  8. package/dist/cli/baseline.d.ts +14 -0
  9. package/dist/cli/baseline.js +43 -3
  10. package/dist/cli/check.d.ts +5 -2
  11. package/dist/cli/check.js +20 -12
  12. package/dist/cli/compare.d.ts +80 -0
  13. package/dist/cli/compare.js +266 -0
  14. package/dist/cli/index.js +244 -101
  15. package/dist/cli/regression-gate.js +23 -0
  16. package/dist/cli/run.js +22 -0
  17. package/dist/cli/start.d.ts +26 -0
  18. package/dist/cli/start.js +130 -0
  19. package/dist/cli/templates.d.ts +24 -0
  20. package/dist/cli/templates.js +314 -0
  21. package/dist/cli/traces.d.ts +109 -0
  22. package/dist/cli/traces.js +152 -0
  23. package/dist/cli/validate.d.ts +37 -0
  24. package/dist/cli/validate.js +252 -0
  25. package/dist/cli/watch.d.ts +19 -0
  26. package/dist/cli/watch.js +175 -0
  27. package/dist/client.js +6 -13
  28. package/dist/constants.d.ts +2 -0
  29. package/dist/constants.js +5 -0
  30. package/dist/index.d.ts +7 -6
  31. package/dist/index.js +22 -6
  32. package/dist/integrations/openai.js +83 -60
  33. package/dist/logger.d.ts +3 -1
  34. package/dist/logger.js +2 -1
  35. package/dist/otel.d.ts +130 -0
  36. package/dist/otel.js +309 -0
  37. package/dist/runtime/eval.d.ts +14 -4
  38. package/dist/runtime/eval.js +127 -2
  39. package/dist/runtime/registry.d.ts +4 -2
  40. package/dist/runtime/registry.js +11 -3
  41. package/dist/runtime/run-report.d.ts +1 -1
  42. package/dist/runtime/run-report.js +7 -4
  43. package/dist/runtime/types.d.ts +38 -0
  44. package/dist/testing.d.ts +8 -0
  45. package/dist/testing.js +45 -10
  46. package/dist/version.d.ts +1 -1
  47. package/dist/version.js +1 -1
  48. package/dist/workflows.d.ts +2 -0
  49. package/dist/workflows.js +184 -102
  50. package/package.json +8 -1
@@ -39,13 +39,21 @@ var __importStar = (this && this.__importStar) || (function () {
39
39
  };
40
40
  })();
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
- exports.evalai = exports.defineEval = void 0;
42
+ exports.createLocalExecutor = exports.withRuntime = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.evalai = exports.defineEval = void 0;
43
+ exports.getFilteredSpecs = getFilteredSpecs;
43
44
  exports.defineSuite = defineSuite;
44
45
  exports.createContext = createContext;
46
+ exports.createEvalContext = createContext;
45
47
  exports.createResult = createResult;
46
48
  const crypto = __importStar(require("node:crypto"));
49
+ const fs = __importStar(require("node:fs"));
47
50
  const path = __importStar(require("node:path"));
48
51
  const registry_1 = require("./registry");
52
+ Object.defineProperty(exports, "createEvalRuntime", { enumerable: true, get: function () { return registry_1.createEvalRuntime; } });
53
+ Object.defineProperty(exports, "disposeActiveRuntime", { enumerable: true, get: function () { return registry_1.disposeActiveRuntime; } });
54
+ Object.defineProperty(exports, "getActiveRuntime", { enumerable: true, get: function () { return registry_1.getActiveRuntime; } });
55
+ Object.defineProperty(exports, "setActiveRuntime", { enumerable: true, get: function () { return registry_1.setActiveRuntime; } });
56
+ Object.defineProperty(exports, "withRuntime", { enumerable: true, get: function () { return registry_1.withRuntime; } });
49
57
  const types_1 = require("./types");
50
58
  /**
51
59
  * Extract AST position from call stack
@@ -159,7 +167,7 @@ function createSpecConfig(nameOrConfig, executor, options) {
159
167
  /**
160
168
  * Core defineEval function implementation
161
169
  */
162
- function defineEvalImpl(nameOrConfig, executor, options) {
170
+ function defineEvalWithMode(mode, nameOrConfig, executor, options) {
163
171
  // Get caller position for identity
164
172
  const callerPosition = getCallerPosition();
165
173
  // Create specification configuration
@@ -187,15 +195,124 @@ function defineEvalImpl(nameOrConfig, executor, options) {
187
195
  budget: config.budget,
188
196
  model: config.model,
189
197
  },
198
+ mode,
190
199
  };
191
200
  // Register specification
192
201
  runtime.register(spec);
193
202
  }
203
+ function defineEvalImpl(nameOrConfig, executor, options) {
204
+ defineEvalWithMode("normal", nameOrConfig, executor, options);
205
+ }
206
+ function defineEvalSkipImpl(nameOrConfig, executor, options) {
207
+ defineEvalWithMode("skip", nameOrConfig, executor, options);
208
+ }
209
+ function defineEvalOnlyImpl(nameOrConfig, executor, options) {
210
+ defineEvalWithMode("only", nameOrConfig, executor, options);
211
+ }
194
212
  /**
195
213
  * Export the defineEval function with proper typing
196
214
  * This is the main DSL entry point
197
215
  */
198
216
  exports.defineEval = defineEvalImpl;
217
+ // Attach .skip and .only modifiers (vitest/jest convention)
218
+ exports.defineEval.skip = defineEvalSkipImpl;
219
+ exports.defineEval.only = defineEvalOnlyImpl;
220
+ /**
221
+ * Parse a JSONL file into an array of row objects.
222
+ * Each line must be a valid JSON object; blank lines are skipped.
223
+ */
224
+ function parseJsonl(content) {
225
+ return content
226
+ .split("\n")
227
+ .map((line) => line.trim())
228
+ .filter((line) => line.length > 0)
229
+ .map((line, i) => {
230
+ try {
231
+ return JSON.parse(line);
232
+ }
233
+ catch {
234
+ throw new types_1.SpecRegistrationError(`Invalid JSON on line ${i + 1} of dataset`);
235
+ }
236
+ });
237
+ }
238
+ /**
239
+ * Parse a simple CSV file into an array of row objects.
240
+ * First line is treated as headers. Values are unquoted strings.
241
+ * For complex CSV (quoted fields, escapes), use a dedicated library.
242
+ */
243
+ function parseCsv(content) {
244
+ const lines = content
245
+ .split("\n")
246
+ .map((l) => l.trim())
247
+ .filter((l) => l.length > 0);
248
+ if (lines.length < 2)
249
+ return [];
250
+ const headers = lines[0].split(",").map((h) => h.trim());
251
+ return lines.slice(1).map((line) => {
252
+ const values = line.split(",").map((v) => v.trim());
253
+ const row = {};
254
+ for (let i = 0; i < headers.length; i++) {
255
+ row[headers[i]] = values[i] ?? "";
256
+ }
257
+ return row;
258
+ });
259
+ }
260
+ /**
261
+ * Load a JSONL or CSV dataset and register one spec per row.
262
+ */
263
+ function fromDatasetImpl(name, datasetPath, executor, options) {
264
+ const resolvedPath = path.isAbsolute(datasetPath)
265
+ ? datasetPath
266
+ : path.resolve(process.cwd(), datasetPath);
267
+ if (!fs.existsSync(resolvedPath)) {
268
+ throw new types_1.SpecRegistrationError(`Dataset file not found: ${resolvedPath}`);
269
+ }
270
+ const content = fs.readFileSync(resolvedPath, "utf8");
271
+ const ext = path.extname(resolvedPath).toLowerCase();
272
+ let rows;
273
+ if (ext === ".jsonl" || ext === ".ndjson") {
274
+ rows = parseJsonl(content);
275
+ }
276
+ else if (ext === ".csv") {
277
+ rows = parseCsv(content);
278
+ }
279
+ else if (ext === ".json") {
280
+ const parsed = JSON.parse(content);
281
+ rows = Array.isArray(parsed) ? parsed : [parsed];
282
+ }
283
+ else {
284
+ throw new types_1.SpecRegistrationError(`Unsupported dataset format: ${ext}. Use .jsonl, .ndjson, .csv, or .json`);
285
+ }
286
+ if (rows.length === 0) {
287
+ throw new types_1.SpecRegistrationError(`Dataset is empty: ${resolvedPath}`);
288
+ }
289
+ for (let i = 0; i < rows.length; i++) {
290
+ const row = rows[i];
291
+ const specName = `${name} - row ${i + 1}`;
292
+ const wrappedExecutor = (context) => executor({ ...context, input: row });
293
+ defineEvalWithMode("normal", specName, wrappedExecutor, {
294
+ ...options,
295
+ metadata: {
296
+ ...options?.metadata,
297
+ datasetPath: resolvedPath,
298
+ datasetRow: i + 1,
299
+ },
300
+ });
301
+ }
302
+ }
303
+ exports.defineEval.fromDataset = fromDatasetImpl;
304
+ /**
305
+ * Filter a list of specs according to skip/only semantics:
306
+ * - If any spec has mode === "only", return only those specs
307
+ * - Otherwise, return all specs except those with mode === "skip"
308
+ */
309
+ function getFilteredSpecs(specs) {
310
+ const onlySpecs = specs.filter((s) => s.mode === "only");
311
+ if (onlySpecs.length > 0) {
312
+ return onlySpecs;
313
+ }
314
+ return specs.filter((s) => s.mode !== "skip");
315
+ }
199
316
  /**
200
317
  * Convenience export for evalai.test() alias (backward compatibility)
201
318
  * Provides alternative naming that matches the original roadmap vision
@@ -245,9 +362,17 @@ function createResult(config) {
245
362
  assertions: config.assertions,
246
363
  metadata: config.metadata,
247
364
  error: config.error,
365
+ output: config.output,
366
+ durationMs: config.durationMs,
367
+ tokens: config.tokens,
248
368
  };
249
369
  }
250
370
  /**
251
371
  * Default export for convenience
252
372
  */
373
+ // Register defineEval with registry to break circular dependency
374
+ (0, registry_1._registerDefineEval)(exports.defineEval);
375
+ // Re-export createLocalExecutor from executor.ts
376
+ var executor_1 = require("./executor");
377
+ Object.defineProperty(exports, "createLocalExecutor", { enumerable: true, get: function () { return executor_1.createLocalExecutor; } });
253
378
  exports.default = exports.defineEval;
@@ -4,7 +4,9 @@
4
4
  * Scoped registry with proper lifecycle management.
5
5
  * Prevents cross-run contamination and memory leaks.
6
6
  */
7
- import type { EvalRuntime } from "./types";
7
+ import type { DefineEvalFunction, EvalRuntime } from "./types";
8
+ /** @internal Called by eval.ts to register defineEval without circular import */
9
+ export declare function _registerDefineEval(fn: (...args: unknown[]) => unknown): void;
8
10
  /**
9
11
  * Runtime interface with lifecycle management
10
12
  * Ensures proper cleanup and prevents resource leaks
@@ -13,7 +15,7 @@ export interface RuntimeHandle {
13
15
  /** Runtime instance */
14
16
  runtime: EvalRuntime;
15
17
  /** defineEval function bound to this runtime */
16
- defineEval: typeof import("./eval").defineEval;
18
+ defineEval: DefineEvalFunction;
17
19
  /** Dispose runtime and clean up resources */
18
20
  dispose(): void;
19
21
  /** Create runtime snapshot for persistence */
@@ -39,6 +39,7 @@ var __importStar = (this && this.__importStar) || (function () {
39
39
  };
40
40
  })();
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
+ exports._registerDefineEval = _registerDefineEval;
42
43
  exports.createEvalRuntime = createEvalRuntime;
43
44
  exports.withRuntime = withRuntime;
44
45
  exports.getActiveRuntime = getActiveRuntime;
@@ -47,6 +48,12 @@ exports.disposeActiveRuntime = disposeActiveRuntime;
47
48
  const crypto = __importStar(require("node:crypto"));
48
49
  const path = __importStar(require("node:path"));
49
50
  const types_1 = require("./types");
51
+ // Registration pattern to break circular dependency (eval.ts imports from registry.ts)
52
+ let _registeredDefineEval = null;
53
+ /** @internal Called by eval.ts to register defineEval without circular import */
54
+ function _registerDefineEval(fn) {
55
+ _registeredDefineEval = fn;
56
+ }
50
57
  /**
51
58
  * Runtime registry implementation
52
59
  * Scoped lifecycle with proper memory management
@@ -326,9 +333,10 @@ function createEvalRuntime(projectRootOrConfig = process.cwd()) {
326
333
  const previousRuntime = activeRuntime;
327
334
  activeRuntime = runtime;
328
335
  try {
329
- // Import and call defineEval
330
- const { defineEval } = require("./eval");
331
- return defineEval(nameOrConfig, executor, options);
336
+ if (!_registeredDefineEval) {
337
+ throw new types_1.RuntimeError("defineEval not registered. Ensure eval.ts is imported before calling createEvalRuntime.");
338
+ }
339
+ return _registeredDefineEval(nameOrConfig, executor, options);
332
340
  }
333
341
  finally {
334
342
  // Restore previous runtime
@@ -159,7 +159,7 @@ export declare class RunReportBuilder {
159
159
  addResult(testId: string, testName: string, filePath: string, position: {
160
160
  line: number;
161
161
  column: number;
162
- }, input: string, result: EnhancedEvalResult): void;
162
+ }, input: string, result: EnhancedEvalResult, tags?: string[]): void;
163
163
  /**
164
164
  * Update summary statistics
165
165
  */
@@ -77,7 +77,7 @@ class RunReportBuilder {
77
77
  /**
78
78
  * Add a test result to the report
79
79
  */
80
- addResult(testId, testName, filePath, position, input, result) {
80
+ addResult(testId, testName, filePath, position, input, result, tags) {
81
81
  const runResult = {
82
82
  testId,
83
83
  testName,
@@ -88,7 +88,7 @@ class RunReportBuilder {
88
88
  score: result.score,
89
89
  durationMs: result.durationMs || 0,
90
90
  metadata: result.metadata,
91
- tags: [], // TODO: Extract from spec
91
+ tags: tags ?? [],
92
92
  assertions: result.assertions?.map((assertion, index) => ({
93
93
  name: assertion.name || `assertion-${index}`,
94
94
  passed: assertion.passed,
@@ -182,8 +182,11 @@ class RunReportBuilder {
182
182
  // Set completion timestamp
183
183
  this.report.finishedAt = new Date().toISOString();
184
184
  const finalReport = this.report;
185
- // Add toJSON method
186
- finalReport.toJSON = () => JSON.stringify(finalReport, null, 2);
185
+ // Add toJSON method (spread to avoid circular reference via toJSON itself)
186
+ finalReport.toJSON = () => {
187
+ const { toJSON: _, ...data } = finalReport;
188
+ return JSON.stringify(data, null, 2);
189
+ };
187
190
  return finalReport;
188
191
  }
189
192
  /**
@@ -36,6 +36,8 @@ export interface EvalSpec {
36
36
  budget?: string;
37
37
  model?: string | "auto";
38
38
  };
39
+ /** Filtering mode: skip = registered but never executed, only = exclusive execution */
40
+ mode?: "normal" | "skip" | "only";
39
41
  }
40
42
  /**
41
43
  * Specification execution context
@@ -81,6 +83,10 @@ export interface EvalResult {
81
83
  durationMs?: number;
82
84
  /** Execution error if failed */
83
85
  error?: string;
86
+ /** Generated output text */
87
+ output?: string;
88
+ /** Token count consumed */
89
+ tokens?: number;
84
90
  }
85
91
  /**
86
92
  * Scoped runtime context - prevents cross-run contamination
@@ -183,6 +189,38 @@ export interface DefineEvalFunction {
183
189
  * @param config - Complete specification configuration
184
190
  */
185
191
  (config: SpecConfig): void;
192
+ /**
193
+ * Register a specification but skip it during execution.
194
+ * Follows the vitest/jest `.skip` convention.
195
+ */
196
+ skip: DefineEvalFunction;
197
+ /**
198
+ * Register a specification for exclusive execution.
199
+ * If any spec is marked `.only`, only those specs run.
200
+ * Follows the vitest/jest `.only` convention.
201
+ */
202
+ only: DefineEvalFunction;
203
+ /**
204
+ * Load a JSONL or CSV dataset and register one spec per row.
205
+ * Each row is passed as `context.input` (the parsed row object) to the executor.
206
+ *
207
+ * @param name - Base name for specs (each gets " [row N]" suffix)
208
+ * @param datasetPath - Path to a .jsonl or .csv file
209
+ * @param executor - Receives the parsed row as input
210
+ * @param options - Optional spec configuration applied to all rows
211
+ *
212
+ * @example
213
+ * ```ts
214
+ * defineEval.fromDataset("rag-accuracy", "./evals/golden.jsonl", async (ctx) => {
215
+ * const row = ctx.input; // { question: string, expected: string }
216
+ * const answer = await myRag(row.question);
217
+ * return createResult({ pass: answer.includes(row.expected), score: 100 });
218
+ * });
219
+ * ```
220
+ */
221
+ fromDataset: <TRow extends Record<string, unknown> = Record<string, unknown>>(name: string, datasetPath: string, executor: (context: EvalContext & {
222
+ input: TRow;
223
+ }) => Promise<EvalResult>, options?: SpecOptions) => void;
186
224
  }
187
225
  /**
188
226
  * Specification definition options
package/dist/testing.d.ts CHANGED
@@ -51,8 +51,16 @@ export interface TestSuiteConfig {
51
51
  stopOnFailure?: boolean;
52
52
  /** Timeout per test case in ms (default: 30000) */
53
53
  timeout?: number;
54
+ /** Alias for stopOnFailure — fail the entire suite on the first failing case. Useful in pre-commit hooks. */
55
+ strict?: boolean;
54
56
  /** Retry failing cases N times (default: 0). Only failing cases are retried. */
55
57
  retries?: number;
58
+ /** Base delay between retries in ms (default: 500). Exponential backoff: delay * 2^attempt. */
59
+ retryDelayMs?: number;
60
+ /** Add random jitter up to this fraction of the delay (default: 0.5 = ±50%). Set 0 to disable. */
61
+ retryJitter?: number;
62
+ /** Seed for deterministic case ordering. When set, cases are shuffled using this seed for reproducible runs. */
63
+ seed?: number;
56
64
  }
57
65
  export interface TestSuiteCaseResult {
58
66
  /** Test case ID */
package/dist/testing.js CHANGED
@@ -50,6 +50,26 @@ class TestSuite {
50
50
  async run() {
51
51
  const startTime = Date.now();
52
52
  const results = [];
53
+ // Deterministic shuffle when seed is provided
54
+ const orderedCases = this.config.cases.map((c, i) => ({
55
+ case: c,
56
+ originalIndex: i,
57
+ }));
58
+ if (this.config.seed !== undefined) {
59
+ // mulberry32 seeded PRNG
60
+ let s = this.config.seed | 0;
61
+ const rand = () => {
62
+ s = (s + 0x6d2b79f5) | 0;
63
+ let t = Math.imul(s ^ (s >>> 15), 1 | s);
64
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
65
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
66
+ };
67
+ // Fisher-Yates shuffle
68
+ for (let i = orderedCases.length - 1; i > 0; i--) {
69
+ const j = Math.floor(rand() * (i + 1));
70
+ [orderedCases[i], orderedCases[j]] = [orderedCases[j], orderedCases[i]];
71
+ }
72
+ }
53
73
  const runTestCase = async (testCase, index) => {
54
74
  const caseStartTime = Date.now();
55
75
  const id = testCase.id || `case-${index}`;
@@ -114,37 +134,52 @@ class TestSuite {
114
134
  };
115
135
  }
116
136
  };
117
- // Run tests
137
+ // Run tests (using orderedCases which may be seeded-shuffled)
118
138
  if (this.config.parallel) {
119
- results.push(...(await Promise.all(this.config.cases.map((tc, i) => runTestCase(tc, i)))));
139
+ results.push(...(await Promise.all(orderedCases.map((oc) => runTestCase(oc.case, oc.originalIndex)))));
120
140
  }
121
141
  else {
122
- for (let i = 0; i < this.config.cases.length; i++) {
123
- const result = await runTestCase(this.config.cases[i], i);
142
+ for (const oc of orderedCases) {
143
+ const result = await runTestCase(oc.case, oc.originalIndex);
124
144
  results.push(result);
125
- if (this.config.stopOnFailure && !result.passed) {
145
+ if ((this.config.stopOnFailure || this.config.strict) &&
146
+ !result.passed) {
126
147
  break;
127
148
  }
128
149
  }
129
150
  }
130
151
  const retriedCases = [];
131
152
  const retries = this.config.retries ?? 0;
153
+ const baseDelay = this.config.retryDelayMs ?? 500;
154
+ const jitterFraction = this.config.retryJitter ?? 0.5;
132
155
  if (retries > 0 && results.length > 0) {
133
156
  const failingIndices = results
134
157
  .map((r, i) => (r.passed ? -1 : i))
135
158
  .filter((i) => i >= 0);
136
159
  for (let attempt = 0; attempt < retries && failingIndices.length > 0; attempt++) {
160
+ // Exponential backoff with jitter before each retry round
161
+ const delay = baseDelay * 2 ** attempt;
162
+ const jitter = jitterFraction > 0
163
+ ? delay * jitterFraction * (Math.random() * 2 - 1)
164
+ : 0;
165
+ const waitMs = Math.max(0, Math.round(delay + jitter));
166
+ if (waitMs > 0) {
167
+ await new Promise((resolve) => setTimeout(resolve, waitMs));
168
+ }
137
169
  const toRetry = [...failingIndices];
138
170
  failingIndices.length = 0;
139
- for (const i of toRetry) {
140
- const tc = this.config.cases[i];
141
- const retryResult = await runTestCase(tc, i);
171
+ for (const idx of toRetry) {
172
+ const tc = results[idx]; // retry based on result index
173
+ const originalCase = orderedCases.find((oc) => (oc.case.id || `case-${oc.originalIndex}`) === tc.id);
174
+ if (!originalCase)
175
+ continue;
176
+ const retryResult = await runTestCase(originalCase.case, originalCase.originalIndex);
142
177
  if (retryResult.passed) {
143
- results[i] = retryResult;
178
+ results[idx] = retryResult;
144
179
  retriedCases.push(retryResult.id);
145
180
  }
146
181
  else {
147
- failingIndices.push(i);
182
+ failingIndices.push(idx);
148
183
  }
149
184
  }
150
185
  }
package/dist/version.d.ts CHANGED
@@ -3,5 +3,5 @@
3
3
  * X-EvalGate-SDK-Version: SDK package version
4
4
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
5
5
  */
6
- export declare const SDK_VERSION = "2.2.3";
6
+ export declare const SDK_VERSION = "2.2.4";
7
7
  export declare const SPEC_VERSION = "2.2.3";
package/dist/version.js CHANGED
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
6
6
  * X-EvalGate-SDK-Version: SDK package version
7
7
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
8
8
  */
9
- exports.SDK_VERSION = "2.2.3";
9
+ exports.SDK_VERSION = "2.2.4";
10
10
  exports.SPEC_VERSION = "2.2.3";
@@ -170,6 +170,8 @@ export interface WorkflowTracerOptions {
170
170
  captureFullPayloads?: boolean;
171
171
  /** Debug mode */
172
172
  debug?: boolean;
173
+ /** Offline mode — skip all API calls, keep in-memory state only */
174
+ offline?: boolean;
173
175
  }
174
176
  /**
175
177
  * Agent span context