@evalgate/sdk 2.2.2 → 2.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +40 -1
  3. package/dist/assertions.d.ts +194 -10
  4. package/dist/assertions.js +525 -73
  5. package/dist/batch.js +4 -4
  6. package/dist/cache.d.ts +5 -1
  7. package/dist/cache.js +5 -1
  8. package/dist/cli/baseline.d.ts +14 -0
  9. package/dist/cli/baseline.js +43 -3
  10. package/dist/cli/check.d.ts +5 -2
  11. package/dist/cli/check.js +20 -12
  12. package/dist/cli/compare.d.ts +80 -0
  13. package/dist/cli/compare.js +266 -0
  14. package/dist/cli/index.js +244 -101
  15. package/dist/cli/regression-gate.js +23 -0
  16. package/dist/cli/run.js +22 -0
  17. package/dist/cli/start.d.ts +26 -0
  18. package/dist/cli/start.js +130 -0
  19. package/dist/cli/templates.d.ts +24 -0
  20. package/dist/cli/templates.js +314 -0
  21. package/dist/cli/traces.d.ts +109 -0
  22. package/dist/cli/traces.js +152 -0
  23. package/dist/cli/upgrade.js +5 -0
  24. package/dist/cli/validate.d.ts +37 -0
  25. package/dist/cli/validate.js +252 -0
  26. package/dist/cli/watch.d.ts +19 -0
  27. package/dist/cli/watch.js +175 -0
  28. package/dist/client.js +6 -13
  29. package/dist/constants.d.ts +2 -0
  30. package/dist/constants.js +5 -0
  31. package/dist/errors.js +7 -0
  32. package/dist/export.js +2 -2
  33. package/dist/index.d.ts +10 -9
  34. package/dist/index.js +24 -7
  35. package/dist/integrations/anthropic.js +6 -6
  36. package/dist/integrations/openai.js +84 -61
  37. package/dist/logger.d.ts +3 -1
  38. package/dist/logger.js +2 -1
  39. package/dist/otel.d.ts +130 -0
  40. package/dist/otel.js +309 -0
  41. package/dist/pagination.d.ts +13 -2
  42. package/dist/pagination.js +28 -2
  43. package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
  44. package/dist/runtime/eval.d.ts +14 -4
  45. package/dist/runtime/eval.js +127 -2
  46. package/dist/runtime/executor.d.ts +3 -2
  47. package/dist/runtime/executor.js +3 -2
  48. package/dist/runtime/registry.d.ts +8 -3
  49. package/dist/runtime/registry.js +15 -4
  50. package/dist/runtime/run-report.d.ts +1 -1
  51. package/dist/runtime/run-report.js +7 -4
  52. package/dist/runtime/types.d.ts +38 -0
  53. package/dist/snapshot.d.ts +12 -0
  54. package/dist/snapshot.js +24 -1
  55. package/dist/testing.d.ts +8 -0
  56. package/dist/testing.js +45 -10
  57. package/dist/version.d.ts +2 -2
  58. package/dist/version.js +2 -2
  59. package/dist/workflows.d.ts +2 -0
  60. package/dist/workflows.js +184 -102
  61. package/package.json +8 -1
package/dist/otel.js ADDED
@@ -0,0 +1,309 @@
1
+ "use strict";
2
+ /**
3
+ * OpenTelemetry Export for WorkflowTracer
4
+ *
5
+ * Converts WorkflowTracer spans, decisions, and costs into
6
+ * OpenTelemetry-compatible span data for export to any OTEL collector.
7
+ *
8
+ * Usage:
9
+ * import { OTelExporter } from "@evalgate/sdk/otel";
10
+ *
11
+ * const exporter = new OTelExporter({ endpoint: "http://localhost:4318" });
12
+ * const tracer = new WorkflowTracer(client, { debug: true });
13
+ * // ... run workflow ...
14
+ * await exporter.exportFromTracer(tracer);
15
+ */
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.OTelExporter = void 0;
18
+ exports.createOTelExporter = createOTelExporter;
19
+ /**
20
+ * Generate a random 16-byte hex trace ID
21
+ */
22
+ function generateTraceId() {
23
+ const bytes = new Uint8Array(16);
24
+ for (let i = 0; i < 16; i++) {
25
+ bytes[i] = Math.floor(Math.random() * 256);
26
+ }
27
+ return Array.from(bytes)
28
+ .map((b) => b.toString(16).padStart(2, "0"))
29
+ .join("");
30
+ }
31
+ /**
32
+ * Generate a random 8-byte hex span ID
33
+ */
34
+ function generateSpanId() {
35
+ const bytes = new Uint8Array(8);
36
+ for (let i = 0; i < 8; i++) {
37
+ bytes[i] = Math.floor(Math.random() * 256);
38
+ }
39
+ return Array.from(bytes)
40
+ .map((b) => b.toString(16).padStart(2, "0"))
41
+ .join("");
42
+ }
43
+ /**
44
+ * Convert milliseconds to nanosecond string
45
+ */
46
+ function msToNano(ms) {
47
+ return `${BigInt(ms) * BigInt(1000000)}`;
48
+ }
49
+ /**
50
+ * Create an OTEL attribute
51
+ */
52
+ function attr(key, value) {
53
+ if (typeof value === "string") {
54
+ return { key, value: { stringValue: value } };
55
+ }
56
+ if (typeof value === "number") {
57
+ if (Number.isInteger(value)) {
58
+ return { key, value: { intValue: String(value) } };
59
+ }
60
+ return { key, value: { doubleValue: value } };
61
+ }
62
+ return { key, value: { boolValue: value } };
63
+ }
64
+ /**
65
+ * OpenTelemetry Exporter for EvalGate WorkflowTracer
66
+ */
67
+ class OTelExporter {
68
+ constructor(options = {}) {
69
+ this.options = {
70
+ endpoint: options.endpoint ?? "http://localhost:4318/v1/traces",
71
+ serviceName: options.serviceName ?? "evalgate",
72
+ resourceAttributes: options.resourceAttributes ?? {},
73
+ sdkVersion: options.sdkVersion ?? "2.2.4",
74
+ headers: options.headers ?? {},
75
+ };
76
+ }
77
+ /**
78
+ * Export workflow data from a WorkflowTracer instance
79
+ */
80
+ exportFromTracer(tracer) {
81
+ const workflow = tracer.getCurrentWorkflow();
82
+ const handoffs = tracer.getHandoffs();
83
+ const decisions = tracer.getDecisions();
84
+ const costs = tracer.getCosts();
85
+ const traceId = generateTraceId();
86
+ const rootSpanId = generateSpanId();
87
+ const now = Date.now();
88
+ const spans = [];
89
+ // Root workflow span
90
+ if (workflow) {
91
+ spans.push({
92
+ traceId,
93
+ spanId: rootSpanId,
94
+ name: `workflow.${workflow.name}`,
95
+ kind: 1,
96
+ startTimeUnixNano: msToNano(new Date(workflow.startedAt).getTime()),
97
+ endTimeUnixNano: msToNano(now),
98
+ attributes: [
99
+ attr("evalgate.workflow.name", workflow.name),
100
+ attr("evalgate.workflow.id", workflow.id),
101
+ attr("evalgate.workflow.trace_id", workflow.traceId),
102
+ ],
103
+ status: { code: 1 },
104
+ events: [],
105
+ });
106
+ }
107
+ // Decision spans
108
+ for (let i = 0; i < decisions.length; i++) {
109
+ const decision = decisions[i];
110
+ const spanId = generateSpanId();
111
+ spans.push(this.decisionToSpan(traceId, spanId, rootSpanId, decision, now - decisions.length + i));
112
+ }
113
+ // Handoff events
114
+ for (let i = 0; i < handoffs.length; i++) {
115
+ const handoff = handoffs[i];
116
+ const spanId = generateSpanId();
117
+ spans.push(this.handoffToSpan(traceId, spanId, rootSpanId, handoff));
118
+ }
119
+ // Cost spans
120
+ for (let i = 0; i < costs.length; i++) {
121
+ const cost = costs[i];
122
+ const spanId = generateSpanId();
123
+ spans.push(this.costToSpan(traceId, spanId, rootSpanId, cost, now - costs.length + i));
124
+ }
125
+ return this.buildPayload(spans);
126
+ }
127
+ /**
128
+ * Export a run result as OTEL spans
129
+ */
130
+ exportRunResult(runResult) {
131
+ const traceId = generateTraceId();
132
+ const rootSpanId = generateSpanId();
133
+ const spans = [];
134
+ // Root run span
135
+ spans.push({
136
+ traceId,
137
+ spanId: rootSpanId,
138
+ name: `evalgate.run.${runResult.runId}`,
139
+ kind: 1,
140
+ startTimeUnixNano: msToNano(runResult.metadata.startedAt),
141
+ endTimeUnixNano: msToNano(runResult.metadata.completedAt),
142
+ attributes: [
143
+ attr("evalgate.run.id", runResult.runId),
144
+ attr("evalgate.run.mode", runResult.metadata.mode),
145
+ attr("evalgate.run.duration_ms", runResult.metadata.duration),
146
+ attr("evalgate.run.pass_rate", runResult.summary.passRate),
147
+ attr("evalgate.run.passed", runResult.summary.passed),
148
+ attr("evalgate.run.failed", runResult.summary.failed),
149
+ ],
150
+ status: {
151
+ code: runResult.summary.failed > 0 ? 2 : 1,
152
+ },
153
+ events: [],
154
+ });
155
+ // Per-spec child spans
156
+ let offset = 0;
157
+ for (const spec of runResult.results) {
158
+ const spanId = generateSpanId();
159
+ const specStart = runResult.metadata.startedAt + offset;
160
+ const specEnd = specStart + spec.result.duration;
161
+ offset += spec.result.duration;
162
+ const attributes = [
163
+ attr("evalgate.spec.id", spec.specId),
164
+ attr("evalgate.spec.name", spec.name),
165
+ attr("evalgate.spec.file", spec.filePath),
166
+ attr("evalgate.spec.status", spec.result.status),
167
+ attr("evalgate.spec.duration_ms", spec.result.duration),
168
+ ];
169
+ if (spec.result.score !== undefined) {
170
+ attributes.push(attr("evalgate.spec.score", spec.result.score));
171
+ }
172
+ spans.push({
173
+ traceId,
174
+ spanId,
175
+ parentSpanId: rootSpanId,
176
+ name: `evalgate.spec.${spec.name}`,
177
+ kind: 1,
178
+ startTimeUnixNano: msToNano(specStart),
179
+ endTimeUnixNano: msToNano(specEnd),
180
+ attributes,
181
+ status: {
182
+ code: spec.result.status === "passed" ? 1 : 2,
183
+ message: spec.result.error,
184
+ },
185
+ events: [],
186
+ });
187
+ }
188
+ return this.buildPayload(spans);
189
+ }
190
+ /**
191
+ * Send payload to OTEL collector via HTTP
192
+ */
193
+ async send(payload) {
194
+ try {
195
+ const response = await fetch(this.options.endpoint, {
196
+ method: "POST",
197
+ headers: {
198
+ "Content-Type": "application/json",
199
+ ...this.options.headers,
200
+ },
201
+ body: JSON.stringify(payload),
202
+ });
203
+ return response.ok;
204
+ }
205
+ catch (err) {
206
+ console.warn(`[OTelExporter] Failed to send: ${err instanceof Error ? err.message : String(err)}`);
207
+ return false;
208
+ }
209
+ }
210
+ decisionToSpan(traceId, spanId, parentSpanId, decision, timestampMs) {
211
+ return {
212
+ traceId,
213
+ spanId,
214
+ parentSpanId,
215
+ name: `decision.${decision.agent}.${decision.chosen}`,
216
+ kind: 1,
217
+ startTimeUnixNano: msToNano(timestampMs),
218
+ endTimeUnixNano: msToNano(timestampMs + 1),
219
+ attributes: [
220
+ attr("evalgate.decision.agent", decision.agent),
221
+ attr("evalgate.decision.type", decision.type),
222
+ attr("evalgate.decision.chosen", decision.chosen),
223
+ attr("evalgate.decision.alternatives", decision.alternatives.length),
224
+ ...(decision.confidence !== undefined
225
+ ? [attr("evalgate.decision.confidence", decision.confidence)]
226
+ : []),
227
+ ...(decision.reasoning
228
+ ? [attr("evalgate.decision.reasoning", decision.reasoning)]
229
+ : []),
230
+ ],
231
+ status: { code: 1 },
232
+ events: [],
233
+ };
234
+ }
235
+ handoffToSpan(traceId, spanId, parentSpanId, handoff) {
236
+ const ts = new Date(handoff.timestamp).getTime();
237
+ return {
238
+ traceId,
239
+ spanId,
240
+ parentSpanId,
241
+ name: `handoff.${handoff.fromAgent ?? "start"}.${handoff.toAgent}`,
242
+ kind: 1,
243
+ startTimeUnixNano: msToNano(ts),
244
+ endTimeUnixNano: msToNano(ts + 1),
245
+ attributes: [
246
+ attr("evalgate.handoff.from", handoff.fromAgent ?? "start"),
247
+ attr("evalgate.handoff.to", handoff.toAgent),
248
+ attr("evalgate.handoff.type", handoff.handoffType),
249
+ ],
250
+ status: { code: 1 },
251
+ events: [],
252
+ };
253
+ }
254
+ costToSpan(traceId, spanId, parentSpanId, cost, timestampMs) {
255
+ return {
256
+ traceId,
257
+ spanId,
258
+ parentSpanId,
259
+ name: `cost.${cost.provider}.${cost.model}`,
260
+ kind: 1,
261
+ startTimeUnixNano: msToNano(timestampMs),
262
+ endTimeUnixNano: msToNano(timestampMs + 1),
263
+ attributes: [
264
+ attr("evalgate.cost.provider", cost.provider),
265
+ attr("evalgate.cost.model", cost.model),
266
+ attr("evalgate.cost.input_tokens", cost.inputTokens),
267
+ attr("evalgate.cost.output_tokens", cost.outputTokens),
268
+ attr("evalgate.cost.total_tokens", cost.totalTokens),
269
+ attr("evalgate.cost.total_usd", cost.totalCost),
270
+ ],
271
+ status: { code: 1 },
272
+ events: [],
273
+ };
274
+ }
275
+ buildPayload(spans) {
276
+ const resourceAttrs = [
277
+ attr("service.name", this.options.serviceName),
278
+ attr("telemetry.sdk.name", "evalgate"),
279
+ attr("telemetry.sdk.version", this.options.sdkVersion),
280
+ attr("telemetry.sdk.language", "nodejs"),
281
+ ];
282
+ for (const [key, value] of Object.entries(this.options.resourceAttributes)) {
283
+ resourceAttrs.push(attr(key, value));
284
+ }
285
+ return {
286
+ resourceSpans: [
287
+ {
288
+ resource: { attributes: resourceAttrs },
289
+ scopeSpans: [
290
+ {
291
+ scope: {
292
+ name: "evalgate",
293
+ version: this.options.sdkVersion,
294
+ },
295
+ spans,
296
+ },
297
+ ],
298
+ },
299
+ ],
300
+ };
301
+ }
302
+ }
303
+ exports.OTelExporter = OTelExporter;
304
+ /**
305
+ * Convenience factory
306
+ */
307
+ function createOTelExporter(options) {
308
+ return new OTelExporter(options);
309
+ }
@@ -50,9 +50,20 @@ export declare function createPaginatedIterator<T>(fetchFn: (offset: number, lim
50
50
  hasMore: boolean;
51
51
  }>, limit?: number): PaginatedIterator<T>;
52
52
  /**
53
- * Auto-paginate helper that fetches all pages automatically
53
+ * Auto-paginate helper that fetches all pages and returns a flat array.
54
+ * @example
55
+ * ```typescript
56
+ * const allItems = await autoPaginate(
57
+ * (offset, limit) => client.traces.list({ offset, limit }),
58
+ * );
59
+ * ```
54
60
  */
55
- export declare function autoPaginate<T>(fetchFn: (offset: number, limit: number) => Promise<T[]>, limit?: number): AsyncGenerator<T, void, unknown>;
61
+ export declare function autoPaginate<T>(fetchFn: (offset: number, limit: number) => Promise<T[]>, limit?: number): Promise<T[]>;
62
+ /**
63
+ * Streaming auto-paginate generator — yields individual items one at a time.
64
+ * Use this when you want to process items as they arrive rather than waiting for all pages.
65
+ */
66
+ export declare function autoPaginateGenerator<T>(fetchFn: (offset: number, limit: number) => Promise<T[]>, limit?: number): AsyncGenerator<T, void, unknown>;
56
67
  /**
57
68
  * Encode cursor for pagination (base64)
58
69
  */
@@ -6,6 +6,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.PaginatedIterator = void 0;
7
7
  exports.createPaginatedIterator = createPaginatedIterator;
8
8
  exports.autoPaginate = autoPaginate;
9
+ exports.autoPaginateGenerator = autoPaginateGenerator;
9
10
  exports.encodeCursor = encodeCursor;
10
11
  exports.decodeCursor = decodeCursor;
11
12
  exports.createPaginationMeta = createPaginationMeta;
@@ -56,9 +57,34 @@ function createPaginatedIterator(fetchFn, limit = 50) {
56
57
  return new PaginatedIterator(fetchFn, limit);
57
58
  }
58
59
  /**
59
- * Auto-paginate helper that fetches all pages automatically
60
+ * Auto-paginate helper that fetches all pages and returns a flat array.
61
+ * @example
62
+ * ```typescript
63
+ * const allItems = await autoPaginate(
64
+ * (offset, limit) => client.traces.list({ offset, limit }),
65
+ * );
66
+ * ```
60
67
  */
61
- async function* autoPaginate(fetchFn, limit = 50) {
68
+ async function autoPaginate(fetchFn, limit = 50) {
69
+ const result = [];
70
+ let offset = 0;
71
+ let hasMore = true;
72
+ while (hasMore) {
73
+ const items = await fetchFn(offset, limit);
74
+ if (items.length === 0) {
75
+ break;
76
+ }
77
+ result.push(...items);
78
+ hasMore = items.length === limit;
79
+ offset += limit;
80
+ }
81
+ return result;
82
+ }
83
+ /**
84
+ * Streaming auto-paginate generator — yields individual items one at a time.
85
+ * Use this when you want to process items as they arrive rather than waiting for all pages.
86
+ */
87
+ async function* autoPaginateGenerator(fetchFn, limit = 50) {
62
88
  let offset = 0;
63
89
  let hasMore = true;
64
90
  while (hasMore) {
@@ -208,12 +208,7 @@ function generateDefineEvalCode(suite, options = {}) {
208
208
  });
209
209
  const helperFunctions = generateHelperFunctionsForSuite(specs, options);
210
210
  const evaluationFunction = generateEvaluationFunction();
211
- return [
212
- ...imports,
213
- ...helperFunctions,
214
- ...evaluationFunction,
215
- ...specCode,
216
- ].join("\n");
211
+ return [...imports, helperFunctions, evaluationFunction, ...specCode].join("\n");
217
212
  }
218
213
  /**
219
214
  * Generate helper functions for a specific spec
@@ -4,12 +4,19 @@
4
4
  * The core DSL function for defining behavioral specifications.
5
5
  * Uses content-addressable identity with AST position for stability.
6
6
  */
7
- import type { DefineEvalFunction, EvalContext, EvalResult } from "./types";
7
+ import { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, withRuntime } from "./registry";
8
+ import type { DefineEvalFunction, EvalContext, EvalResult, EvalSpec } from "./types";
8
9
  /**
9
10
  * Export the defineEval function with proper typing
10
11
  * This is the main DSL entry point
11
12
  */
12
13
  export declare const defineEval: DefineEvalFunction;
14
+ /**
15
+ * Filter a list of specs according to skip/only semantics:
16
+ * - If any spec has mode === "only", return only those specs
17
+ * - Otherwise, return all specs except those with mode === "skip"
18
+ */
19
+ export declare function getFilteredSpecs(specs: EvalSpec[]): EvalSpec[];
13
20
  /**
14
21
  * Convenience export for evalai.test() alias (backward compatibility)
15
22
  * Provides alternative naming that matches the original roadmap vision
@@ -48,8 +55,11 @@ export declare function createResult(config: {
48
55
  assertions?: EvalResult["assertions"];
49
56
  metadata?: Record<string, unknown>;
50
57
  error?: string;
58
+ output?: string;
59
+ durationMs?: number;
60
+ tokens?: number;
51
61
  }): EvalResult;
52
- /**
53
- * Default export for convenience
54
- */
62
+ export { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, withRuntime, };
63
+ export { createContext as createEvalContext };
64
+ export { createLocalExecutor } from "./executor";
55
65
  export default defineEval;
@@ -39,13 +39,21 @@ var __importStar = (this && this.__importStar) || (function () {
39
39
  };
40
40
  })();
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
- exports.evalai = exports.defineEval = void 0;
42
+ exports.createLocalExecutor = exports.withRuntime = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.evalai = exports.defineEval = void 0;
43
+ exports.getFilteredSpecs = getFilteredSpecs;
43
44
  exports.defineSuite = defineSuite;
44
45
  exports.createContext = createContext;
46
+ exports.createEvalContext = createContext;
45
47
  exports.createResult = createResult;
46
48
  const crypto = __importStar(require("node:crypto"));
49
+ const fs = __importStar(require("node:fs"));
47
50
  const path = __importStar(require("node:path"));
48
51
  const registry_1 = require("./registry");
52
+ Object.defineProperty(exports, "createEvalRuntime", { enumerable: true, get: function () { return registry_1.createEvalRuntime; } });
53
+ Object.defineProperty(exports, "disposeActiveRuntime", { enumerable: true, get: function () { return registry_1.disposeActiveRuntime; } });
54
+ Object.defineProperty(exports, "getActiveRuntime", { enumerable: true, get: function () { return registry_1.getActiveRuntime; } });
55
+ Object.defineProperty(exports, "setActiveRuntime", { enumerable: true, get: function () { return registry_1.setActiveRuntime; } });
56
+ Object.defineProperty(exports, "withRuntime", { enumerable: true, get: function () { return registry_1.withRuntime; } });
49
57
  const types_1 = require("./types");
50
58
  /**
51
59
  * Extract AST position from call stack
@@ -159,7 +167,7 @@ function createSpecConfig(nameOrConfig, executor, options) {
159
167
  /**
160
168
  * Core defineEval function implementation
161
169
  */
162
- function defineEvalImpl(nameOrConfig, executor, options) {
170
+ function defineEvalWithMode(mode, nameOrConfig, executor, options) {
163
171
  // Get caller position for identity
164
172
  const callerPosition = getCallerPosition();
165
173
  // Create specification configuration
@@ -187,15 +195,124 @@ function defineEvalImpl(nameOrConfig, executor, options) {
187
195
  budget: config.budget,
188
196
  model: config.model,
189
197
  },
198
+ mode,
190
199
  };
191
200
  // Register specification
192
201
  runtime.register(spec);
193
202
  }
203
+ function defineEvalImpl(nameOrConfig, executor, options) {
204
+ defineEvalWithMode("normal", nameOrConfig, executor, options);
205
+ }
206
+ function defineEvalSkipImpl(nameOrConfig, executor, options) {
207
+ defineEvalWithMode("skip", nameOrConfig, executor, options);
208
+ }
209
+ function defineEvalOnlyImpl(nameOrConfig, executor, options) {
210
+ defineEvalWithMode("only", nameOrConfig, executor, options);
211
+ }
194
212
  /**
195
213
  * Export the defineEval function with proper typing
196
214
  * This is the main DSL entry point
197
215
  */
198
216
  exports.defineEval = defineEvalImpl;
217
+ // Attach .skip and .only modifiers (vitest/jest convention)
218
+ exports.defineEval.skip = defineEvalSkipImpl;
219
+ exports.defineEval.only = defineEvalOnlyImpl;
220
+ /**
221
+ * Parse a JSONL file into an array of row objects.
222
+ * Each line must be a valid JSON object; blank lines are skipped.
223
+ */
224
+ function parseJsonl(content) {
225
+ return content
226
+ .split("\n")
227
+ .map((line) => line.trim())
228
+ .filter((line) => line.length > 0)
229
+ .map((line, i) => {
230
+ try {
231
+ return JSON.parse(line);
232
+ }
233
+ catch {
234
+ throw new types_1.SpecRegistrationError(`Invalid JSON on line ${i + 1} of dataset`);
235
+ }
236
+ });
237
+ }
238
+ /**
239
+ * Parse a simple CSV file into an array of row objects.
240
+ * First line is treated as headers. Values are unquoted strings.
241
+ * For complex CSV (quoted fields, escapes), use a dedicated library.
242
+ */
243
+ function parseCsv(content) {
244
+ const lines = content
245
+ .split("\n")
246
+ .map((l) => l.trim())
247
+ .filter((l) => l.length > 0);
248
+ if (lines.length < 2)
249
+ return [];
250
+ const headers = lines[0].split(",").map((h) => h.trim());
251
+ return lines.slice(1).map((line) => {
252
+ const values = line.split(",").map((v) => v.trim());
253
+ const row = {};
254
+ for (let i = 0; i < headers.length; i++) {
255
+ row[headers[i]] = values[i] ?? "";
256
+ }
257
+ return row;
258
+ });
259
+ }
260
+ /**
261
+ * Load a JSONL or CSV dataset and register one spec per row.
262
+ */
263
+ function fromDatasetImpl(name, datasetPath, executor, options) {
264
+ const resolvedPath = path.isAbsolute(datasetPath)
265
+ ? datasetPath
266
+ : path.resolve(process.cwd(), datasetPath);
267
+ if (!fs.existsSync(resolvedPath)) {
268
+ throw new types_1.SpecRegistrationError(`Dataset file not found: ${resolvedPath}`);
269
+ }
270
+ const content = fs.readFileSync(resolvedPath, "utf8");
271
+ const ext = path.extname(resolvedPath).toLowerCase();
272
+ let rows;
273
+ if (ext === ".jsonl" || ext === ".ndjson") {
274
+ rows = parseJsonl(content);
275
+ }
276
+ else if (ext === ".csv") {
277
+ rows = parseCsv(content);
278
+ }
279
+ else if (ext === ".json") {
280
+ const parsed = JSON.parse(content);
281
+ rows = Array.isArray(parsed) ? parsed : [parsed];
282
+ }
283
+ else {
284
+ throw new types_1.SpecRegistrationError(`Unsupported dataset format: ${ext}. Use .jsonl, .ndjson, .csv, or .json`);
285
+ }
286
+ if (rows.length === 0) {
287
+ throw new types_1.SpecRegistrationError(`Dataset is empty: ${resolvedPath}`);
288
+ }
289
+ for (let i = 0; i < rows.length; i++) {
290
+ const row = rows[i];
291
+ const specName = `${name} - row ${i + 1}`;
292
+ const wrappedExecutor = (context) => executor({ ...context, input: row });
293
+ defineEvalWithMode("normal", specName, wrappedExecutor, {
294
+ ...options,
295
+ metadata: {
296
+ ...options?.metadata,
297
+ datasetPath: resolvedPath,
298
+ datasetRow: i + 1,
299
+ },
300
+ });
301
+ }
302
+ }
303
+ exports.defineEval.fromDataset = fromDatasetImpl;
304
+ /**
305
+ * Filter a list of specs according to skip/only semantics:
306
+ * - If any spec has mode === "only", return only those specs
307
+ * - Otherwise, return all specs except those with mode === "skip"
308
+ */
309
+ function getFilteredSpecs(specs) {
310
+ const onlySpecs = specs.filter((s) => s.mode === "only");
311
+ if (onlySpecs.length > 0) {
312
+ return onlySpecs;
313
+ }
314
+ return specs.filter((s) => s.mode !== "skip");
315
+ }
199
316
  /**
200
317
  * Convenience export for evalai.test() alias (backward compatibility)
201
318
  * Provides alternative naming that matches the original roadmap vision
@@ -245,9 +362,17 @@ function createResult(config) {
245
362
  assertions: config.assertions,
246
363
  metadata: config.metadata,
247
364
  error: config.error,
365
+ output: config.output,
366
+ durationMs: config.durationMs,
367
+ tokens: config.tokens,
248
368
  };
249
369
  }
250
370
  /**
251
371
  * Default export for convenience
252
372
  */
373
+ // Register defineEval with registry to break circular dependency
374
+ (0, registry_1._registerDefineEval)(exports.defineEval);
375
+ // Re-export createLocalExecutor from executor.ts
376
+ var executor_1 = require("./executor");
377
+ Object.defineProperty(exports, "createLocalExecutor", { enumerable: true, get: function () { return executor_1.createLocalExecutor; } });
253
378
  exports.default = exports.defineEval;
@@ -10,7 +10,8 @@ import type { LocalExecutor } from "./types";
10
10
  */
11
11
  export declare function createLocalExecutor(): LocalExecutor;
12
12
  /**
13
- * Default local executor instance
13
+ * Default local executor factory
14
+ * Call as defaultLocalExecutor() to get a new executor instance.
14
15
  * For convenience in simple use cases
15
16
  */
16
- export declare const defaultLocalExecutor: LocalExecutor;
17
+ export declare const defaultLocalExecutor: typeof createLocalExecutor;
@@ -146,7 +146,8 @@ function createLocalExecutor() {
146
146
  return new LocalExecutorImpl();
147
147
  }
148
148
  /**
149
- * Default local executor instance
149
+ * Default local executor factory
150
+ * Call as defaultLocalExecutor() to get a new executor instance.
150
151
  * For convenience in simple use cases
151
152
  */
152
- exports.defaultLocalExecutor = createLocalExecutor();
153
+ exports.defaultLocalExecutor = createLocalExecutor;
@@ -4,7 +4,9 @@
4
4
  * Scoped registry with proper lifecycle management.
5
5
  * Prevents cross-run contamination and memory leaks.
6
6
  */
7
- import type { EvalRuntime } from "./types";
7
+ import type { DefineEvalFunction, EvalRuntime } from "./types";
8
+ /** @internal Called by eval.ts to register defineEval without circular import */
9
+ export declare function _registerDefineEval(fn: (...args: unknown[]) => unknown): void;
8
10
  /**
9
11
  * Runtime interface with lifecycle management
10
12
  * Ensures proper cleanup and prevents resource leaks
@@ -13,7 +15,7 @@ export interface RuntimeHandle {
13
15
  /** Runtime instance */
14
16
  runtime: EvalRuntime;
15
17
  /** defineEval function bound to this runtime */
16
- defineEval: typeof import("./eval").defineEval;
18
+ defineEval: DefineEvalFunction;
17
19
  /** Dispose runtime and clean up resources */
18
20
  dispose(): void;
19
21
  /** Create runtime snapshot for persistence */
@@ -61,7 +63,10 @@ export interface SerializedSpec {
61
63
  * Create a new scoped runtime with lifecycle management
62
64
  * Returns a handle for proper resource management
63
65
  */
64
- export declare function createEvalRuntime(projectRoot?: string): RuntimeHandle;
66
+ export declare function createEvalRuntime(projectRootOrConfig?: string | {
67
+ name?: string;
68
+ projectRoot?: string;
69
+ }): RuntimeHandle;
65
70
  /**
66
71
  * Helper function for safe runtime execution with automatic cleanup
67
72
  * Ensures runtime is disposed even if an exception is thrown