@evalgate/sdk 2.2.2 → 2.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +40 -1
- package/dist/assertions.d.ts +194 -10
- package/dist/assertions.js +525 -73
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +5 -1
- package/dist/cache.js +5 -1
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/upgrade.js +5 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/errors.js +7 -0
- package/dist/export.js +2 -2
- package/dist/index.d.ts +10 -9
- package/dist/index.js +24 -7
- package/dist/integrations/anthropic.js +6 -6
- package/dist/integrations/openai.js +84 -61
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/pagination.d.ts +13 -2
- package/dist/pagination.js +28 -2
- package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/executor.d.ts +3 -2
- package/dist/runtime/executor.js +3 -2
- package/dist/runtime/registry.d.ts +8 -3
- package/dist/runtime/registry.js +15 -4
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/snapshot.d.ts +12 -0
- package/dist/snapshot.js +24 -1
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +8 -1
package/dist/otel.js
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* OpenTelemetry Export for WorkflowTracer
|
|
4
|
+
*
|
|
5
|
+
* Converts WorkflowTracer spans, decisions, and costs into
|
|
6
|
+
* OpenTelemetry-compatible span data for export to any OTEL collector.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* import { OTelExporter } from "@evalgate/sdk/otel";
|
|
10
|
+
*
|
|
11
|
+
* const exporter = new OTelExporter({ endpoint: "http://localhost:4318" });
|
|
12
|
+
* const tracer = new WorkflowTracer(client, { debug: true });
|
|
13
|
+
* // ... run workflow ...
|
|
14
|
+
* await exporter.exportFromTracer(tracer);
|
|
15
|
+
*/
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.OTelExporter = void 0;
|
|
18
|
+
exports.createOTelExporter = createOTelExporter;
|
|
19
|
+
/**
|
|
20
|
+
* Generate a random 16-byte hex trace ID
|
|
21
|
+
*/
|
|
22
|
+
function generateTraceId() {
|
|
23
|
+
const bytes = new Uint8Array(16);
|
|
24
|
+
for (let i = 0; i < 16; i++) {
|
|
25
|
+
bytes[i] = Math.floor(Math.random() * 256);
|
|
26
|
+
}
|
|
27
|
+
return Array.from(bytes)
|
|
28
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
29
|
+
.join("");
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Generate a random 8-byte hex span ID
|
|
33
|
+
*/
|
|
34
|
+
function generateSpanId() {
|
|
35
|
+
const bytes = new Uint8Array(8);
|
|
36
|
+
for (let i = 0; i < 8; i++) {
|
|
37
|
+
bytes[i] = Math.floor(Math.random() * 256);
|
|
38
|
+
}
|
|
39
|
+
return Array.from(bytes)
|
|
40
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
41
|
+
.join("");
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Convert milliseconds to nanosecond string
|
|
45
|
+
*/
|
|
46
|
+
function msToNano(ms) {
|
|
47
|
+
return `${BigInt(ms) * BigInt(1000000)}`;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Create an OTEL attribute
|
|
51
|
+
*/
|
|
52
|
+
function attr(key, value) {
|
|
53
|
+
if (typeof value === "string") {
|
|
54
|
+
return { key, value: { stringValue: value } };
|
|
55
|
+
}
|
|
56
|
+
if (typeof value === "number") {
|
|
57
|
+
if (Number.isInteger(value)) {
|
|
58
|
+
return { key, value: { intValue: String(value) } };
|
|
59
|
+
}
|
|
60
|
+
return { key, value: { doubleValue: value } };
|
|
61
|
+
}
|
|
62
|
+
return { key, value: { boolValue: value } };
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* OpenTelemetry Exporter for EvalGate WorkflowTracer
|
|
66
|
+
*/
|
|
67
|
+
class OTelExporter {
|
|
68
|
+
constructor(options = {}) {
|
|
69
|
+
this.options = {
|
|
70
|
+
endpoint: options.endpoint ?? "http://localhost:4318/v1/traces",
|
|
71
|
+
serviceName: options.serviceName ?? "evalgate",
|
|
72
|
+
resourceAttributes: options.resourceAttributes ?? {},
|
|
73
|
+
sdkVersion: options.sdkVersion ?? "2.2.4",
|
|
74
|
+
headers: options.headers ?? {},
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Export workflow data from a WorkflowTracer instance
|
|
79
|
+
*/
|
|
80
|
+
exportFromTracer(tracer) {
|
|
81
|
+
const workflow = tracer.getCurrentWorkflow();
|
|
82
|
+
const handoffs = tracer.getHandoffs();
|
|
83
|
+
const decisions = tracer.getDecisions();
|
|
84
|
+
const costs = tracer.getCosts();
|
|
85
|
+
const traceId = generateTraceId();
|
|
86
|
+
const rootSpanId = generateSpanId();
|
|
87
|
+
const now = Date.now();
|
|
88
|
+
const spans = [];
|
|
89
|
+
// Root workflow span
|
|
90
|
+
if (workflow) {
|
|
91
|
+
spans.push({
|
|
92
|
+
traceId,
|
|
93
|
+
spanId: rootSpanId,
|
|
94
|
+
name: `workflow.${workflow.name}`,
|
|
95
|
+
kind: 1,
|
|
96
|
+
startTimeUnixNano: msToNano(new Date(workflow.startedAt).getTime()),
|
|
97
|
+
endTimeUnixNano: msToNano(now),
|
|
98
|
+
attributes: [
|
|
99
|
+
attr("evalgate.workflow.name", workflow.name),
|
|
100
|
+
attr("evalgate.workflow.id", workflow.id),
|
|
101
|
+
attr("evalgate.workflow.trace_id", workflow.traceId),
|
|
102
|
+
],
|
|
103
|
+
status: { code: 1 },
|
|
104
|
+
events: [],
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
// Decision spans
|
|
108
|
+
for (let i = 0; i < decisions.length; i++) {
|
|
109
|
+
const decision = decisions[i];
|
|
110
|
+
const spanId = generateSpanId();
|
|
111
|
+
spans.push(this.decisionToSpan(traceId, spanId, rootSpanId, decision, now - decisions.length + i));
|
|
112
|
+
}
|
|
113
|
+
// Handoff events
|
|
114
|
+
for (let i = 0; i < handoffs.length; i++) {
|
|
115
|
+
const handoff = handoffs[i];
|
|
116
|
+
const spanId = generateSpanId();
|
|
117
|
+
spans.push(this.handoffToSpan(traceId, spanId, rootSpanId, handoff));
|
|
118
|
+
}
|
|
119
|
+
// Cost spans
|
|
120
|
+
for (let i = 0; i < costs.length; i++) {
|
|
121
|
+
const cost = costs[i];
|
|
122
|
+
const spanId = generateSpanId();
|
|
123
|
+
spans.push(this.costToSpan(traceId, spanId, rootSpanId, cost, now - costs.length + i));
|
|
124
|
+
}
|
|
125
|
+
return this.buildPayload(spans);
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Export a run result as OTEL spans
|
|
129
|
+
*/
|
|
130
|
+
exportRunResult(runResult) {
|
|
131
|
+
const traceId = generateTraceId();
|
|
132
|
+
const rootSpanId = generateSpanId();
|
|
133
|
+
const spans = [];
|
|
134
|
+
// Root run span
|
|
135
|
+
spans.push({
|
|
136
|
+
traceId,
|
|
137
|
+
spanId: rootSpanId,
|
|
138
|
+
name: `evalgate.run.${runResult.runId}`,
|
|
139
|
+
kind: 1,
|
|
140
|
+
startTimeUnixNano: msToNano(runResult.metadata.startedAt),
|
|
141
|
+
endTimeUnixNano: msToNano(runResult.metadata.completedAt),
|
|
142
|
+
attributes: [
|
|
143
|
+
attr("evalgate.run.id", runResult.runId),
|
|
144
|
+
attr("evalgate.run.mode", runResult.metadata.mode),
|
|
145
|
+
attr("evalgate.run.duration_ms", runResult.metadata.duration),
|
|
146
|
+
attr("evalgate.run.pass_rate", runResult.summary.passRate),
|
|
147
|
+
attr("evalgate.run.passed", runResult.summary.passed),
|
|
148
|
+
attr("evalgate.run.failed", runResult.summary.failed),
|
|
149
|
+
],
|
|
150
|
+
status: {
|
|
151
|
+
code: runResult.summary.failed > 0 ? 2 : 1,
|
|
152
|
+
},
|
|
153
|
+
events: [],
|
|
154
|
+
});
|
|
155
|
+
// Per-spec child spans
|
|
156
|
+
let offset = 0;
|
|
157
|
+
for (const spec of runResult.results) {
|
|
158
|
+
const spanId = generateSpanId();
|
|
159
|
+
const specStart = runResult.metadata.startedAt + offset;
|
|
160
|
+
const specEnd = specStart + spec.result.duration;
|
|
161
|
+
offset += spec.result.duration;
|
|
162
|
+
const attributes = [
|
|
163
|
+
attr("evalgate.spec.id", spec.specId),
|
|
164
|
+
attr("evalgate.spec.name", spec.name),
|
|
165
|
+
attr("evalgate.spec.file", spec.filePath),
|
|
166
|
+
attr("evalgate.spec.status", spec.result.status),
|
|
167
|
+
attr("evalgate.spec.duration_ms", spec.result.duration),
|
|
168
|
+
];
|
|
169
|
+
if (spec.result.score !== undefined) {
|
|
170
|
+
attributes.push(attr("evalgate.spec.score", spec.result.score));
|
|
171
|
+
}
|
|
172
|
+
spans.push({
|
|
173
|
+
traceId,
|
|
174
|
+
spanId,
|
|
175
|
+
parentSpanId: rootSpanId,
|
|
176
|
+
name: `evalgate.spec.${spec.name}`,
|
|
177
|
+
kind: 1,
|
|
178
|
+
startTimeUnixNano: msToNano(specStart),
|
|
179
|
+
endTimeUnixNano: msToNano(specEnd),
|
|
180
|
+
attributes,
|
|
181
|
+
status: {
|
|
182
|
+
code: spec.result.status === "passed" ? 1 : 2,
|
|
183
|
+
message: spec.result.error,
|
|
184
|
+
},
|
|
185
|
+
events: [],
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
return this.buildPayload(spans);
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Send payload to OTEL collector via HTTP
|
|
192
|
+
*/
|
|
193
|
+
async send(payload) {
|
|
194
|
+
try {
|
|
195
|
+
const response = await fetch(this.options.endpoint, {
|
|
196
|
+
method: "POST",
|
|
197
|
+
headers: {
|
|
198
|
+
"Content-Type": "application/json",
|
|
199
|
+
...this.options.headers,
|
|
200
|
+
},
|
|
201
|
+
body: JSON.stringify(payload),
|
|
202
|
+
});
|
|
203
|
+
return response.ok;
|
|
204
|
+
}
|
|
205
|
+
catch (err) {
|
|
206
|
+
console.warn(`[OTelExporter] Failed to send: ${err instanceof Error ? err.message : String(err)}`);
|
|
207
|
+
return false;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
decisionToSpan(traceId, spanId, parentSpanId, decision, timestampMs) {
|
|
211
|
+
return {
|
|
212
|
+
traceId,
|
|
213
|
+
spanId,
|
|
214
|
+
parentSpanId,
|
|
215
|
+
name: `decision.${decision.agent}.${decision.chosen}`,
|
|
216
|
+
kind: 1,
|
|
217
|
+
startTimeUnixNano: msToNano(timestampMs),
|
|
218
|
+
endTimeUnixNano: msToNano(timestampMs + 1),
|
|
219
|
+
attributes: [
|
|
220
|
+
attr("evalgate.decision.agent", decision.agent),
|
|
221
|
+
attr("evalgate.decision.type", decision.type),
|
|
222
|
+
attr("evalgate.decision.chosen", decision.chosen),
|
|
223
|
+
attr("evalgate.decision.alternatives", decision.alternatives.length),
|
|
224
|
+
...(decision.confidence !== undefined
|
|
225
|
+
? [attr("evalgate.decision.confidence", decision.confidence)]
|
|
226
|
+
: []),
|
|
227
|
+
...(decision.reasoning
|
|
228
|
+
? [attr("evalgate.decision.reasoning", decision.reasoning)]
|
|
229
|
+
: []),
|
|
230
|
+
],
|
|
231
|
+
status: { code: 1 },
|
|
232
|
+
events: [],
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
handoffToSpan(traceId, spanId, parentSpanId, handoff) {
|
|
236
|
+
const ts = new Date(handoff.timestamp).getTime();
|
|
237
|
+
return {
|
|
238
|
+
traceId,
|
|
239
|
+
spanId,
|
|
240
|
+
parentSpanId,
|
|
241
|
+
name: `handoff.${handoff.fromAgent ?? "start"}.${handoff.toAgent}`,
|
|
242
|
+
kind: 1,
|
|
243
|
+
startTimeUnixNano: msToNano(ts),
|
|
244
|
+
endTimeUnixNano: msToNano(ts + 1),
|
|
245
|
+
attributes: [
|
|
246
|
+
attr("evalgate.handoff.from", handoff.fromAgent ?? "start"),
|
|
247
|
+
attr("evalgate.handoff.to", handoff.toAgent),
|
|
248
|
+
attr("evalgate.handoff.type", handoff.handoffType),
|
|
249
|
+
],
|
|
250
|
+
status: { code: 1 },
|
|
251
|
+
events: [],
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
costToSpan(traceId, spanId, parentSpanId, cost, timestampMs) {
|
|
255
|
+
return {
|
|
256
|
+
traceId,
|
|
257
|
+
spanId,
|
|
258
|
+
parentSpanId,
|
|
259
|
+
name: `cost.${cost.provider}.${cost.model}`,
|
|
260
|
+
kind: 1,
|
|
261
|
+
startTimeUnixNano: msToNano(timestampMs),
|
|
262
|
+
endTimeUnixNano: msToNano(timestampMs + 1),
|
|
263
|
+
attributes: [
|
|
264
|
+
attr("evalgate.cost.provider", cost.provider),
|
|
265
|
+
attr("evalgate.cost.model", cost.model),
|
|
266
|
+
attr("evalgate.cost.input_tokens", cost.inputTokens),
|
|
267
|
+
attr("evalgate.cost.output_tokens", cost.outputTokens),
|
|
268
|
+
attr("evalgate.cost.total_tokens", cost.totalTokens),
|
|
269
|
+
attr("evalgate.cost.total_usd", cost.totalCost),
|
|
270
|
+
],
|
|
271
|
+
status: { code: 1 },
|
|
272
|
+
events: [],
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
buildPayload(spans) {
|
|
276
|
+
const resourceAttrs = [
|
|
277
|
+
attr("service.name", this.options.serviceName),
|
|
278
|
+
attr("telemetry.sdk.name", "evalgate"),
|
|
279
|
+
attr("telemetry.sdk.version", this.options.sdkVersion),
|
|
280
|
+
attr("telemetry.sdk.language", "nodejs"),
|
|
281
|
+
];
|
|
282
|
+
for (const [key, value] of Object.entries(this.options.resourceAttributes)) {
|
|
283
|
+
resourceAttrs.push(attr(key, value));
|
|
284
|
+
}
|
|
285
|
+
return {
|
|
286
|
+
resourceSpans: [
|
|
287
|
+
{
|
|
288
|
+
resource: { attributes: resourceAttrs },
|
|
289
|
+
scopeSpans: [
|
|
290
|
+
{
|
|
291
|
+
scope: {
|
|
292
|
+
name: "evalgate",
|
|
293
|
+
version: this.options.sdkVersion,
|
|
294
|
+
},
|
|
295
|
+
spans,
|
|
296
|
+
},
|
|
297
|
+
],
|
|
298
|
+
},
|
|
299
|
+
],
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
exports.OTelExporter = OTelExporter;
|
|
304
|
+
/**
|
|
305
|
+
* Convenience factory
|
|
306
|
+
*/
|
|
307
|
+
function createOTelExporter(options) {
|
|
308
|
+
return new OTelExporter(options);
|
|
309
|
+
}
|
package/dist/pagination.d.ts
CHANGED
|
@@ -50,9 +50,20 @@ export declare function createPaginatedIterator<T>(fetchFn: (offset: number, lim
|
|
|
50
50
|
hasMore: boolean;
|
|
51
51
|
}>, limit?: number): PaginatedIterator<T>;
|
|
52
52
|
/**
|
|
53
|
-
* Auto-paginate helper that fetches all pages
|
|
53
|
+
* Auto-paginate helper that fetches all pages and returns a flat array.
|
|
54
|
+
* @example
|
|
55
|
+
* ```typescript
|
|
56
|
+
* const allItems = await autoPaginate(
|
|
57
|
+
* (offset, limit) => client.traces.list({ offset, limit }),
|
|
58
|
+
* );
|
|
59
|
+
* ```
|
|
54
60
|
*/
|
|
55
|
-
export declare function autoPaginate<T>(fetchFn: (offset: number, limit: number) => Promise<T[]>, limit?: number):
|
|
61
|
+
export declare function autoPaginate<T>(fetchFn: (offset: number, limit: number) => Promise<T[]>, limit?: number): Promise<T[]>;
|
|
62
|
+
/**
|
|
63
|
+
* Streaming auto-paginate generator — yields individual items one at a time.
|
|
64
|
+
* Use this when you want to process items as they arrive rather than waiting for all pages.
|
|
65
|
+
*/
|
|
66
|
+
export declare function autoPaginateGenerator<T>(fetchFn: (offset: number, limit: number) => Promise<T[]>, limit?: number): AsyncGenerator<T, void, unknown>;
|
|
56
67
|
/**
|
|
57
68
|
* Encode cursor for pagination (base64)
|
|
58
69
|
*/
|
package/dist/pagination.js
CHANGED
|
@@ -6,6 +6,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
6
6
|
exports.PaginatedIterator = void 0;
|
|
7
7
|
exports.createPaginatedIterator = createPaginatedIterator;
|
|
8
8
|
exports.autoPaginate = autoPaginate;
|
|
9
|
+
exports.autoPaginateGenerator = autoPaginateGenerator;
|
|
9
10
|
exports.encodeCursor = encodeCursor;
|
|
10
11
|
exports.decodeCursor = decodeCursor;
|
|
11
12
|
exports.createPaginationMeta = createPaginationMeta;
|
|
@@ -56,9 +57,34 @@ function createPaginatedIterator(fetchFn, limit = 50) {
|
|
|
56
57
|
return new PaginatedIterator(fetchFn, limit);
|
|
57
58
|
}
|
|
58
59
|
/**
|
|
59
|
-
* Auto-paginate helper that fetches all pages
|
|
60
|
+
* Auto-paginate helper that fetches all pages and returns a flat array.
|
|
61
|
+
* @example
|
|
62
|
+
* ```typescript
|
|
63
|
+
* const allItems = await autoPaginate(
|
|
64
|
+
* (offset, limit) => client.traces.list({ offset, limit }),
|
|
65
|
+
* );
|
|
66
|
+
* ```
|
|
60
67
|
*/
|
|
61
|
-
async function
|
|
68
|
+
async function autoPaginate(fetchFn, limit = 50) {
|
|
69
|
+
const result = [];
|
|
70
|
+
let offset = 0;
|
|
71
|
+
let hasMore = true;
|
|
72
|
+
while (hasMore) {
|
|
73
|
+
const items = await fetchFn(offset, limit);
|
|
74
|
+
if (items.length === 0) {
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
result.push(...items);
|
|
78
|
+
hasMore = items.length === limit;
|
|
79
|
+
offset += limit;
|
|
80
|
+
}
|
|
81
|
+
return result;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Streaming auto-paginate generator — yields individual items one at a time.
|
|
85
|
+
* Use this when you want to process items as they arrive rather than waiting for all pages.
|
|
86
|
+
*/
|
|
87
|
+
async function* autoPaginateGenerator(fetchFn, limit = 50) {
|
|
62
88
|
let offset = 0;
|
|
63
89
|
let hasMore = true;
|
|
64
90
|
while (hasMore) {
|
|
@@ -208,12 +208,7 @@ function generateDefineEvalCode(suite, options = {}) {
|
|
|
208
208
|
});
|
|
209
209
|
const helperFunctions = generateHelperFunctionsForSuite(specs, options);
|
|
210
210
|
const evaluationFunction = generateEvaluationFunction();
|
|
211
|
-
return [
|
|
212
|
-
...imports,
|
|
213
|
-
...helperFunctions,
|
|
214
|
-
...evaluationFunction,
|
|
215
|
-
...specCode,
|
|
216
|
-
].join("\n");
|
|
211
|
+
return [...imports, helperFunctions, evaluationFunction, ...specCode].join("\n");
|
|
217
212
|
}
|
|
218
213
|
/**
|
|
219
214
|
* Generate helper functions for a specific spec
|
package/dist/runtime/eval.d.ts
CHANGED
|
@@ -4,12 +4,19 @@
|
|
|
4
4
|
* The core DSL function for defining behavioral specifications.
|
|
5
5
|
* Uses content-addressable identity with AST position for stability.
|
|
6
6
|
*/
|
|
7
|
-
import
|
|
7
|
+
import { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, withRuntime } from "./registry";
|
|
8
|
+
import type { DefineEvalFunction, EvalContext, EvalResult, EvalSpec } from "./types";
|
|
8
9
|
/**
|
|
9
10
|
* Export the defineEval function with proper typing
|
|
10
11
|
* This is the main DSL entry point
|
|
11
12
|
*/
|
|
12
13
|
export declare const defineEval: DefineEvalFunction;
|
|
14
|
+
/**
|
|
15
|
+
* Filter a list of specs according to skip/only semantics:
|
|
16
|
+
* - If any spec has mode === "only", return only those specs
|
|
17
|
+
* - Otherwise, return all specs except those with mode === "skip"
|
|
18
|
+
*/
|
|
19
|
+
export declare function getFilteredSpecs(specs: EvalSpec[]): EvalSpec[];
|
|
13
20
|
/**
|
|
14
21
|
* Convenience export for evalai.test() alias (backward compatibility)
|
|
15
22
|
* Provides alternative naming that matches the original roadmap vision
|
|
@@ -48,8 +55,11 @@ export declare function createResult(config: {
|
|
|
48
55
|
assertions?: EvalResult["assertions"];
|
|
49
56
|
metadata?: Record<string, unknown>;
|
|
50
57
|
error?: string;
|
|
58
|
+
output?: string;
|
|
59
|
+
durationMs?: number;
|
|
60
|
+
tokens?: number;
|
|
51
61
|
}): EvalResult;
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
62
|
+
export { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, withRuntime, };
|
|
63
|
+
export { createContext as createEvalContext };
|
|
64
|
+
export { createLocalExecutor } from "./executor";
|
|
55
65
|
export default defineEval;
|
package/dist/runtime/eval.js
CHANGED
|
@@ -39,13 +39,21 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
39
39
|
};
|
|
40
40
|
})();
|
|
41
41
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
-
exports.evalai = exports.defineEval = void 0;
|
|
42
|
+
exports.createLocalExecutor = exports.withRuntime = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.evalai = exports.defineEval = void 0;
|
|
43
|
+
exports.getFilteredSpecs = getFilteredSpecs;
|
|
43
44
|
exports.defineSuite = defineSuite;
|
|
44
45
|
exports.createContext = createContext;
|
|
46
|
+
exports.createEvalContext = createContext;
|
|
45
47
|
exports.createResult = createResult;
|
|
46
48
|
const crypto = __importStar(require("node:crypto"));
|
|
49
|
+
const fs = __importStar(require("node:fs"));
|
|
47
50
|
const path = __importStar(require("node:path"));
|
|
48
51
|
const registry_1 = require("./registry");
|
|
52
|
+
Object.defineProperty(exports, "createEvalRuntime", { enumerable: true, get: function () { return registry_1.createEvalRuntime; } });
|
|
53
|
+
Object.defineProperty(exports, "disposeActiveRuntime", { enumerable: true, get: function () { return registry_1.disposeActiveRuntime; } });
|
|
54
|
+
Object.defineProperty(exports, "getActiveRuntime", { enumerable: true, get: function () { return registry_1.getActiveRuntime; } });
|
|
55
|
+
Object.defineProperty(exports, "setActiveRuntime", { enumerable: true, get: function () { return registry_1.setActiveRuntime; } });
|
|
56
|
+
Object.defineProperty(exports, "withRuntime", { enumerable: true, get: function () { return registry_1.withRuntime; } });
|
|
49
57
|
const types_1 = require("./types");
|
|
50
58
|
/**
|
|
51
59
|
* Extract AST position from call stack
|
|
@@ -159,7 +167,7 @@ function createSpecConfig(nameOrConfig, executor, options) {
|
|
|
159
167
|
/**
|
|
160
168
|
* Core defineEval function implementation
|
|
161
169
|
*/
|
|
162
|
-
function
|
|
170
|
+
function defineEvalWithMode(mode, nameOrConfig, executor, options) {
|
|
163
171
|
// Get caller position for identity
|
|
164
172
|
const callerPosition = getCallerPosition();
|
|
165
173
|
// Create specification configuration
|
|
@@ -187,15 +195,124 @@ function defineEvalImpl(nameOrConfig, executor, options) {
|
|
|
187
195
|
budget: config.budget,
|
|
188
196
|
model: config.model,
|
|
189
197
|
},
|
|
198
|
+
mode,
|
|
190
199
|
};
|
|
191
200
|
// Register specification
|
|
192
201
|
runtime.register(spec);
|
|
193
202
|
}
|
|
203
|
+
function defineEvalImpl(nameOrConfig, executor, options) {
|
|
204
|
+
defineEvalWithMode("normal", nameOrConfig, executor, options);
|
|
205
|
+
}
|
|
206
|
+
function defineEvalSkipImpl(nameOrConfig, executor, options) {
|
|
207
|
+
defineEvalWithMode("skip", nameOrConfig, executor, options);
|
|
208
|
+
}
|
|
209
|
+
function defineEvalOnlyImpl(nameOrConfig, executor, options) {
|
|
210
|
+
defineEvalWithMode("only", nameOrConfig, executor, options);
|
|
211
|
+
}
|
|
194
212
|
/**
|
|
195
213
|
* Export the defineEval function with proper typing
|
|
196
214
|
* This is the main DSL entry point
|
|
197
215
|
*/
|
|
198
216
|
exports.defineEval = defineEvalImpl;
|
|
217
|
+
// Attach .skip and .only modifiers (vitest/jest convention)
|
|
218
|
+
exports.defineEval.skip = defineEvalSkipImpl;
|
|
219
|
+
exports.defineEval.only = defineEvalOnlyImpl;
|
|
220
|
+
/**
|
|
221
|
+
* Parse a JSONL file into an array of row objects.
|
|
222
|
+
* Each line must be a valid JSON object; blank lines are skipped.
|
|
223
|
+
*/
|
|
224
|
+
function parseJsonl(content) {
|
|
225
|
+
return content
|
|
226
|
+
.split("\n")
|
|
227
|
+
.map((line) => line.trim())
|
|
228
|
+
.filter((line) => line.length > 0)
|
|
229
|
+
.map((line, i) => {
|
|
230
|
+
try {
|
|
231
|
+
return JSON.parse(line);
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
throw new types_1.SpecRegistrationError(`Invalid JSON on line ${i + 1} of dataset`);
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Parse a simple CSV file into an array of row objects.
|
|
240
|
+
* First line is treated as headers. Values are unquoted strings.
|
|
241
|
+
* For complex CSV (quoted fields, escapes), use a dedicated library.
|
|
242
|
+
*/
|
|
243
|
+
function parseCsv(content) {
|
|
244
|
+
const lines = content
|
|
245
|
+
.split("\n")
|
|
246
|
+
.map((l) => l.trim())
|
|
247
|
+
.filter((l) => l.length > 0);
|
|
248
|
+
if (lines.length < 2)
|
|
249
|
+
return [];
|
|
250
|
+
const headers = lines[0].split(",").map((h) => h.trim());
|
|
251
|
+
return lines.slice(1).map((line) => {
|
|
252
|
+
const values = line.split(",").map((v) => v.trim());
|
|
253
|
+
const row = {};
|
|
254
|
+
for (let i = 0; i < headers.length; i++) {
|
|
255
|
+
row[headers[i]] = values[i] ?? "";
|
|
256
|
+
}
|
|
257
|
+
return row;
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Load a JSONL or CSV dataset and register one spec per row.
|
|
262
|
+
*/
|
|
263
|
+
function fromDatasetImpl(name, datasetPath, executor, options) {
|
|
264
|
+
const resolvedPath = path.isAbsolute(datasetPath)
|
|
265
|
+
? datasetPath
|
|
266
|
+
: path.resolve(process.cwd(), datasetPath);
|
|
267
|
+
if (!fs.existsSync(resolvedPath)) {
|
|
268
|
+
throw new types_1.SpecRegistrationError(`Dataset file not found: ${resolvedPath}`);
|
|
269
|
+
}
|
|
270
|
+
const content = fs.readFileSync(resolvedPath, "utf8");
|
|
271
|
+
const ext = path.extname(resolvedPath).toLowerCase();
|
|
272
|
+
let rows;
|
|
273
|
+
if (ext === ".jsonl" || ext === ".ndjson") {
|
|
274
|
+
rows = parseJsonl(content);
|
|
275
|
+
}
|
|
276
|
+
else if (ext === ".csv") {
|
|
277
|
+
rows = parseCsv(content);
|
|
278
|
+
}
|
|
279
|
+
else if (ext === ".json") {
|
|
280
|
+
const parsed = JSON.parse(content);
|
|
281
|
+
rows = Array.isArray(parsed) ? parsed : [parsed];
|
|
282
|
+
}
|
|
283
|
+
else {
|
|
284
|
+
throw new types_1.SpecRegistrationError(`Unsupported dataset format: ${ext}. Use .jsonl, .ndjson, .csv, or .json`);
|
|
285
|
+
}
|
|
286
|
+
if (rows.length === 0) {
|
|
287
|
+
throw new types_1.SpecRegistrationError(`Dataset is empty: ${resolvedPath}`);
|
|
288
|
+
}
|
|
289
|
+
for (let i = 0; i < rows.length; i++) {
|
|
290
|
+
const row = rows[i];
|
|
291
|
+
const specName = `${name} - row ${i + 1}`;
|
|
292
|
+
const wrappedExecutor = (context) => executor({ ...context, input: row });
|
|
293
|
+
defineEvalWithMode("normal", specName, wrappedExecutor, {
|
|
294
|
+
...options,
|
|
295
|
+
metadata: {
|
|
296
|
+
...options?.metadata,
|
|
297
|
+
datasetPath: resolvedPath,
|
|
298
|
+
datasetRow: i + 1,
|
|
299
|
+
},
|
|
300
|
+
});
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
exports.defineEval.fromDataset = fromDatasetImpl;
|
|
304
|
+
/**
|
|
305
|
+
* Filter a list of specs according to skip/only semantics:
|
|
306
|
+
* - If any spec has mode === "only", return only those specs
|
|
307
|
+
* - Otherwise, return all specs except those with mode === "skip"
|
|
308
|
+
*/
|
|
309
|
+
function getFilteredSpecs(specs) {
|
|
310
|
+
const onlySpecs = specs.filter((s) => s.mode === "only");
|
|
311
|
+
if (onlySpecs.length > 0) {
|
|
312
|
+
return onlySpecs;
|
|
313
|
+
}
|
|
314
|
+
return specs.filter((s) => s.mode !== "skip");
|
|
315
|
+
}
|
|
199
316
|
/**
|
|
200
317
|
* Convenience export for evalai.test() alias (backward compatibility)
|
|
201
318
|
* Provides alternative naming that matches the original roadmap vision
|
|
@@ -245,9 +362,17 @@ function createResult(config) {
|
|
|
245
362
|
assertions: config.assertions,
|
|
246
363
|
metadata: config.metadata,
|
|
247
364
|
error: config.error,
|
|
365
|
+
output: config.output,
|
|
366
|
+
durationMs: config.durationMs,
|
|
367
|
+
tokens: config.tokens,
|
|
248
368
|
};
|
|
249
369
|
}
|
|
250
370
|
/**
|
|
251
371
|
* Default export for convenience
|
|
252
372
|
*/
|
|
373
|
+
// Register defineEval with registry to break circular dependency
|
|
374
|
+
(0, registry_1._registerDefineEval)(exports.defineEval);
|
|
375
|
+
// Re-export createLocalExecutor from executor.ts
|
|
376
|
+
var executor_1 = require("./executor");
|
|
377
|
+
Object.defineProperty(exports, "createLocalExecutor", { enumerable: true, get: function () { return executor_1.createLocalExecutor; } });
|
|
253
378
|
exports.default = exports.defineEval;
|
|
@@ -10,7 +10,8 @@ import type { LocalExecutor } from "./types";
|
|
|
10
10
|
*/
|
|
11
11
|
export declare function createLocalExecutor(): LocalExecutor;
|
|
12
12
|
/**
|
|
13
|
-
* Default local executor
|
|
13
|
+
* Default local executor factory
|
|
14
|
+
* Call as defaultLocalExecutor() to get a new executor instance.
|
|
14
15
|
* For convenience in simple use cases
|
|
15
16
|
*/
|
|
16
|
-
export declare const defaultLocalExecutor:
|
|
17
|
+
export declare const defaultLocalExecutor: typeof createLocalExecutor;
|
package/dist/runtime/executor.js
CHANGED
|
@@ -146,7 +146,8 @@ function createLocalExecutor() {
|
|
|
146
146
|
return new LocalExecutorImpl();
|
|
147
147
|
}
|
|
148
148
|
/**
|
|
149
|
-
* Default local executor
|
|
149
|
+
* Default local executor factory
|
|
150
|
+
* Call as defaultLocalExecutor() to get a new executor instance.
|
|
150
151
|
* For convenience in simple use cases
|
|
151
152
|
*/
|
|
152
|
-
exports.defaultLocalExecutor = createLocalExecutor
|
|
153
|
+
exports.defaultLocalExecutor = createLocalExecutor;
|
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|
* Scoped registry with proper lifecycle management.
|
|
5
5
|
* Prevents cross-run contamination and memory leaks.
|
|
6
6
|
*/
|
|
7
|
-
import type { EvalRuntime } from "./types";
|
|
7
|
+
import type { DefineEvalFunction, EvalRuntime } from "./types";
|
|
8
|
+
/** @internal Called by eval.ts to register defineEval without circular import */
|
|
9
|
+
export declare function _registerDefineEval(fn: (...args: unknown[]) => unknown): void;
|
|
8
10
|
/**
|
|
9
11
|
* Runtime interface with lifecycle management
|
|
10
12
|
* Ensures proper cleanup and prevents resource leaks
|
|
@@ -13,7 +15,7 @@ export interface RuntimeHandle {
|
|
|
13
15
|
/** Runtime instance */
|
|
14
16
|
runtime: EvalRuntime;
|
|
15
17
|
/** defineEval function bound to this runtime */
|
|
16
|
-
defineEval:
|
|
18
|
+
defineEval: DefineEvalFunction;
|
|
17
19
|
/** Dispose runtime and clean up resources */
|
|
18
20
|
dispose(): void;
|
|
19
21
|
/** Create runtime snapshot for persistence */
|
|
@@ -61,7 +63,10 @@ export interface SerializedSpec {
|
|
|
61
63
|
* Create a new scoped runtime with lifecycle management
|
|
62
64
|
* Returns a handle for proper resource management
|
|
63
65
|
*/
|
|
64
|
-
export declare function createEvalRuntime(
|
|
66
|
+
export declare function createEvalRuntime(projectRootOrConfig?: string | {
|
|
67
|
+
name?: string;
|
|
68
|
+
projectRoot?: string;
|
|
69
|
+
}): RuntimeHandle;
|
|
65
70
|
/**
|
|
66
71
|
* Helper function for safe runtime execution with automatic cleanup
|
|
67
72
|
* Ensures runtime is disposed even if an exception is thrown
|