@deepagents/evals 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/dist/comparison/index.d.ts +41 -0
- package/dist/comparison/index.d.ts.map +1 -0
- package/dist/comparison/index.js +106 -0
- package/dist/comparison/index.js.map +7 -0
- package/dist/dataset/hf.d.ts +16 -0
- package/dist/dataset/hf.d.ts.map +1 -0
- package/dist/dataset/index.d.ts +17 -0
- package/dist/dataset/index.d.ts.map +1 -0
- package/dist/dataset/index.js +256 -0
- package/dist/dataset/index.js.map +7 -0
- package/dist/engine/index.d.ts +67 -0
- package/dist/engine/index.d.ts.map +1 -0
- package/dist/engine/index.js +332 -0
- package/dist/engine/index.js.map +7 -0
- package/dist/evaluate/index.d.ts +47 -0
- package/dist/evaluate/index.d.ts.map +1 -0
- package/dist/evaluate/index.js +977 -0
- package/dist/evaluate/index.js.map +7 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1763 -0
- package/dist/index.js.map +7 -0
- package/dist/reporters/console.d.ts +6 -0
- package/dist/reporters/console.d.ts.map +1 -0
- package/dist/reporters/csv.d.ts +6 -0
- package/dist/reporters/csv.d.ts.map +1 -0
- package/dist/reporters/format.d.ts +12 -0
- package/dist/reporters/format.d.ts.map +1 -0
- package/dist/reporters/html.d.ts +6 -0
- package/dist/reporters/html.d.ts.map +1 -0
- package/dist/reporters/index.d.ts +12 -0
- package/dist/reporters/index.d.ts.map +1 -0
- package/dist/reporters/index.js +447 -0
- package/dist/reporters/index.js.map +7 -0
- package/dist/reporters/json.d.ts +7 -0
- package/dist/reporters/json.d.ts.map +1 -0
- package/dist/reporters/markdown.d.ts +6 -0
- package/dist/reporters/markdown.d.ts.map +1 -0
- package/dist/reporters/shared.d.ts +11 -0
- package/dist/reporters/shared.d.ts.map +1 -0
- package/dist/reporters/types.d.ts +35 -0
- package/dist/reporters/types.d.ts.map +1 -0
- package/dist/scorers/index.d.ts +30 -0
- package/dist/scorers/index.d.ts.map +1 -0
- package/dist/scorers/index.js +175 -0
- package/dist/scorers/index.js.map +7 -0
- package/dist/store/index.d.ts +103 -0
- package/dist/store/index.d.ts.map +1 -0
- package/dist/store/index.js +361 -0
- package/dist/store/index.js.map +7 -0
- package/package.json +99 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1763 @@
|
|
|
1
|
+
// packages/evals/src/dataset/index.ts
|
|
2
|
+
import { createReadStream } from "node:fs";
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { extname } from "node:path";
|
|
5
|
+
import { createInterface } from "node:readline";
|
|
6
|
+
|
|
7
|
+
// packages/evals/src/dataset/hf.ts
|
|
8
|
+
var HF_BASE_URL = "https://datasets-server.huggingface.co/rows";
|
|
9
|
+
var PAGE_SIZE = 100;
|
|
10
|
+
function hf(options) {
|
|
11
|
+
return {
|
|
12
|
+
[Symbol.asyncIterator]() {
|
|
13
|
+
return paginate(options);
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
async function* paginate(options) {
|
|
18
|
+
const { dataset: dataset2, config, split, rows } = options;
|
|
19
|
+
const limit = rows ?? Infinity;
|
|
20
|
+
let offset = 0;
|
|
21
|
+
let yielded = 0;
|
|
22
|
+
while (yielded < limit) {
|
|
23
|
+
const pageSize = limit === Infinity ? PAGE_SIZE : Math.min(PAGE_SIZE, limit - yielded);
|
|
24
|
+
const url = buildUrl(dataset2, config, split, offset, pageSize);
|
|
25
|
+
const page = await fetchPage(url);
|
|
26
|
+
if (page.rows.length === 0) return;
|
|
27
|
+
for (const entry of page.rows) {
|
|
28
|
+
yield entry.row;
|
|
29
|
+
yielded++;
|
|
30
|
+
if (yielded >= limit) return;
|
|
31
|
+
}
|
|
32
|
+
offset += page.rows.length;
|
|
33
|
+
if (page.rows.length < pageSize || offset >= page.num_rows_total) return;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
function buildUrl(dataset2, config, split, offset, length) {
|
|
37
|
+
const url = new URL(HF_BASE_URL);
|
|
38
|
+
url.searchParams.set("dataset", dataset2);
|
|
39
|
+
url.searchParams.set("config", config);
|
|
40
|
+
url.searchParams.set("split", split);
|
|
41
|
+
url.searchParams.set("offset", String(offset));
|
|
42
|
+
url.searchParams.set("length", String(length));
|
|
43
|
+
return url.toString();
|
|
44
|
+
}
|
|
45
|
+
async function fetchPage(url) {
|
|
46
|
+
const response = await fetch(url);
|
|
47
|
+
if (!response.ok) {
|
|
48
|
+
const body = await response.text().catch(() => "");
|
|
49
|
+
throw new Error(
|
|
50
|
+
`HuggingFace API error ${response.status}: ${body || response.statusText}`
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
const text = await response.text();
|
|
54
|
+
try {
|
|
55
|
+
return JSON.parse(text);
|
|
56
|
+
} catch {
|
|
57
|
+
throw new Error(
|
|
58
|
+
`HuggingFace API returned non-JSON response from ${url}: ${text.slice(0, 200)}`
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// packages/evals/src/dataset/index.ts
|
|
64
|
+
var Dataset = class _Dataset {
|
|
65
|
+
#source;
|
|
66
|
+
constructor(source) {
|
|
67
|
+
this.#source = source;
|
|
68
|
+
}
|
|
69
|
+
map(fn) {
|
|
70
|
+
const source = this.#source;
|
|
71
|
+
return new _Dataset(async function* () {
|
|
72
|
+
for await (const item of source()) {
|
|
73
|
+
yield fn(item);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
filter(fn) {
|
|
78
|
+
const source = this.#source;
|
|
79
|
+
return new _Dataset(async function* () {
|
|
80
|
+
for await (const item of source()) {
|
|
81
|
+
if (fn(item)) yield item;
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
limit(n) {
|
|
86
|
+
const source = this.#source;
|
|
87
|
+
return new _Dataset(async function* () {
|
|
88
|
+
let count = 0;
|
|
89
|
+
for await (const item of source()) {
|
|
90
|
+
if (count >= n) return;
|
|
91
|
+
yield item;
|
|
92
|
+
count++;
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
shuffle() {
|
|
97
|
+
const source = this.#source;
|
|
98
|
+
return new _Dataset(async function* () {
|
|
99
|
+
const items = [];
|
|
100
|
+
for await (const item of source()) {
|
|
101
|
+
items.push(item);
|
|
102
|
+
}
|
|
103
|
+
for (let i = items.length - 1; i > 0; i--) {
|
|
104
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
105
|
+
const temp = items[i];
|
|
106
|
+
items[i] = items[j];
|
|
107
|
+
items[j] = temp;
|
|
108
|
+
}
|
|
109
|
+
yield* items;
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
sample(n) {
|
|
113
|
+
const source = this.#source;
|
|
114
|
+
return new _Dataset(async function* () {
|
|
115
|
+
const items = [];
|
|
116
|
+
for await (const item of source()) {
|
|
117
|
+
items.push(item);
|
|
118
|
+
}
|
|
119
|
+
const count = Math.min(Math.max(0, n), items.length);
|
|
120
|
+
for (let i = items.length - 1; i > items.length - count - 1; i--) {
|
|
121
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
122
|
+
const temp = items[i];
|
|
123
|
+
items[i] = items[j];
|
|
124
|
+
items[j] = temp;
|
|
125
|
+
}
|
|
126
|
+
for (let i = items.length - count; i < items.length; i++) {
|
|
127
|
+
yield items[i];
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
async toArray() {
|
|
132
|
+
const result = [];
|
|
133
|
+
for await (const item of this.#source()) {
|
|
134
|
+
result.push(item);
|
|
135
|
+
}
|
|
136
|
+
return result;
|
|
137
|
+
}
|
|
138
|
+
[Symbol.asyncIterator]() {
|
|
139
|
+
return this.#source()[Symbol.asyncIterator]();
|
|
140
|
+
}
|
|
141
|
+
};
|
|
142
|
+
function parseCSVLine(line) {
|
|
143
|
+
const fields = [];
|
|
144
|
+
let current = "";
|
|
145
|
+
let inQuotes = false;
|
|
146
|
+
for (let i = 0; i < line.length; i++) {
|
|
147
|
+
const char = line[i];
|
|
148
|
+
if (inQuotes) {
|
|
149
|
+
if (char === '"') {
|
|
150
|
+
if (i + 1 < line.length && line[i + 1] === '"') {
|
|
151
|
+
current += '"';
|
|
152
|
+
i++;
|
|
153
|
+
} else {
|
|
154
|
+
inQuotes = false;
|
|
155
|
+
}
|
|
156
|
+
} else {
|
|
157
|
+
current += char;
|
|
158
|
+
}
|
|
159
|
+
} else {
|
|
160
|
+
if (char === '"' && current === "") {
|
|
161
|
+
inQuotes = true;
|
|
162
|
+
} else if (char === ",") {
|
|
163
|
+
fields.push(current);
|
|
164
|
+
current = "";
|
|
165
|
+
} else {
|
|
166
|
+
current += char;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
fields.push(current);
|
|
171
|
+
return fields;
|
|
172
|
+
}
|
|
173
|
+
function loadJSON(filePath) {
|
|
174
|
+
return async function* () {
|
|
175
|
+
const content = await readFile(filePath, "utf-8");
|
|
176
|
+
const data = JSON.parse(content);
|
|
177
|
+
if (!Array.isArray(data)) {
|
|
178
|
+
throw new Error(`JSON file "${filePath}" does not contain an array`);
|
|
179
|
+
}
|
|
180
|
+
yield* data;
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
function loadJSONL(filePath) {
|
|
184
|
+
return async function* () {
|
|
185
|
+
const rl = createInterface({
|
|
186
|
+
input: createReadStream(filePath, "utf-8"),
|
|
187
|
+
crlfDelay: Infinity
|
|
188
|
+
});
|
|
189
|
+
try {
|
|
190
|
+
for await (const line of rl) {
|
|
191
|
+
const trimmed = line.trim();
|
|
192
|
+
if (trimmed) {
|
|
193
|
+
yield JSON.parse(trimmed);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
} finally {
|
|
197
|
+
rl.close();
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
function loadCSV(filePath) {
|
|
202
|
+
return async function* () {
|
|
203
|
+
const rl = createInterface({
|
|
204
|
+
input: createReadStream(filePath, "utf-8"),
|
|
205
|
+
crlfDelay: Infinity
|
|
206
|
+
});
|
|
207
|
+
try {
|
|
208
|
+
let headers;
|
|
209
|
+
for await (const line of rl) {
|
|
210
|
+
const trimmed = line.trim();
|
|
211
|
+
if (!trimmed) continue;
|
|
212
|
+
const fields = parseCSVLine(trimmed);
|
|
213
|
+
if (!headers) {
|
|
214
|
+
headers = fields;
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
const row = {};
|
|
218
|
+
for (let i = 0; i < headers.length; i++) {
|
|
219
|
+
row[headers[i]] = fields[i] ?? "";
|
|
220
|
+
}
|
|
221
|
+
yield row;
|
|
222
|
+
}
|
|
223
|
+
} finally {
|
|
224
|
+
rl.close();
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
function dataset(source) {
|
|
229
|
+
if (Array.isArray(source)) {
|
|
230
|
+
return new Dataset(async function* () {
|
|
231
|
+
yield* source;
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
if (typeof source === "object" && Symbol.asyncIterator in source) {
|
|
235
|
+
return new Dataset(() => source);
|
|
236
|
+
}
|
|
237
|
+
const ext = extname(source).toLowerCase();
|
|
238
|
+
switch (ext) {
|
|
239
|
+
case ".json":
|
|
240
|
+
return new Dataset(loadJSON(source));
|
|
241
|
+
case ".jsonl":
|
|
242
|
+
return new Dataset(loadJSONL(source));
|
|
243
|
+
case ".csv":
|
|
244
|
+
return new Dataset(loadCSV(source));
|
|
245
|
+
default:
|
|
246
|
+
throw new Error(
|
|
247
|
+
`Unsupported file extension "${ext}" for dataset file "${source}". Supported: .json, .jsonl, .csv`
|
|
248
|
+
);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// packages/evals/src/scorers/index.ts
|
|
253
|
+
import { generateObject } from "ai";
|
|
254
|
+
import { z } from "zod";
|
|
255
|
+
var exactMatch = async ({ output, expected }) => {
|
|
256
|
+
const exp = expected == null ? "" : String(expected);
|
|
257
|
+
if (output === exp) return { score: 1 };
|
|
258
|
+
return {
|
|
259
|
+
score: 0,
|
|
260
|
+
reason: `Output does not exactly match expected. Expected "${exp}" but got "${output}".`
|
|
261
|
+
};
|
|
262
|
+
};
|
|
263
|
+
var includes = async ({ output, expected }) => {
|
|
264
|
+
const exp = expected == null ? "" : String(expected);
|
|
265
|
+
if (output.includes(exp)) return { score: 1 };
|
|
266
|
+
return {
|
|
267
|
+
score: 0,
|
|
268
|
+
reason: `Output does not include expected substring "${exp}".`
|
|
269
|
+
};
|
|
270
|
+
};
|
|
271
|
+
function regex(pattern) {
|
|
272
|
+
return async ({ output }) => {
|
|
273
|
+
return { score: pattern.test(output) ? 1 : 0 };
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
function levenshteinDistance(a, b) {
|
|
277
|
+
if (a.length === 0) return b.length;
|
|
278
|
+
if (b.length === 0) return a.length;
|
|
279
|
+
if (a.length > b.length) [a, b] = [b, a];
|
|
280
|
+
let prev = Array.from({ length: a.length + 1 }, (_, i) => i);
|
|
281
|
+
let curr = new Array(a.length + 1);
|
|
282
|
+
for (let j = 1; j <= b.length; j++) {
|
|
283
|
+
curr[0] = j;
|
|
284
|
+
for (let i = 1; i <= a.length; i++) {
|
|
285
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
286
|
+
curr[i] = Math.min(prev[i] + 1, curr[i - 1] + 1, prev[i - 1] + cost);
|
|
287
|
+
}
|
|
288
|
+
[prev, curr] = [curr, prev];
|
|
289
|
+
}
|
|
290
|
+
return prev[a.length];
|
|
291
|
+
}
|
|
292
|
+
var levenshtein = async ({ output, expected }) => {
|
|
293
|
+
const exp = expected == null ? "" : String(expected);
|
|
294
|
+
if (output.length === 0 && exp.length === 0) return { score: 1 };
|
|
295
|
+
const maxLen = Math.max(output.length, exp.length);
|
|
296
|
+
const distance = levenshteinDistance(output, exp);
|
|
297
|
+
const score = Math.max(0, 1 - distance / maxLen);
|
|
298
|
+
if (score === 1) return { score };
|
|
299
|
+
return {
|
|
300
|
+
score,
|
|
301
|
+
reason: `Levenshtein distance is ${distance} across max length ${maxLen}.`
|
|
302
|
+
};
|
|
303
|
+
};
|
|
304
|
+
function deepEqual(a, b) {
|
|
305
|
+
if (a === b) return true;
|
|
306
|
+
if (a == null || b == null) return false;
|
|
307
|
+
if (typeof a !== typeof b) return false;
|
|
308
|
+
if (Array.isArray(a)) {
|
|
309
|
+
if (!Array.isArray(b) || a.length !== b.length) return false;
|
|
310
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
311
|
+
}
|
|
312
|
+
if (typeof a === "object") {
|
|
313
|
+
const keysA = Object.keys(a).sort();
|
|
314
|
+
const keysB = Object.keys(b).sort();
|
|
315
|
+
if (keysA.length !== keysB.length) return false;
|
|
316
|
+
return keysA.every(
|
|
317
|
+
(key, i) => keysB[i] === key && deepEqual(
|
|
318
|
+
a[key],
|
|
319
|
+
b[key]
|
|
320
|
+
)
|
|
321
|
+
);
|
|
322
|
+
}
|
|
323
|
+
return false;
|
|
324
|
+
}
|
|
325
|
+
var jsonMatch = async ({ output, expected }) => {
|
|
326
|
+
try {
|
|
327
|
+
const parsedOutput = JSON.parse(output);
|
|
328
|
+
const parsedExpected = typeof expected === "string" ? JSON.parse(expected) : expected;
|
|
329
|
+
if (deepEqual(parsedOutput, parsedExpected)) return { score: 1 };
|
|
330
|
+
return { score: 0, reason: "JSON payload differs from expected JSON." };
|
|
331
|
+
} catch {
|
|
332
|
+
return { score: 0, reason: "Failed to parse JSON" };
|
|
333
|
+
}
|
|
334
|
+
};
|
|
335
|
+
var llmScorerSchema = z.object({
|
|
336
|
+
score: z.number().min(0).max(1),
|
|
337
|
+
reason: z.string()
|
|
338
|
+
});
|
|
339
|
+
function llmJudge(config) {
|
|
340
|
+
return async ({ input, output, expected }) => {
|
|
341
|
+
const { object } = await generateObject({
|
|
342
|
+
model: config.model,
|
|
343
|
+
schema: llmScorerSchema,
|
|
344
|
+
prompt: `You are an expert evaluator. Grade the output based on the following criteria:
|
|
345
|
+
${config.criteria}
|
|
346
|
+
|
|
347
|
+
Input: ${JSON.stringify(input)}
|
|
348
|
+
Output: ${output}
|
|
349
|
+
${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
|
|
350
|
+
|
|
351
|
+
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
352
|
+
});
|
|
353
|
+
return { score: object.score, reason: object.reason };
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
function factuality(config) {
|
|
357
|
+
return async ({ input, output, expected }) => {
|
|
358
|
+
const { object } = await generateObject({
|
|
359
|
+
model: config.model,
|
|
360
|
+
schema: llmScorerSchema,
|
|
361
|
+
prompt: `You are a factuality evaluator. Determine whether the output is factually consistent with the expected reference.
|
|
362
|
+
|
|
363
|
+
Input: ${JSON.stringify(input)}
|
|
364
|
+
Output: ${output}
|
|
365
|
+
Expected reference: ${JSON.stringify(expected)}
|
|
366
|
+
|
|
367
|
+
Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
|
|
368
|
+
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
369
|
+
});
|
|
370
|
+
return { score: object.score, reason: object.reason };
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
function all(...scorers) {
|
|
374
|
+
return async (args) => {
|
|
375
|
+
if (scorers.length === 0) return { score: 1 };
|
|
376
|
+
const results = await Promise.all(scorers.map((s) => s(args)));
|
|
377
|
+
const minResult = results.reduce(
|
|
378
|
+
(min, r) => r.score < min.score ? r : min
|
|
379
|
+
);
|
|
380
|
+
const reasons = results.filter((r) => r.reason).map((r) => r.reason).join("; ");
|
|
381
|
+
return { score: minResult.score, reason: reasons || void 0 };
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
function any(...scorers) {
|
|
385
|
+
return async (args) => {
|
|
386
|
+
if (scorers.length === 0) return { score: 0 };
|
|
387
|
+
const results = await Promise.all(scorers.map((s) => s(args)));
|
|
388
|
+
const maxResult = results.reduce(
|
|
389
|
+
(max, r) => r.score > max.score ? r : max
|
|
390
|
+
);
|
|
391
|
+
return { score: maxResult.score, reason: maxResult.reason };
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
function weighted(config) {
|
|
395
|
+
return async (args) => {
|
|
396
|
+
const entries = Object.entries(config);
|
|
397
|
+
const results = await Promise.all(
|
|
398
|
+
entries.map(async ([name, { scorer, weight }]) => ({
|
|
399
|
+
name,
|
|
400
|
+
result: await scorer(args),
|
|
401
|
+
weight
|
|
402
|
+
}))
|
|
403
|
+
);
|
|
404
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
405
|
+
const weightedScore = results.reduce(
|
|
406
|
+
(sum, r) => sum + r.result.score * r.weight,
|
|
407
|
+
0
|
|
408
|
+
);
|
|
409
|
+
const score = totalWeight > 0 ? weightedScore / totalWeight : 0;
|
|
410
|
+
const reasons = results.map((r) => `${r.name}: ${r.result.score.toFixed(2)} (w=${r.weight})`).join(", ");
|
|
411
|
+
return { score, reason: reasons || void 0 };
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// packages/evals/src/store/index.ts
|
|
416
|
+
import { mkdirSync } from "node:fs";
|
|
417
|
+
import { dirname } from "node:path";
|
|
418
|
+
import { DatabaseSync } from "node:sqlite";
|
|
419
|
+
|
|
420
|
+
// packages/evals/src/store/ddl.sqlite.sql
|
|
421
|
+
var ddl_sqlite_default = "PRAGMA journal_mode = WAL;\nPRAGMA synchronous = NORMAL;\nPRAGMA foreign_keys = ON;\n\nCREATE TABLE IF NOT EXISTS suites (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE TABLE IF NOT EXISTS runs (\n id TEXT PRIMARY KEY,\n suite_id TEXT NOT NULL,\n name TEXT NOT NULL,\n model TEXT NOT NULL,\n config TEXT,\n started_at INTEGER NOT NULL,\n finished_at INTEGER,\n status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),\n summary TEXT,\n FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id);\nCREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);\n\nCREATE TABLE IF NOT EXISTS cases (\n id TEXT PRIMARY KEY,\n run_id TEXT NOT NULL,\n idx INTEGER NOT NULL,\n input TEXT NOT NULL,\n output TEXT,\n expected TEXT,\n latency_ms INTEGER,\n tokens_in INTEGER,\n tokens_out INTEGER,\n error TEXT,\n FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_cases_run_id ON cases(run_id);\n\nCREATE TABLE IF NOT EXISTS scores (\n id TEXT PRIMARY KEY,\n case_id TEXT NOT NULL,\n scorer_name TEXT NOT NULL,\n score REAL NOT NULL,\n reason TEXT,\n FOREIGN KEY (case_id) REFERENCES cases(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_scores_case_id ON scores(case_id);\n\nCREATE TABLE IF NOT EXISTS prompts (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL UNIQUE,\n content TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at);\n";
|
|
422
|
+
|
|
423
|
+
// packages/evals/src/store/index.ts
|
|
424
|
+
var RunStore = class {
|
|
425
|
+
#db;
|
|
426
|
+
#statements = /* @__PURE__ */ new Map();
|
|
427
|
+
#stmt(sql) {
|
|
428
|
+
let stmt = this.#statements.get(sql);
|
|
429
|
+
if (!stmt) {
|
|
430
|
+
stmt = this.#db.prepare(sql);
|
|
431
|
+
this.#statements.set(sql, stmt);
|
|
432
|
+
}
|
|
433
|
+
return stmt;
|
|
434
|
+
}
|
|
435
|
+
#transaction(fn) {
|
|
436
|
+
this.#db.exec("BEGIN TRANSACTION");
|
|
437
|
+
try {
|
|
438
|
+
const result = fn();
|
|
439
|
+
this.#db.exec("COMMIT");
|
|
440
|
+
return result;
|
|
441
|
+
} catch (error) {
|
|
442
|
+
this.#db.exec("ROLLBACK");
|
|
443
|
+
throw error;
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
constructor(pathOrDb) {
|
|
447
|
+
if (pathOrDb instanceof DatabaseSync) {
|
|
448
|
+
this.#db = pathOrDb;
|
|
449
|
+
} else {
|
|
450
|
+
const dbPath = pathOrDb ?? ".evals/store.db";
|
|
451
|
+
mkdirSync(dirname(dbPath), { recursive: true });
|
|
452
|
+
this.#db = new DatabaseSync(dbPath);
|
|
453
|
+
}
|
|
454
|
+
this.#db.exec(ddl_sqlite_default);
|
|
455
|
+
this.#migrateRunsTableToSuiteRequired();
|
|
456
|
+
this.#migratePromptsTableIfNeeded();
|
|
457
|
+
this.#db.exec(
|
|
458
|
+
"CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)"
|
|
459
|
+
);
|
|
460
|
+
}
|
|
461
|
+
#migratePromptsTableIfNeeded() {
|
|
462
|
+
const columns = this.#stmt("PRAGMA table_info(prompts)").all();
|
|
463
|
+
if (columns.length === 0) return;
|
|
464
|
+
if (columns.some((column) => column.name === "version")) return;
|
|
465
|
+
this.#transaction(() => {
|
|
466
|
+
this.#db.exec("ALTER TABLE prompts RENAME TO prompts_legacy");
|
|
467
|
+
this.#db.exec(`
|
|
468
|
+
CREATE TABLE prompts (
|
|
469
|
+
id TEXT PRIMARY KEY,
|
|
470
|
+
name TEXT NOT NULL,
|
|
471
|
+
version INTEGER NOT NULL,
|
|
472
|
+
content TEXT NOT NULL,
|
|
473
|
+
created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000),
|
|
474
|
+
UNIQUE(name, version)
|
|
475
|
+
)
|
|
476
|
+
`);
|
|
477
|
+
this.#db.exec(`
|
|
478
|
+
INSERT INTO prompts (id, name, version, content, created_at)
|
|
479
|
+
SELECT id, name, 1, content, created_at
|
|
480
|
+
FROM prompts_legacy
|
|
481
|
+
`);
|
|
482
|
+
this.#db.exec("DROP TABLE prompts_legacy");
|
|
483
|
+
this.#db.exec(
|
|
484
|
+
"CREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at)"
|
|
485
|
+
);
|
|
486
|
+
this.#db.exec(
|
|
487
|
+
"CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)"
|
|
488
|
+
);
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
#migrateRunsTableToSuiteRequired() {
|
|
492
|
+
const runColumns = this.#stmt("PRAGMA table_info(runs)").all();
|
|
493
|
+
if (runColumns.length === 0) return;
|
|
494
|
+
const suiteColumn = runColumns.find((column) => column.name === "suite_id");
|
|
495
|
+
const hasNonNullSuite = suiteColumn?.notnull === 1;
|
|
496
|
+
const runForeignKeys = this.#stmt(
|
|
497
|
+
"PRAGMA foreign_key_list(runs)"
|
|
498
|
+
).all();
|
|
499
|
+
const suiteForeignKey = runForeignKeys.find(
|
|
500
|
+
(fk) => fk.from === "suite_id" && fk.table === "suites"
|
|
501
|
+
);
|
|
502
|
+
const hasCascadeDelete = suiteForeignKey?.on_delete === "CASCADE";
|
|
503
|
+
if (hasNonNullSuite && hasCascadeDelete) return;
|
|
504
|
+
this.#statements.clear();
|
|
505
|
+
this.#transaction(() => {
|
|
506
|
+
this.#db.exec(`
|
|
507
|
+
CREATE TABLE runs_next (
|
|
508
|
+
id TEXT PRIMARY KEY,
|
|
509
|
+
suite_id TEXT NOT NULL,
|
|
510
|
+
name TEXT NOT NULL,
|
|
511
|
+
model TEXT NOT NULL,
|
|
512
|
+
config TEXT,
|
|
513
|
+
started_at INTEGER NOT NULL,
|
|
514
|
+
finished_at INTEGER,
|
|
515
|
+
status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),
|
|
516
|
+
summary TEXT,
|
|
517
|
+
FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE
|
|
518
|
+
)
|
|
519
|
+
`);
|
|
520
|
+
this.#db.exec("DELETE FROM runs WHERE suite_id IS NULL");
|
|
521
|
+
this.#db.exec(`
|
|
522
|
+
INSERT INTO runs_next (id, suite_id, name, model, config, started_at, finished_at, status, summary)
|
|
523
|
+
SELECT r.id, r.suite_id, r.name, r.model, r.config, r.started_at, r.finished_at, r.status, r.summary
|
|
524
|
+
FROM runs r
|
|
525
|
+
JOIN suites s ON s.id = r.suite_id
|
|
526
|
+
`);
|
|
527
|
+
this.#db.exec("DROP TABLE runs");
|
|
528
|
+
this.#db.exec("ALTER TABLE runs_next RENAME TO runs");
|
|
529
|
+
this.#db.exec(
|
|
530
|
+
"CREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id)"
|
|
531
|
+
);
|
|
532
|
+
this.#db.exec(
|
|
533
|
+
"CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at)"
|
|
534
|
+
);
|
|
535
|
+
});
|
|
536
|
+
this.#statements.clear();
|
|
537
|
+
}
|
|
538
|
+
createSuite(name) {
|
|
539
|
+
const id = crypto.randomUUID();
|
|
540
|
+
const now = Date.now();
|
|
541
|
+
this.#stmt(
|
|
542
|
+
"INSERT INTO suites (id, name, created_at) VALUES (?, ?, ?)"
|
|
543
|
+
).run(id, name, now);
|
|
544
|
+
return { id, name, created_at: now };
|
|
545
|
+
}
|
|
546
|
+
createRun(run) {
|
|
547
|
+
const id = crypto.randomUUID();
|
|
548
|
+
const now = Date.now();
|
|
549
|
+
this.#stmt(
|
|
550
|
+
"INSERT INTO runs (id, suite_id, name, model, config, started_at) VALUES (?, ?, ?, ?, ?, ?)"
|
|
551
|
+
).run(
|
|
552
|
+
id,
|
|
553
|
+
run.suite_id,
|
|
554
|
+
run.name,
|
|
555
|
+
run.model,
|
|
556
|
+
run.config ? JSON.stringify(run.config) : null,
|
|
557
|
+
now
|
|
558
|
+
);
|
|
559
|
+
return id;
|
|
560
|
+
}
|
|
561
|
+
finishRun(runId, status, summary) {
|
|
562
|
+
this.#stmt(
|
|
563
|
+
"UPDATE runs SET finished_at = ?, status = ?, summary = ? WHERE id = ?"
|
|
564
|
+
).run(Date.now(), status, summary ? JSON.stringify(summary) : null, runId);
|
|
565
|
+
}
|
|
566
|
+
saveCases(cases) {
|
|
567
|
+
this.#transaction(() => {
|
|
568
|
+
const stmt = this.#stmt(
|
|
569
|
+
"INSERT INTO cases (id, run_id, idx, input, output, expected, latency_ms, tokens_in, tokens_out, error) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
|
570
|
+
);
|
|
571
|
+
for (const c of cases) {
|
|
572
|
+
stmt.run(
|
|
573
|
+
c.id,
|
|
574
|
+
c.run_id,
|
|
575
|
+
c.idx,
|
|
576
|
+
JSON.stringify(c.input),
|
|
577
|
+
c.output,
|
|
578
|
+
c.expected != null ? JSON.stringify(c.expected) : null,
|
|
579
|
+
c.latency_ms,
|
|
580
|
+
c.tokens_in,
|
|
581
|
+
c.tokens_out,
|
|
582
|
+
c.error ?? null
|
|
583
|
+
);
|
|
584
|
+
}
|
|
585
|
+
});
|
|
586
|
+
}
|
|
587
|
+
saveScores(scores) {
|
|
588
|
+
this.#transaction(() => {
|
|
589
|
+
const stmt = this.#stmt(
|
|
590
|
+
"INSERT INTO scores (id, case_id, scorer_name, score, reason) VALUES (?, ?, ?, ?, ?)"
|
|
591
|
+
);
|
|
592
|
+
for (const s of scores) {
|
|
593
|
+
stmt.run(s.id, s.case_id, s.scorer_name, s.score, s.reason ?? null);
|
|
594
|
+
}
|
|
595
|
+
});
|
|
596
|
+
}
|
|
597
|
+
getRun(runId) {
|
|
598
|
+
const row = this.#stmt("SELECT * FROM runs WHERE id = ?").get(runId);
|
|
599
|
+
if (!row) return void 0;
|
|
600
|
+
return {
|
|
601
|
+
id: row.id,
|
|
602
|
+
suite_id: row.suite_id,
|
|
603
|
+
name: row.name,
|
|
604
|
+
model: row.model,
|
|
605
|
+
config: row.config ? JSON.parse(row.config) : null,
|
|
606
|
+
started_at: row.started_at,
|
|
607
|
+
finished_at: row.finished_at,
|
|
608
|
+
status: row.status,
|
|
609
|
+
summary: row.summary ? JSON.parse(row.summary) : null
|
|
610
|
+
};
|
|
611
|
+
}
|
|
612
|
+
listRuns(suiteId) {
|
|
613
|
+
const sql = suiteId ? "SELECT * FROM runs WHERE suite_id = ? ORDER BY started_at" : "SELECT * FROM runs ORDER BY started_at";
|
|
614
|
+
const rows = suiteId ? this.#stmt(sql).all(suiteId) : this.#stmt(sql).all();
|
|
615
|
+
return rows.map((row) => ({
|
|
616
|
+
id: row.id,
|
|
617
|
+
suite_id: row.suite_id,
|
|
618
|
+
name: row.name,
|
|
619
|
+
model: row.model,
|
|
620
|
+
config: row.config ? JSON.parse(row.config) : null,
|
|
621
|
+
started_at: row.started_at,
|
|
622
|
+
finished_at: row.finished_at,
|
|
623
|
+
status: row.status,
|
|
624
|
+
summary: row.summary ? JSON.parse(row.summary) : null
|
|
625
|
+
}));
|
|
626
|
+
}
|
|
627
|
+
getCases(runId) {
|
|
628
|
+
const rows = this.#stmt(
|
|
629
|
+
"SELECT * FROM cases WHERE run_id = ? ORDER BY idx"
|
|
630
|
+
).all(runId);
|
|
631
|
+
return rows.map((row) => ({
|
|
632
|
+
id: row.id,
|
|
633
|
+
run_id: row.run_id,
|
|
634
|
+
idx: row.idx,
|
|
635
|
+
input: JSON.parse(row.input),
|
|
636
|
+
output: row.output,
|
|
637
|
+
expected: row.expected ? JSON.parse(row.expected) : null,
|
|
638
|
+
latency_ms: row.latency_ms,
|
|
639
|
+
tokens_in: row.tokens_in,
|
|
640
|
+
tokens_out: row.tokens_out,
|
|
641
|
+
error: row.error
|
|
642
|
+
}));
|
|
643
|
+
}
|
|
644
|
+
getFailingCases(runId, threshold = 0.5) {
|
|
645
|
+
const rows = this.#stmt(
|
|
646
|
+
`SELECT c.*, s.scorer_name, s.score, s.reason as score_reason
|
|
647
|
+
FROM cases c
|
|
648
|
+
JOIN scores s ON s.case_id = c.id
|
|
649
|
+
WHERE c.run_id = ? AND s.score < ?
|
|
650
|
+
ORDER BY c.idx`
|
|
651
|
+
).all(runId, threshold);
|
|
652
|
+
const caseMap = /* @__PURE__ */ new Map();
|
|
653
|
+
for (const row of rows) {
|
|
654
|
+
let c = caseMap.get(row.id);
|
|
655
|
+
if (!c) {
|
|
656
|
+
c = {
|
|
657
|
+
id: row.id,
|
|
658
|
+
run_id: row.run_id,
|
|
659
|
+
idx: row.idx,
|
|
660
|
+
input: JSON.parse(row.input),
|
|
661
|
+
output: row.output,
|
|
662
|
+
expected: row.expected ? JSON.parse(row.expected) : null,
|
|
663
|
+
latency_ms: row.latency_ms,
|
|
664
|
+
tokens_in: row.tokens_in,
|
|
665
|
+
tokens_out: row.tokens_out,
|
|
666
|
+
error: row.error,
|
|
667
|
+
scores: []
|
|
668
|
+
};
|
|
669
|
+
caseMap.set(row.id, c);
|
|
670
|
+
}
|
|
671
|
+
c.scores.push({
|
|
672
|
+
scorer_name: row.scorer_name,
|
|
673
|
+
score: row.score,
|
|
674
|
+
reason: row.score_reason
|
|
675
|
+
});
|
|
676
|
+
}
|
|
677
|
+
return Array.from(caseMap.values());
|
|
678
|
+
}
|
|
679
|
+
getRunSummary(runId, threshold = 0.5) {
|
|
680
|
+
const totals = this.#stmt(
|
|
681
|
+
`SELECT
|
|
682
|
+
COUNT(DISTINCT c.id) as totalCases,
|
|
683
|
+
COALESCE(SUM(c.latency_ms), 0) as totalLatencyMs,
|
|
684
|
+
COALESCE(SUM(c.tokens_in), 0) as totalTokensIn,
|
|
685
|
+
COALESCE(SUM(c.tokens_out), 0) as totalTokensOut
|
|
686
|
+
FROM cases c WHERE c.run_id = ?`
|
|
687
|
+
).get(runId);
|
|
688
|
+
const scorerMeans = this.#stmt(
|
|
689
|
+
`SELECT s.scorer_name, AVG(s.score) as meanScore
|
|
690
|
+
FROM scores s
|
|
691
|
+
JOIN cases c ON c.id = s.case_id
|
|
692
|
+
WHERE c.run_id = ?
|
|
693
|
+
GROUP BY s.scorer_name`
|
|
694
|
+
).all(runId);
|
|
695
|
+
const meanScores = {};
|
|
696
|
+
for (const row of scorerMeans) {
|
|
697
|
+
meanScores[row.scorer_name] = row.meanScore;
|
|
698
|
+
}
|
|
699
|
+
const passFail = this.#stmt(
|
|
700
|
+
`SELECT c.id,
|
|
701
|
+
MIN(s.score) as minScore
|
|
702
|
+
FROM cases c
|
|
703
|
+
JOIN scores s ON s.case_id = c.id
|
|
704
|
+
WHERE c.run_id = ?
|
|
705
|
+
GROUP BY c.id`
|
|
706
|
+
).all(runId);
|
|
707
|
+
let passCount = 0;
|
|
708
|
+
let failCount = 0;
|
|
709
|
+
for (const row of passFail) {
|
|
710
|
+
if (row.minScore >= threshold) passCount++;
|
|
711
|
+
else failCount++;
|
|
712
|
+
}
|
|
713
|
+
return {
|
|
714
|
+
totalCases: totals.totalCases,
|
|
715
|
+
passCount,
|
|
716
|
+
failCount,
|
|
717
|
+
meanScores,
|
|
718
|
+
totalLatencyMs: totals.totalLatencyMs,
|
|
719
|
+
totalTokensIn: totals.totalTokensIn,
|
|
720
|
+
totalTokensOut: totals.totalTokensOut
|
|
721
|
+
};
|
|
722
|
+
}
|
|
723
|
+
listSuites() {
|
|
724
|
+
const rows = this.#stmt(
|
|
725
|
+
"SELECT * FROM suites ORDER BY created_at DESC"
|
|
726
|
+
).all();
|
|
727
|
+
return rows.map((row) => ({
|
|
728
|
+
id: row.id,
|
|
729
|
+
name: row.name,
|
|
730
|
+
created_at: row.created_at
|
|
731
|
+
}));
|
|
732
|
+
}
|
|
733
|
+
createPrompt(name, content) {
|
|
734
|
+
const id = crypto.randomUUID();
|
|
735
|
+
const now = Date.now();
|
|
736
|
+
const latest = this.#stmt(
|
|
737
|
+
"SELECT MAX(version) as latestVersion FROM prompts WHERE name = ?"
|
|
738
|
+
).get(name);
|
|
739
|
+
const version = (latest?.latestVersion ?? 0) + 1;
|
|
740
|
+
this.#stmt(
|
|
741
|
+
"INSERT INTO prompts (id, name, version, content, created_at) VALUES (?, ?, ?, ?, ?)"
|
|
742
|
+
).run(id, name, version, content, now);
|
|
743
|
+
return { id, name, version, content, created_at: now };
|
|
744
|
+
}
|
|
745
|
+
listPrompts() {
|
|
746
|
+
const rows = this.#stmt(
|
|
747
|
+
"SELECT * FROM prompts ORDER BY name COLLATE NOCASE ASC, version DESC"
|
|
748
|
+
).all();
|
|
749
|
+
return rows.map((row) => ({
|
|
750
|
+
id: row.id,
|
|
751
|
+
name: row.name,
|
|
752
|
+
version: row.version,
|
|
753
|
+
content: row.content,
|
|
754
|
+
created_at: row.created_at
|
|
755
|
+
}));
|
|
756
|
+
}
|
|
757
|
+
getPrompt(id) {
|
|
758
|
+
const row = this.#stmt("SELECT * FROM prompts WHERE id = ?").get(id);
|
|
759
|
+
if (!row) return void 0;
|
|
760
|
+
return {
|
|
761
|
+
id: row.id,
|
|
762
|
+
name: row.name,
|
|
763
|
+
version: row.version,
|
|
764
|
+
content: row.content,
|
|
765
|
+
created_at: row.created_at
|
|
766
|
+
};
|
|
767
|
+
}
|
|
768
|
+
deletePrompt(id) {
|
|
769
|
+
this.#stmt("DELETE FROM prompts WHERE id = ?").run(id);
|
|
770
|
+
}
|
|
771
|
+
};
|
|
772
|
+
|
|
773
|
+
// packages/evals/src/engine/index.ts
|
|
774
|
+
import { EventEmitter } from "node:events";
|
|
775
|
+
var EvalEmitter = class extends EventEmitter {
|
|
776
|
+
on(event, listener) {
|
|
777
|
+
return super.on(event, listener);
|
|
778
|
+
}
|
|
779
|
+
emit(event, data) {
|
|
780
|
+
return super.emit(event, data);
|
|
781
|
+
}
|
|
782
|
+
};
|
|
783
|
+
function errorMessage(err) {
|
|
784
|
+
if (err instanceof Error) {
|
|
785
|
+
return `${err.name}: ${err.message}`;
|
|
786
|
+
}
|
|
787
|
+
if (typeof err === "string") return err;
|
|
788
|
+
if (err == null) return "Unknown error";
|
|
789
|
+
try {
|
|
790
|
+
return JSON.stringify(err);
|
|
791
|
+
} catch {
|
|
792
|
+
return String(err);
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
function serializeError(err) {
|
|
796
|
+
if (err instanceof Error) {
|
|
797
|
+
return JSON.stringify({
|
|
798
|
+
name: err.name,
|
|
799
|
+
message: err.message,
|
|
800
|
+
stack: err.stack,
|
|
801
|
+
cause: err.cause instanceof Error ? {
|
|
802
|
+
name: err.cause.name,
|
|
803
|
+
message: err.cause.message
|
|
804
|
+
} : err.cause
|
|
805
|
+
});
|
|
806
|
+
}
|
|
807
|
+
if (typeof err === "string") return JSON.stringify({ message: err });
|
|
808
|
+
if (err == null) return JSON.stringify({ message: "Unknown error" });
|
|
809
|
+
try {
|
|
810
|
+
return JSON.stringify(err);
|
|
811
|
+
} catch {
|
|
812
|
+
return JSON.stringify({ message: String(err) });
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
function failureScores(scorerNames, error) {
|
|
816
|
+
const reason = `Task failed: ${errorMessage(error)}`;
|
|
817
|
+
const scores = {};
|
|
818
|
+
for (const scorerName of scorerNames) {
|
|
819
|
+
scores[scorerName] = { score: 0, reason };
|
|
820
|
+
}
|
|
821
|
+
return scores;
|
|
822
|
+
}
|
|
823
|
+
function createSemaphore(maxConcurrency) {
|
|
824
|
+
let active = 0;
|
|
825
|
+
const queue = [];
|
|
826
|
+
return {
|
|
827
|
+
async acquire() {
|
|
828
|
+
if (active < maxConcurrency) {
|
|
829
|
+
active++;
|
|
830
|
+
return;
|
|
831
|
+
}
|
|
832
|
+
return new Promise((resolve) => queue.push(resolve));
|
|
833
|
+
},
|
|
834
|
+
release() {
|
|
835
|
+
active--;
|
|
836
|
+
const next = queue.shift();
|
|
837
|
+
if (next) {
|
|
838
|
+
active++;
|
|
839
|
+
next();
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
};
|
|
843
|
+
}
|
|
844
|
+
async function wrapTask(task, input, timeoutMs) {
|
|
845
|
+
const start = performance.now();
|
|
846
|
+
let timerId;
|
|
847
|
+
try {
|
|
848
|
+
const result = await Promise.race([
|
|
849
|
+
task(input),
|
|
850
|
+
new Promise((_, reject) => {
|
|
851
|
+
timerId = setTimeout(
|
|
852
|
+
() => reject(new Error("timeout exceeded")),
|
|
853
|
+
timeoutMs
|
|
854
|
+
);
|
|
855
|
+
})
|
|
856
|
+
]);
|
|
857
|
+
clearTimeout(timerId);
|
|
858
|
+
const latencyMs = Math.round(performance.now() - start);
|
|
859
|
+
return {
|
|
860
|
+
output: result.output,
|
|
861
|
+
latencyMs,
|
|
862
|
+
tokensIn: result.usage?.inputTokens ?? 0,
|
|
863
|
+
tokensOut: result.usage?.outputTokens ?? 0
|
|
864
|
+
};
|
|
865
|
+
} catch (err) {
|
|
866
|
+
clearTimeout(timerId);
|
|
867
|
+
const latencyMs = Math.round(performance.now() - start);
|
|
868
|
+
return {
|
|
869
|
+
output: "",
|
|
870
|
+
latencyMs,
|
|
871
|
+
tokensIn: 0,
|
|
872
|
+
tokensOut: 0,
|
|
873
|
+
error: err
|
|
874
|
+
};
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
function clampScore(score, scorerName) {
|
|
878
|
+
if (score < 0 || score > 1) {
|
|
879
|
+
console.warn(
|
|
880
|
+
`Scorer "${scorerName}" returned out-of-range score ${score}, clamping to 0..1`
|
|
881
|
+
);
|
|
882
|
+
return Math.max(0, Math.min(1, score));
|
|
883
|
+
}
|
|
884
|
+
return score;
|
|
885
|
+
}
|
|
886
|
+
async function runEval(config) {
|
|
887
|
+
const {
|
|
888
|
+
name,
|
|
889
|
+
model,
|
|
890
|
+
dataset: ds,
|
|
891
|
+
task,
|
|
892
|
+
scorers,
|
|
893
|
+
store,
|
|
894
|
+
suiteId,
|
|
895
|
+
maxConcurrency = 10,
|
|
896
|
+
batchSize,
|
|
897
|
+
timeout = 3e4,
|
|
898
|
+
trials = 1,
|
|
899
|
+
threshold = 0.5
|
|
900
|
+
} = config;
|
|
901
|
+
const emitter = config.emitter ?? new EvalEmitter();
|
|
902
|
+
const resolvedSuiteId = suiteId ?? store.createSuite(name).id;
|
|
903
|
+
const runId = store.createRun({
|
|
904
|
+
suite_id: resolvedSuiteId,
|
|
905
|
+
name,
|
|
906
|
+
model,
|
|
907
|
+
config: config.config
|
|
908
|
+
});
|
|
909
|
+
const items = [];
|
|
910
|
+
let idx = 0;
|
|
911
|
+
for await (const item of ds) {
|
|
912
|
+
items.push({ index: idx++, input: item });
|
|
913
|
+
}
|
|
914
|
+
emitter.emit("run:start", { runId, totalCases: items.length, name, model });
|
|
915
|
+
const semaphore = createSemaphore(maxConcurrency);
|
|
916
|
+
const scorerNames = Object.keys(scorers);
|
|
917
|
+
const allCaseScores = [];
|
|
918
|
+
const processItem = async ({ index, input }) => {
|
|
919
|
+
await semaphore.acquire();
|
|
920
|
+
try {
|
|
921
|
+
emitter.emit("case:start", { runId, index, input });
|
|
922
|
+
let finalResult;
|
|
923
|
+
let finalScores;
|
|
924
|
+
if (trials > 1) {
|
|
925
|
+
const trialResults = [];
|
|
926
|
+
for (let t = 0; t < trials; t++) {
|
|
927
|
+
const result = await wrapTask(task, input, timeout);
|
|
928
|
+
if (result.error) {
|
|
929
|
+
trialResults.push({
|
|
930
|
+
result,
|
|
931
|
+
scores: failureScores(scorerNames, result.error)
|
|
932
|
+
});
|
|
933
|
+
} else {
|
|
934
|
+
const scores = {};
|
|
935
|
+
for (const [sName, scorer] of Object.entries(scorers)) {
|
|
936
|
+
const sr = await scorer({
|
|
937
|
+
input,
|
|
938
|
+
output: result.output,
|
|
939
|
+
expected: input.expected
|
|
940
|
+
});
|
|
941
|
+
scores[sName] = {
|
|
942
|
+
score: clampScore(sr.score, sName),
|
|
943
|
+
reason: sr.reason
|
|
944
|
+
};
|
|
945
|
+
}
|
|
946
|
+
trialResults.push({ result, scores });
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
const lastSuccessful = [...trialResults].reverse().find((t) => !t.result.error);
|
|
950
|
+
const baseResult = lastSuccessful?.result ?? trialResults[trialResults.length - 1].result;
|
|
951
|
+
finalResult = {
|
|
952
|
+
output: baseResult.output,
|
|
953
|
+
latencyMs: Math.round(
|
|
954
|
+
trialResults.reduce((sum, t) => sum + t.result.latencyMs, 0) / trials
|
|
955
|
+
),
|
|
956
|
+
tokensIn: Math.round(
|
|
957
|
+
trialResults.reduce((sum, t) => sum + t.result.tokensIn, 0) / trials
|
|
958
|
+
),
|
|
959
|
+
tokensOut: Math.round(
|
|
960
|
+
trialResults.reduce((sum, t) => sum + t.result.tokensOut, 0) / trials
|
|
961
|
+
),
|
|
962
|
+
error: lastSuccessful ? void 0 : baseResult.error
|
|
963
|
+
};
|
|
964
|
+
finalScores = {};
|
|
965
|
+
for (const sName of scorerNames) {
|
|
966
|
+
const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
|
|
967
|
+
finalScores[sName] = {
|
|
968
|
+
score: meanScore,
|
|
969
|
+
reason: trialResults[trialResults.length - 1].scores[sName]?.reason
|
|
970
|
+
};
|
|
971
|
+
}
|
|
972
|
+
} else {
|
|
973
|
+
finalResult = await wrapTask(task, input, timeout);
|
|
974
|
+
if (finalResult.error) {
|
|
975
|
+
finalScores = failureScores(scorerNames, finalResult.error);
|
|
976
|
+
} else {
|
|
977
|
+
finalScores = {};
|
|
978
|
+
for (const [sName, scorer] of Object.entries(scorers)) {
|
|
979
|
+
const sr = await scorer({
|
|
980
|
+
input,
|
|
981
|
+
output: finalResult.output,
|
|
982
|
+
expected: input.expected
|
|
983
|
+
});
|
|
984
|
+
finalScores[sName] = {
|
|
985
|
+
score: clampScore(sr.score, sName),
|
|
986
|
+
reason: sr.reason
|
|
987
|
+
};
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
const caseId = crypto.randomUUID();
|
|
992
|
+
const caseData = {
|
|
993
|
+
id: caseId,
|
|
994
|
+
run_id: runId,
|
|
995
|
+
idx: index,
|
|
996
|
+
input,
|
|
997
|
+
output: finalResult.output || null,
|
|
998
|
+
expected: input.expected,
|
|
999
|
+
latency_ms: finalResult.latencyMs,
|
|
1000
|
+
tokens_in: finalResult.tokensIn,
|
|
1001
|
+
tokens_out: finalResult.tokensOut,
|
|
1002
|
+
error: finalResult.error ? serializeError(finalResult.error) : void 0
|
|
1003
|
+
};
|
|
1004
|
+
store.saveCases([caseData]);
|
|
1005
|
+
const scoreDataList = scorerNames.map((sName) => ({
|
|
1006
|
+
id: crypto.randomUUID(),
|
|
1007
|
+
case_id: caseId,
|
|
1008
|
+
scorer_name: sName,
|
|
1009
|
+
score: finalScores[sName].score,
|
|
1010
|
+
reason: finalScores[sName].reason
|
|
1011
|
+
}));
|
|
1012
|
+
store.saveScores(scoreDataList);
|
|
1013
|
+
allCaseScores.push({
|
|
1014
|
+
index,
|
|
1015
|
+
scores: Object.fromEntries(
|
|
1016
|
+
scorerNames.map((sName) => [sName, finalScores[sName].score])
|
|
1017
|
+
),
|
|
1018
|
+
latencyMs: finalResult.latencyMs,
|
|
1019
|
+
tokensIn: finalResult.tokensIn,
|
|
1020
|
+
tokensOut: finalResult.tokensOut
|
|
1021
|
+
});
|
|
1022
|
+
if (finalResult.error) {
|
|
1023
|
+
emitter.emit("case:error", {
|
|
1024
|
+
runId,
|
|
1025
|
+
index,
|
|
1026
|
+
error: errorMessage(finalResult.error)
|
|
1027
|
+
});
|
|
1028
|
+
}
|
|
1029
|
+
emitter.emit("case:scored", {
|
|
1030
|
+
runId,
|
|
1031
|
+
index,
|
|
1032
|
+
input,
|
|
1033
|
+
output: finalResult.output,
|
|
1034
|
+
expected: input.expected,
|
|
1035
|
+
scores: finalScores,
|
|
1036
|
+
error: finalResult.error,
|
|
1037
|
+
latencyMs: finalResult.latencyMs,
|
|
1038
|
+
tokensIn: finalResult.tokensIn,
|
|
1039
|
+
tokensOut: finalResult.tokensOut
|
|
1040
|
+
});
|
|
1041
|
+
} finally {
|
|
1042
|
+
semaphore.release();
|
|
1043
|
+
}
|
|
1044
|
+
};
|
|
1045
|
+
const batches = batchSize ? Array.from(
|
|
1046
|
+
{ length: Math.ceil(items.length / batchSize) },
|
|
1047
|
+
(_, i) => items.slice(i * batchSize, (i + 1) * batchSize)
|
|
1048
|
+
) : [items];
|
|
1049
|
+
try {
|
|
1050
|
+
for (const batch of batches) {
|
|
1051
|
+
await Promise.all(batch.map(processItem));
|
|
1052
|
+
}
|
|
1053
|
+
} catch (err) {
|
|
1054
|
+
store.finishRun(runId, "failed");
|
|
1055
|
+
throw err;
|
|
1056
|
+
}
|
|
1057
|
+
const summary = computeSummary(allCaseScores, scorerNames, threshold);
|
|
1058
|
+
store.finishRun(runId, "completed", summary);
|
|
1059
|
+
emitter.emit("run:end", { runId, summary });
|
|
1060
|
+
return summary;
|
|
1061
|
+
}
|
|
1062
|
+
function computeSummary(cases, scorerNames, threshold) {
|
|
1063
|
+
const totalCases = cases.length;
|
|
1064
|
+
let passCount = 0;
|
|
1065
|
+
let failCount = 0;
|
|
1066
|
+
let totalLatencyMs = 0;
|
|
1067
|
+
let totalTokensIn = 0;
|
|
1068
|
+
let totalTokensOut = 0;
|
|
1069
|
+
const scoreSums = {};
|
|
1070
|
+
for (const name of scorerNames) {
|
|
1071
|
+
scoreSums[name] = 0;
|
|
1072
|
+
}
|
|
1073
|
+
for (const c of cases) {
|
|
1074
|
+
totalLatencyMs += c.latencyMs;
|
|
1075
|
+
totalTokensIn += c.tokensIn;
|
|
1076
|
+
totalTokensOut += c.tokensOut;
|
|
1077
|
+
let allPass = true;
|
|
1078
|
+
for (const name of scorerNames) {
|
|
1079
|
+
const score = c.scores[name] ?? 0;
|
|
1080
|
+
scoreSums[name] += score;
|
|
1081
|
+
if (score < threshold) allPass = false;
|
|
1082
|
+
}
|
|
1083
|
+
if (allPass) passCount++;
|
|
1084
|
+
else failCount++;
|
|
1085
|
+
}
|
|
1086
|
+
const meanScores = {};
|
|
1087
|
+
for (const name of scorerNames) {
|
|
1088
|
+
meanScores[name] = totalCases > 0 ? scoreSums[name] / totalCases : 0;
|
|
1089
|
+
}
|
|
1090
|
+
return {
|
|
1091
|
+
totalCases,
|
|
1092
|
+
passCount,
|
|
1093
|
+
failCount,
|
|
1094
|
+
meanScores,
|
|
1095
|
+
totalLatencyMs,
|
|
1096
|
+
totalTokensIn,
|
|
1097
|
+
totalTokensOut
|
|
1098
|
+
};
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
// packages/evals/src/comparison/index.ts
|
|
1102
|
+
function categorize(delta, tolerance) {
|
|
1103
|
+
if (Math.abs(delta) <= tolerance) return "unchanged";
|
|
1104
|
+
return delta > 0 ? "improved" : "regressed";
|
|
1105
|
+
}
|
|
1106
|
+
function buildScoreMap(cases) {
|
|
1107
|
+
const map = /* @__PURE__ */ new Map();
|
|
1108
|
+
for (const c of cases) {
|
|
1109
|
+
const scores = {};
|
|
1110
|
+
for (const s of c.scores) {
|
|
1111
|
+
scores[s.scorer_name] = s.score;
|
|
1112
|
+
}
|
|
1113
|
+
map.set(c.idx, scores);
|
|
1114
|
+
}
|
|
1115
|
+
return map;
|
|
1116
|
+
}
|
|
1117
|
+
function getAllCasesWithScores(store, runId) {
|
|
1118
|
+
const cases = store.getCases(runId);
|
|
1119
|
+
const withScores = store.getFailingCases(runId, Infinity);
|
|
1120
|
+
const scoredMap = new Map(withScores.map((c) => [c.id, c]));
|
|
1121
|
+
return cases.map((c) => scoredMap.get(c.id) ?? { ...c, scores: [] });
|
|
1122
|
+
}
|
|
1123
|
+
function compareRuns(store, baselineRunId, candidateRunId, options) {
|
|
1124
|
+
const tolerance = options?.tolerance ?? 0.01;
|
|
1125
|
+
const regressionThreshold = options?.regressionThreshold ?? 0.05;
|
|
1126
|
+
const baselineCases = getAllCasesWithScores(store, baselineRunId);
|
|
1127
|
+
const candidateCases = getAllCasesWithScores(store, candidateRunId);
|
|
1128
|
+
if (baselineCases.length !== candidateCases.length) {
|
|
1129
|
+
console.warn(
|
|
1130
|
+
`Run case count mismatch: baseline=${baselineCases.length}, candidate=${candidateCases.length}. Comparing intersection only.`
|
|
1131
|
+
);
|
|
1132
|
+
}
|
|
1133
|
+
const baselineMap = buildScoreMap(baselineCases);
|
|
1134
|
+
const candidateMap = buildScoreMap(candidateCases);
|
|
1135
|
+
const allScorerNames = /* @__PURE__ */ new Set();
|
|
1136
|
+
for (const scores of baselineMap.values()) {
|
|
1137
|
+
for (const name of Object.keys(scores)) allScorerNames.add(name);
|
|
1138
|
+
}
|
|
1139
|
+
for (const scores of candidateMap.values()) {
|
|
1140
|
+
for (const name of Object.keys(scores)) allScorerNames.add(name);
|
|
1141
|
+
}
|
|
1142
|
+
const commonIndices = [...baselineMap.keys()].filter(
|
|
1143
|
+
(idx) => candidateMap.has(idx)
|
|
1144
|
+
);
|
|
1145
|
+
commonIndices.sort((a, b) => a - b);
|
|
1146
|
+
const caseDiffs = [];
|
|
1147
|
+
const scorerDeltas = {};
|
|
1148
|
+
const scorerCounts = {};
|
|
1149
|
+
for (const name of allScorerNames) {
|
|
1150
|
+
scorerDeltas[name] = [];
|
|
1151
|
+
scorerCounts[name] = { improved: 0, regressed: 0, unchanged: 0 };
|
|
1152
|
+
}
|
|
1153
|
+
for (const idx of commonIndices) {
|
|
1154
|
+
const baseScores = baselineMap.get(idx);
|
|
1155
|
+
const candScores = candidateMap.get(idx);
|
|
1156
|
+
const diff = { index: idx, scorerDeltas: {} };
|
|
1157
|
+
for (const name of allScorerNames) {
|
|
1158
|
+
const baseline = baseScores[name] ?? 0;
|
|
1159
|
+
const candidate = candScores[name] ?? 0;
|
|
1160
|
+
const delta = candidate - baseline;
|
|
1161
|
+
const change = categorize(delta, tolerance);
|
|
1162
|
+
diff.scorerDeltas[name] = { baseline, candidate, delta, change };
|
|
1163
|
+
scorerDeltas[name].push(delta);
|
|
1164
|
+
if (change === "improved") scorerCounts[name].improved++;
|
|
1165
|
+
else if (change === "regressed") scorerCounts[name].regressed++;
|
|
1166
|
+
else scorerCounts[name].unchanged++;
|
|
1167
|
+
}
|
|
1168
|
+
caseDiffs.push(diff);
|
|
1169
|
+
}
|
|
1170
|
+
const scorerSummaries = {};
|
|
1171
|
+
for (const name of allScorerNames) {
|
|
1172
|
+
const deltas = scorerDeltas[name];
|
|
1173
|
+
const meanDelta = deltas.length > 0 ? deltas.reduce((a, b) => a + b, 0) / deltas.length : 0;
|
|
1174
|
+
scorerSummaries[name] = {
|
|
1175
|
+
meanDelta,
|
|
1176
|
+
improvedCount: scorerCounts[name].improved,
|
|
1177
|
+
regressedCount: scorerCounts[name].regressed,
|
|
1178
|
+
unchangedCount: scorerCounts[name].unchanged
|
|
1179
|
+
};
|
|
1180
|
+
}
|
|
1181
|
+
const baselineSummary = store.getRunSummary(baselineRunId);
|
|
1182
|
+
const candidateSummary = store.getRunSummary(candidateRunId);
|
|
1183
|
+
const costDelta = {
|
|
1184
|
+
latencyDeltaMs: candidateSummary.totalLatencyMs - baselineSummary.totalLatencyMs,
|
|
1185
|
+
tokenInDelta: candidateSummary.totalTokensIn - baselineSummary.totalTokensIn,
|
|
1186
|
+
tokenOutDelta: candidateSummary.totalTokensOut - baselineSummary.totalTokensOut
|
|
1187
|
+
};
|
|
1188
|
+
const regressionDetails = {};
|
|
1189
|
+
let anyRegressed = false;
|
|
1190
|
+
for (const [name, summary] of Object.entries(scorerSummaries)) {
|
|
1191
|
+
const exceeds = summary.meanDelta < -regressionThreshold;
|
|
1192
|
+
regressionDetails[name] = { meanDelta: summary.meanDelta, exceeds };
|
|
1193
|
+
if (exceeds) anyRegressed = true;
|
|
1194
|
+
}
|
|
1195
|
+
return {
|
|
1196
|
+
caseDiffs,
|
|
1197
|
+
scorerSummaries,
|
|
1198
|
+
costDelta,
|
|
1199
|
+
totalCasesCompared: commonIndices.length,
|
|
1200
|
+
regression: { regressed: anyRegressed, details: regressionDetails }
|
|
1201
|
+
};
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
// packages/evals/src/reporters/console.ts
|
|
1205
|
+
import chalk from "chalk";
|
|
1206
|
+
|
|
1207
|
+
// packages/evals/src/reporters/format.ts
|
|
1208
|
+
function formatDuration(ms) {
|
|
1209
|
+
if (ms < 1e3) return `${ms}ms`;
|
|
1210
|
+
return `${(ms / 1e3).toFixed(1)}s`;
|
|
1211
|
+
}
|
|
1212
|
+
function formatTokens(n) {
|
|
1213
|
+
if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
|
|
1214
|
+
if (n >= 1e3) return `${(n / 1e3).toFixed(0)}k`;
|
|
1215
|
+
return String(n);
|
|
1216
|
+
}
|
|
1217
|
+
function generateFilename(name, runId, ext) {
|
|
1218
|
+
const slug = name.replace(/[^a-zA-Z0-9-_]/g, "-").toLowerCase();
|
|
1219
|
+
const prefix = runId.slice(0, 8);
|
|
1220
|
+
return `${slug}-${prefix}.${ext}`;
|
|
1221
|
+
}
|
|
1222
|
+
function stringifyUnknown(value, options) {
|
|
1223
|
+
if (typeof value === "string") return value;
|
|
1224
|
+
const space = options?.space ?? 0;
|
|
1225
|
+
const fallback = options?.fallback ?? "null";
|
|
1226
|
+
try {
|
|
1227
|
+
return JSON.stringify(value, null, space) ?? fallback;
|
|
1228
|
+
} catch {
|
|
1229
|
+
return String(value);
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
function formatInputValue(value) {
|
|
1233
|
+
return stringifyUnknown(value, { space: 0, fallback: "" });
|
|
1234
|
+
}
|
|
1235
|
+
function formatErrorValue(value) {
|
|
1236
|
+
if (value == null) return "";
|
|
1237
|
+
if (typeof value === "string") return value;
|
|
1238
|
+
return stringifyUnknown(value, { space: 2, fallback: "" });
|
|
1239
|
+
}
|
|
1240
|
+
function escapeCsv(value) {
|
|
1241
|
+
const str = stringifyUnknown(value, { space: 0, fallback: "null" });
|
|
1242
|
+
if (str.includes(",") || str.includes('"') || str.includes("\n")) {
|
|
1243
|
+
return `"${str.replace(/"/g, '""')}"`;
|
|
1244
|
+
}
|
|
1245
|
+
return str;
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
// packages/evals/src/reporters/shared.ts
|
|
1249
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
1250
|
+
import { join } from "node:path";
|
|
1251
|
+
var DEFAULT_OUTPUT_DIR = ".evals/reports";
|
|
1252
|
+
function resolveOutputDir(outputDir) {
|
|
1253
|
+
return outputDir ?? DEFAULT_OUTPUT_DIR;
|
|
1254
|
+
}
|
|
1255
|
+
function getReportPath(outputDir, name, runId, ext) {
|
|
1256
|
+
return join(outputDir, generateFilename(name, runId, ext));
|
|
1257
|
+
}
|
|
1258
|
+
async function writeRunReportFile(outputDir, name, runId, ext, content) {
|
|
1259
|
+
await mkdir(outputDir, { recursive: true });
|
|
1260
|
+
await writeFile(getReportPath(outputDir, name, runId, ext), content, "utf-8");
|
|
1261
|
+
}
|
|
1262
|
+
function getCaseStatus(result, threshold) {
|
|
1263
|
+
if (result.error) return "error";
|
|
1264
|
+
const passed = Object.values(result.scores).every(
|
|
1265
|
+
(s) => s.score >= threshold
|
|
1266
|
+
);
|
|
1267
|
+
return passed ? "pass" : "fail";
|
|
1268
|
+
}
|
|
1269
|
+
function createRunEndFileReporter(options) {
|
|
1270
|
+
const outputDir = resolveOutputDir(options.outputDir);
|
|
1271
|
+
return {
|
|
1272
|
+
async onRunEnd(data) {
|
|
1273
|
+
const content = await options.render(data);
|
|
1274
|
+
await writeRunReportFile(
|
|
1275
|
+
outputDir,
|
|
1276
|
+
data.name,
|
|
1277
|
+
data.runId,
|
|
1278
|
+
options.ext,
|
|
1279
|
+
content
|
|
1280
|
+
);
|
|
1281
|
+
}
|
|
1282
|
+
};
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
// packages/evals/src/reporters/console.ts
|
|
1286
|
+
function consoleReporter(options) {
|
|
1287
|
+
const verbosity = options?.verbosity ?? "normal";
|
|
1288
|
+
let totalCases = 0;
|
|
1289
|
+
let completed = 0;
|
|
1290
|
+
return {
|
|
1291
|
+
onRunStart(data) {
|
|
1292
|
+
totalCases = data.totalCases;
|
|
1293
|
+
completed = 0;
|
|
1294
|
+
},
|
|
1295
|
+
onCaseEnd() {
|
|
1296
|
+
completed++;
|
|
1297
|
+
if (verbosity !== "quiet") {
|
|
1298
|
+
process.stdout.write(
|
|
1299
|
+
`\r ${chalk.dim(`[${completed}/${totalCases}]`)}`
|
|
1300
|
+
);
|
|
1301
|
+
}
|
|
1302
|
+
},
|
|
1303
|
+
onRunEnd(data) {
|
|
1304
|
+
if (verbosity !== "quiet") {
|
|
1305
|
+
process.stdout.write("\r" + " ".repeat(30) + "\r");
|
|
1306
|
+
}
|
|
1307
|
+
renderSummaryTable(data);
|
|
1308
|
+
if (verbosity === "quiet") return;
|
|
1309
|
+
const sorted = [...data.cases].sort((a, b) => a.index - b.index);
|
|
1310
|
+
if (verbosity === "verbose") {
|
|
1311
|
+
for (const c of sorted) {
|
|
1312
|
+
renderCaseDetail(c, data.threshold, {
|
|
1313
|
+
includeIO: true,
|
|
1314
|
+
maxStringLength: 2e4
|
|
1315
|
+
});
|
|
1316
|
+
}
|
|
1317
|
+
} else {
|
|
1318
|
+
const failing = sorted.filter(
|
|
1319
|
+
(c) => getCaseStatus(c, data.threshold) !== "pass"
|
|
1320
|
+
);
|
|
1321
|
+
if (failing.length > 0) {
|
|
1322
|
+
console.log(chalk.dim(` Failing cases (${failing.length}):`));
|
|
1323
|
+
console.log("");
|
|
1324
|
+
for (const c of failing) {
|
|
1325
|
+
renderCaseDetail(c, data.threshold, {
|
|
1326
|
+
includeIO: true,
|
|
1327
|
+
maxStringLength: 4e3
|
|
1328
|
+
});
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
};
|
|
1334
|
+
}
|
|
1335
|
+
function indentBlock(text, spaces) {
|
|
1336
|
+
const pad = " ".repeat(spaces);
|
|
1337
|
+
return text.replace(/\r\n/g, "\n").split("\n").map((line) => pad + line).join("\n");
|
|
1338
|
+
}
|
|
1339
|
+
function truncateString(text, maxLength) {
|
|
1340
|
+
if (text.length <= maxLength) return text;
|
|
1341
|
+
return text.slice(0, maxLength) + "\u2026";
|
|
1342
|
+
}
|
|
1343
|
+
function renderSummaryTable(data) {
|
|
1344
|
+
const { summary } = data;
|
|
1345
|
+
const scoreStr = Object.entries(summary.meanScores).map(([name, score]) => `${name}: ${score.toFixed(3)}`).join(", ");
|
|
1346
|
+
console.log("");
|
|
1347
|
+
console.log(chalk.bold(" Summary"));
|
|
1348
|
+
console.log(chalk.dim(" " + "\u2500".repeat(60)));
|
|
1349
|
+
console.log(` ${chalk.dim("Eval:")} ${data.name}`);
|
|
1350
|
+
console.log(` ${chalk.dim("Model:")} ${data.model}`);
|
|
1351
|
+
console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
|
|
1352
|
+
console.log(
|
|
1353
|
+
` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
|
|
1354
|
+
);
|
|
1355
|
+
console.log(` ${chalk.dim("Scores:")} ${scoreStr}`);
|
|
1356
|
+
console.log(
|
|
1357
|
+
` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
|
|
1358
|
+
);
|
|
1359
|
+
console.log(
|
|
1360
|
+
` ${chalk.dim("Tokens:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1361
|
+
);
|
|
1362
|
+
console.log(chalk.dim(" " + "\u2500".repeat(60)));
|
|
1363
|
+
console.log("");
|
|
1364
|
+
}
|
|
1365
|
+
function renderCaseDetail(c, threshold, options) {
|
|
1366
|
+
const entries = Object.entries(c.scores);
|
|
1367
|
+
const failed = entries.some(([, s]) => s.score < threshold);
|
|
1368
|
+
const prefix = failed ? chalk.red("FAIL") : chalk.green("PASS");
|
|
1369
|
+
const includeIO = options?.includeIO ?? false;
|
|
1370
|
+
const maxStringLength = options?.maxStringLength ?? 4e3;
|
|
1371
|
+
console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)}`);
|
|
1372
|
+
const inputStr = stringifyUnknown(c.input, {
|
|
1373
|
+
space: 2,
|
|
1374
|
+
fallback: String(c.input)
|
|
1375
|
+
});
|
|
1376
|
+
console.log(` ${chalk.dim("Input:")} ${inputStr}`);
|
|
1377
|
+
if (includeIO) {
|
|
1378
|
+
console.log(` ${chalk.dim("Output:")}`);
|
|
1379
|
+
console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
|
|
1380
|
+
console.log(` ${chalk.dim("Expected:")}`);
|
|
1381
|
+
const expectedStrRaw = stringifyUnknown(c.expected, {
|
|
1382
|
+
space: 2,
|
|
1383
|
+
fallback: String(c.expected)
|
|
1384
|
+
});
|
|
1385
|
+
console.log(
|
|
1386
|
+
indentBlock(truncateString(expectedStrRaw, maxStringLength), 6)
|
|
1387
|
+
);
|
|
1388
|
+
}
|
|
1389
|
+
for (const [name, s] of entries) {
|
|
1390
|
+
const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
|
|
1391
|
+
const reasonStr = s.reason ? ` \u2014 ${s.reason}` : "";
|
|
1392
|
+
console.log(
|
|
1393
|
+
` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
|
|
1394
|
+
);
|
|
1395
|
+
}
|
|
1396
|
+
if (c.error) {
|
|
1397
|
+
console.log(` ${chalk.dim("Error:")}`);
|
|
1398
|
+
const errorStr = formatErrorValue(c.error);
|
|
1399
|
+
console.log(` ${chalk.red(errorStr)}`);
|
|
1400
|
+
}
|
|
1401
|
+
console.log("");
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
// packages/evals/src/reporters/json.ts
|
|
1405
|
+
import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
|
|
1406
|
+
function jsonReporter(options) {
|
|
1407
|
+
const outputDir = resolveOutputDir(options?.outputDir);
|
|
1408
|
+
const pretty = options?.pretty ?? true;
|
|
1409
|
+
let streamFilename = "";
|
|
1410
|
+
return {
|
|
1411
|
+
async onRunStart(data) {
|
|
1412
|
+
await mkdir2(outputDir, { recursive: true });
|
|
1413
|
+
streamFilename = getReportPath(outputDir, data.name, data.runId, "jsonl");
|
|
1414
|
+
},
|
|
1415
|
+
async onCaseEnd(data) {
|
|
1416
|
+
const line = stringifyUnknown(data, { space: 0, fallback: "null" });
|
|
1417
|
+
await appendFile(streamFilename, line + "\n", "utf-8");
|
|
1418
|
+
},
|
|
1419
|
+
async onRunEnd(data) {
|
|
1420
|
+
const content = stringifyUnknown(data, {
|
|
1421
|
+
space: pretty ? 2 : 0,
|
|
1422
|
+
fallback: "null"
|
|
1423
|
+
});
|
|
1424
|
+
await writeRunReportFile(
|
|
1425
|
+
outputDir,
|
|
1426
|
+
data.name,
|
|
1427
|
+
data.runId,
|
|
1428
|
+
"json",
|
|
1429
|
+
content
|
|
1430
|
+
);
|
|
1431
|
+
}
|
|
1432
|
+
};
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
// packages/evals/src/reporters/csv.ts
|
|
1436
|
+
function csvReporter(options) {
|
|
1437
|
+
return createRunEndFileReporter({
|
|
1438
|
+
outputDir: options?.outputDir,
|
|
1439
|
+
ext: "csv",
|
|
1440
|
+
render(data) {
|
|
1441
|
+
const scorerNames = Object.keys(data.summary.meanScores);
|
|
1442
|
+
const headerParts = [
|
|
1443
|
+
"index",
|
|
1444
|
+
"input",
|
|
1445
|
+
"output",
|
|
1446
|
+
"expected",
|
|
1447
|
+
"error",
|
|
1448
|
+
"latency_ms",
|
|
1449
|
+
"tokens_in",
|
|
1450
|
+
"tokens_out"
|
|
1451
|
+
];
|
|
1452
|
+
for (const name of scorerNames) {
|
|
1453
|
+
headerParts.push(`${name}_score`, `${name}_reason`);
|
|
1454
|
+
}
|
|
1455
|
+
const rows = [headerParts.join(",")];
|
|
1456
|
+
for (const c of data.cases) {
|
|
1457
|
+
const parts = [
|
|
1458
|
+
String(c.index),
|
|
1459
|
+
escapeCsv(c.input),
|
|
1460
|
+
escapeCsv(c.output),
|
|
1461
|
+
escapeCsv(c.expected),
|
|
1462
|
+
escapeCsv(c.error ?? ""),
|
|
1463
|
+
String(c.latencyMs),
|
|
1464
|
+
String(c.tokensIn),
|
|
1465
|
+
String(c.tokensOut)
|
|
1466
|
+
];
|
|
1467
|
+
for (const name of scorerNames) {
|
|
1468
|
+
const s = c.scores[name];
|
|
1469
|
+
parts.push(String(s?.score ?? ""), escapeCsv(s?.reason ?? ""));
|
|
1470
|
+
}
|
|
1471
|
+
rows.push(parts.join(","));
|
|
1472
|
+
}
|
|
1473
|
+
return rows.join("\n") + "\n";
|
|
1474
|
+
}
|
|
1475
|
+
});
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
// packages/evals/src/reporters/markdown.ts
|
|
1479
|
+
function markdownReporter(options) {
|
|
1480
|
+
return createRunEndFileReporter({
|
|
1481
|
+
outputDir: options?.outputDir,
|
|
1482
|
+
ext: "md",
|
|
1483
|
+
render(data) {
|
|
1484
|
+
const { summary } = data;
|
|
1485
|
+
const scorerNames = Object.keys(summary.meanScores);
|
|
1486
|
+
const lines = [];
|
|
1487
|
+
lines.push(`# ${data.name}`);
|
|
1488
|
+
lines.push("");
|
|
1489
|
+
lines.push(`**Model:** ${data.model}`);
|
|
1490
|
+
lines.push(
|
|
1491
|
+
`**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
|
|
1492
|
+
);
|
|
1493
|
+
lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
|
|
1494
|
+
lines.push(
|
|
1495
|
+
`**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1496
|
+
);
|
|
1497
|
+
lines.push("");
|
|
1498
|
+
lines.push("## Scores");
|
|
1499
|
+
lines.push("");
|
|
1500
|
+
lines.push("| Scorer | Mean |");
|
|
1501
|
+
lines.push("|--------|------|");
|
|
1502
|
+
for (const [name, score] of Object.entries(summary.meanScores)) {
|
|
1503
|
+
lines.push(`| ${name} | ${score.toFixed(3)} |`);
|
|
1504
|
+
}
|
|
1505
|
+
lines.push("");
|
|
1506
|
+
lines.push("## Cases");
|
|
1507
|
+
lines.push("");
|
|
1508
|
+
const caseHeader = [
|
|
1509
|
+
"#",
|
|
1510
|
+
"Status",
|
|
1511
|
+
"Input",
|
|
1512
|
+
...scorerNames,
|
|
1513
|
+
"Latency",
|
|
1514
|
+
"Error"
|
|
1515
|
+
];
|
|
1516
|
+
lines.push(`| ${caseHeader.join(" | ")} |`);
|
|
1517
|
+
lines.push(`| ${caseHeader.map(() => "---").join(" | ")} |`);
|
|
1518
|
+
for (const c of data.cases) {
|
|
1519
|
+
const statusValue = getCaseStatus(c, data.threshold);
|
|
1520
|
+
const status = statusValue === "error" ? "\u{1F534} Error" : statusValue === "pass" ? "\u2705 Pass" : "\u274C Fail";
|
|
1521
|
+
const input = formatInputValue(c.input).slice(0, 60);
|
|
1522
|
+
const scores = scorerNames.map(
|
|
1523
|
+
(name) => c.scores[name]?.score.toFixed(3) ?? "-"
|
|
1524
|
+
);
|
|
1525
|
+
const error = c.error ? formatErrorValue(c.error).replace(/\r?\n/g, "<br>").replace(/\|/g, "\\|") : "-";
|
|
1526
|
+
const row = [
|
|
1527
|
+
String(c.index),
|
|
1528
|
+
status,
|
|
1529
|
+
input,
|
|
1530
|
+
...scores,
|
|
1531
|
+
`${c.latencyMs}ms`,
|
|
1532
|
+
error
|
|
1533
|
+
];
|
|
1534
|
+
lines.push(`| ${row.join(" | ")} |`);
|
|
1535
|
+
}
|
|
1536
|
+
lines.push("");
|
|
1537
|
+
return lines.join("\n");
|
|
1538
|
+
}
|
|
1539
|
+
});
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
// packages/evals/src/reporters/html.ts
|
|
1543
|
+
function htmlReporter(options) {
|
|
1544
|
+
return createRunEndFileReporter({
|
|
1545
|
+
outputDir: options?.outputDir,
|
|
1546
|
+
ext: "html",
|
|
1547
|
+
render: renderHtml
|
|
1548
|
+
});
|
|
1549
|
+
}
|
|
1550
|
+
function esc(str) {
|
|
1551
|
+
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
1552
|
+
}
|
|
1553
|
+
function renderHtml(data) {
|
|
1554
|
+
const { summary } = data;
|
|
1555
|
+
const scorerNames = Object.keys(summary.meanScores);
|
|
1556
|
+
const caseRows = data.cases.map((c) => {
|
|
1557
|
+
const status = getCaseStatus(c, data.threshold);
|
|
1558
|
+
const statusLabel = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
|
|
1559
|
+
const scoresCells = scorerNames.map((name) => {
|
|
1560
|
+
const s = c.scores[name];
|
|
1561
|
+
const score = s?.score ?? 0;
|
|
1562
|
+
const cls = score >= data.threshold ? "pass" : "fail";
|
|
1563
|
+
const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
|
|
1564
|
+
return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
|
|
1565
|
+
}).join("");
|
|
1566
|
+
return `<tr class="${status}">
|
|
1567
|
+
<td>${c.index}</td>
|
|
1568
|
+
<td class="${status}">${statusLabel}</td>
|
|
1569
|
+
<td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
|
|
1570
|
+
<td class="text">${esc(c.output.slice(0, 120))}</td>
|
|
1571
|
+
${scoresCells}
|
|
1572
|
+
<td>${c.latencyMs}ms</td>
|
|
1573
|
+
<td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
|
|
1574
|
+
</tr>`;
|
|
1575
|
+
}).join("\n");
|
|
1576
|
+
const scorerHeaders = scorerNames.map((n) => `<th>${esc(n)}</th>`).join("");
|
|
1577
|
+
const meanScoreRows = Object.entries(summary.meanScores).map(
|
|
1578
|
+
([name, score]) => `<tr><td>${esc(name)}</td><td>${score.toFixed(3)}</td></tr>`
|
|
1579
|
+
).join("");
|
|
1580
|
+
return `<!DOCTYPE html>
|
|
1581
|
+
<html lang="en">
|
|
1582
|
+
<head>
|
|
1583
|
+
<meta charset="utf-8">
|
|
1584
|
+
<title>${esc(data.name)} \u2014 Eval Report</title>
|
|
1585
|
+
<style>
|
|
1586
|
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
1587
|
+
body { font-family: system-ui, -apple-system, sans-serif; background: #f8f9fa; color: #1a1a1a; padding: 2rem; }
|
|
1588
|
+
h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
|
|
1589
|
+
.meta { color: #666; margin-bottom: 1.5rem; font-size: 0.9rem; }
|
|
1590
|
+
.meta span { margin-right: 1.5rem; }
|
|
1591
|
+
.summary-table, .cases-table { width: 100%; border-collapse: collapse; margin-bottom: 2rem; }
|
|
1592
|
+
.summary-table th, .summary-table td,
|
|
1593
|
+
.cases-table th, .cases-table td { padding: 0.5rem 0.75rem; border: 1px solid #ddd; text-align: left; font-size: 0.85rem; }
|
|
1594
|
+
.summary-table th, .cases-table th { background: #f1f3f5; font-weight: 600; }
|
|
1595
|
+
.cases-table .text { max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
|
1596
|
+
.cases-table .error-text { max-width: 480px; white-space: pre-wrap; word-break: break-word; }
|
|
1597
|
+
.pass { color: #2b8a3e; }
|
|
1598
|
+
.fail { color: #c92a2a; }
|
|
1599
|
+
.error { color: #e67700; }
|
|
1600
|
+
tr.pass:hover, tr.fail:hover, tr.error:hover { background: #f1f3f5; }
|
|
1601
|
+
td.pass { background: #ebfbee; }
|
|
1602
|
+
td.fail { background: #fff5f5; }
|
|
1603
|
+
h2 { font-size: 1.2rem; margin: 1.5rem 0 0.75rem; }
|
|
1604
|
+
</style>
|
|
1605
|
+
</head>
|
|
1606
|
+
<body>
|
|
1607
|
+
<h1>${esc(data.name)}</h1>
|
|
1608
|
+
<div class="meta">
|
|
1609
|
+
<span><strong>Model:</strong> ${esc(data.model)}</span>
|
|
1610
|
+
<span><strong>Cases:</strong> ${summary.totalCases}</span>
|
|
1611
|
+
<span><strong>Pass:</strong> ${summary.passCount}</span>
|
|
1612
|
+
<span><strong>Fail:</strong> ${summary.failCount}</span>
|
|
1613
|
+
<span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
|
|
1614
|
+
<span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
|
|
1615
|
+
</div>
|
|
1616
|
+
|
|
1617
|
+
<h2>Mean Scores</h2>
|
|
1618
|
+
<table class="summary-table">
|
|
1619
|
+
<thead><tr><th>Scorer</th><th>Mean</th></tr></thead>
|
|
1620
|
+
<tbody>${meanScoreRows}</tbody>
|
|
1621
|
+
</table>
|
|
1622
|
+
|
|
1623
|
+
<h2>Cases</h2>
|
|
1624
|
+
<table class="cases-table">
|
|
1625
|
+
<thead>
|
|
1626
|
+
<tr>
|
|
1627
|
+
<th>#</th>
|
|
1628
|
+
<th>Status</th>
|
|
1629
|
+
<th>Input</th>
|
|
1630
|
+
<th>Output</th>
|
|
1631
|
+
${scorerHeaders}
|
|
1632
|
+
<th>Latency</th>
|
|
1633
|
+
<th>Error</th>
|
|
1634
|
+
</tr>
|
|
1635
|
+
</thead>
|
|
1636
|
+
<tbody>
|
|
1637
|
+
${caseRows}
|
|
1638
|
+
</tbody>
|
|
1639
|
+
</table>
|
|
1640
|
+
</body>
|
|
1641
|
+
</html>`;
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
// packages/evals/src/evaluate/index.ts
|
|
1645
|
+
async function evaluate(options) {
|
|
1646
|
+
if ("models" in options) {
|
|
1647
|
+
return evaluateEach(options);
|
|
1648
|
+
}
|
|
1649
|
+
return evaluateSingle(options);
|
|
1650
|
+
}
|
|
1651
|
+
function resolveStore(store) {
|
|
1652
|
+
return store instanceof RunStore ? store : new RunStore(store);
|
|
1653
|
+
}
|
|
1654
|
+
function wireReporters(reporters) {
|
|
1655
|
+
const emitter = new EvalEmitter();
|
|
1656
|
+
const cases = [];
|
|
1657
|
+
let runId = "";
|
|
1658
|
+
emitter.on("run:start", (data) => {
|
|
1659
|
+
runId = data.runId;
|
|
1660
|
+
for (const r of reporters) r.onRunStart?.(data);
|
|
1661
|
+
});
|
|
1662
|
+
emitter.on("case:scored", (data) => {
|
|
1663
|
+
const result = {
|
|
1664
|
+
runId: data.runId,
|
|
1665
|
+
index: data.index,
|
|
1666
|
+
input: data.input,
|
|
1667
|
+
output: data.output,
|
|
1668
|
+
expected: data.expected,
|
|
1669
|
+
scores: data.scores,
|
|
1670
|
+
error: data.error ?? null,
|
|
1671
|
+
latencyMs: data.latencyMs,
|
|
1672
|
+
tokensIn: data.tokensIn,
|
|
1673
|
+
tokensOut: data.tokensOut
|
|
1674
|
+
};
|
|
1675
|
+
cases.push(result);
|
|
1676
|
+
for (const r of reporters) r.onCaseEnd?.(result);
|
|
1677
|
+
});
|
|
1678
|
+
return { emitter, cases, getRunId: () => runId };
|
|
1679
|
+
}
|
|
1680
|
+
async function notifyRunEnd(reporters, data) {
|
|
1681
|
+
data.cases.sort((a, b) => a.index - b.index);
|
|
1682
|
+
await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
|
|
1683
|
+
}
|
|
1684
|
+
async function evaluateSingle(options) {
|
|
1685
|
+
const store = resolveStore(options.store);
|
|
1686
|
+
const threshold = options.threshold ?? 0.5;
|
|
1687
|
+
const { emitter, cases, getRunId } = wireReporters(options.reporters);
|
|
1688
|
+
const summary = await runEval({
|
|
1689
|
+
name: options.name,
|
|
1690
|
+
model: options.model,
|
|
1691
|
+
dataset: options.dataset,
|
|
1692
|
+
task: options.task,
|
|
1693
|
+
scorers: options.scorers,
|
|
1694
|
+
store,
|
|
1695
|
+
emitter,
|
|
1696
|
+
suiteId: options.suiteId,
|
|
1697
|
+
maxConcurrency: options.maxConcurrency,
|
|
1698
|
+
timeout: options.timeout,
|
|
1699
|
+
trials: options.trials,
|
|
1700
|
+
threshold: options.threshold
|
|
1701
|
+
});
|
|
1702
|
+
await notifyRunEnd(options.reporters, {
|
|
1703
|
+
runId: getRunId(),
|
|
1704
|
+
name: options.name,
|
|
1705
|
+
model: options.model,
|
|
1706
|
+
summary,
|
|
1707
|
+
cases,
|
|
1708
|
+
threshold
|
|
1709
|
+
});
|
|
1710
|
+
return summary;
|
|
1711
|
+
}
|
|
1712
|
+
async function evaluateEach(options) {
|
|
1713
|
+
const store = resolveStore(options.store);
|
|
1714
|
+
const items = [];
|
|
1715
|
+
for await (const item of options.dataset) {
|
|
1716
|
+
items.push(item);
|
|
1717
|
+
}
|
|
1718
|
+
const suite = store.createSuite(options.name);
|
|
1719
|
+
return Promise.all(
|
|
1720
|
+
options.models.map(
|
|
1721
|
+
(variant) => evaluateSingle({
|
|
1722
|
+
name: `${options.name} [${variant.name}]`,
|
|
1723
|
+
model: variant.name,
|
|
1724
|
+
dataset: dataset(items),
|
|
1725
|
+
task: (input) => options.task(input, variant),
|
|
1726
|
+
scorers: options.scorers,
|
|
1727
|
+
reporters: options.reporters,
|
|
1728
|
+
store,
|
|
1729
|
+
suiteId: suite.id,
|
|
1730
|
+
maxConcurrency: options.maxConcurrency,
|
|
1731
|
+
timeout: options.timeout,
|
|
1732
|
+
trials: options.trials,
|
|
1733
|
+
threshold: options.threshold
|
|
1734
|
+
})
|
|
1735
|
+
)
|
|
1736
|
+
);
|
|
1737
|
+
}
|
|
1738
|
+
export {
|
|
1739
|
+
Dataset,
|
|
1740
|
+
EvalEmitter,
|
|
1741
|
+
RunStore,
|
|
1742
|
+
all,
|
|
1743
|
+
any,
|
|
1744
|
+
compareRuns,
|
|
1745
|
+
consoleReporter,
|
|
1746
|
+
csvReporter,
|
|
1747
|
+
dataset,
|
|
1748
|
+
evaluate,
|
|
1749
|
+
exactMatch,
|
|
1750
|
+
factuality,
|
|
1751
|
+
hf,
|
|
1752
|
+
htmlReporter,
|
|
1753
|
+
includes,
|
|
1754
|
+
jsonMatch,
|
|
1755
|
+
jsonReporter,
|
|
1756
|
+
levenshtein,
|
|
1757
|
+
llmJudge,
|
|
1758
|
+
markdownReporter,
|
|
1759
|
+
regex,
|
|
1760
|
+
runEval,
|
|
1761
|
+
weighted
|
|
1762
|
+
};
|
|
1763
|
+
//# sourceMappingURL=index.js.map
|