@deepagents/evals 0.19.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/dataset/index.d.ts +3 -0
- package/dist/dataset/index.d.ts.map +1 -1
- package/dist/dataset/index.js +84 -1
- package/dist/dataset/index.js.map +3 -3
- package/dist/dataset/record-selection.d.ts +8 -0
- package/dist/dataset/record-selection.d.ts.map +1 -0
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js +6 -3
- package/dist/engine/index.js.map +2 -2
- package/dist/evaluate/index.d.ts +16 -3
- package/dist/evaluate/index.d.ts.map +1 -1
- package/dist/evaluate/index.js +225 -359
- package/dist/evaluate/index.js.map +3 -3
- package/dist/index.d.ts +5 -5
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +429 -110
- package/dist/index.js.map +4 -4
- package/dist/reporters/console.d.ts.map +1 -1
- package/dist/reporters/csv.d.ts.map +1 -1
- package/dist/reporters/html.d.ts.map +1 -1
- package/dist/reporters/index.js +129 -36
- package/dist/reporters/index.js.map +3 -3
- package/dist/reporters/markdown.d.ts.map +1 -1
- package/dist/scorers/index.d.ts +2 -6
- package/dist/scorers/index.d.ts.map +1 -1
- package/dist/scorers/index.js +32 -54
- package/dist/scorers/index.js.map +2 -2
- package/dist/store/index.d.ts +2 -0
- package/dist/store/index.d.ts.map +1 -1
- package/dist/store/index.js +22 -0
- package/dist/store/index.js.map +2 -2
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -60,6 +60,70 @@ async function fetchPage(url) {
|
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
+
// packages/evals/src/dataset/record-selection.ts
|
|
64
|
+
function parsePositiveInt(token) {
|
|
65
|
+
if (!/^\d+$/.test(token)) {
|
|
66
|
+
throw new Error(`Invalid record token "${token}"`);
|
|
67
|
+
}
|
|
68
|
+
const value = Number(token);
|
|
69
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
70
|
+
throw new Error(`Record numbers must be >= 1. Received "${token}"`);
|
|
71
|
+
}
|
|
72
|
+
return value;
|
|
73
|
+
}
|
|
74
|
+
function parseRecordSelection(spec) {
|
|
75
|
+
const trimmed = spec.trim();
|
|
76
|
+
if (!trimmed) {
|
|
77
|
+
return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
|
|
78
|
+
}
|
|
79
|
+
const indexes = /* @__PURE__ */ new Set();
|
|
80
|
+
const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
|
|
81
|
+
if (parts.length === 0) {
|
|
82
|
+
throw new Error("Record selection is empty.");
|
|
83
|
+
}
|
|
84
|
+
for (const part of parts) {
|
|
85
|
+
const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
|
|
86
|
+
if (rangeMatch) {
|
|
87
|
+
const start = parsePositiveInt(rangeMatch[1]);
|
|
88
|
+
const end = parsePositiveInt(rangeMatch[2]);
|
|
89
|
+
if (end < start) {
|
|
90
|
+
throw new Error(
|
|
91
|
+
`Invalid range "${part}". Range end must be >= range start.`
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
for (let i = start; i <= end; i++) {
|
|
95
|
+
indexes.add(i - 1);
|
|
96
|
+
}
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
const value = parsePositiveInt(part);
|
|
100
|
+
indexes.add(value - 1);
|
|
101
|
+
}
|
|
102
|
+
return {
|
|
103
|
+
indexes,
|
|
104
|
+
normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function pickFromArray(items, indexes) {
|
|
108
|
+
if (indexes.size === 0) return items;
|
|
109
|
+
return items.filter((_, i) => indexes.has(i));
|
|
110
|
+
}
|
|
111
|
+
async function* filterRecordsByIndex(source, indexes) {
|
|
112
|
+
if (indexes.size === 0) {
|
|
113
|
+
for await (const item of source) {
|
|
114
|
+
yield item;
|
|
115
|
+
}
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
let idx = 0;
|
|
119
|
+
for await (const item of source) {
|
|
120
|
+
if (indexes.has(idx)) {
|
|
121
|
+
yield item;
|
|
122
|
+
}
|
|
123
|
+
idx++;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
63
127
|
// packages/evals/src/dataset/index.ts
|
|
64
128
|
var Dataset = class _Dataset {
|
|
65
129
|
#source;
|
|
@@ -128,6 +192,22 @@ var Dataset = class _Dataset {
|
|
|
128
192
|
}
|
|
129
193
|
});
|
|
130
194
|
}
|
|
195
|
+
pick(indexes) {
|
|
196
|
+
const source = this.#source;
|
|
197
|
+
return new _Dataset(async function* () {
|
|
198
|
+
if (indexes.size === 0) {
|
|
199
|
+
yield* source();
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
let idx = 0;
|
|
203
|
+
for await (const item of source()) {
|
|
204
|
+
if (indexes.has(idx)) {
|
|
205
|
+
yield item;
|
|
206
|
+
}
|
|
207
|
+
idx++;
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
}
|
|
131
211
|
async toArray() {
|
|
132
212
|
const result = [];
|
|
133
213
|
for await (const item of this.#source()) {
|
|
@@ -250,8 +330,10 @@ function dataset(source) {
|
|
|
250
330
|
}
|
|
251
331
|
|
|
252
332
|
// packages/evals/src/scorers/index.ts
|
|
253
|
-
import {
|
|
254
|
-
|
|
333
|
+
import {
|
|
334
|
+
Factuality as AutoevalsFactuality,
|
|
335
|
+
Levenshtein as AutoevalsLevenshtein
|
|
336
|
+
} from "autoevals";
|
|
255
337
|
var exactMatch = async ({ output, expected }) => {
|
|
256
338
|
const exp = expected == null ? "" : String(expected);
|
|
257
339
|
if (output === exp) return { score: 1 };
|
|
@@ -273,32 +355,32 @@ function regex(pattern) {
|
|
|
273
355
|
return { score: pattern.test(output) ? 1 : 0 };
|
|
274
356
|
};
|
|
275
357
|
}
|
|
276
|
-
function
|
|
277
|
-
if (
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
358
|
+
function normalizeScore(score) {
|
|
359
|
+
if (typeof score !== "number" || !Number.isFinite(score)) return 0;
|
|
360
|
+
return Math.max(0, Math.min(1, score));
|
|
361
|
+
}
|
|
362
|
+
function reasonFromMetadata(metadata) {
|
|
363
|
+
if (!metadata) return void 0;
|
|
364
|
+
const candidates = [
|
|
365
|
+
metadata.reason,
|
|
366
|
+
metadata.rationale,
|
|
367
|
+
metadata.explanation
|
|
368
|
+
];
|
|
369
|
+
for (const candidate of candidates) {
|
|
370
|
+
if (typeof candidate === "string" && candidate.trim().length > 0) {
|
|
371
|
+
return candidate;
|
|
287
372
|
}
|
|
288
|
-
[prev, curr] = [curr, prev];
|
|
289
373
|
}
|
|
290
|
-
return
|
|
374
|
+
return void 0;
|
|
291
375
|
}
|
|
292
376
|
var levenshtein = async ({ output, expected }) => {
|
|
293
377
|
const exp = expected == null ? "" : String(expected);
|
|
294
|
-
|
|
295
|
-
const
|
|
296
|
-
const distance = levenshteinDistance(output, exp);
|
|
297
|
-
const score = Math.max(0, 1 - distance / maxLen);
|
|
298
|
-
if (score === 1) return { score };
|
|
378
|
+
const result = await AutoevalsLevenshtein({ output, expected: exp });
|
|
379
|
+
const score = normalizeScore(result.score);
|
|
299
380
|
return {
|
|
300
381
|
score,
|
|
301
|
-
reason:
|
|
382
|
+
reason: reasonFromMetadata(result.metadata),
|
|
383
|
+
metadata: result.metadata
|
|
302
384
|
};
|
|
303
385
|
};
|
|
304
386
|
function deepEqual(a, b) {
|
|
@@ -332,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
|
|
|
332
414
|
return { score: 0, reason: "Failed to parse JSON" };
|
|
333
415
|
}
|
|
334
416
|
};
|
|
335
|
-
var llmScorerSchema = z.object({
|
|
336
|
-
score: z.number().min(0).max(1),
|
|
337
|
-
reason: z.string()
|
|
338
|
-
});
|
|
339
|
-
function llmJudge(config) {
|
|
340
|
-
return async ({ input, output, expected }) => {
|
|
341
|
-
const { object } = await generateObject({
|
|
342
|
-
model: config.model,
|
|
343
|
-
schema: llmScorerSchema,
|
|
344
|
-
prompt: `You are an expert evaluator. Grade the output based on the following criteria:
|
|
345
|
-
${config.criteria}
|
|
346
|
-
|
|
347
|
-
Input: ${JSON.stringify(input)}
|
|
348
|
-
Output: ${output}
|
|
349
|
-
${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
|
|
350
|
-
|
|
351
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
352
|
-
});
|
|
353
|
-
return { score: object.score, reason: object.reason };
|
|
354
|
-
};
|
|
355
|
-
}
|
|
356
417
|
function factuality(config) {
|
|
357
418
|
return async ({ input, output, expected }) => {
|
|
358
|
-
const
|
|
419
|
+
const result = await AutoevalsFactuality({
|
|
359
420
|
model: config.model,
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
Input: ${JSON.stringify(input)}
|
|
364
|
-
Output: ${output}
|
|
365
|
-
Expected reference: ${JSON.stringify(expected)}
|
|
366
|
-
|
|
367
|
-
Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
|
|
368
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
421
|
+
input: typeof input === "string" ? input : JSON.stringify(input),
|
|
422
|
+
output,
|
|
423
|
+
expected: expected == null ? void 0 : String(expected)
|
|
369
424
|
});
|
|
370
|
-
return {
|
|
425
|
+
return {
|
|
426
|
+
score: normalizeScore(result.score),
|
|
427
|
+
reason: reasonFromMetadata(result.metadata),
|
|
428
|
+
metadata: result.metadata
|
|
429
|
+
};
|
|
371
430
|
};
|
|
372
431
|
}
|
|
373
432
|
function all(...scorers) {
|
|
@@ -720,6 +779,28 @@ var RunStore = class {
|
|
|
720
779
|
totalTokensOut: totals.totalTokensOut
|
|
721
780
|
};
|
|
722
781
|
}
|
|
782
|
+
findSuiteByName(name) {
|
|
783
|
+
const row = this.#stmt(
|
|
784
|
+
"SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
|
|
785
|
+
).get(name);
|
|
786
|
+
return row ?? void 0;
|
|
787
|
+
}
|
|
788
|
+
getLatestCompletedRun(suiteId, model) {
|
|
789
|
+
const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
|
|
790
|
+
const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
|
|
791
|
+
if (!row) return void 0;
|
|
792
|
+
return {
|
|
793
|
+
id: row.id,
|
|
794
|
+
suite_id: row.suite_id,
|
|
795
|
+
name: row.name,
|
|
796
|
+
model: row.model,
|
|
797
|
+
config: row.config ? JSON.parse(row.config) : null,
|
|
798
|
+
started_at: row.started_at,
|
|
799
|
+
finished_at: row.finished_at,
|
|
800
|
+
status: row.status,
|
|
801
|
+
summary: row.summary ? JSON.parse(row.summary) : null
|
|
802
|
+
};
|
|
803
|
+
}
|
|
723
804
|
listSuites() {
|
|
724
805
|
const rows = this.#stmt(
|
|
725
806
|
"SELECT * FROM suites ORDER BY created_at DESC"
|
|
@@ -940,7 +1021,8 @@ async function runEval(config) {
|
|
|
940
1021
|
});
|
|
941
1022
|
scores[sName] = {
|
|
942
1023
|
score: clampScore(sr.score, sName),
|
|
943
|
-
reason: sr.reason
|
|
1024
|
+
reason: sr.reason,
|
|
1025
|
+
metadata: sr.metadata
|
|
944
1026
|
};
|
|
945
1027
|
}
|
|
946
1028
|
trialResults.push({ result, scores });
|
|
@@ -966,7 +1048,8 @@ async function runEval(config) {
|
|
|
966
1048
|
const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
|
|
967
1049
|
finalScores[sName] = {
|
|
968
1050
|
score: meanScore,
|
|
969
|
-
reason: trialResults[trialResults.length - 1].scores[sName]?.reason
|
|
1051
|
+
reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
|
|
1052
|
+
metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
|
|
970
1053
|
};
|
|
971
1054
|
}
|
|
972
1055
|
} else {
|
|
@@ -983,7 +1066,8 @@ async function runEval(config) {
|
|
|
983
1066
|
});
|
|
984
1067
|
finalScores[sName] = {
|
|
985
1068
|
score: clampScore(sr.score, sName),
|
|
986
|
-
reason: sr.reason
|
|
1069
|
+
reason: sr.reason,
|
|
1070
|
+
metadata: sr.metadata
|
|
987
1071
|
};
|
|
988
1072
|
}
|
|
989
1073
|
}
|
|
@@ -1283,26 +1367,51 @@ function createRunEndFileReporter(options) {
|
|
|
1283
1367
|
}
|
|
1284
1368
|
|
|
1285
1369
|
// packages/evals/src/reporters/console.ts
|
|
1370
|
+
var BAR_WIDTH = 20;
|
|
1371
|
+
function renderProgressBar(completed, total, elapsedMs) {
|
|
1372
|
+
const pct = total > 0 ? completed / total : 0;
|
|
1373
|
+
const filled = Math.round(pct * BAR_WIDTH);
|
|
1374
|
+
const bar = "\u2593".repeat(filled) + "\u2591".repeat(BAR_WIDTH - filled);
|
|
1375
|
+
const pctStr = `${(pct * 100).toFixed(0)}%`;
|
|
1376
|
+
return ` ${bar} ${pctStr} (${completed}/${total}) ${formatDuration(elapsedMs)}`;
|
|
1377
|
+
}
|
|
1378
|
+
function statusLabel(status) {
|
|
1379
|
+
if (status === "pass") return chalk.green("PASS");
|
|
1380
|
+
if (status === "error") return chalk.yellow("ERROR");
|
|
1381
|
+
return chalk.red("FAIL");
|
|
1382
|
+
}
|
|
1286
1383
|
function consoleReporter(options) {
|
|
1287
1384
|
const verbosity = options?.verbosity ?? "normal";
|
|
1288
1385
|
let totalCases = 0;
|
|
1289
1386
|
let completed = 0;
|
|
1387
|
+
let startTime = 0;
|
|
1290
1388
|
return {
|
|
1291
1389
|
onRunStart(data) {
|
|
1292
1390
|
totalCases = data.totalCases;
|
|
1293
1391
|
completed = 0;
|
|
1392
|
+
startTime = Date.now();
|
|
1393
|
+
if (verbosity !== "quiet") {
|
|
1394
|
+
const label = data.name;
|
|
1395
|
+
console.log("");
|
|
1396
|
+
console.log(
|
|
1397
|
+
` ${chalk.dim("\u2500\u2500")} ${chalk.bold(label)} ${chalk.dim("\u2500".repeat(Math.max(0, 56 - label.length)))}`
|
|
1398
|
+
);
|
|
1399
|
+
console.log(` ${chalk.dim(`Running ${data.totalCases} cases...`)}`);
|
|
1400
|
+
console.log("");
|
|
1401
|
+
}
|
|
1294
1402
|
},
|
|
1295
1403
|
onCaseEnd() {
|
|
1296
1404
|
completed++;
|
|
1297
1405
|
if (verbosity !== "quiet") {
|
|
1406
|
+
const elapsed = Date.now() - startTime;
|
|
1298
1407
|
process.stdout.write(
|
|
1299
|
-
`\r
|
|
1408
|
+
`\r${renderProgressBar(completed, totalCases, elapsed)}`
|
|
1300
1409
|
);
|
|
1301
1410
|
}
|
|
1302
1411
|
},
|
|
1303
1412
|
onRunEnd(data) {
|
|
1304
1413
|
if (verbosity !== "quiet") {
|
|
1305
|
-
process.stdout.write("\r" + " ".repeat(
|
|
1414
|
+
process.stdout.write("\r" + " ".repeat(70) + "\r");
|
|
1306
1415
|
}
|
|
1307
1416
|
renderSummaryTable(data);
|
|
1308
1417
|
if (verbosity === "quiet") return;
|
|
@@ -1315,19 +1424,7 @@ function consoleReporter(options) {
|
|
|
1315
1424
|
});
|
|
1316
1425
|
}
|
|
1317
1426
|
} else {
|
|
1318
|
-
|
|
1319
|
-
(c) => getCaseStatus(c, data.threshold) !== "pass"
|
|
1320
|
-
);
|
|
1321
|
-
if (failing.length > 0) {
|
|
1322
|
-
console.log(chalk.dim(` Failing cases (${failing.length}):`));
|
|
1323
|
-
console.log("");
|
|
1324
|
-
for (const c of failing) {
|
|
1325
|
-
renderCaseDetail(c, data.threshold, {
|
|
1326
|
-
includeIO: true,
|
|
1327
|
-
maxStringLength: 4e3
|
|
1328
|
-
});
|
|
1329
|
-
}
|
|
1330
|
-
}
|
|
1427
|
+
renderFailuresByScorer(sorted, data.threshold);
|
|
1331
1428
|
}
|
|
1332
1429
|
}
|
|
1333
1430
|
};
|
|
@@ -1340,40 +1437,69 @@ function truncateString(text, maxLength) {
|
|
|
1340
1437
|
if (text.length <= maxLength) return text;
|
|
1341
1438
|
return text.slice(0, maxLength) + "\u2026";
|
|
1342
1439
|
}
|
|
1440
|
+
function stringifyRationale(value) {
|
|
1441
|
+
if (typeof value === "string") {
|
|
1442
|
+
const trimmed = value.trim();
|
|
1443
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
1444
|
+
}
|
|
1445
|
+
if (Array.isArray(value)) {
|
|
1446
|
+
const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
|
|
1447
|
+
if (parts.length > 0) return parts.join(" | ");
|
|
1448
|
+
}
|
|
1449
|
+
return void 0;
|
|
1450
|
+
}
|
|
1451
|
+
function scoreReasonWithMetadata(score) {
|
|
1452
|
+
const reason = score.reason?.trim();
|
|
1453
|
+
if (reason) return reason;
|
|
1454
|
+
return stringifyRationale(score.metadata?.["rationale"]);
|
|
1455
|
+
}
|
|
1343
1456
|
function renderSummaryTable(data) {
|
|
1344
1457
|
const { summary } = data;
|
|
1345
|
-
const
|
|
1458
|
+
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
1346
1459
|
console.log("");
|
|
1347
1460
|
console.log(chalk.bold(" Summary"));
|
|
1348
1461
|
console.log(chalk.dim(" " + "\u2500".repeat(60)));
|
|
1349
|
-
console.log(` ${chalk.dim("Eval:")}
|
|
1350
|
-
console.log(` ${chalk.dim("Model:")}
|
|
1351
|
-
console.log(` ${chalk.dim("
|
|
1462
|
+
console.log(` ${chalk.dim("Eval:")} ${data.name}`);
|
|
1463
|
+
console.log(` ${chalk.dim("Model:")} ${data.model}`);
|
|
1464
|
+
console.log(` ${chalk.dim("Threshold:")} ${data.threshold}`);
|
|
1465
|
+
console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
|
|
1352
1466
|
console.log(
|
|
1353
|
-
` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
|
|
1467
|
+
` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))} ${chalk.dim(`(${passRate}%)`)}`
|
|
1354
1468
|
);
|
|
1355
|
-
console.log(` ${chalk.dim("Scores:")} ${scoreStr}`);
|
|
1356
1469
|
console.log(
|
|
1357
|
-
` ${chalk.dim("Duration:")}
|
|
1470
|
+
` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
|
|
1358
1471
|
);
|
|
1359
1472
|
console.log(
|
|
1360
|
-
` ${chalk.dim("Tokens:")}
|
|
1473
|
+
` ${chalk.dim("Tokens:")} ${chalk.dim("In:")} ${formatTokens(summary.totalTokensIn)} ${chalk.dim("Out:")} ${formatTokens(summary.totalTokensOut)} ${chalk.dim("Total:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1361
1474
|
);
|
|
1475
|
+
const scoreEntries = Object.entries(summary.meanScores);
|
|
1476
|
+
if (scoreEntries.length > 0) {
|
|
1477
|
+
console.log("");
|
|
1478
|
+
console.log(chalk.bold(" Scores"));
|
|
1479
|
+
for (const [name, score] of scoreEntries) {
|
|
1480
|
+
const scoreColor = score >= data.threshold ? chalk.green : chalk.red;
|
|
1481
|
+
console.log(
|
|
1482
|
+
` ${chalk.dim(name + ":")}${" ".repeat(Math.max(1, 12 - name.length))}${scoreColor(score.toFixed(3))}`
|
|
1483
|
+
);
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1362
1486
|
console.log(chalk.dim(" " + "\u2500".repeat(60)));
|
|
1363
1487
|
console.log("");
|
|
1364
1488
|
}
|
|
1365
1489
|
function renderCaseDetail(c, threshold, options) {
|
|
1366
1490
|
const entries = Object.entries(c.scores);
|
|
1367
|
-
const
|
|
1368
|
-
const prefix =
|
|
1491
|
+
const status = getCaseStatus(c, threshold);
|
|
1492
|
+
const prefix = statusLabel(status);
|
|
1369
1493
|
const includeIO = options?.includeIO ?? false;
|
|
1370
1494
|
const maxStringLength = options?.maxStringLength ?? 4e3;
|
|
1371
|
-
|
|
1495
|
+
const meta = `${chalk.dim(formatDuration(c.latencyMs))} ${chalk.dim(`${c.tokensIn}/${c.tokensOut} tokens`)}`;
|
|
1496
|
+
console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)} ${meta}`);
|
|
1372
1497
|
const inputStr = stringifyUnknown(c.input, {
|
|
1373
1498
|
space: 2,
|
|
1374
1499
|
fallback: String(c.input)
|
|
1375
1500
|
});
|
|
1376
|
-
console.log(` ${chalk.dim("Input:")}
|
|
1501
|
+
console.log(` ${chalk.dim("Input:")}`);
|
|
1502
|
+
console.log(indentBlock(truncateString(inputStr, maxStringLength), 6));
|
|
1377
1503
|
if (includeIO) {
|
|
1378
1504
|
console.log(` ${chalk.dim("Output:")}`);
|
|
1379
1505
|
console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
|
|
@@ -1388,7 +1514,8 @@ function renderCaseDetail(c, threshold, options) {
|
|
|
1388
1514
|
}
|
|
1389
1515
|
for (const [name, s] of entries) {
|
|
1390
1516
|
const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
|
|
1391
|
-
const
|
|
1517
|
+
const reason = scoreReasonWithMetadata(s);
|
|
1518
|
+
const reasonStr = reason ? ` \u2014 ${reason}` : "";
|
|
1392
1519
|
console.log(
|
|
1393
1520
|
` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
|
|
1394
1521
|
);
|
|
@@ -1400,6 +1527,37 @@ function renderCaseDetail(c, threshold, options) {
|
|
|
1400
1527
|
}
|
|
1401
1528
|
console.log("");
|
|
1402
1529
|
}
|
|
1530
|
+
function renderFailuresByScorer(cases, threshold) {
|
|
1531
|
+
const scorerNames = /* @__PURE__ */ new Set();
|
|
1532
|
+
for (const c of cases) {
|
|
1533
|
+
for (const name of Object.keys(c.scores)) {
|
|
1534
|
+
scorerNames.add(name);
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
let hasFailures = false;
|
|
1538
|
+
for (const scorer of scorerNames) {
|
|
1539
|
+
const failing = cases.filter((c) => {
|
|
1540
|
+
const s = c.scores[scorer];
|
|
1541
|
+
return s && s.score < threshold || getCaseStatus(c, threshold) === "error";
|
|
1542
|
+
});
|
|
1543
|
+
if (failing.length === 0) continue;
|
|
1544
|
+
if (!hasFailures) {
|
|
1545
|
+
console.log(chalk.dim(" Failing by scorer:"));
|
|
1546
|
+
console.log("");
|
|
1547
|
+
hasFailures = true;
|
|
1548
|
+
}
|
|
1549
|
+
console.log(
|
|
1550
|
+
` ${chalk.bold(scorer)} ${chalk.dim(`(${failing.length} failures)`)}`
|
|
1551
|
+
);
|
|
1552
|
+
console.log(chalk.dim(" " + "\u2500".repeat(40)));
|
|
1553
|
+
for (const c of failing) {
|
|
1554
|
+
renderCaseDetail(c, threshold, {
|
|
1555
|
+
includeIO: true,
|
|
1556
|
+
maxStringLength: 4e3
|
|
1557
|
+
});
|
|
1558
|
+
}
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1403
1561
|
|
|
1404
1562
|
// packages/evals/src/reporters/json.ts
|
|
1405
1563
|
import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
|
|
@@ -1441,6 +1599,7 @@ function csvReporter(options) {
|
|
|
1441
1599
|
const scorerNames = Object.keys(data.summary.meanScores);
|
|
1442
1600
|
const headerParts = [
|
|
1443
1601
|
"index",
|
|
1602
|
+
"status",
|
|
1444
1603
|
"input",
|
|
1445
1604
|
"output",
|
|
1446
1605
|
"expected",
|
|
@@ -1454,8 +1613,10 @@ function csvReporter(options) {
|
|
|
1454
1613
|
}
|
|
1455
1614
|
const rows = [headerParts.join(",")];
|
|
1456
1615
|
for (const c of data.cases) {
|
|
1616
|
+
const status = getCaseStatus(c, data.threshold);
|
|
1457
1617
|
const parts = [
|
|
1458
1618
|
String(c.index),
|
|
1619
|
+
status,
|
|
1459
1620
|
escapeCsv(c.input),
|
|
1460
1621
|
escapeCsv(c.output),
|
|
1461
1622
|
escapeCsv(c.expected),
|
|
@@ -1484,15 +1645,17 @@ function markdownReporter(options) {
|
|
|
1484
1645
|
const { summary } = data;
|
|
1485
1646
|
const scorerNames = Object.keys(summary.meanScores);
|
|
1486
1647
|
const lines = [];
|
|
1648
|
+
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
1487
1649
|
lines.push(`# ${data.name}`);
|
|
1488
1650
|
lines.push("");
|
|
1489
1651
|
lines.push(`**Model:** ${data.model}`);
|
|
1652
|
+
lines.push(`**Threshold:** ${data.threshold}`);
|
|
1490
1653
|
lines.push(
|
|
1491
|
-
`**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
|
|
1654
|
+
`**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail, ${passRate}%)`
|
|
1492
1655
|
);
|
|
1493
1656
|
lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
|
|
1494
1657
|
lines.push(
|
|
1495
|
-
`**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1658
|
+
`**Tokens:** In: ${formatTokens(summary.totalTokensIn)} | Out: ${formatTokens(summary.totalTokensOut)} | Total: ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1496
1659
|
);
|
|
1497
1660
|
lines.push("");
|
|
1498
1661
|
lines.push("## Scores");
|
|
@@ -1511,6 +1674,7 @@ function markdownReporter(options) {
|
|
|
1511
1674
|
"Input",
|
|
1512
1675
|
...scorerNames,
|
|
1513
1676
|
"Latency",
|
|
1677
|
+
"Tokens",
|
|
1514
1678
|
"Error"
|
|
1515
1679
|
];
|
|
1516
1680
|
lines.push(`| ${caseHeader.join(" | ")} |`);
|
|
@@ -1528,7 +1692,8 @@ function markdownReporter(options) {
|
|
|
1528
1692
|
status,
|
|
1529
1693
|
input,
|
|
1530
1694
|
...scores,
|
|
1531
|
-
|
|
1695
|
+
formatDuration(c.latencyMs),
|
|
1696
|
+
`${c.tokensIn}/${c.tokensOut}`,
|
|
1532
1697
|
error
|
|
1533
1698
|
];
|
|
1534
1699
|
lines.push(`| ${row.join(" | ")} |`);
|
|
@@ -1553,9 +1718,10 @@ function esc(str) {
|
|
|
1553
1718
|
function renderHtml(data) {
|
|
1554
1719
|
const { summary } = data;
|
|
1555
1720
|
const scorerNames = Object.keys(summary.meanScores);
|
|
1721
|
+
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
1556
1722
|
const caseRows = data.cases.map((c) => {
|
|
1557
1723
|
const status = getCaseStatus(c, data.threshold);
|
|
1558
|
-
const
|
|
1724
|
+
const statusLabel2 = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
|
|
1559
1725
|
const scoresCells = scorerNames.map((name) => {
|
|
1560
1726
|
const s = c.scores[name];
|
|
1561
1727
|
const score = s?.score ?? 0;
|
|
@@ -1563,13 +1729,19 @@ function renderHtml(data) {
|
|
|
1563
1729
|
const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
|
|
1564
1730
|
return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
|
|
1565
1731
|
}).join("");
|
|
1732
|
+
const expectedStr = stringifyUnknown(c.expected, {
|
|
1733
|
+
space: 0,
|
|
1734
|
+
fallback: ""
|
|
1735
|
+
});
|
|
1566
1736
|
return `<tr class="${status}">
|
|
1567
1737
|
<td>${c.index}</td>
|
|
1568
|
-
<td class="${status}">${
|
|
1738
|
+
<td class="${status}">${statusLabel2}</td>
|
|
1569
1739
|
<td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
|
|
1570
1740
|
<td class="text">${esc(c.output.slice(0, 120))}</td>
|
|
1741
|
+
<td class="text">${esc(expectedStr.slice(0, 120))}</td>
|
|
1571
1742
|
${scoresCells}
|
|
1572
|
-
<td>${c.latencyMs}
|
|
1743
|
+
<td>${formatDuration(c.latencyMs)}</td>
|
|
1744
|
+
<td>${c.tokensIn}/${c.tokensOut}</td>
|
|
1573
1745
|
<td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
|
|
1574
1746
|
</tr>`;
|
|
1575
1747
|
}).join("\n");
|
|
@@ -1607,11 +1779,14 @@ function renderHtml(data) {
|
|
|
1607
1779
|
<h1>${esc(data.name)}</h1>
|
|
1608
1780
|
<div class="meta">
|
|
1609
1781
|
<span><strong>Model:</strong> ${esc(data.model)}</span>
|
|
1782
|
+
<span><strong>Threshold:</strong> ${data.threshold}</span>
|
|
1610
1783
|
<span><strong>Cases:</strong> ${summary.totalCases}</span>
|
|
1611
1784
|
<span><strong>Pass:</strong> ${summary.passCount}</span>
|
|
1612
|
-
<span><strong>Fail:</strong> ${summary.failCount}</span>
|
|
1785
|
+
<span><strong>Fail:</strong> ${summary.failCount} (${passRate}%)</span>
|
|
1613
1786
|
<span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
|
|
1614
|
-
<span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn
|
|
1787
|
+
<span><strong>Tokens In:</strong> ${formatTokens(summary.totalTokensIn)}</span>
|
|
1788
|
+
<span><strong>Tokens Out:</strong> ${formatTokens(summary.totalTokensOut)}</span>
|
|
1789
|
+
<span><strong>Total Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
|
|
1615
1790
|
</div>
|
|
1616
1791
|
|
|
1617
1792
|
<h2>Mean Scores</h2>
|
|
@@ -1628,8 +1803,10 @@ function renderHtml(data) {
|
|
|
1628
1803
|
<th>Status</th>
|
|
1629
1804
|
<th>Input</th>
|
|
1630
1805
|
<th>Output</th>
|
|
1806
|
+
<th>Expected</th>
|
|
1631
1807
|
${scorerHeaders}
|
|
1632
1808
|
<th>Latency</th>
|
|
1809
|
+
<th>Tokens</th>
|
|
1633
1810
|
<th>Error</th>
|
|
1634
1811
|
</tr>
|
|
1635
1812
|
</thead>
|
|
@@ -1642,14 +1819,149 @@ function renderHtml(data) {
|
|
|
1642
1819
|
}
|
|
1643
1820
|
|
|
1644
1821
|
// packages/evals/src/evaluate/index.ts
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1822
|
+
var EvalAssertionError = class extends Error {
|
|
1823
|
+
summary;
|
|
1824
|
+
constructor(summary) {
|
|
1825
|
+
const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
|
|
1826
|
+
super(msg);
|
|
1827
|
+
this.name = "EvalAssertionError";
|
|
1828
|
+
this.summary = summary;
|
|
1829
|
+
}
|
|
1830
|
+
};
|
|
1831
|
+
function resolveFailedIndexes(store, suiteName, model, threshold) {
|
|
1832
|
+
const suite = store.findSuiteByName(suiteName);
|
|
1833
|
+
if (!suite) {
|
|
1834
|
+
console.warn(
|
|
1835
|
+
`No previous suite found for '${suiteName}'. Running all cases.`
|
|
1836
|
+
);
|
|
1837
|
+
return /* @__PURE__ */ new Set();
|
|
1838
|
+
}
|
|
1839
|
+
const run = store.getLatestCompletedRun(suite.id, model);
|
|
1840
|
+
if (!run) {
|
|
1841
|
+
console.warn(
|
|
1842
|
+
`No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
|
|
1843
|
+
);
|
|
1844
|
+
return /* @__PURE__ */ new Set();
|
|
1648
1845
|
}
|
|
1649
|
-
|
|
1846
|
+
const failingCases = store.getFailingCases(run.id, threshold);
|
|
1847
|
+
if (failingCases.length === 0) {
|
|
1848
|
+
console.warn(`No failed cases in previous run. Running all cases.`);
|
|
1849
|
+
return /* @__PURE__ */ new Set();
|
|
1850
|
+
}
|
|
1851
|
+
console.warn(
|
|
1852
|
+
`Retrying ${failingCases.length} failed cases from previous run`
|
|
1853
|
+
);
|
|
1854
|
+
return new Set(failingCases.map((c) => c.idx));
|
|
1650
1855
|
}
|
|
1651
|
-
|
|
1652
|
-
|
|
1856
|
+
var EvalBuilder = class {
|
|
1857
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
1858
|
+
#options;
|
|
1859
|
+
#selection = { type: "all" };
|
|
1860
|
+
#shouldAssert = false;
|
|
1861
|
+
constructor(options) {
|
|
1862
|
+
this.#options = options;
|
|
1863
|
+
}
|
|
1864
|
+
#setSelection(selection) {
|
|
1865
|
+
if (this.#selection.type !== "all") {
|
|
1866
|
+
throw new Error(
|
|
1867
|
+
`Cannot combine .${this.#selection.type}() with .${selection.type}()`
|
|
1868
|
+
);
|
|
1869
|
+
}
|
|
1870
|
+
this.#selection = selection;
|
|
1871
|
+
return this;
|
|
1872
|
+
}
|
|
1873
|
+
failed() {
|
|
1874
|
+
return this.#setSelection({ type: "failed" });
|
|
1875
|
+
}
|
|
1876
|
+
cases(spec) {
|
|
1877
|
+
const { indexes } = parseRecordSelection(spec);
|
|
1878
|
+
return this.#setSelection({ type: "cases", indexes });
|
|
1879
|
+
}
|
|
1880
|
+
sample(count) {
|
|
1881
|
+
if (count < 1) {
|
|
1882
|
+
throw new Error("Sample count must be >= 1");
|
|
1883
|
+
}
|
|
1884
|
+
return this.#setSelection({ type: "sample", count });
|
|
1885
|
+
}
|
|
1886
|
+
assert() {
|
|
1887
|
+
this.#shouldAssert = true;
|
|
1888
|
+
return this;
|
|
1889
|
+
}
|
|
1890
|
+
then(onfulfilled, onrejected) {
|
|
1891
|
+
return this.#execute().then(onfulfilled, onrejected);
|
|
1892
|
+
}
|
|
1893
|
+
async #execute() {
|
|
1894
|
+
if ("models" in this.#options) {
|
|
1895
|
+
return this.#executeMulti();
|
|
1896
|
+
}
|
|
1897
|
+
return this.#executeSingle();
|
|
1898
|
+
}
|
|
1899
|
+
#applyDatasetFilter(ds) {
|
|
1900
|
+
switch (this.#selection.type) {
|
|
1901
|
+
case "all":
|
|
1902
|
+
return ds;
|
|
1903
|
+
case "cases":
|
|
1904
|
+
return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
|
|
1905
|
+
case "sample":
|
|
1906
|
+
return dataset(ds).sample(this.#selection.count);
|
|
1907
|
+
case "failed":
|
|
1908
|
+
return ds;
|
|
1909
|
+
}
|
|
1910
|
+
}
|
|
1911
|
+
async #executeSingle() {
|
|
1912
|
+
const options = this.#options;
|
|
1913
|
+
let ds = options.dataset;
|
|
1914
|
+
if (this.#selection.type === "failed") {
|
|
1915
|
+
const indexes = resolveFailedIndexes(
|
|
1916
|
+
options.store,
|
|
1917
|
+
options.name,
|
|
1918
|
+
options.model,
|
|
1919
|
+
options.threshold
|
|
1920
|
+
);
|
|
1921
|
+
if (indexes.size > 0) {
|
|
1922
|
+
ds = filterRecordsByIndex(ds, indexes);
|
|
1923
|
+
}
|
|
1924
|
+
} else {
|
|
1925
|
+
ds = this.#applyDatasetFilter(ds);
|
|
1926
|
+
}
|
|
1927
|
+
const result = await evaluateSingle({ ...options, dataset: ds });
|
|
1928
|
+
if (this.#shouldAssert && result.failCount > 0) {
|
|
1929
|
+
throw new EvalAssertionError(result);
|
|
1930
|
+
}
|
|
1931
|
+
return result;
|
|
1932
|
+
}
|
|
1933
|
+
async #executeMulti() {
|
|
1934
|
+
const options = this.#options;
|
|
1935
|
+
let result;
|
|
1936
|
+
if (this.#selection.type === "failed") {
|
|
1937
|
+
const perModelIndexes = /* @__PURE__ */ new Map();
|
|
1938
|
+
for (const variant of options.models) {
|
|
1939
|
+
perModelIndexes.set(
|
|
1940
|
+
variant.name,
|
|
1941
|
+
resolveFailedIndexes(
|
|
1942
|
+
options.store,
|
|
1943
|
+
options.name,
|
|
1944
|
+
variant.name,
|
|
1945
|
+
options.threshold
|
|
1946
|
+
)
|
|
1947
|
+
);
|
|
1948
|
+
}
|
|
1949
|
+
result = await evaluateEach(options, perModelIndexes);
|
|
1950
|
+
} else {
|
|
1951
|
+
const filtered = this.#applyDatasetFilter(options.dataset);
|
|
1952
|
+
result = await evaluateEach({ ...options, dataset: filtered });
|
|
1953
|
+
}
|
|
1954
|
+
if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
|
|
1955
|
+
throw new EvalAssertionError(result);
|
|
1956
|
+
}
|
|
1957
|
+
return result;
|
|
1958
|
+
}
|
|
1959
|
+
};
|
|
1960
|
+
function evaluate(options) {
|
|
1961
|
+
if ("models" in options) {
|
|
1962
|
+
return new EvalBuilder(options);
|
|
1963
|
+
}
|
|
1964
|
+
return new EvalBuilder(options);
|
|
1653
1965
|
}
|
|
1654
1966
|
function wireReporters(reporters) {
|
|
1655
1967
|
const emitter = new EvalEmitter();
|
|
@@ -1682,7 +1994,6 @@ async function notifyRunEnd(reporters, data) {
|
|
|
1682
1994
|
await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
|
|
1683
1995
|
}
|
|
1684
1996
|
async function evaluateSingle(options) {
|
|
1685
|
-
const store = resolveStore(options.store);
|
|
1686
1997
|
const threshold = options.threshold ?? 0.5;
|
|
1687
1998
|
const { emitter, cases, getRunId } = wireReporters(options.reporters);
|
|
1688
1999
|
const summary = await runEval({
|
|
@@ -1691,7 +2002,7 @@ async function evaluateSingle(options) {
|
|
|
1691
2002
|
dataset: options.dataset,
|
|
1692
2003
|
task: options.task,
|
|
1693
2004
|
scorers: options.scorers,
|
|
1694
|
-
store,
|
|
2005
|
+
store: options.store,
|
|
1695
2006
|
emitter,
|
|
1696
2007
|
suiteId: options.suiteId,
|
|
1697
2008
|
maxConcurrency: options.maxConcurrency,
|
|
@@ -1709,34 +2020,40 @@ async function evaluateSingle(options) {
|
|
|
1709
2020
|
});
|
|
1710
2021
|
return summary;
|
|
1711
2022
|
}
|
|
1712
|
-
async function evaluateEach(options) {
|
|
1713
|
-
const store = resolveStore(options.store);
|
|
2023
|
+
async function evaluateEach(options, perModelFailedIndexes) {
|
|
1714
2024
|
const items = [];
|
|
1715
2025
|
for await (const item of options.dataset) {
|
|
1716
2026
|
items.push(item);
|
|
1717
2027
|
}
|
|
1718
|
-
const suite = store.createSuite(options.name);
|
|
2028
|
+
const suite = options.store.createSuite(options.name);
|
|
1719
2029
|
return Promise.all(
|
|
1720
|
-
options.models.map(
|
|
1721
|
-
|
|
2030
|
+
options.models.map((variant) => {
|
|
2031
|
+
let ds = dataset(items);
|
|
2032
|
+
const failedIndexes = perModelFailedIndexes?.get(variant.name);
|
|
2033
|
+
if (failedIndexes && failedIndexes.size > 0) {
|
|
2034
|
+
ds = filterRecordsByIndex(ds, failedIndexes);
|
|
2035
|
+
}
|
|
2036
|
+
return evaluateSingle({
|
|
1722
2037
|
name: `${options.name} [${variant.name}]`,
|
|
1723
2038
|
model: variant.name,
|
|
1724
|
-
dataset:
|
|
2039
|
+
dataset: ds,
|
|
1725
2040
|
task: (input) => options.task(input, variant),
|
|
1726
2041
|
scorers: options.scorers,
|
|
1727
2042
|
reporters: options.reporters,
|
|
1728
|
-
store,
|
|
2043
|
+
store: options.store,
|
|
1729
2044
|
suiteId: suite.id,
|
|
1730
2045
|
maxConcurrency: options.maxConcurrency,
|
|
1731
2046
|
timeout: options.timeout,
|
|
1732
2047
|
trials: options.trials,
|
|
1733
2048
|
threshold: options.threshold
|
|
1734
|
-
})
|
|
1735
|
-
)
|
|
2049
|
+
});
|
|
2050
|
+
})
|
|
1736
2051
|
);
|
|
1737
2052
|
}
|
|
1738
2053
|
export {
|
|
1739
2054
|
Dataset,
|
|
2055
|
+
EvalAssertionError,
|
|
2056
|
+
EvalBuilder,
|
|
1740
2057
|
EvalEmitter,
|
|
1741
2058
|
RunStore,
|
|
1742
2059
|
all,
|
|
@@ -1748,14 +2065,16 @@ export {
|
|
|
1748
2065
|
evaluate,
|
|
1749
2066
|
exactMatch,
|
|
1750
2067
|
factuality,
|
|
2068
|
+
filterRecordsByIndex,
|
|
1751
2069
|
hf,
|
|
1752
2070
|
htmlReporter,
|
|
1753
2071
|
includes,
|
|
1754
2072
|
jsonMatch,
|
|
1755
2073
|
jsonReporter,
|
|
1756
2074
|
levenshtein,
|
|
1757
|
-
llmJudge,
|
|
1758
2075
|
markdownReporter,
|
|
2076
|
+
parseRecordSelection,
|
|
2077
|
+
pickFromArray,
|
|
1759
2078
|
regex,
|
|
1760
2079
|
runEval,
|
|
1761
2080
|
weighted
|