@deepagents/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dataset/index.d.ts +3 -0
- package/dist/dataset/index.d.ts.map +1 -1
- package/dist/dataset/index.js +84 -1
- package/dist/dataset/index.js.map +3 -3
- package/dist/dataset/record-selection.d.ts +8 -0
- package/dist/dataset/record-selection.d.ts.map +1 -0
- package/dist/evaluate/index.d.ts +16 -3
- package/dist/evaluate/index.d.ts.map +1 -1
- package/dist/evaluate/index.js +219 -356
- package/dist/evaluate/index.js.map +3 -3
- package/dist/index.d.ts +4 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +373 -52
- package/dist/index.js.map +4 -4
- package/dist/reporters/console.d.ts.map +1 -1
- package/dist/reporters/csv.d.ts.map +1 -1
- package/dist/reporters/html.d.ts.map +1 -1
- package/dist/reporters/index.js +111 -35
- package/dist/reporters/index.js.map +3 -3
- package/dist/reporters/markdown.d.ts.map +1 -1
- package/dist/store/index.d.ts +2 -0
- package/dist/store/index.d.ts.map +1 -1
- package/dist/store/index.js +22 -0
- package/dist/store/index.js.map +2 -2
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -60,6 +60,70 @@ async function fetchPage(url) {
|
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
+
// packages/evals/src/dataset/record-selection.ts
|
|
64
|
+
function parsePositiveInt(token) {
|
|
65
|
+
if (!/^\d+$/.test(token)) {
|
|
66
|
+
throw new Error(`Invalid record token "${token}"`);
|
|
67
|
+
}
|
|
68
|
+
const value = Number(token);
|
|
69
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
70
|
+
throw new Error(`Record numbers must be >= 1. Received "${token}"`);
|
|
71
|
+
}
|
|
72
|
+
return value;
|
|
73
|
+
}
|
|
74
|
+
function parseRecordSelection(spec) {
|
|
75
|
+
const trimmed = spec.trim();
|
|
76
|
+
if (!trimmed) {
|
|
77
|
+
return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
|
|
78
|
+
}
|
|
79
|
+
const indexes = /* @__PURE__ */ new Set();
|
|
80
|
+
const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
|
|
81
|
+
if (parts.length === 0) {
|
|
82
|
+
throw new Error("Record selection is empty.");
|
|
83
|
+
}
|
|
84
|
+
for (const part of parts) {
|
|
85
|
+
const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
|
|
86
|
+
if (rangeMatch) {
|
|
87
|
+
const start = parsePositiveInt(rangeMatch[1]);
|
|
88
|
+
const end = parsePositiveInt(rangeMatch[2]);
|
|
89
|
+
if (end < start) {
|
|
90
|
+
throw new Error(
|
|
91
|
+
`Invalid range "${part}". Range end must be >= range start.`
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
for (let i = start; i <= end; i++) {
|
|
95
|
+
indexes.add(i - 1);
|
|
96
|
+
}
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
const value = parsePositiveInt(part);
|
|
100
|
+
indexes.add(value - 1);
|
|
101
|
+
}
|
|
102
|
+
return {
|
|
103
|
+
indexes,
|
|
104
|
+
normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function pickFromArray(items, indexes) {
|
|
108
|
+
if (indexes.size === 0) return items;
|
|
109
|
+
return items.filter((_, i) => indexes.has(i));
|
|
110
|
+
}
|
|
111
|
+
async function* filterRecordsByIndex(source, indexes) {
|
|
112
|
+
if (indexes.size === 0) {
|
|
113
|
+
for await (const item of source) {
|
|
114
|
+
yield item;
|
|
115
|
+
}
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
let idx = 0;
|
|
119
|
+
for await (const item of source) {
|
|
120
|
+
if (indexes.has(idx)) {
|
|
121
|
+
yield item;
|
|
122
|
+
}
|
|
123
|
+
idx++;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
63
127
|
// packages/evals/src/dataset/index.ts
|
|
64
128
|
var Dataset = class _Dataset {
|
|
65
129
|
#source;
|
|
@@ -128,6 +192,22 @@ var Dataset = class _Dataset {
|
|
|
128
192
|
}
|
|
129
193
|
});
|
|
130
194
|
}
|
|
195
|
+
pick(indexes) {
|
|
196
|
+
const source = this.#source;
|
|
197
|
+
return new _Dataset(async function* () {
|
|
198
|
+
if (indexes.size === 0) {
|
|
199
|
+
yield* source();
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
let idx = 0;
|
|
203
|
+
for await (const item of source()) {
|
|
204
|
+
if (indexes.has(idx)) {
|
|
205
|
+
yield item;
|
|
206
|
+
}
|
|
207
|
+
idx++;
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
}
|
|
131
211
|
async toArray() {
|
|
132
212
|
const result = [];
|
|
133
213
|
for await (const item of this.#source()) {
|
|
@@ -720,6 +800,28 @@ var RunStore = class {
|
|
|
720
800
|
totalTokensOut: totals.totalTokensOut
|
|
721
801
|
};
|
|
722
802
|
}
|
|
803
|
+
findSuiteByName(name) {
|
|
804
|
+
const row = this.#stmt(
|
|
805
|
+
"SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
|
|
806
|
+
).get(name);
|
|
807
|
+
return row ?? void 0;
|
|
808
|
+
}
|
|
809
|
+
getLatestCompletedRun(suiteId, model) {
|
|
810
|
+
const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
|
|
811
|
+
const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
|
|
812
|
+
if (!row) return void 0;
|
|
813
|
+
return {
|
|
814
|
+
id: row.id,
|
|
815
|
+
suite_id: row.suite_id,
|
|
816
|
+
name: row.name,
|
|
817
|
+
model: row.model,
|
|
818
|
+
config: row.config ? JSON.parse(row.config) : null,
|
|
819
|
+
started_at: row.started_at,
|
|
820
|
+
finished_at: row.finished_at,
|
|
821
|
+
status: row.status,
|
|
822
|
+
summary: row.summary ? JSON.parse(row.summary) : null
|
|
823
|
+
};
|
|
824
|
+
}
|
|
723
825
|
listSuites() {
|
|
724
826
|
const rows = this.#stmt(
|
|
725
827
|
"SELECT * FROM suites ORDER BY created_at DESC"
|
|
@@ -1283,26 +1385,51 @@ function createRunEndFileReporter(options) {
|
|
|
1283
1385
|
}
|
|
1284
1386
|
|
|
1285
1387
|
// packages/evals/src/reporters/console.ts
|
|
1388
|
+
var BAR_WIDTH = 20;
|
|
1389
|
+
function renderProgressBar(completed, total, elapsedMs) {
|
|
1390
|
+
const pct = total > 0 ? completed / total : 0;
|
|
1391
|
+
const filled = Math.round(pct * BAR_WIDTH);
|
|
1392
|
+
const bar = "\u2593".repeat(filled) + "\u2591".repeat(BAR_WIDTH - filled);
|
|
1393
|
+
const pctStr = `${(pct * 100).toFixed(0)}%`;
|
|
1394
|
+
return ` ${bar} ${pctStr} (${completed}/${total}) ${formatDuration(elapsedMs)}`;
|
|
1395
|
+
}
|
|
1396
|
+
function statusLabel(status) {
|
|
1397
|
+
if (status === "pass") return chalk.green("PASS");
|
|
1398
|
+
if (status === "error") return chalk.yellow("ERROR");
|
|
1399
|
+
return chalk.red("FAIL");
|
|
1400
|
+
}
|
|
1286
1401
|
function consoleReporter(options) {
|
|
1287
1402
|
const verbosity = options?.verbosity ?? "normal";
|
|
1288
1403
|
let totalCases = 0;
|
|
1289
1404
|
let completed = 0;
|
|
1405
|
+
let startTime = 0;
|
|
1290
1406
|
return {
|
|
1291
1407
|
onRunStart(data) {
|
|
1292
1408
|
totalCases = data.totalCases;
|
|
1293
1409
|
completed = 0;
|
|
1410
|
+
startTime = Date.now();
|
|
1411
|
+
if (verbosity !== "quiet") {
|
|
1412
|
+
const label = data.name;
|
|
1413
|
+
console.log("");
|
|
1414
|
+
console.log(
|
|
1415
|
+
` ${chalk.dim("\u2500\u2500")} ${chalk.bold(label)} ${chalk.dim("\u2500".repeat(Math.max(0, 56 - label.length)))}`
|
|
1416
|
+
);
|
|
1417
|
+
console.log(` ${chalk.dim(`Running ${data.totalCases} cases...`)}`);
|
|
1418
|
+
console.log("");
|
|
1419
|
+
}
|
|
1294
1420
|
},
|
|
1295
1421
|
onCaseEnd() {
|
|
1296
1422
|
completed++;
|
|
1297
1423
|
if (verbosity !== "quiet") {
|
|
1424
|
+
const elapsed = Date.now() - startTime;
|
|
1298
1425
|
process.stdout.write(
|
|
1299
|
-
`\r
|
|
1426
|
+
`\r${renderProgressBar(completed, totalCases, elapsed)}`
|
|
1300
1427
|
);
|
|
1301
1428
|
}
|
|
1302
1429
|
},
|
|
1303
1430
|
onRunEnd(data) {
|
|
1304
1431
|
if (verbosity !== "quiet") {
|
|
1305
|
-
process.stdout.write("\r" + " ".repeat(
|
|
1432
|
+
process.stdout.write("\r" + " ".repeat(70) + "\r");
|
|
1306
1433
|
}
|
|
1307
1434
|
renderSummaryTable(data);
|
|
1308
1435
|
if (verbosity === "quiet") return;
|
|
@@ -1315,19 +1442,7 @@ function consoleReporter(options) {
|
|
|
1315
1442
|
});
|
|
1316
1443
|
}
|
|
1317
1444
|
} else {
|
|
1318
|
-
|
|
1319
|
-
(c) => getCaseStatus(c, data.threshold) !== "pass"
|
|
1320
|
-
);
|
|
1321
|
-
if (failing.length > 0) {
|
|
1322
|
-
console.log(chalk.dim(` Failing cases (${failing.length}):`));
|
|
1323
|
-
console.log("");
|
|
1324
|
-
for (const c of failing) {
|
|
1325
|
-
renderCaseDetail(c, data.threshold, {
|
|
1326
|
-
includeIO: true,
|
|
1327
|
-
maxStringLength: 4e3
|
|
1328
|
-
});
|
|
1329
|
-
}
|
|
1330
|
-
}
|
|
1445
|
+
renderFailuresByScorer(sorted, data.threshold);
|
|
1331
1446
|
}
|
|
1332
1447
|
}
|
|
1333
1448
|
};
|
|
@@ -1342,38 +1457,51 @@ function truncateString(text, maxLength) {
|
|
|
1342
1457
|
}
|
|
1343
1458
|
function renderSummaryTable(data) {
|
|
1344
1459
|
const { summary } = data;
|
|
1345
|
-
const
|
|
1460
|
+
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
1346
1461
|
console.log("");
|
|
1347
1462
|
console.log(chalk.bold(" Summary"));
|
|
1348
1463
|
console.log(chalk.dim(" " + "\u2500".repeat(60)));
|
|
1349
|
-
console.log(` ${chalk.dim("Eval:")}
|
|
1350
|
-
console.log(` ${chalk.dim("Model:")}
|
|
1351
|
-
console.log(` ${chalk.dim("
|
|
1464
|
+
console.log(` ${chalk.dim("Eval:")} ${data.name}`);
|
|
1465
|
+
console.log(` ${chalk.dim("Model:")} ${data.model}`);
|
|
1466
|
+
console.log(` ${chalk.dim("Threshold:")} ${data.threshold}`);
|
|
1467
|
+
console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
|
|
1352
1468
|
console.log(
|
|
1353
|
-
` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
|
|
1469
|
+
` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))} ${chalk.dim(`(${passRate}%)`)}`
|
|
1354
1470
|
);
|
|
1355
|
-
console.log(` ${chalk.dim("Scores:")} ${scoreStr}`);
|
|
1356
1471
|
console.log(
|
|
1357
|
-
` ${chalk.dim("Duration:")}
|
|
1472
|
+
` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
|
|
1358
1473
|
);
|
|
1359
1474
|
console.log(
|
|
1360
|
-
` ${chalk.dim("Tokens:")}
|
|
1475
|
+
` ${chalk.dim("Tokens:")} ${chalk.dim("In:")} ${formatTokens(summary.totalTokensIn)} ${chalk.dim("Out:")} ${formatTokens(summary.totalTokensOut)} ${chalk.dim("Total:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1361
1476
|
);
|
|
1477
|
+
const scoreEntries = Object.entries(summary.meanScores);
|
|
1478
|
+
if (scoreEntries.length > 0) {
|
|
1479
|
+
console.log("");
|
|
1480
|
+
console.log(chalk.bold(" Scores"));
|
|
1481
|
+
for (const [name, score] of scoreEntries) {
|
|
1482
|
+
const scoreColor = score >= data.threshold ? chalk.green : chalk.red;
|
|
1483
|
+
console.log(
|
|
1484
|
+
` ${chalk.dim(name + ":")}${" ".repeat(Math.max(1, 12 - name.length))}${scoreColor(score.toFixed(3))}`
|
|
1485
|
+
);
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1362
1488
|
console.log(chalk.dim(" " + "\u2500".repeat(60)));
|
|
1363
1489
|
console.log("");
|
|
1364
1490
|
}
|
|
1365
1491
|
function renderCaseDetail(c, threshold, options) {
|
|
1366
1492
|
const entries = Object.entries(c.scores);
|
|
1367
|
-
const
|
|
1368
|
-
const prefix =
|
|
1493
|
+
const status = getCaseStatus(c, threshold);
|
|
1494
|
+
const prefix = statusLabel(status);
|
|
1369
1495
|
const includeIO = options?.includeIO ?? false;
|
|
1370
1496
|
const maxStringLength = options?.maxStringLength ?? 4e3;
|
|
1371
|
-
|
|
1497
|
+
const meta = `${chalk.dim(formatDuration(c.latencyMs))} ${chalk.dim(`${c.tokensIn}/${c.tokensOut} tokens`)}`;
|
|
1498
|
+
console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)} ${meta}`);
|
|
1372
1499
|
const inputStr = stringifyUnknown(c.input, {
|
|
1373
1500
|
space: 2,
|
|
1374
1501
|
fallback: String(c.input)
|
|
1375
1502
|
});
|
|
1376
|
-
console.log(` ${chalk.dim("Input:")}
|
|
1503
|
+
console.log(` ${chalk.dim("Input:")}`);
|
|
1504
|
+
console.log(indentBlock(truncateString(inputStr, maxStringLength), 6));
|
|
1377
1505
|
if (includeIO) {
|
|
1378
1506
|
console.log(` ${chalk.dim("Output:")}`);
|
|
1379
1507
|
console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
|
|
@@ -1400,6 +1528,37 @@ function renderCaseDetail(c, threshold, options) {
|
|
|
1400
1528
|
}
|
|
1401
1529
|
console.log("");
|
|
1402
1530
|
}
|
|
1531
|
+
function renderFailuresByScorer(cases, threshold) {
|
|
1532
|
+
const scorerNames = /* @__PURE__ */ new Set();
|
|
1533
|
+
for (const c of cases) {
|
|
1534
|
+
for (const name of Object.keys(c.scores)) {
|
|
1535
|
+
scorerNames.add(name);
|
|
1536
|
+
}
|
|
1537
|
+
}
|
|
1538
|
+
let hasFailures = false;
|
|
1539
|
+
for (const scorer of scorerNames) {
|
|
1540
|
+
const failing = cases.filter((c) => {
|
|
1541
|
+
const s = c.scores[scorer];
|
|
1542
|
+
return s && s.score < threshold || getCaseStatus(c, threshold) === "error";
|
|
1543
|
+
});
|
|
1544
|
+
if (failing.length === 0) continue;
|
|
1545
|
+
if (!hasFailures) {
|
|
1546
|
+
console.log(chalk.dim(" Failing by scorer:"));
|
|
1547
|
+
console.log("");
|
|
1548
|
+
hasFailures = true;
|
|
1549
|
+
}
|
|
1550
|
+
console.log(
|
|
1551
|
+
` ${chalk.bold(scorer)} ${chalk.dim(`(${failing.length} failures)`)}`
|
|
1552
|
+
);
|
|
1553
|
+
console.log(chalk.dim(" " + "\u2500".repeat(40)));
|
|
1554
|
+
for (const c of failing) {
|
|
1555
|
+
renderCaseDetail(c, threshold, {
|
|
1556
|
+
includeIO: true,
|
|
1557
|
+
maxStringLength: 4e3
|
|
1558
|
+
});
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1403
1562
|
|
|
1404
1563
|
// packages/evals/src/reporters/json.ts
|
|
1405
1564
|
import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
|
|
@@ -1441,6 +1600,7 @@ function csvReporter(options) {
|
|
|
1441
1600
|
const scorerNames = Object.keys(data.summary.meanScores);
|
|
1442
1601
|
const headerParts = [
|
|
1443
1602
|
"index",
|
|
1603
|
+
"status",
|
|
1444
1604
|
"input",
|
|
1445
1605
|
"output",
|
|
1446
1606
|
"expected",
|
|
@@ -1454,8 +1614,10 @@ function csvReporter(options) {
|
|
|
1454
1614
|
}
|
|
1455
1615
|
const rows = [headerParts.join(",")];
|
|
1456
1616
|
for (const c of data.cases) {
|
|
1617
|
+
const status = getCaseStatus(c, data.threshold);
|
|
1457
1618
|
const parts = [
|
|
1458
1619
|
String(c.index),
|
|
1620
|
+
status,
|
|
1459
1621
|
escapeCsv(c.input),
|
|
1460
1622
|
escapeCsv(c.output),
|
|
1461
1623
|
escapeCsv(c.expected),
|
|
@@ -1484,15 +1646,17 @@ function markdownReporter(options) {
|
|
|
1484
1646
|
const { summary } = data;
|
|
1485
1647
|
const scorerNames = Object.keys(summary.meanScores);
|
|
1486
1648
|
const lines = [];
|
|
1649
|
+
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
1487
1650
|
lines.push(`# ${data.name}`);
|
|
1488
1651
|
lines.push("");
|
|
1489
1652
|
lines.push(`**Model:** ${data.model}`);
|
|
1653
|
+
lines.push(`**Threshold:** ${data.threshold}`);
|
|
1490
1654
|
lines.push(
|
|
1491
|
-
`**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
|
|
1655
|
+
`**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail, ${passRate}%)`
|
|
1492
1656
|
);
|
|
1493
1657
|
lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
|
|
1494
1658
|
lines.push(
|
|
1495
|
-
`**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1659
|
+
`**Tokens:** In: ${formatTokens(summary.totalTokensIn)} | Out: ${formatTokens(summary.totalTokensOut)} | Total: ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
|
|
1496
1660
|
);
|
|
1497
1661
|
lines.push("");
|
|
1498
1662
|
lines.push("## Scores");
|
|
@@ -1511,6 +1675,7 @@ function markdownReporter(options) {
|
|
|
1511
1675
|
"Input",
|
|
1512
1676
|
...scorerNames,
|
|
1513
1677
|
"Latency",
|
|
1678
|
+
"Tokens",
|
|
1514
1679
|
"Error"
|
|
1515
1680
|
];
|
|
1516
1681
|
lines.push(`| ${caseHeader.join(" | ")} |`);
|
|
@@ -1528,7 +1693,8 @@ function markdownReporter(options) {
|
|
|
1528
1693
|
status,
|
|
1529
1694
|
input,
|
|
1530
1695
|
...scores,
|
|
1531
|
-
|
|
1696
|
+
formatDuration(c.latencyMs),
|
|
1697
|
+
`${c.tokensIn}/${c.tokensOut}`,
|
|
1532
1698
|
error
|
|
1533
1699
|
];
|
|
1534
1700
|
lines.push(`| ${row.join(" | ")} |`);
|
|
@@ -1553,9 +1719,10 @@ function esc(str) {
|
|
|
1553
1719
|
function renderHtml(data) {
|
|
1554
1720
|
const { summary } = data;
|
|
1555
1721
|
const scorerNames = Object.keys(summary.meanScores);
|
|
1722
|
+
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
1556
1723
|
const caseRows = data.cases.map((c) => {
|
|
1557
1724
|
const status = getCaseStatus(c, data.threshold);
|
|
1558
|
-
const
|
|
1725
|
+
const statusLabel2 = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
|
|
1559
1726
|
const scoresCells = scorerNames.map((name) => {
|
|
1560
1727
|
const s = c.scores[name];
|
|
1561
1728
|
const score = s?.score ?? 0;
|
|
@@ -1563,13 +1730,19 @@ function renderHtml(data) {
|
|
|
1563
1730
|
const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
|
|
1564
1731
|
return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
|
|
1565
1732
|
}).join("");
|
|
1733
|
+
const expectedStr = stringifyUnknown(c.expected, {
|
|
1734
|
+
space: 0,
|
|
1735
|
+
fallback: ""
|
|
1736
|
+
});
|
|
1566
1737
|
return `<tr class="${status}">
|
|
1567
1738
|
<td>${c.index}</td>
|
|
1568
|
-
<td class="${status}">${
|
|
1739
|
+
<td class="${status}">${statusLabel2}</td>
|
|
1569
1740
|
<td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
|
|
1570
1741
|
<td class="text">${esc(c.output.slice(0, 120))}</td>
|
|
1742
|
+
<td class="text">${esc(expectedStr.slice(0, 120))}</td>
|
|
1571
1743
|
${scoresCells}
|
|
1572
|
-
<td>${c.latencyMs}
|
|
1744
|
+
<td>${formatDuration(c.latencyMs)}</td>
|
|
1745
|
+
<td>${c.tokensIn}/${c.tokensOut}</td>
|
|
1573
1746
|
<td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
|
|
1574
1747
|
</tr>`;
|
|
1575
1748
|
}).join("\n");
|
|
@@ -1607,11 +1780,14 @@ function renderHtml(data) {
|
|
|
1607
1780
|
<h1>${esc(data.name)}</h1>
|
|
1608
1781
|
<div class="meta">
|
|
1609
1782
|
<span><strong>Model:</strong> ${esc(data.model)}</span>
|
|
1783
|
+
<span><strong>Threshold:</strong> ${data.threshold}</span>
|
|
1610
1784
|
<span><strong>Cases:</strong> ${summary.totalCases}</span>
|
|
1611
1785
|
<span><strong>Pass:</strong> ${summary.passCount}</span>
|
|
1612
|
-
<span><strong>Fail:</strong> ${summary.failCount}</span>
|
|
1786
|
+
<span><strong>Fail:</strong> ${summary.failCount} (${passRate}%)</span>
|
|
1613
1787
|
<span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
|
|
1614
|
-
<span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn
|
|
1788
|
+
<span><strong>Tokens In:</strong> ${formatTokens(summary.totalTokensIn)}</span>
|
|
1789
|
+
<span><strong>Tokens Out:</strong> ${formatTokens(summary.totalTokensOut)}</span>
|
|
1790
|
+
<span><strong>Total Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
|
|
1615
1791
|
</div>
|
|
1616
1792
|
|
|
1617
1793
|
<h2>Mean Scores</h2>
|
|
@@ -1628,8 +1804,10 @@ function renderHtml(data) {
|
|
|
1628
1804
|
<th>Status</th>
|
|
1629
1805
|
<th>Input</th>
|
|
1630
1806
|
<th>Output</th>
|
|
1807
|
+
<th>Expected</th>
|
|
1631
1808
|
${scorerHeaders}
|
|
1632
1809
|
<th>Latency</th>
|
|
1810
|
+
<th>Tokens</th>
|
|
1633
1811
|
<th>Error</th>
|
|
1634
1812
|
</tr>
|
|
1635
1813
|
</thead>
|
|
@@ -1642,14 +1820,149 @@ function renderHtml(data) {
|
|
|
1642
1820
|
}
|
|
1643
1821
|
|
|
1644
1822
|
// packages/evals/src/evaluate/index.ts
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1823
|
+
var EvalAssertionError = class extends Error {
|
|
1824
|
+
summary;
|
|
1825
|
+
constructor(summary) {
|
|
1826
|
+
const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
|
|
1827
|
+
super(msg);
|
|
1828
|
+
this.name = "EvalAssertionError";
|
|
1829
|
+
this.summary = summary;
|
|
1830
|
+
}
|
|
1831
|
+
};
|
|
1832
|
+
function resolveFailedIndexes(store, suiteName, model, threshold) {
|
|
1833
|
+
const suite = store.findSuiteByName(suiteName);
|
|
1834
|
+
if (!suite) {
|
|
1835
|
+
console.warn(
|
|
1836
|
+
`No previous suite found for '${suiteName}'. Running all cases.`
|
|
1837
|
+
);
|
|
1838
|
+
return /* @__PURE__ */ new Set();
|
|
1839
|
+
}
|
|
1840
|
+
const run = store.getLatestCompletedRun(suite.id, model);
|
|
1841
|
+
if (!run) {
|
|
1842
|
+
console.warn(
|
|
1843
|
+
`No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
|
|
1844
|
+
);
|
|
1845
|
+
return /* @__PURE__ */ new Set();
|
|
1846
|
+
}
|
|
1847
|
+
const failingCases = store.getFailingCases(run.id, threshold);
|
|
1848
|
+
if (failingCases.length === 0) {
|
|
1849
|
+
console.warn(`No failed cases in previous run. Running all cases.`);
|
|
1850
|
+
return /* @__PURE__ */ new Set();
|
|
1648
1851
|
}
|
|
1649
|
-
|
|
1852
|
+
console.warn(
|
|
1853
|
+
`Retrying ${failingCases.length} failed cases from previous run`
|
|
1854
|
+
);
|
|
1855
|
+
return new Set(failingCases.map((c) => c.idx));
|
|
1650
1856
|
}
|
|
1651
|
-
|
|
1652
|
-
|
|
1857
|
+
var EvalBuilder = class {
|
|
1858
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
1859
|
+
#options;
|
|
1860
|
+
#selection = { type: "all" };
|
|
1861
|
+
#shouldAssert = false;
|
|
1862
|
+
constructor(options) {
|
|
1863
|
+
this.#options = options;
|
|
1864
|
+
}
|
|
1865
|
+
#setSelection(selection) {
|
|
1866
|
+
if (this.#selection.type !== "all") {
|
|
1867
|
+
throw new Error(
|
|
1868
|
+
`Cannot combine .${this.#selection.type}() with .${selection.type}()`
|
|
1869
|
+
);
|
|
1870
|
+
}
|
|
1871
|
+
this.#selection = selection;
|
|
1872
|
+
return this;
|
|
1873
|
+
}
|
|
1874
|
+
failed() {
|
|
1875
|
+
return this.#setSelection({ type: "failed" });
|
|
1876
|
+
}
|
|
1877
|
+
cases(spec) {
|
|
1878
|
+
const { indexes } = parseRecordSelection(spec);
|
|
1879
|
+
return this.#setSelection({ type: "cases", indexes });
|
|
1880
|
+
}
|
|
1881
|
+
sample(count) {
|
|
1882
|
+
if (count < 1) {
|
|
1883
|
+
throw new Error("Sample count must be >= 1");
|
|
1884
|
+
}
|
|
1885
|
+
return this.#setSelection({ type: "sample", count });
|
|
1886
|
+
}
|
|
1887
|
+
assert() {
|
|
1888
|
+
this.#shouldAssert = true;
|
|
1889
|
+
return this;
|
|
1890
|
+
}
|
|
1891
|
+
then(onfulfilled, onrejected) {
|
|
1892
|
+
return this.#execute().then(onfulfilled, onrejected);
|
|
1893
|
+
}
|
|
1894
|
+
async #execute() {
|
|
1895
|
+
if ("models" in this.#options) {
|
|
1896
|
+
return this.#executeMulti();
|
|
1897
|
+
}
|
|
1898
|
+
return this.#executeSingle();
|
|
1899
|
+
}
|
|
1900
|
+
#applyDatasetFilter(ds) {
|
|
1901
|
+
switch (this.#selection.type) {
|
|
1902
|
+
case "all":
|
|
1903
|
+
return ds;
|
|
1904
|
+
case "cases":
|
|
1905
|
+
return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
|
|
1906
|
+
case "sample":
|
|
1907
|
+
return dataset(ds).sample(this.#selection.count);
|
|
1908
|
+
case "failed":
|
|
1909
|
+
return ds;
|
|
1910
|
+
}
|
|
1911
|
+
}
|
|
1912
|
+
async #executeSingle() {
|
|
1913
|
+
const options = this.#options;
|
|
1914
|
+
let ds = options.dataset;
|
|
1915
|
+
if (this.#selection.type === "failed") {
|
|
1916
|
+
const indexes = resolveFailedIndexes(
|
|
1917
|
+
options.store,
|
|
1918
|
+
options.name,
|
|
1919
|
+
options.model,
|
|
1920
|
+
options.threshold
|
|
1921
|
+
);
|
|
1922
|
+
if (indexes.size > 0) {
|
|
1923
|
+
ds = filterRecordsByIndex(ds, indexes);
|
|
1924
|
+
}
|
|
1925
|
+
} else {
|
|
1926
|
+
ds = this.#applyDatasetFilter(ds);
|
|
1927
|
+
}
|
|
1928
|
+
const result = await evaluateSingle({ ...options, dataset: ds });
|
|
1929
|
+
if (this.#shouldAssert && result.failCount > 0) {
|
|
1930
|
+
throw new EvalAssertionError(result);
|
|
1931
|
+
}
|
|
1932
|
+
return result;
|
|
1933
|
+
}
|
|
1934
|
+
async #executeMulti() {
|
|
1935
|
+
const options = this.#options;
|
|
1936
|
+
let result;
|
|
1937
|
+
if (this.#selection.type === "failed") {
|
|
1938
|
+
const perModelIndexes = /* @__PURE__ */ new Map();
|
|
1939
|
+
for (const variant of options.models) {
|
|
1940
|
+
perModelIndexes.set(
|
|
1941
|
+
variant.name,
|
|
1942
|
+
resolveFailedIndexes(
|
|
1943
|
+
options.store,
|
|
1944
|
+
options.name,
|
|
1945
|
+
variant.name,
|
|
1946
|
+
options.threshold
|
|
1947
|
+
)
|
|
1948
|
+
);
|
|
1949
|
+
}
|
|
1950
|
+
result = await evaluateEach(options, perModelIndexes);
|
|
1951
|
+
} else {
|
|
1952
|
+
const filtered = this.#applyDatasetFilter(options.dataset);
|
|
1953
|
+
result = await evaluateEach({ ...options, dataset: filtered });
|
|
1954
|
+
}
|
|
1955
|
+
if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
|
|
1956
|
+
throw new EvalAssertionError(result);
|
|
1957
|
+
}
|
|
1958
|
+
return result;
|
|
1959
|
+
}
|
|
1960
|
+
};
|
|
1961
|
+
function evaluate(options) {
|
|
1962
|
+
if ("models" in options) {
|
|
1963
|
+
return new EvalBuilder(options);
|
|
1964
|
+
}
|
|
1965
|
+
return new EvalBuilder(options);
|
|
1653
1966
|
}
|
|
1654
1967
|
function wireReporters(reporters) {
|
|
1655
1968
|
const emitter = new EvalEmitter();
|
|
@@ -1682,7 +1995,6 @@ async function notifyRunEnd(reporters, data) {
|
|
|
1682
1995
|
await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
|
|
1683
1996
|
}
|
|
1684
1997
|
async function evaluateSingle(options) {
|
|
1685
|
-
const store = resolveStore(options.store);
|
|
1686
1998
|
const threshold = options.threshold ?? 0.5;
|
|
1687
1999
|
const { emitter, cases, getRunId } = wireReporters(options.reporters);
|
|
1688
2000
|
const summary = await runEval({
|
|
@@ -1691,7 +2003,7 @@ async function evaluateSingle(options) {
|
|
|
1691
2003
|
dataset: options.dataset,
|
|
1692
2004
|
task: options.task,
|
|
1693
2005
|
scorers: options.scorers,
|
|
1694
|
-
store,
|
|
2006
|
+
store: options.store,
|
|
1695
2007
|
emitter,
|
|
1696
2008
|
suiteId: options.suiteId,
|
|
1697
2009
|
maxConcurrency: options.maxConcurrency,
|
|
@@ -1709,34 +2021,40 @@ async function evaluateSingle(options) {
|
|
|
1709
2021
|
});
|
|
1710
2022
|
return summary;
|
|
1711
2023
|
}
|
|
1712
|
-
async function evaluateEach(options) {
|
|
1713
|
-
const store = resolveStore(options.store);
|
|
2024
|
+
async function evaluateEach(options, perModelFailedIndexes) {
|
|
1714
2025
|
const items = [];
|
|
1715
2026
|
for await (const item of options.dataset) {
|
|
1716
2027
|
items.push(item);
|
|
1717
2028
|
}
|
|
1718
|
-
const suite = store.createSuite(options.name);
|
|
2029
|
+
const suite = options.store.createSuite(options.name);
|
|
1719
2030
|
return Promise.all(
|
|
1720
|
-
options.models.map(
|
|
1721
|
-
|
|
2031
|
+
options.models.map((variant) => {
|
|
2032
|
+
let ds = dataset(items);
|
|
2033
|
+
const failedIndexes = perModelFailedIndexes?.get(variant.name);
|
|
2034
|
+
if (failedIndexes && failedIndexes.size > 0) {
|
|
2035
|
+
ds = filterRecordsByIndex(ds, failedIndexes);
|
|
2036
|
+
}
|
|
2037
|
+
return evaluateSingle({
|
|
1722
2038
|
name: `${options.name} [${variant.name}]`,
|
|
1723
2039
|
model: variant.name,
|
|
1724
|
-
dataset:
|
|
2040
|
+
dataset: ds,
|
|
1725
2041
|
task: (input) => options.task(input, variant),
|
|
1726
2042
|
scorers: options.scorers,
|
|
1727
2043
|
reporters: options.reporters,
|
|
1728
|
-
store,
|
|
2044
|
+
store: options.store,
|
|
1729
2045
|
suiteId: suite.id,
|
|
1730
2046
|
maxConcurrency: options.maxConcurrency,
|
|
1731
2047
|
timeout: options.timeout,
|
|
1732
2048
|
trials: options.trials,
|
|
1733
2049
|
threshold: options.threshold
|
|
1734
|
-
})
|
|
1735
|
-
)
|
|
2050
|
+
});
|
|
2051
|
+
})
|
|
1736
2052
|
);
|
|
1737
2053
|
}
|
|
1738
2054
|
export {
|
|
1739
2055
|
Dataset,
|
|
2056
|
+
EvalAssertionError,
|
|
2057
|
+
EvalBuilder,
|
|
1740
2058
|
EvalEmitter,
|
|
1741
2059
|
RunStore,
|
|
1742
2060
|
all,
|
|
@@ -1748,6 +2066,7 @@ export {
|
|
|
1748
2066
|
evaluate,
|
|
1749
2067
|
exactMatch,
|
|
1750
2068
|
factuality,
|
|
2069
|
+
filterRecordsByIndex,
|
|
1751
2070
|
hf,
|
|
1752
2071
|
htmlReporter,
|
|
1753
2072
|
includes,
|
|
@@ -1756,6 +2075,8 @@ export {
|
|
|
1756
2075
|
levenshtein,
|
|
1757
2076
|
llmJudge,
|
|
1758
2077
|
markdownReporter,
|
|
2078
|
+
parseRecordSelection,
|
|
2079
|
+
pickFromArray,
|
|
1759
2080
|
regex,
|
|
1760
2081
|
runEval,
|
|
1761
2082
|
weighted
|