@deepagents/evals 0.20.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js +6 -3
- package/dist/engine/index.js.map +2 -2
- package/dist/evaluate/index.js +6 -3
- package/dist/evaluate/index.js.map +2 -2
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +66 -58
- package/dist/index.js.map +2 -2
- package/dist/reporters/index.js +18 -1
- package/dist/reporters/index.js.map +2 -2
- package/dist/scorers/index.d.ts +2 -6
- package/dist/scorers/index.d.ts.map +1 -1
- package/dist/scorers/index.js +32 -54
- package/dist/scorers/index.js.map +2 -2
- package/dist/store/index.d.ts +3 -0
- package/dist/store/index.d.ts.map +1 -1
- package/dist/store/index.js +10 -0
- package/dist/store/index.js.map +2 -2
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -330,8 +330,10 @@ function dataset(source) {
|
|
|
330
330
|
}
|
|
331
331
|
|
|
332
332
|
// packages/evals/src/scorers/index.ts
|
|
333
|
-
import {
|
|
334
|
-
|
|
333
|
+
import {
|
|
334
|
+
Factuality as AutoevalsFactuality,
|
|
335
|
+
Levenshtein as AutoevalsLevenshtein
|
|
336
|
+
} from "autoevals";
|
|
335
337
|
var exactMatch = async ({ output, expected }) => {
|
|
336
338
|
const exp = expected == null ? "" : String(expected);
|
|
337
339
|
if (output === exp) return { score: 1 };
|
|
@@ -353,32 +355,32 @@ function regex(pattern) {
|
|
|
353
355
|
return { score: pattern.test(output) ? 1 : 0 };
|
|
354
356
|
};
|
|
355
357
|
}
|
|
356
|
-
function
|
|
357
|
-
if (
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
358
|
+
function normalizeScore(score) {
|
|
359
|
+
if (typeof score !== "number" || !Number.isFinite(score)) return 0;
|
|
360
|
+
return Math.max(0, Math.min(1, score));
|
|
361
|
+
}
|
|
362
|
+
function reasonFromMetadata(metadata) {
|
|
363
|
+
if (!metadata) return void 0;
|
|
364
|
+
const candidates = [
|
|
365
|
+
metadata.reason,
|
|
366
|
+
metadata.rationale,
|
|
367
|
+
metadata.explanation
|
|
368
|
+
];
|
|
369
|
+
for (const candidate of candidates) {
|
|
370
|
+
if (typeof candidate === "string" && candidate.trim().length > 0) {
|
|
371
|
+
return candidate;
|
|
367
372
|
}
|
|
368
|
-
[prev, curr] = [curr, prev];
|
|
369
373
|
}
|
|
370
|
-
return
|
|
374
|
+
return void 0;
|
|
371
375
|
}
|
|
372
376
|
var levenshtein = async ({ output, expected }) => {
|
|
373
377
|
const exp = expected == null ? "" : String(expected);
|
|
374
|
-
|
|
375
|
-
const
|
|
376
|
-
const distance = levenshteinDistance(output, exp);
|
|
377
|
-
const score = Math.max(0, 1 - distance / maxLen);
|
|
378
|
-
if (score === 1) return { score };
|
|
378
|
+
const result = await AutoevalsLevenshtein({ output, expected: exp });
|
|
379
|
+
const score = normalizeScore(result.score);
|
|
379
380
|
return {
|
|
380
381
|
score,
|
|
381
|
-
reason:
|
|
382
|
+
reason: reasonFromMetadata(result.metadata),
|
|
383
|
+
metadata: result.metadata
|
|
382
384
|
};
|
|
383
385
|
};
|
|
384
386
|
function deepEqual(a, b) {
|
|
@@ -412,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
|
|
|
412
414
|
return { score: 0, reason: "Failed to parse JSON" };
|
|
413
415
|
}
|
|
414
416
|
};
|
|
415
|
-
var llmScorerSchema = z.object({
|
|
416
|
-
score: z.number().min(0).max(1),
|
|
417
|
-
reason: z.string()
|
|
418
|
-
});
|
|
419
|
-
function llmJudge(config) {
|
|
420
|
-
return async ({ input, output, expected }) => {
|
|
421
|
-
const { object } = await generateObject({
|
|
422
|
-
model: config.model,
|
|
423
|
-
schema: llmScorerSchema,
|
|
424
|
-
prompt: `You are an expert evaluator. Grade the output based on the following criteria:
|
|
425
|
-
${config.criteria}
|
|
426
|
-
|
|
427
|
-
Input: ${JSON.stringify(input)}
|
|
428
|
-
Output: ${output}
|
|
429
|
-
${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
|
|
430
|
-
|
|
431
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
432
|
-
});
|
|
433
|
-
return { score: object.score, reason: object.reason };
|
|
434
|
-
};
|
|
435
|
-
}
|
|
436
417
|
function factuality(config) {
|
|
437
418
|
return async ({ input, output, expected }) => {
|
|
438
|
-
const
|
|
419
|
+
const result = await AutoevalsFactuality({
|
|
439
420
|
model: config.model,
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
Input: ${JSON.stringify(input)}
|
|
444
|
-
Output: ${output}
|
|
445
|
-
Expected reference: ${JSON.stringify(expected)}
|
|
446
|
-
|
|
447
|
-
Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
|
|
448
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
421
|
+
input: typeof input === "string" ? input : JSON.stringify(input),
|
|
422
|
+
output,
|
|
423
|
+
expected: expected == null ? void 0 : String(expected)
|
|
449
424
|
});
|
|
450
|
-
return {
|
|
425
|
+
return {
|
|
426
|
+
score: normalizeScore(result.score),
|
|
427
|
+
reason: reasonFromMetadata(result.metadata),
|
|
428
|
+
metadata: result.metadata
|
|
429
|
+
};
|
|
451
430
|
};
|
|
452
431
|
}
|
|
453
432
|
function all(...scorers) {
|
|
@@ -623,6 +602,16 @@ var RunStore = class {
|
|
|
623
602
|
).run(id, name, now);
|
|
624
603
|
return { id, name, created_at: now };
|
|
625
604
|
}
|
|
605
|
+
getSuite(id) {
|
|
606
|
+
const row = this.#stmt("SELECT * FROM suites WHERE id = ?").get(id);
|
|
607
|
+
return row ?? void 0;
|
|
608
|
+
}
|
|
609
|
+
renameSuite(id, name) {
|
|
610
|
+
this.#stmt("UPDATE suites SET name = ? WHERE id = ?").run(name, id);
|
|
611
|
+
}
|
|
612
|
+
renameRun(id, name) {
|
|
613
|
+
this.#stmt("UPDATE runs SET name = ? WHERE id = ?").run(name, id);
|
|
614
|
+
}
|
|
626
615
|
createRun(run) {
|
|
627
616
|
const id = crypto.randomUUID();
|
|
628
617
|
const now = Date.now();
|
|
@@ -1042,7 +1031,8 @@ async function runEval(config) {
|
|
|
1042
1031
|
});
|
|
1043
1032
|
scores[sName] = {
|
|
1044
1033
|
score: clampScore(sr.score, sName),
|
|
1045
|
-
reason: sr.reason
|
|
1034
|
+
reason: sr.reason,
|
|
1035
|
+
metadata: sr.metadata
|
|
1046
1036
|
};
|
|
1047
1037
|
}
|
|
1048
1038
|
trialResults.push({ result, scores });
|
|
@@ -1068,7 +1058,8 @@ async function runEval(config) {
|
|
|
1068
1058
|
const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
|
|
1069
1059
|
finalScores[sName] = {
|
|
1070
1060
|
score: meanScore,
|
|
1071
|
-
reason: trialResults[trialResults.length - 1].scores[sName]?.reason
|
|
1061
|
+
reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
|
|
1062
|
+
metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
|
|
1072
1063
|
};
|
|
1073
1064
|
}
|
|
1074
1065
|
} else {
|
|
@@ -1085,7 +1076,8 @@ async function runEval(config) {
|
|
|
1085
1076
|
});
|
|
1086
1077
|
finalScores[sName] = {
|
|
1087
1078
|
score: clampScore(sr.score, sName),
|
|
1088
|
-
reason: sr.reason
|
|
1079
|
+
reason: sr.reason,
|
|
1080
|
+
metadata: sr.metadata
|
|
1089
1081
|
};
|
|
1090
1082
|
}
|
|
1091
1083
|
}
|
|
@@ -1455,6 +1447,22 @@ function truncateString(text, maxLength) {
|
|
|
1455
1447
|
if (text.length <= maxLength) return text;
|
|
1456
1448
|
return text.slice(0, maxLength) + "\u2026";
|
|
1457
1449
|
}
|
|
1450
|
+
function stringifyRationale(value) {
|
|
1451
|
+
if (typeof value === "string") {
|
|
1452
|
+
const trimmed = value.trim();
|
|
1453
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
1454
|
+
}
|
|
1455
|
+
if (Array.isArray(value)) {
|
|
1456
|
+
const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
|
|
1457
|
+
if (parts.length > 0) return parts.join(" | ");
|
|
1458
|
+
}
|
|
1459
|
+
return void 0;
|
|
1460
|
+
}
|
|
1461
|
+
function scoreReasonWithMetadata(score) {
|
|
1462
|
+
const reason = score.reason?.trim();
|
|
1463
|
+
if (reason) return reason;
|
|
1464
|
+
return stringifyRationale(score.metadata?.["rationale"]);
|
|
1465
|
+
}
|
|
1458
1466
|
function renderSummaryTable(data) {
|
|
1459
1467
|
const { summary } = data;
|
|
1460
1468
|
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
@@ -1516,7 +1524,8 @@ function renderCaseDetail(c, threshold, options) {
|
|
|
1516
1524
|
}
|
|
1517
1525
|
for (const [name, s] of entries) {
|
|
1518
1526
|
const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
|
|
1519
|
-
const
|
|
1527
|
+
const reason = scoreReasonWithMetadata(s);
|
|
1528
|
+
const reasonStr = reason ? ` \u2014 ${reason}` : "";
|
|
1520
1529
|
console.log(
|
|
1521
1530
|
` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
|
|
1522
1531
|
);
|
|
@@ -2073,7 +2082,6 @@ export {
|
|
|
2073
2082
|
jsonMatch,
|
|
2074
2083
|
jsonReporter,
|
|
2075
2084
|
levenshtein,
|
|
2076
|
-
llmJudge,
|
|
2077
2085
|
markdownReporter,
|
|
2078
2086
|
parseRecordSelection,
|
|
2079
2087
|
pickFromArray,
|