@deepagents/evals 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js +6 -3
- package/dist/engine/index.js.map +2 -2
- package/dist/evaluate/index.js +6 -3
- package/dist/evaluate/index.js.map +2 -2
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +56 -58
- package/dist/index.js.map +2 -2
- package/dist/reporters/index.js +18 -1
- package/dist/reporters/index.js.map +2 -2
- package/dist/scorers/index.d.ts +2 -6
- package/dist/scorers/index.d.ts.map +1 -1
- package/dist/scorers/index.js +32 -54
- package/dist/scorers/index.js.map +2 -2
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -330,8 +330,10 @@ function dataset(source) {
|
|
|
330
330
|
}
|
|
331
331
|
|
|
332
332
|
// packages/evals/src/scorers/index.ts
|
|
333
|
-
import {
|
|
334
|
-
|
|
333
|
+
import {
|
|
334
|
+
Factuality as AutoevalsFactuality,
|
|
335
|
+
Levenshtein as AutoevalsLevenshtein
|
|
336
|
+
} from "autoevals";
|
|
335
337
|
var exactMatch = async ({ output, expected }) => {
|
|
336
338
|
const exp = expected == null ? "" : String(expected);
|
|
337
339
|
if (output === exp) return { score: 1 };
|
|
@@ -353,32 +355,32 @@ function regex(pattern) {
|
|
|
353
355
|
return { score: pattern.test(output) ? 1 : 0 };
|
|
354
356
|
};
|
|
355
357
|
}
|
|
356
|
-
function
|
|
357
|
-
if (
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
358
|
+
function normalizeScore(score) {
|
|
359
|
+
if (typeof score !== "number" || !Number.isFinite(score)) return 0;
|
|
360
|
+
return Math.max(0, Math.min(1, score));
|
|
361
|
+
}
|
|
362
|
+
function reasonFromMetadata(metadata) {
|
|
363
|
+
if (!metadata) return void 0;
|
|
364
|
+
const candidates = [
|
|
365
|
+
metadata.reason,
|
|
366
|
+
metadata.rationale,
|
|
367
|
+
metadata.explanation
|
|
368
|
+
];
|
|
369
|
+
for (const candidate of candidates) {
|
|
370
|
+
if (typeof candidate === "string" && candidate.trim().length > 0) {
|
|
371
|
+
return candidate;
|
|
367
372
|
}
|
|
368
|
-
[prev, curr] = [curr, prev];
|
|
369
373
|
}
|
|
370
|
-
return
|
|
374
|
+
return void 0;
|
|
371
375
|
}
|
|
372
376
|
var levenshtein = async ({ output, expected }) => {
|
|
373
377
|
const exp = expected == null ? "" : String(expected);
|
|
374
|
-
|
|
375
|
-
const
|
|
376
|
-
const distance = levenshteinDistance(output, exp);
|
|
377
|
-
const score = Math.max(0, 1 - distance / maxLen);
|
|
378
|
-
if (score === 1) return { score };
|
|
378
|
+
const result = await AutoevalsLevenshtein({ output, expected: exp });
|
|
379
|
+
const score = normalizeScore(result.score);
|
|
379
380
|
return {
|
|
380
381
|
score,
|
|
381
|
-
reason:
|
|
382
|
+
reason: reasonFromMetadata(result.metadata),
|
|
383
|
+
metadata: result.metadata
|
|
382
384
|
};
|
|
383
385
|
};
|
|
384
386
|
function deepEqual(a, b) {
|
|
@@ -412,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
|
|
|
412
414
|
return { score: 0, reason: "Failed to parse JSON" };
|
|
413
415
|
}
|
|
414
416
|
};
|
|
415
|
-
var llmScorerSchema = z.object({
|
|
416
|
-
score: z.number().min(0).max(1),
|
|
417
|
-
reason: z.string()
|
|
418
|
-
});
|
|
419
|
-
function llmJudge(config) {
|
|
420
|
-
return async ({ input, output, expected }) => {
|
|
421
|
-
const { object } = await generateObject({
|
|
422
|
-
model: config.model,
|
|
423
|
-
schema: llmScorerSchema,
|
|
424
|
-
prompt: `You are an expert evaluator. Grade the output based on the following criteria:
|
|
425
|
-
${config.criteria}
|
|
426
|
-
|
|
427
|
-
Input: ${JSON.stringify(input)}
|
|
428
|
-
Output: ${output}
|
|
429
|
-
${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
|
|
430
|
-
|
|
431
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
432
|
-
});
|
|
433
|
-
return { score: object.score, reason: object.reason };
|
|
434
|
-
};
|
|
435
|
-
}
|
|
436
417
|
function factuality(config) {
|
|
437
418
|
return async ({ input, output, expected }) => {
|
|
438
|
-
const
|
|
419
|
+
const result = await AutoevalsFactuality({
|
|
439
420
|
model: config.model,
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
Input: ${JSON.stringify(input)}
|
|
444
|
-
Output: ${output}
|
|
445
|
-
Expected reference: ${JSON.stringify(expected)}
|
|
446
|
-
|
|
447
|
-
Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
|
|
448
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
421
|
+
input: typeof input === "string" ? input : JSON.stringify(input),
|
|
422
|
+
output,
|
|
423
|
+
expected: expected == null ? void 0 : String(expected)
|
|
449
424
|
});
|
|
450
|
-
return {
|
|
425
|
+
return {
|
|
426
|
+
score: normalizeScore(result.score),
|
|
427
|
+
reason: reasonFromMetadata(result.metadata),
|
|
428
|
+
metadata: result.metadata
|
|
429
|
+
};
|
|
451
430
|
};
|
|
452
431
|
}
|
|
453
432
|
function all(...scorers) {
|
|
@@ -1042,7 +1021,8 @@ async function runEval(config) {
|
|
|
1042
1021
|
});
|
|
1043
1022
|
scores[sName] = {
|
|
1044
1023
|
score: clampScore(sr.score, sName),
|
|
1045
|
-
reason: sr.reason
|
|
1024
|
+
reason: sr.reason,
|
|
1025
|
+
metadata: sr.metadata
|
|
1046
1026
|
};
|
|
1047
1027
|
}
|
|
1048
1028
|
trialResults.push({ result, scores });
|
|
@@ -1068,7 +1048,8 @@ async function runEval(config) {
|
|
|
1068
1048
|
const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
|
|
1069
1049
|
finalScores[sName] = {
|
|
1070
1050
|
score: meanScore,
|
|
1071
|
-
reason: trialResults[trialResults.length - 1].scores[sName]?.reason
|
|
1051
|
+
reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
|
|
1052
|
+
metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
|
|
1072
1053
|
};
|
|
1073
1054
|
}
|
|
1074
1055
|
} else {
|
|
@@ -1085,7 +1066,8 @@ async function runEval(config) {
|
|
|
1085
1066
|
});
|
|
1086
1067
|
finalScores[sName] = {
|
|
1087
1068
|
score: clampScore(sr.score, sName),
|
|
1088
|
-
reason: sr.reason
|
|
1069
|
+
reason: sr.reason,
|
|
1070
|
+
metadata: sr.metadata
|
|
1089
1071
|
};
|
|
1090
1072
|
}
|
|
1091
1073
|
}
|
|
@@ -1455,6 +1437,22 @@ function truncateString(text, maxLength) {
|
|
|
1455
1437
|
if (text.length <= maxLength) return text;
|
|
1456
1438
|
return text.slice(0, maxLength) + "\u2026";
|
|
1457
1439
|
}
|
|
1440
|
+
function stringifyRationale(value) {
|
|
1441
|
+
if (typeof value === "string") {
|
|
1442
|
+
const trimmed = value.trim();
|
|
1443
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
1444
|
+
}
|
|
1445
|
+
if (Array.isArray(value)) {
|
|
1446
|
+
const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
|
|
1447
|
+
if (parts.length > 0) return parts.join(" | ");
|
|
1448
|
+
}
|
|
1449
|
+
return void 0;
|
|
1450
|
+
}
|
|
1451
|
+
function scoreReasonWithMetadata(score) {
|
|
1452
|
+
const reason = score.reason?.trim();
|
|
1453
|
+
if (reason) return reason;
|
|
1454
|
+
return stringifyRationale(score.metadata?.["rationale"]);
|
|
1455
|
+
}
|
|
1458
1456
|
function renderSummaryTable(data) {
|
|
1459
1457
|
const { summary } = data;
|
|
1460
1458
|
const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
|
|
@@ -1516,7 +1514,8 @@ function renderCaseDetail(c, threshold, options) {
|
|
|
1516
1514
|
}
|
|
1517
1515
|
for (const [name, s] of entries) {
|
|
1518
1516
|
const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
|
|
1519
|
-
const
|
|
1517
|
+
const reason = scoreReasonWithMetadata(s);
|
|
1518
|
+
const reasonStr = reason ? ` \u2014 ${reason}` : "";
|
|
1520
1519
|
console.log(
|
|
1521
1520
|
` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
|
|
1522
1521
|
);
|
|
@@ -2073,7 +2072,6 @@ export {
|
|
|
2073
2072
|
jsonMatch,
|
|
2074
2073
|
jsonReporter,
|
|
2075
2074
|
levenshtein,
|
|
2076
|
-
llmJudge,
|
|
2077
2075
|
markdownReporter,
|
|
2078
2076
|
parseRecordSelection,
|
|
2079
2077
|
pickFromArray,
|