@mastra/evals 0.10.6 → 0.10.8-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{dist-IVAARSAW.cjs → dist-JD6MNRVB.cjs} +8 -8
- package/dist/{dist-5JXLPLM2.js → dist-ZXFGMR47.js} +8 -8
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/{magic-string.es-LD4FLE5J.js → magic-string.es-MNZ6ZGOL.js} +1 -1
- package/dist/{magic-string.es-66FD77JZ.cjs → magic-string.es-T2QO2IBJ.cjs} +1 -1
- package/dist/scorers/llm/index.cjs +27 -22
- package/dist/scorers/llm/index.js +27 -22
- package/package.json +4 -4
|
@@ -11988,7 +11988,7 @@ function createTestHook(name, handler) {
|
|
|
11988
11988
|
};
|
|
11989
11989
|
}
|
|
11990
11990
|
|
|
11991
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
11991
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/utils.XdZDrNZV.js
|
|
11992
11992
|
var NAME_WORKER_STATE = "__vitest_worker__";
|
|
11993
11993
|
function getWorkerState() {
|
|
11994
11994
|
const workerState = globalThis[NAME_WORKER_STATE];
|
|
@@ -12036,7 +12036,7 @@ async function waitForImportsToResolve() {
|
|
|
12036
12036
|
await waitForImportsToResolve();
|
|
12037
12037
|
}
|
|
12038
12038
|
|
|
12039
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
12039
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
|
|
12040
12040
|
var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
|
|
12041
12041
|
function getDefaultExportFromCjs3(x) {
|
|
12042
12042
|
return x && x.__esModule && Object.prototype.hasOwnProperty.call(x, "default") ? x["default"] : x;
|
|
@@ -12889,7 +12889,7 @@ function offsetToLineNumber(source, offset) {
|
|
|
12889
12889
|
return line + 1;
|
|
12890
12890
|
}
|
|
12891
12891
|
async function saveInlineSnapshots(environment, snapshots) {
|
|
12892
|
-
const MagicString = (await import('./magic-string.es-
|
|
12892
|
+
const MagicString = (await import('./magic-string.es-T2QO2IBJ.cjs')).default;
|
|
12893
12893
|
const files = new Set(snapshots.map((i) => i.file));
|
|
12894
12894
|
await Promise.all(Array.from(files).map(async (file) => {
|
|
12895
12895
|
const snaps = snapshots.filter((i) => i.file === file);
|
|
@@ -13666,7 +13666,7 @@ var SnapshotClient = class {
|
|
|
13666
13666
|
}
|
|
13667
13667
|
};
|
|
13668
13668
|
|
|
13669
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13669
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/date.Bq6ZW5rf.js
|
|
13670
13670
|
var RealDate = Date;
|
|
13671
13671
|
var now2 = null;
|
|
13672
13672
|
var MockDate = class _MockDate extends RealDate {
|
|
@@ -13714,7 +13714,7 @@ function resetDate() {
|
|
|
13714
13714
|
globalThis.Date = RealDate;
|
|
13715
13715
|
}
|
|
13716
13716
|
|
|
13717
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13717
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/vi.bdSIJ99Y.js
|
|
13718
13718
|
var unsupported = [
|
|
13719
13719
|
"matchSnapshot",
|
|
13720
13720
|
"toMatchSnapshot",
|
|
@@ -16400,7 +16400,7 @@ function getImporter(name) {
|
|
|
16400
16400
|
return stack?.file || "";
|
|
16401
16401
|
}
|
|
16402
16402
|
|
|
16403
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16403
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/benchmark.CYdenmiT.js
|
|
16404
16404
|
var benchFns = /* @__PURE__ */ new WeakMap();
|
|
16405
16405
|
var benchOptsMap = /* @__PURE__ */ new WeakMap();
|
|
16406
16406
|
var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
|
|
@@ -16426,12 +16426,12 @@ function formatName2(name) {
|
|
|
16426
16426
|
return typeof name === "string" ? name : typeof name === "function" ? name.name || "<anonymous>" : String(name);
|
|
16427
16427
|
}
|
|
16428
16428
|
|
|
16429
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16429
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/index.CdQS2e2Q.js
|
|
16430
16430
|
chunkIS3BZTWE_cjs.__toESM(require_dist(), 1);
|
|
16431
16431
|
var assertType = function assertType2() {
|
|
16432
16432
|
};
|
|
16433
16433
|
|
|
16434
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16434
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/index.js
|
|
16435
16435
|
var import_expect_type2 = chunkIS3BZTWE_cjs.__toESM(require_dist(), 1);
|
|
16436
16436
|
var export_expectTypeOf = import_expect_type2.expectTypeOf;
|
|
16437
16437
|
/*! Bundled license information:
|
|
@@ -11986,7 +11986,7 @@ function createTestHook(name, handler) {
|
|
|
11986
11986
|
};
|
|
11987
11987
|
}
|
|
11988
11988
|
|
|
11989
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
11989
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/utils.XdZDrNZV.js
|
|
11990
11990
|
var NAME_WORKER_STATE = "__vitest_worker__";
|
|
11991
11991
|
function getWorkerState() {
|
|
11992
11992
|
const workerState = globalThis[NAME_WORKER_STATE];
|
|
@@ -12034,7 +12034,7 @@ async function waitForImportsToResolve() {
|
|
|
12034
12034
|
await waitForImportsToResolve();
|
|
12035
12035
|
}
|
|
12036
12036
|
|
|
12037
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
12037
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
|
|
12038
12038
|
var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
|
|
12039
12039
|
function getDefaultExportFromCjs3(x) {
|
|
12040
12040
|
return x && x.__esModule && Object.prototype.hasOwnProperty.call(x, "default") ? x["default"] : x;
|
|
@@ -12887,7 +12887,7 @@ function offsetToLineNumber(source, offset) {
|
|
|
12887
12887
|
return line + 1;
|
|
12888
12888
|
}
|
|
12889
12889
|
async function saveInlineSnapshots(environment, snapshots) {
|
|
12890
|
-
const MagicString = (await import('./magic-string.es-
|
|
12890
|
+
const MagicString = (await import('./magic-string.es-MNZ6ZGOL.js')).default;
|
|
12891
12891
|
const files = new Set(snapshots.map((i) => i.file));
|
|
12892
12892
|
await Promise.all(Array.from(files).map(async (file) => {
|
|
12893
12893
|
const snaps = snapshots.filter((i) => i.file === file);
|
|
@@ -13664,7 +13664,7 @@ var SnapshotClient = class {
|
|
|
13664
13664
|
}
|
|
13665
13665
|
};
|
|
13666
13666
|
|
|
13667
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13667
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/date.Bq6ZW5rf.js
|
|
13668
13668
|
var RealDate = Date;
|
|
13669
13669
|
var now2 = null;
|
|
13670
13670
|
var MockDate = class _MockDate extends RealDate {
|
|
@@ -13712,7 +13712,7 @@ function resetDate() {
|
|
|
13712
13712
|
globalThis.Date = RealDate;
|
|
13713
13713
|
}
|
|
13714
13714
|
|
|
13715
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13715
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/vi.bdSIJ99Y.js
|
|
13716
13716
|
var unsupported = [
|
|
13717
13717
|
"matchSnapshot",
|
|
13718
13718
|
"toMatchSnapshot",
|
|
@@ -16398,7 +16398,7 @@ function getImporter(name) {
|
|
|
16398
16398
|
return stack?.file || "";
|
|
16399
16399
|
}
|
|
16400
16400
|
|
|
16401
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16401
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/benchmark.CYdenmiT.js
|
|
16402
16402
|
var benchFns = /* @__PURE__ */ new WeakMap();
|
|
16403
16403
|
var benchOptsMap = /* @__PURE__ */ new WeakMap();
|
|
16404
16404
|
var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
|
|
@@ -16424,12 +16424,12 @@ function formatName2(name) {
|
|
|
16424
16424
|
return typeof name === "string" ? name : typeof name === "function" ? name.name || "<anonymous>" : String(name);
|
|
16425
16425
|
}
|
|
16426
16426
|
|
|
16427
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16427
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/index.CdQS2e2Q.js
|
|
16428
16428
|
__toESM(require_dist(), 1);
|
|
16429
16429
|
var assertType = function assertType2() {
|
|
16430
16430
|
};
|
|
16431
16431
|
|
|
16432
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16432
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/index.js
|
|
16433
16433
|
var import_expect_type2 = __toESM(require_dist(), 1);
|
|
16434
16434
|
var export_expectTypeOf = import_expect_type2.expectTypeOf;
|
|
16435
16435
|
/*! Bundled license information:
|
package/dist/index.cjs
CHANGED
|
@@ -41,7 +41,7 @@ var getCurrentTestInfo = async () => {
|
|
|
41
41
|
};
|
|
42
42
|
}
|
|
43
43
|
try {
|
|
44
|
-
const vitest = await import('./dist-
|
|
44
|
+
const vitest = await import('./dist-JD6MNRVB.cjs');
|
|
45
45
|
if (typeof vitest !== "undefined" && vitest.expect?.getState) {
|
|
46
46
|
const state = vitest.expect.getState();
|
|
47
47
|
return {
|
package/dist/index.js
CHANGED
|
@@ -39,7 +39,7 @@ var getCurrentTestInfo = async () => {
|
|
|
39
39
|
};
|
|
40
40
|
}
|
|
41
41
|
try {
|
|
42
|
-
const vitest = await import('./dist-
|
|
42
|
+
const vitest = await import('./dist-ZXFGMR47.js');
|
|
43
43
|
if (typeof vitest !== "undefined" && vitest.expect?.getState) {
|
|
44
44
|
const state = vitest.expect.getState();
|
|
45
45
|
return {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.
|
|
1
|
+
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.4/node_modules/@jridgewell/sourcemap-codec/dist/sourcemap-codec.mjs
|
|
2
2
|
var comma = ",".charCodeAt(0);
|
|
3
3
|
var semicolon = ";".charCodeAt(0);
|
|
4
4
|
var chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Object.defineProperty(exports, '__esModule', { value: true });
|
|
4
4
|
|
|
5
|
-
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.
|
|
5
|
+
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.4/node_modules/@jridgewell/sourcemap-codec/dist/sourcemap-codec.mjs
|
|
6
6
|
var comma = ",".charCodeAt(0);
|
|
7
7
|
var semicolon = ";".charCodeAt(0);
|
|
8
8
|
var chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
@@ -236,7 +236,7 @@ function createAnswerRelevancyScorer({
|
|
|
236
236
|
},
|
|
237
237
|
analyze: {
|
|
238
238
|
description: "Score the relevance of the statements to the input",
|
|
239
|
-
outputSchema: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })),
|
|
239
|
+
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
240
240
|
createPrompt: ({ run }) => createScorePrompt(JSON.stringify(run.input), run.extractStepResult?.statements || [])
|
|
241
241
|
},
|
|
242
242
|
reason: {
|
|
@@ -246,18 +246,18 @@ function createAnswerRelevancyScorer({
|
|
|
246
246
|
input: run.input.map((input) => input.content).join(", "),
|
|
247
247
|
output: run.output.text,
|
|
248
248
|
score: run.score,
|
|
249
|
-
results: run.analyzeStepResult,
|
|
249
|
+
results: run.analyzeStepResult.results,
|
|
250
250
|
scale: options.scale
|
|
251
251
|
});
|
|
252
252
|
}
|
|
253
253
|
},
|
|
254
254
|
calculateScore: ({ run }) => {
|
|
255
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
255
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
256
256
|
return 0;
|
|
257
257
|
}
|
|
258
|
-
const numberOfResults = run.analyzeStepResult.length;
|
|
258
|
+
const numberOfResults = run.analyzeStepResult.results.length;
|
|
259
259
|
let relevancyCount = 0;
|
|
260
|
-
for (const { result } of run.analyzeStepResult) {
|
|
260
|
+
for (const { result } of run.analyzeStepResult.results) {
|
|
261
261
|
if (result.trim().toLowerCase() === "yes") {
|
|
262
262
|
relevancyCount++;
|
|
263
263
|
} else if (result.trim().toLowerCase() === "unsure") {
|
|
@@ -455,7 +455,7 @@ function createFaithfulnessScorer({
|
|
|
455
455
|
},
|
|
456
456
|
analyze: {
|
|
457
457
|
description: "Score the relevance of the statements to the input",
|
|
458
|
-
outputSchema: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })),
|
|
458
|
+
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
459
459
|
createPrompt: ({ run }) => {
|
|
460
460
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
461
461
|
claims: run.extractStepResult || [],
|
|
@@ -465,8 +465,8 @@ function createFaithfulnessScorer({
|
|
|
465
465
|
}
|
|
466
466
|
},
|
|
467
467
|
calculateScore: ({ run }) => {
|
|
468
|
-
const totalClaims = run.analyzeStepResult.length;
|
|
469
|
-
const supportedClaims = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
468
|
+
const totalClaims = run.analyzeStepResult.verdicts.length;
|
|
469
|
+
const supportedClaims = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
470
470
|
if (totalClaims === 0) {
|
|
471
471
|
return 0;
|
|
472
472
|
}
|
|
@@ -482,7 +482,7 @@ function createFaithfulnessScorer({
|
|
|
482
482
|
context: options?.context || [],
|
|
483
483
|
score: run.score,
|
|
484
484
|
scale: options?.scale || 1,
|
|
485
|
-
verdicts: run.analyzeStepResult || []
|
|
485
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
486
486
|
});
|
|
487
487
|
return prompt;
|
|
488
488
|
}
|
|
@@ -617,7 +617,7 @@ function createBiasScorer({ model, options }) {
|
|
|
617
617
|
},
|
|
618
618
|
analyze: {
|
|
619
619
|
description: "Score the relevance of the statements to the input",
|
|
620
|
-
outputSchema: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })),
|
|
620
|
+
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
621
621
|
createPrompt: ({ run }) => {
|
|
622
622
|
const prompt = createBiasAnalyzePrompt({
|
|
623
623
|
output: run.output.text,
|
|
@@ -627,17 +627,20 @@ function createBiasScorer({ model, options }) {
|
|
|
627
627
|
}
|
|
628
628
|
},
|
|
629
629
|
calculateScore: ({ run }) => {
|
|
630
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
630
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
631
631
|
return 0;
|
|
632
632
|
}
|
|
633
|
-
const biasedVerdicts = run.analyzeStepResult.filter((v) => v.result.toLowerCase() === "yes");
|
|
634
|
-
const score = biasedVerdicts.length / run.analyzeStepResult.length;
|
|
633
|
+
const biasedVerdicts = run.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
634
|
+
const score = biasedVerdicts.length / run.analyzeStepResult.results.length;
|
|
635
635
|
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
636
636
|
},
|
|
637
637
|
reason: {
|
|
638
638
|
description: "Reason about the results",
|
|
639
639
|
createPrompt: ({ run }) => {
|
|
640
|
-
return createBiasReasonPrompt({
|
|
640
|
+
return createBiasReasonPrompt({
|
|
641
|
+
score: run.score,
|
|
642
|
+
biases: run.analyzeStepResult?.results.map((v) => v.reason) || []
|
|
643
|
+
});
|
|
641
644
|
}
|
|
642
645
|
}
|
|
643
646
|
});
|
|
@@ -858,7 +861,9 @@ function createHallucinationScorer({
|
|
|
858
861
|
},
|
|
859
862
|
analyze: {
|
|
860
863
|
description: "Score the relevance of the statements to the input",
|
|
861
|
-
outputSchema: zod.z.
|
|
864
|
+
outputSchema: zod.z.object({
|
|
865
|
+
verdicts: zod.z.array(zod.z.object({ statement: zod.z.string(), verdict: zod.z.string(), reason: zod.z.string() }))
|
|
866
|
+
}),
|
|
862
867
|
createPrompt: ({ run }) => {
|
|
863
868
|
const prompt = createHallucinationAnalyzePrompt({
|
|
864
869
|
claims: run.extractStepResult.claims,
|
|
@@ -868,8 +873,8 @@ function createHallucinationScorer({
|
|
|
868
873
|
}
|
|
869
874
|
},
|
|
870
875
|
calculateScore: ({ run }) => {
|
|
871
|
-
const totalStatements = run.analyzeStepResult.length;
|
|
872
|
-
const contradictedStatements = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
876
|
+
const totalStatements = run.analyzeStepResult.verdicts.length;
|
|
877
|
+
const contradictedStatements = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
873
878
|
if (totalStatements === 0) {
|
|
874
879
|
return 0;
|
|
875
880
|
}
|
|
@@ -885,7 +890,7 @@ function createHallucinationScorer({
|
|
|
885
890
|
context: run?.additionalContext?.context || [],
|
|
886
891
|
score: run.score,
|
|
887
892
|
scale: options?.scale || 1,
|
|
888
|
-
verdicts: run.analyzeStepResult || []
|
|
893
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
889
894
|
});
|
|
890
895
|
return prompt;
|
|
891
896
|
}
|
|
@@ -991,7 +996,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
991
996
|
},
|
|
992
997
|
analyze: {
|
|
993
998
|
description: "Score the relevance of the statements to the input",
|
|
994
|
-
outputSchema: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })),
|
|
999
|
+
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
995
1000
|
createPrompt: ({ run }) => {
|
|
996
1001
|
const prompt = createToxicityAnalyzePrompt({
|
|
997
1002
|
input: run.input.map((input) => input.content).join(", "),
|
|
@@ -1001,12 +1006,12 @@ function createToxicityScorer({ model, options }) {
|
|
|
1001
1006
|
}
|
|
1002
1007
|
},
|
|
1003
1008
|
calculateScore: ({ run }) => {
|
|
1004
|
-
const numberOfVerdicts = run.analyzeStepResult?.length || 0;
|
|
1009
|
+
const numberOfVerdicts = run.analyzeStepResult?.verdicts.length || 0;
|
|
1005
1010
|
if (numberOfVerdicts === 0) {
|
|
1006
1011
|
return 1;
|
|
1007
1012
|
}
|
|
1008
1013
|
let toxicityCount = 0;
|
|
1009
|
-
for (const { verdict } of run.analyzeStepResult) {
|
|
1014
|
+
for (const { verdict } of run.analyzeStepResult.verdicts) {
|
|
1010
1015
|
if (verdict.trim().toLowerCase() === "yes") {
|
|
1011
1016
|
toxicityCount++;
|
|
1012
1017
|
}
|
|
@@ -1019,7 +1024,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
1019
1024
|
createPrompt: ({ run }) => {
|
|
1020
1025
|
const prompt = createToxicityReasonPrompt({
|
|
1021
1026
|
score: run.score,
|
|
1022
|
-
toxics: run.analyzeStepResult?.map((v) => v.reason) || []
|
|
1027
|
+
toxics: run.analyzeStepResult?.verdicts.map((v) => v.reason) || []
|
|
1023
1028
|
});
|
|
1024
1029
|
return prompt;
|
|
1025
1030
|
}
|
|
@@ -234,7 +234,7 @@ function createAnswerRelevancyScorer({
|
|
|
234
234
|
},
|
|
235
235
|
analyze: {
|
|
236
236
|
description: "Score the relevance of the statements to the input",
|
|
237
|
-
outputSchema: z.array(z.object({ result: z.string(), reason: z.string() })),
|
|
237
|
+
outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
|
|
238
238
|
createPrompt: ({ run }) => createScorePrompt(JSON.stringify(run.input), run.extractStepResult?.statements || [])
|
|
239
239
|
},
|
|
240
240
|
reason: {
|
|
@@ -244,18 +244,18 @@ function createAnswerRelevancyScorer({
|
|
|
244
244
|
input: run.input.map((input) => input.content).join(", "),
|
|
245
245
|
output: run.output.text,
|
|
246
246
|
score: run.score,
|
|
247
|
-
results: run.analyzeStepResult,
|
|
247
|
+
results: run.analyzeStepResult.results,
|
|
248
248
|
scale: options.scale
|
|
249
249
|
});
|
|
250
250
|
}
|
|
251
251
|
},
|
|
252
252
|
calculateScore: ({ run }) => {
|
|
253
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
253
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
254
254
|
return 0;
|
|
255
255
|
}
|
|
256
|
-
const numberOfResults = run.analyzeStepResult.length;
|
|
256
|
+
const numberOfResults = run.analyzeStepResult.results.length;
|
|
257
257
|
let relevancyCount = 0;
|
|
258
|
-
for (const { result } of run.analyzeStepResult) {
|
|
258
|
+
for (const { result } of run.analyzeStepResult.results) {
|
|
259
259
|
if (result.trim().toLowerCase() === "yes") {
|
|
260
260
|
relevancyCount++;
|
|
261
261
|
} else if (result.trim().toLowerCase() === "unsure") {
|
|
@@ -453,7 +453,7 @@ function createFaithfulnessScorer({
|
|
|
453
453
|
},
|
|
454
454
|
analyze: {
|
|
455
455
|
description: "Score the relevance of the statements to the input",
|
|
456
|
-
outputSchema: z.array(z.object({ verdict: z.string(), reason: z.string() })),
|
|
456
|
+
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
457
457
|
createPrompt: ({ run }) => {
|
|
458
458
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
459
459
|
claims: run.extractStepResult || [],
|
|
@@ -463,8 +463,8 @@ function createFaithfulnessScorer({
|
|
|
463
463
|
}
|
|
464
464
|
},
|
|
465
465
|
calculateScore: ({ run }) => {
|
|
466
|
-
const totalClaims = run.analyzeStepResult.length;
|
|
467
|
-
const supportedClaims = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
466
|
+
const totalClaims = run.analyzeStepResult.verdicts.length;
|
|
467
|
+
const supportedClaims = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
468
468
|
if (totalClaims === 0) {
|
|
469
469
|
return 0;
|
|
470
470
|
}
|
|
@@ -480,7 +480,7 @@ function createFaithfulnessScorer({
|
|
|
480
480
|
context: options?.context || [],
|
|
481
481
|
score: run.score,
|
|
482
482
|
scale: options?.scale || 1,
|
|
483
|
-
verdicts: run.analyzeStepResult || []
|
|
483
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
484
484
|
});
|
|
485
485
|
return prompt;
|
|
486
486
|
}
|
|
@@ -615,7 +615,7 @@ function createBiasScorer({ model, options }) {
|
|
|
615
615
|
},
|
|
616
616
|
analyze: {
|
|
617
617
|
description: "Score the relevance of the statements to the input",
|
|
618
|
-
outputSchema: z.array(z.object({ result: z.string(), reason: z.string() })),
|
|
618
|
+
outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
|
|
619
619
|
createPrompt: ({ run }) => {
|
|
620
620
|
const prompt = createBiasAnalyzePrompt({
|
|
621
621
|
output: run.output.text,
|
|
@@ -625,17 +625,20 @@ function createBiasScorer({ model, options }) {
|
|
|
625
625
|
}
|
|
626
626
|
},
|
|
627
627
|
calculateScore: ({ run }) => {
|
|
628
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
628
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
629
629
|
return 0;
|
|
630
630
|
}
|
|
631
|
-
const biasedVerdicts = run.analyzeStepResult.filter((v) => v.result.toLowerCase() === "yes");
|
|
632
|
-
const score = biasedVerdicts.length / run.analyzeStepResult.length;
|
|
631
|
+
const biasedVerdicts = run.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
632
|
+
const score = biasedVerdicts.length / run.analyzeStepResult.results.length;
|
|
633
633
|
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
634
634
|
},
|
|
635
635
|
reason: {
|
|
636
636
|
description: "Reason about the results",
|
|
637
637
|
createPrompt: ({ run }) => {
|
|
638
|
-
return createBiasReasonPrompt({
|
|
638
|
+
return createBiasReasonPrompt({
|
|
639
|
+
score: run.score,
|
|
640
|
+
biases: run.analyzeStepResult?.results.map((v) => v.reason) || []
|
|
641
|
+
});
|
|
639
642
|
}
|
|
640
643
|
}
|
|
641
644
|
});
|
|
@@ -856,7 +859,9 @@ function createHallucinationScorer({
|
|
|
856
859
|
},
|
|
857
860
|
analyze: {
|
|
858
861
|
description: "Score the relevance of the statements to the input",
|
|
859
|
-
outputSchema: z.
|
|
862
|
+
outputSchema: z.object({
|
|
863
|
+
verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
|
|
864
|
+
}),
|
|
860
865
|
createPrompt: ({ run }) => {
|
|
861
866
|
const prompt = createHallucinationAnalyzePrompt({
|
|
862
867
|
claims: run.extractStepResult.claims,
|
|
@@ -866,8 +871,8 @@ function createHallucinationScorer({
|
|
|
866
871
|
}
|
|
867
872
|
},
|
|
868
873
|
calculateScore: ({ run }) => {
|
|
869
|
-
const totalStatements = run.analyzeStepResult.length;
|
|
870
|
-
const contradictedStatements = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
874
|
+
const totalStatements = run.analyzeStepResult.verdicts.length;
|
|
875
|
+
const contradictedStatements = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
871
876
|
if (totalStatements === 0) {
|
|
872
877
|
return 0;
|
|
873
878
|
}
|
|
@@ -883,7 +888,7 @@ function createHallucinationScorer({
|
|
|
883
888
|
context: run?.additionalContext?.context || [],
|
|
884
889
|
score: run.score,
|
|
885
890
|
scale: options?.scale || 1,
|
|
886
|
-
verdicts: run.analyzeStepResult || []
|
|
891
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
887
892
|
});
|
|
888
893
|
return prompt;
|
|
889
894
|
}
|
|
@@ -989,7 +994,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
989
994
|
},
|
|
990
995
|
analyze: {
|
|
991
996
|
description: "Score the relevance of the statements to the input",
|
|
992
|
-
outputSchema: z.array(z.object({ verdict: z.string(), reason: z.string() })),
|
|
997
|
+
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
993
998
|
createPrompt: ({ run }) => {
|
|
994
999
|
const prompt = createToxicityAnalyzePrompt({
|
|
995
1000
|
input: run.input.map((input) => input.content).join(", "),
|
|
@@ -999,12 +1004,12 @@ function createToxicityScorer({ model, options }) {
|
|
|
999
1004
|
}
|
|
1000
1005
|
},
|
|
1001
1006
|
calculateScore: ({ run }) => {
|
|
1002
|
-
const numberOfVerdicts = run.analyzeStepResult?.length || 0;
|
|
1007
|
+
const numberOfVerdicts = run.analyzeStepResult?.verdicts.length || 0;
|
|
1003
1008
|
if (numberOfVerdicts === 0) {
|
|
1004
1009
|
return 1;
|
|
1005
1010
|
}
|
|
1006
1011
|
let toxicityCount = 0;
|
|
1007
|
-
for (const { verdict } of run.analyzeStepResult) {
|
|
1012
|
+
for (const { verdict } of run.analyzeStepResult.verdicts) {
|
|
1008
1013
|
if (verdict.trim().toLowerCase() === "yes") {
|
|
1009
1014
|
toxicityCount++;
|
|
1010
1015
|
}
|
|
@@ -1017,7 +1022,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
1017
1022
|
createPrompt: ({ run }) => {
|
|
1018
1023
|
const prompt = createToxicityReasonPrompt({
|
|
1019
1024
|
score: run.score,
|
|
1020
|
-
toxics: run.analyzeStepResult?.map((v) => v.reason) || []
|
|
1025
|
+
toxics: run.analyzeStepResult?.verdicts.map((v) => v.reason) || []
|
|
1021
1026
|
});
|
|
1022
1027
|
return prompt;
|
|
1023
1028
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.8-alpha.0",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -84,7 +84,7 @@
|
|
|
84
84
|
"zod": "^3.25.67"
|
|
85
85
|
},
|
|
86
86
|
"peerDependencies": {
|
|
87
|
-
"@mastra/core": "
|
|
87
|
+
"@mastra/core": ">=0.11.0-0 <0.12.0-0",
|
|
88
88
|
"ai": "^4.0.0"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
@@ -100,8 +100,8 @@
|
|
|
100
100
|
"tsup": "^8.5.0",
|
|
101
101
|
"typescript": "^5.8.3",
|
|
102
102
|
"vitest": "^3.2.4",
|
|
103
|
-
"@internal/lint": "0.0.
|
|
104
|
-
"@mastra/core": "0.
|
|
103
|
+
"@internal/lint": "0.0.23",
|
|
104
|
+
"@mastra/core": "0.12.0-alpha.0"
|
|
105
105
|
},
|
|
106
106
|
"scripts": {
|
|
107
107
|
"check": "tsc --noEmit",
|