@mastra/evals 0.10.7 → 0.10.8-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_tsup-dts-rollup.d.cts +4 -1
- package/dist/_tsup-dts-rollup.d.ts +4 -1
- package/dist/{dist-IVAARSAW.cjs → dist-JD6MNRVB.cjs} +8 -8
- package/dist/{dist-5JXLPLM2.js → dist-ZXFGMR47.js} +8 -8
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/{magic-string.es-LD4FLE5J.js → magic-string.es-MNZ6ZGOL.js} +1 -1
- package/dist/{magic-string.es-66FD77JZ.cjs → magic-string.es-T2QO2IBJ.cjs} +1 -1
- package/dist/scorers/code/index.cjs +44 -4
- package/dist/scorers/code/index.d.cts +1 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.js +43 -5
- package/dist/scorers/llm/index.cjs +31 -26
- package/dist/scorers/llm/index.js +31 -26
- package/package.json +3 -3
|
@@ -405,7 +405,10 @@ export { createTextualDifferenceScorer }
|
|
|
405
405
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_1 }
|
|
406
406
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_2 }
|
|
407
407
|
|
|
408
|
-
|
|
408
|
+
declare function createToneScorer(): MastraScorer;
|
|
409
|
+
export { createToneScorer }
|
|
410
|
+
export { createToneScorer as createToneScorer_alias_1 }
|
|
411
|
+
export { createToneScorer as createToneScorer_alias_2 }
|
|
409
412
|
|
|
410
413
|
export declare function createToxicityAnalyzePrompt({ input, output }: {
|
|
411
414
|
input: string;
|
|
@@ -405,7 +405,10 @@ export { createTextualDifferenceScorer }
|
|
|
405
405
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_1 }
|
|
406
406
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_2 }
|
|
407
407
|
|
|
408
|
-
|
|
408
|
+
declare function createToneScorer(): MastraScorer;
|
|
409
|
+
export { createToneScorer }
|
|
410
|
+
export { createToneScorer as createToneScorer_alias_1 }
|
|
411
|
+
export { createToneScorer as createToneScorer_alias_2 }
|
|
409
412
|
|
|
410
413
|
export declare function createToxicityAnalyzePrompt({ input, output }: {
|
|
411
414
|
input: string;
|
|
@@ -11988,7 +11988,7 @@ function createTestHook(name, handler) {
|
|
|
11988
11988
|
};
|
|
11989
11989
|
}
|
|
11990
11990
|
|
|
11991
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
11991
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/utils.XdZDrNZV.js
|
|
11992
11992
|
var NAME_WORKER_STATE = "__vitest_worker__";
|
|
11993
11993
|
function getWorkerState() {
|
|
11994
11994
|
const workerState = globalThis[NAME_WORKER_STATE];
|
|
@@ -12036,7 +12036,7 @@ async function waitForImportsToResolve() {
|
|
|
12036
12036
|
await waitForImportsToResolve();
|
|
12037
12037
|
}
|
|
12038
12038
|
|
|
12039
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
12039
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
|
|
12040
12040
|
var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
|
|
12041
12041
|
function getDefaultExportFromCjs3(x) {
|
|
12042
12042
|
return x && x.__esModule && Object.prototype.hasOwnProperty.call(x, "default") ? x["default"] : x;
|
|
@@ -12889,7 +12889,7 @@ function offsetToLineNumber(source, offset) {
|
|
|
12889
12889
|
return line + 1;
|
|
12890
12890
|
}
|
|
12891
12891
|
async function saveInlineSnapshots(environment, snapshots) {
|
|
12892
|
-
const MagicString = (await import('./magic-string.es-
|
|
12892
|
+
const MagicString = (await import('./magic-string.es-T2QO2IBJ.cjs')).default;
|
|
12893
12893
|
const files = new Set(snapshots.map((i) => i.file));
|
|
12894
12894
|
await Promise.all(Array.from(files).map(async (file) => {
|
|
12895
12895
|
const snaps = snapshots.filter((i) => i.file === file);
|
|
@@ -13666,7 +13666,7 @@ var SnapshotClient = class {
|
|
|
13666
13666
|
}
|
|
13667
13667
|
};
|
|
13668
13668
|
|
|
13669
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13669
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/date.Bq6ZW5rf.js
|
|
13670
13670
|
var RealDate = Date;
|
|
13671
13671
|
var now2 = null;
|
|
13672
13672
|
var MockDate = class _MockDate extends RealDate {
|
|
@@ -13714,7 +13714,7 @@ function resetDate() {
|
|
|
13714
13714
|
globalThis.Date = RealDate;
|
|
13715
13715
|
}
|
|
13716
13716
|
|
|
13717
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13717
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/vi.bdSIJ99Y.js
|
|
13718
13718
|
var unsupported = [
|
|
13719
13719
|
"matchSnapshot",
|
|
13720
13720
|
"toMatchSnapshot",
|
|
@@ -16400,7 +16400,7 @@ function getImporter(name) {
|
|
|
16400
16400
|
return stack?.file || "";
|
|
16401
16401
|
}
|
|
16402
16402
|
|
|
16403
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16403
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/benchmark.CYdenmiT.js
|
|
16404
16404
|
var benchFns = /* @__PURE__ */ new WeakMap();
|
|
16405
16405
|
var benchOptsMap = /* @__PURE__ */ new WeakMap();
|
|
16406
16406
|
var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
|
|
@@ -16426,12 +16426,12 @@ function formatName2(name) {
|
|
|
16426
16426
|
return typeof name === "string" ? name : typeof name === "function" ? name.name || "<anonymous>" : String(name);
|
|
16427
16427
|
}
|
|
16428
16428
|
|
|
16429
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16429
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/index.CdQS2e2Q.js
|
|
16430
16430
|
chunkIS3BZTWE_cjs.__toESM(require_dist(), 1);
|
|
16431
16431
|
var assertType = function assertType2() {
|
|
16432
16432
|
};
|
|
16433
16433
|
|
|
16434
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16434
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/index.js
|
|
16435
16435
|
var import_expect_type2 = chunkIS3BZTWE_cjs.__toESM(require_dist(), 1);
|
|
16436
16436
|
var export_expectTypeOf = import_expect_type2.expectTypeOf;
|
|
16437
16437
|
/*! Bundled license information:
|
|
@@ -11986,7 +11986,7 @@ function createTestHook(name, handler) {
|
|
|
11986
11986
|
};
|
|
11987
11987
|
}
|
|
11988
11988
|
|
|
11989
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
11989
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/utils.XdZDrNZV.js
|
|
11990
11990
|
var NAME_WORKER_STATE = "__vitest_worker__";
|
|
11991
11991
|
function getWorkerState() {
|
|
11992
11992
|
const workerState = globalThis[NAME_WORKER_STATE];
|
|
@@ -12034,7 +12034,7 @@ async function waitForImportsToResolve() {
|
|
|
12034
12034
|
await waitForImportsToResolve();
|
|
12035
12035
|
}
|
|
12036
12036
|
|
|
12037
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
12037
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
|
|
12038
12038
|
var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
|
|
12039
12039
|
function getDefaultExportFromCjs3(x) {
|
|
12040
12040
|
return x && x.__esModule && Object.prototype.hasOwnProperty.call(x, "default") ? x["default"] : x;
|
|
@@ -12887,7 +12887,7 @@ function offsetToLineNumber(source, offset) {
|
|
|
12887
12887
|
return line + 1;
|
|
12888
12888
|
}
|
|
12889
12889
|
async function saveInlineSnapshots(environment, snapshots) {
|
|
12890
|
-
const MagicString = (await import('./magic-string.es-
|
|
12890
|
+
const MagicString = (await import('./magic-string.es-MNZ6ZGOL.js')).default;
|
|
12891
12891
|
const files = new Set(snapshots.map((i) => i.file));
|
|
12892
12892
|
await Promise.all(Array.from(files).map(async (file) => {
|
|
12893
12893
|
const snaps = snapshots.filter((i) => i.file === file);
|
|
@@ -13664,7 +13664,7 @@ var SnapshotClient = class {
|
|
|
13664
13664
|
}
|
|
13665
13665
|
};
|
|
13666
13666
|
|
|
13667
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13667
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/date.Bq6ZW5rf.js
|
|
13668
13668
|
var RealDate = Date;
|
|
13669
13669
|
var now2 = null;
|
|
13670
13670
|
var MockDate = class _MockDate extends RealDate {
|
|
@@ -13712,7 +13712,7 @@ function resetDate() {
|
|
|
13712
13712
|
globalThis.Date = RealDate;
|
|
13713
13713
|
}
|
|
13714
13714
|
|
|
13715
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
13715
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/vi.bdSIJ99Y.js
|
|
13716
13716
|
var unsupported = [
|
|
13717
13717
|
"matchSnapshot",
|
|
13718
13718
|
"toMatchSnapshot",
|
|
@@ -16398,7 +16398,7 @@ function getImporter(name) {
|
|
|
16398
16398
|
return stack?.file || "";
|
|
16399
16399
|
}
|
|
16400
16400
|
|
|
16401
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16401
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/benchmark.CYdenmiT.js
|
|
16402
16402
|
var benchFns = /* @__PURE__ */ new WeakMap();
|
|
16403
16403
|
var benchOptsMap = /* @__PURE__ */ new WeakMap();
|
|
16404
16404
|
var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
|
|
@@ -16424,12 +16424,12 @@ function formatName2(name) {
|
|
|
16424
16424
|
return typeof name === "string" ? name : typeof name === "function" ? name.name || "<anonymous>" : String(name);
|
|
16425
16425
|
}
|
|
16426
16426
|
|
|
16427
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16427
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/chunks/index.CdQS2e2Q.js
|
|
16428
16428
|
__toESM(require_dist(), 1);
|
|
16429
16429
|
var assertType = function assertType2() {
|
|
16430
16430
|
};
|
|
16431
16431
|
|
|
16432
|
-
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.
|
|
16432
|
+
// ../../node_modules/.pnpm/vitest@3.2.4_@types+debug@4.1.12_@types+node@20.19.9_@vitest+ui@3.2.3_jiti@2.4.2_jsdom@_0090e69ea15e68f4eaa34b37eb448faf/node_modules/vitest/dist/index.js
|
|
16433
16433
|
var import_expect_type2 = __toESM(require_dist(), 1);
|
|
16434
16434
|
var export_expectTypeOf = import_expect_type2.expectTypeOf;
|
|
16435
16435
|
/*! Bundled license information:
|
package/dist/index.cjs
CHANGED
|
@@ -41,7 +41,7 @@ var getCurrentTestInfo = async () => {
|
|
|
41
41
|
};
|
|
42
42
|
}
|
|
43
43
|
try {
|
|
44
|
-
const vitest = await import('./dist-
|
|
44
|
+
const vitest = await import('./dist-JD6MNRVB.cjs');
|
|
45
45
|
if (typeof vitest !== "undefined" && vitest.expect?.getState) {
|
|
46
46
|
const state = vitest.expect.getState();
|
|
47
47
|
return {
|
package/dist/index.js
CHANGED
|
@@ -39,7 +39,7 @@ var getCurrentTestInfo = async () => {
|
|
|
39
39
|
};
|
|
40
40
|
}
|
|
41
41
|
try {
|
|
42
|
-
const vitest = await import('./dist-
|
|
42
|
+
const vitest = await import('./dist-ZXFGMR47.js');
|
|
43
43
|
if (typeof vitest !== "undefined" && vitest.expect?.getState) {
|
|
44
44
|
const state = vitest.expect.getState();
|
|
45
45
|
return {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.
|
|
1
|
+
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.4/node_modules/@jridgewell/sourcemap-codec/dist/sourcemap-codec.mjs
|
|
2
2
|
var comma = ",".charCodeAt(0);
|
|
3
3
|
var semicolon = ";".charCodeAt(0);
|
|
4
4
|
var chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Object.defineProperty(exports, '__esModule', { value: true });
|
|
4
4
|
|
|
5
|
-
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.
|
|
5
|
+
// ../../node_modules/.pnpm/@jridgewell+sourcemap-codec@1.5.4/node_modules/@jridgewell/sourcemap-codec/dist/sourcemap-codec.mjs
|
|
6
6
|
var comma = ",".charCodeAt(0);
|
|
7
7
|
var semicolon = ";".charCodeAt(0);
|
|
8
8
|
var chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
@@ -5,12 +5,14 @@ var nlp = require('compromise');
|
|
|
5
5
|
var difflib = require('difflib');
|
|
6
6
|
var keyword_extractor = require('keyword-extractor');
|
|
7
7
|
var stringSimilarity = require('string-similarity');
|
|
8
|
+
var Sentiment = require('sentiment');
|
|
8
9
|
|
|
9
10
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
10
11
|
|
|
11
12
|
var nlp__default = /*#__PURE__*/_interopDefault(nlp);
|
|
12
13
|
var keyword_extractor__default = /*#__PURE__*/_interopDefault(keyword_extractor);
|
|
13
14
|
var stringSimilarity__default = /*#__PURE__*/_interopDefault(stringSimilarity);
|
|
15
|
+
var Sentiment__default = /*#__PURE__*/_interopDefault(Sentiment);
|
|
14
16
|
|
|
15
17
|
function normalizeString(str) {
|
|
16
18
|
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
@@ -63,7 +65,7 @@ function createCompletenessScorer() {
|
|
|
63
65
|
if (isInputInvalid || isOutputInvalid) {
|
|
64
66
|
throw new Error("Inputs cannot be null or undefined");
|
|
65
67
|
}
|
|
66
|
-
const input = run.input
|
|
68
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
67
69
|
const output = run.output.text;
|
|
68
70
|
const inputToProcess = input;
|
|
69
71
|
const outputToProcess = output;
|
|
@@ -100,7 +102,7 @@ function createTextualDifferenceScorer() {
|
|
|
100
102
|
name: "Completeness",
|
|
101
103
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
102
104
|
analyze: async (run) => {
|
|
103
|
-
const input = run.input
|
|
105
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
104
106
|
const output = run.output.text;
|
|
105
107
|
const matcher = new difflib.SequenceMatcher(null, input, output);
|
|
106
108
|
const ratio = matcher.ratio();
|
|
@@ -125,7 +127,7 @@ function createKeywordCoverageScorer() {
|
|
|
125
127
|
name: "Completeness",
|
|
126
128
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
127
129
|
extract: async (run) => {
|
|
128
|
-
const input = run.input
|
|
130
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
129
131
|
const output = run.output.text;
|
|
130
132
|
if (!input && !output) {
|
|
131
133
|
return {
|
|
@@ -182,7 +184,7 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
182
184
|
name: "Completeness",
|
|
183
185
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
184
186
|
extract: async (run) => {
|
|
185
|
-
let processedInput = run.input
|
|
187
|
+
let processedInput = run.input?.map((i) => i.content).join(", ") || "";
|
|
186
188
|
let processedOutput = run.output.text;
|
|
187
189
|
if (ignoreCase) {
|
|
188
190
|
processedInput = processedInput.toLowerCase();
|
|
@@ -213,8 +215,46 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
213
215
|
}
|
|
214
216
|
});
|
|
215
217
|
}
|
|
218
|
+
function createToneScorer() {
|
|
219
|
+
return scores.createScorer({
|
|
220
|
+
name: "Completeness",
|
|
221
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
222
|
+
analyze: async (run) => {
|
|
223
|
+
const sentiment = new Sentiment__default.default();
|
|
224
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
225
|
+
const output = run.output.text;
|
|
226
|
+
const responseSentiment = sentiment.analyze(input);
|
|
227
|
+
if (output) {
|
|
228
|
+
const referenceSentiment = sentiment.analyze(output);
|
|
229
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
230
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
231
|
+
return {
|
|
232
|
+
score: normalizedScore,
|
|
233
|
+
result: {
|
|
234
|
+
responseSentiment: responseSentiment.comparative,
|
|
235
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
236
|
+
difference: sentimentDiff
|
|
237
|
+
}
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
const sentences = input.match(/[^.!?]+[.!?]+/g) || [input];
|
|
241
|
+
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
242
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
243
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
244
|
+
const stability = Math.max(0, 1 - variance);
|
|
245
|
+
return {
|
|
246
|
+
score: stability,
|
|
247
|
+
result: {
|
|
248
|
+
avgSentiment,
|
|
249
|
+
sentimentVariance: variance
|
|
250
|
+
}
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
});
|
|
254
|
+
}
|
|
216
255
|
|
|
217
256
|
exports.createCompletenessScorer = createCompletenessScorer;
|
|
218
257
|
exports.createContentSimilarityScorer = createContentSimilarityScorer;
|
|
219
258
|
exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
|
|
220
259
|
exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
|
|
260
|
+
exports.createToneScorer = createToneScorer;
|
|
@@ -2,3 +2,4 @@ export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '..
|
|
|
2
2
|
export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.cjs';
|
|
3
3
|
export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.cjs';
|
|
4
4
|
export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.cjs';
|
|
5
|
+
export { createToneScorer_alias_1 as createToneScorer } from '../../_tsup-dts-rollup.cjs';
|
|
@@ -2,3 +2,4 @@ export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '..
|
|
|
2
2
|
export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.js';
|
|
3
3
|
export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.js';
|
|
4
4
|
export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.js';
|
|
5
|
+
export { createToneScorer_alias_1 as createToneScorer } from '../../_tsup-dts-rollup.js';
|
|
@@ -3,6 +3,7 @@ import nlp from 'compromise';
|
|
|
3
3
|
import { SequenceMatcher } from 'difflib';
|
|
4
4
|
import keyword_extractor from 'keyword-extractor';
|
|
5
5
|
import stringSimilarity from 'string-similarity';
|
|
6
|
+
import Sentiment from 'sentiment';
|
|
6
7
|
|
|
7
8
|
function normalizeString(str) {
|
|
8
9
|
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
@@ -55,7 +56,7 @@ function createCompletenessScorer() {
|
|
|
55
56
|
if (isInputInvalid || isOutputInvalid) {
|
|
56
57
|
throw new Error("Inputs cannot be null or undefined");
|
|
57
58
|
}
|
|
58
|
-
const input = run.input
|
|
59
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
59
60
|
const output = run.output.text;
|
|
60
61
|
const inputToProcess = input;
|
|
61
62
|
const outputToProcess = output;
|
|
@@ -92,7 +93,7 @@ function createTextualDifferenceScorer() {
|
|
|
92
93
|
name: "Completeness",
|
|
93
94
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
94
95
|
analyze: async (run) => {
|
|
95
|
-
const input = run.input
|
|
96
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
96
97
|
const output = run.output.text;
|
|
97
98
|
const matcher = new SequenceMatcher(null, input, output);
|
|
98
99
|
const ratio = matcher.ratio();
|
|
@@ -117,7 +118,7 @@ function createKeywordCoverageScorer() {
|
|
|
117
118
|
name: "Completeness",
|
|
118
119
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
119
120
|
extract: async (run) => {
|
|
120
|
-
const input = run.input
|
|
121
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
121
122
|
const output = run.output.text;
|
|
122
123
|
if (!input && !output) {
|
|
123
124
|
return {
|
|
@@ -174,7 +175,7 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
174
175
|
name: "Completeness",
|
|
175
176
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
176
177
|
extract: async (run) => {
|
|
177
|
-
let processedInput = run.input
|
|
178
|
+
let processedInput = run.input?.map((i) => i.content).join(", ") || "";
|
|
178
179
|
let processedOutput = run.output.text;
|
|
179
180
|
if (ignoreCase) {
|
|
180
181
|
processedInput = processedInput.toLowerCase();
|
|
@@ -205,5 +206,42 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
205
206
|
}
|
|
206
207
|
});
|
|
207
208
|
}
|
|
209
|
+
function createToneScorer() {
|
|
210
|
+
return createScorer({
|
|
211
|
+
name: "Completeness",
|
|
212
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
213
|
+
analyze: async (run) => {
|
|
214
|
+
const sentiment = new Sentiment();
|
|
215
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
216
|
+
const output = run.output.text;
|
|
217
|
+
const responseSentiment = sentiment.analyze(input);
|
|
218
|
+
if (output) {
|
|
219
|
+
const referenceSentiment = sentiment.analyze(output);
|
|
220
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
221
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
222
|
+
return {
|
|
223
|
+
score: normalizedScore,
|
|
224
|
+
result: {
|
|
225
|
+
responseSentiment: responseSentiment.comparative,
|
|
226
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
227
|
+
difference: sentimentDiff
|
|
228
|
+
}
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
const sentences = input.match(/[^.!?]+[.!?]+/g) || [input];
|
|
232
|
+
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
233
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
234
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
235
|
+
const stability = Math.max(0, 1 - variance);
|
|
236
|
+
return {
|
|
237
|
+
score: stability,
|
|
238
|
+
result: {
|
|
239
|
+
avgSentiment,
|
|
240
|
+
sentimentVariance: variance
|
|
241
|
+
}
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
}
|
|
208
246
|
|
|
209
|
-
export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer };
|
|
247
|
+
export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer, createToneScorer };
|
|
@@ -236,28 +236,28 @@ function createAnswerRelevancyScorer({
|
|
|
236
236
|
},
|
|
237
237
|
analyze: {
|
|
238
238
|
description: "Score the relevance of the statements to the input",
|
|
239
|
-
outputSchema: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })),
|
|
239
|
+
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
240
240
|
createPrompt: ({ run }) => createScorePrompt(JSON.stringify(run.input), run.extractStepResult?.statements || [])
|
|
241
241
|
},
|
|
242
242
|
reason: {
|
|
243
243
|
description: "Reason about the results",
|
|
244
244
|
createPrompt: ({ run }) => {
|
|
245
245
|
return createReasonPrompt({
|
|
246
|
-
input: run.input
|
|
246
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
247
247
|
output: run.output.text,
|
|
248
248
|
score: run.score,
|
|
249
|
-
results: run.analyzeStepResult,
|
|
249
|
+
results: run.analyzeStepResult.results,
|
|
250
250
|
scale: options.scale
|
|
251
251
|
});
|
|
252
252
|
}
|
|
253
253
|
},
|
|
254
254
|
calculateScore: ({ run }) => {
|
|
255
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
255
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
256
256
|
return 0;
|
|
257
257
|
}
|
|
258
|
-
const numberOfResults = run.analyzeStepResult.length;
|
|
258
|
+
const numberOfResults = run.analyzeStepResult.results.length;
|
|
259
259
|
let relevancyCount = 0;
|
|
260
|
-
for (const { result } of run.analyzeStepResult) {
|
|
260
|
+
for (const { result } of run.analyzeStepResult.results) {
|
|
261
261
|
if (result.trim().toLowerCase() === "yes") {
|
|
262
262
|
relevancyCount++;
|
|
263
263
|
} else if (result.trim().toLowerCase() === "unsure") {
|
|
@@ -455,7 +455,7 @@ function createFaithfulnessScorer({
|
|
|
455
455
|
},
|
|
456
456
|
analyze: {
|
|
457
457
|
description: "Score the relevance of the statements to the input",
|
|
458
|
-
outputSchema: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })),
|
|
458
|
+
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
459
459
|
createPrompt: ({ run }) => {
|
|
460
460
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
461
461
|
claims: run.extractStepResult || [],
|
|
@@ -465,8 +465,8 @@ function createFaithfulnessScorer({
|
|
|
465
465
|
}
|
|
466
466
|
},
|
|
467
467
|
calculateScore: ({ run }) => {
|
|
468
|
-
const totalClaims = run.analyzeStepResult.length;
|
|
469
|
-
const supportedClaims = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
468
|
+
const totalClaims = run.analyzeStepResult.verdicts.length;
|
|
469
|
+
const supportedClaims = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
470
470
|
if (totalClaims === 0) {
|
|
471
471
|
return 0;
|
|
472
472
|
}
|
|
@@ -477,12 +477,12 @@ function createFaithfulnessScorer({
|
|
|
477
477
|
description: "Reason about the results",
|
|
478
478
|
createPrompt: ({ run }) => {
|
|
479
479
|
const prompt = createFaithfulnessReasonPrompt({
|
|
480
|
-
input: run.input
|
|
480
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
481
481
|
output: run.output.text,
|
|
482
482
|
context: options?.context || [],
|
|
483
483
|
score: run.score,
|
|
484
484
|
scale: options?.scale || 1,
|
|
485
|
-
verdicts: run.analyzeStepResult || []
|
|
485
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
486
486
|
});
|
|
487
487
|
return prompt;
|
|
488
488
|
}
|
|
@@ -617,7 +617,7 @@ function createBiasScorer({ model, options }) {
|
|
|
617
617
|
},
|
|
618
618
|
analyze: {
|
|
619
619
|
description: "Score the relevance of the statements to the input",
|
|
620
|
-
outputSchema: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })),
|
|
620
|
+
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
621
621
|
createPrompt: ({ run }) => {
|
|
622
622
|
const prompt = createBiasAnalyzePrompt({
|
|
623
623
|
output: run.output.text,
|
|
@@ -627,17 +627,20 @@ function createBiasScorer({ model, options }) {
|
|
|
627
627
|
}
|
|
628
628
|
},
|
|
629
629
|
calculateScore: ({ run }) => {
|
|
630
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
630
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
631
631
|
return 0;
|
|
632
632
|
}
|
|
633
|
-
const biasedVerdicts = run.analyzeStepResult.filter((v) => v.result.toLowerCase() === "yes");
|
|
634
|
-
const score = biasedVerdicts.length / run.analyzeStepResult.length;
|
|
633
|
+
const biasedVerdicts = run.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
634
|
+
const score = biasedVerdicts.length / run.analyzeStepResult.results.length;
|
|
635
635
|
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
636
636
|
},
|
|
637
637
|
reason: {
|
|
638
638
|
description: "Reason about the results",
|
|
639
639
|
createPrompt: ({ run }) => {
|
|
640
|
-
return createBiasReasonPrompt({
|
|
640
|
+
return createBiasReasonPrompt({
|
|
641
|
+
score: run.score,
|
|
642
|
+
biases: run.analyzeStepResult?.results.map((v) => v.reason) || []
|
|
643
|
+
});
|
|
641
644
|
}
|
|
642
645
|
}
|
|
643
646
|
});
|
|
@@ -858,7 +861,9 @@ function createHallucinationScorer({
|
|
|
858
861
|
},
|
|
859
862
|
analyze: {
|
|
860
863
|
description: "Score the relevance of the statements to the input",
|
|
861
|
-
outputSchema: zod.z.
|
|
864
|
+
outputSchema: zod.z.object({
|
|
865
|
+
verdicts: zod.z.array(zod.z.object({ statement: zod.z.string(), verdict: zod.z.string(), reason: zod.z.string() }))
|
|
866
|
+
}),
|
|
862
867
|
createPrompt: ({ run }) => {
|
|
863
868
|
const prompt = createHallucinationAnalyzePrompt({
|
|
864
869
|
claims: run.extractStepResult.claims,
|
|
@@ -868,8 +873,8 @@ function createHallucinationScorer({
|
|
|
868
873
|
}
|
|
869
874
|
},
|
|
870
875
|
calculateScore: ({ run }) => {
|
|
871
|
-
const totalStatements = run.analyzeStepResult.length;
|
|
872
|
-
const contradictedStatements = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
876
|
+
const totalStatements = run.analyzeStepResult.verdicts.length;
|
|
877
|
+
const contradictedStatements = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
873
878
|
if (totalStatements === 0) {
|
|
874
879
|
return 0;
|
|
875
880
|
}
|
|
@@ -880,12 +885,12 @@ function createHallucinationScorer({
|
|
|
880
885
|
description: "Reason about the results",
|
|
881
886
|
createPrompt: ({ run }) => {
|
|
882
887
|
const prompt = createHallucinationReasonPrompt({
|
|
883
|
-
input: run.input
|
|
888
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
884
889
|
output: run.output.text,
|
|
885
890
|
context: run?.additionalContext?.context || [],
|
|
886
891
|
score: run.score,
|
|
887
892
|
scale: options?.scale || 1,
|
|
888
|
-
verdicts: run.analyzeStepResult || []
|
|
893
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
889
894
|
});
|
|
890
895
|
return prompt;
|
|
891
896
|
}
|
|
@@ -991,22 +996,22 @@ function createToxicityScorer({ model, options }) {
|
|
|
991
996
|
},
|
|
992
997
|
analyze: {
|
|
993
998
|
description: "Score the relevance of the statements to the input",
|
|
994
|
-
outputSchema: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })),
|
|
999
|
+
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
995
1000
|
createPrompt: ({ run }) => {
|
|
996
1001
|
const prompt = createToxicityAnalyzePrompt({
|
|
997
|
-
input: run.input
|
|
1002
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
998
1003
|
output: run.output.text
|
|
999
1004
|
});
|
|
1000
1005
|
return prompt;
|
|
1001
1006
|
}
|
|
1002
1007
|
},
|
|
1003
1008
|
calculateScore: ({ run }) => {
|
|
1004
|
-
const numberOfVerdicts = run.analyzeStepResult?.length || 0;
|
|
1009
|
+
const numberOfVerdicts = run.analyzeStepResult?.verdicts.length || 0;
|
|
1005
1010
|
if (numberOfVerdicts === 0) {
|
|
1006
1011
|
return 1;
|
|
1007
1012
|
}
|
|
1008
1013
|
let toxicityCount = 0;
|
|
1009
|
-
for (const { verdict } of run.analyzeStepResult) {
|
|
1014
|
+
for (const { verdict } of run.analyzeStepResult.verdicts) {
|
|
1010
1015
|
if (verdict.trim().toLowerCase() === "yes") {
|
|
1011
1016
|
toxicityCount++;
|
|
1012
1017
|
}
|
|
@@ -1019,7 +1024,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
1019
1024
|
createPrompt: ({ run }) => {
|
|
1020
1025
|
const prompt = createToxicityReasonPrompt({
|
|
1021
1026
|
score: run.score,
|
|
1022
|
-
toxics: run.analyzeStepResult?.map((v) => v.reason) || []
|
|
1027
|
+
toxics: run.analyzeStepResult?.verdicts.map((v) => v.reason) || []
|
|
1023
1028
|
});
|
|
1024
1029
|
return prompt;
|
|
1025
1030
|
}
|
|
@@ -234,28 +234,28 @@ function createAnswerRelevancyScorer({
|
|
|
234
234
|
},
|
|
235
235
|
analyze: {
|
|
236
236
|
description: "Score the relevance of the statements to the input",
|
|
237
|
-
outputSchema: z.array(z.object({ result: z.string(), reason: z.string() })),
|
|
237
|
+
outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
|
|
238
238
|
createPrompt: ({ run }) => createScorePrompt(JSON.stringify(run.input), run.extractStepResult?.statements || [])
|
|
239
239
|
},
|
|
240
240
|
reason: {
|
|
241
241
|
description: "Reason about the results",
|
|
242
242
|
createPrompt: ({ run }) => {
|
|
243
243
|
return createReasonPrompt({
|
|
244
|
-
input: run.input
|
|
244
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
245
245
|
output: run.output.text,
|
|
246
246
|
score: run.score,
|
|
247
|
-
results: run.analyzeStepResult,
|
|
247
|
+
results: run.analyzeStepResult.results,
|
|
248
248
|
scale: options.scale
|
|
249
249
|
});
|
|
250
250
|
}
|
|
251
251
|
},
|
|
252
252
|
calculateScore: ({ run }) => {
|
|
253
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
253
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
254
254
|
return 0;
|
|
255
255
|
}
|
|
256
|
-
const numberOfResults = run.analyzeStepResult.length;
|
|
256
|
+
const numberOfResults = run.analyzeStepResult.results.length;
|
|
257
257
|
let relevancyCount = 0;
|
|
258
|
-
for (const { result } of run.analyzeStepResult) {
|
|
258
|
+
for (const { result } of run.analyzeStepResult.results) {
|
|
259
259
|
if (result.trim().toLowerCase() === "yes") {
|
|
260
260
|
relevancyCount++;
|
|
261
261
|
} else if (result.trim().toLowerCase() === "unsure") {
|
|
@@ -453,7 +453,7 @@ function createFaithfulnessScorer({
|
|
|
453
453
|
},
|
|
454
454
|
analyze: {
|
|
455
455
|
description: "Score the relevance of the statements to the input",
|
|
456
|
-
outputSchema: z.array(z.object({ verdict: z.string(), reason: z.string() })),
|
|
456
|
+
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
457
457
|
createPrompt: ({ run }) => {
|
|
458
458
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
459
459
|
claims: run.extractStepResult || [],
|
|
@@ -463,8 +463,8 @@ function createFaithfulnessScorer({
|
|
|
463
463
|
}
|
|
464
464
|
},
|
|
465
465
|
calculateScore: ({ run }) => {
|
|
466
|
-
const totalClaims = run.analyzeStepResult.length;
|
|
467
|
-
const supportedClaims = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
466
|
+
const totalClaims = run.analyzeStepResult.verdicts.length;
|
|
467
|
+
const supportedClaims = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
468
468
|
if (totalClaims === 0) {
|
|
469
469
|
return 0;
|
|
470
470
|
}
|
|
@@ -475,12 +475,12 @@ function createFaithfulnessScorer({
|
|
|
475
475
|
description: "Reason about the results",
|
|
476
476
|
createPrompt: ({ run }) => {
|
|
477
477
|
const prompt = createFaithfulnessReasonPrompt({
|
|
478
|
-
input: run.input
|
|
478
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
479
479
|
output: run.output.text,
|
|
480
480
|
context: options?.context || [],
|
|
481
481
|
score: run.score,
|
|
482
482
|
scale: options?.scale || 1,
|
|
483
|
-
verdicts: run.analyzeStepResult || []
|
|
483
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
484
484
|
});
|
|
485
485
|
return prompt;
|
|
486
486
|
}
|
|
@@ -615,7 +615,7 @@ function createBiasScorer({ model, options }) {
|
|
|
615
615
|
},
|
|
616
616
|
analyze: {
|
|
617
617
|
description: "Score the relevance of the statements to the input",
|
|
618
|
-
outputSchema: z.array(z.object({ result: z.string(), reason: z.string() })),
|
|
618
|
+
outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
|
|
619
619
|
createPrompt: ({ run }) => {
|
|
620
620
|
const prompt = createBiasAnalyzePrompt({
|
|
621
621
|
output: run.output.text,
|
|
@@ -625,17 +625,20 @@ function createBiasScorer({ model, options }) {
|
|
|
625
625
|
}
|
|
626
626
|
},
|
|
627
627
|
calculateScore: ({ run }) => {
|
|
628
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.length === 0) {
|
|
628
|
+
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
629
629
|
return 0;
|
|
630
630
|
}
|
|
631
|
-
const biasedVerdicts = run.analyzeStepResult.filter((v) => v.result.toLowerCase() === "yes");
|
|
632
|
-
const score = biasedVerdicts.length / run.analyzeStepResult.length;
|
|
631
|
+
const biasedVerdicts = run.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
632
|
+
const score = biasedVerdicts.length / run.analyzeStepResult.results.length;
|
|
633
633
|
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
634
634
|
},
|
|
635
635
|
reason: {
|
|
636
636
|
description: "Reason about the results",
|
|
637
637
|
createPrompt: ({ run }) => {
|
|
638
|
-
return createBiasReasonPrompt({
|
|
638
|
+
return createBiasReasonPrompt({
|
|
639
|
+
score: run.score,
|
|
640
|
+
biases: run.analyzeStepResult?.results.map((v) => v.reason) || []
|
|
641
|
+
});
|
|
639
642
|
}
|
|
640
643
|
}
|
|
641
644
|
});
|
|
@@ -856,7 +859,9 @@ function createHallucinationScorer({
|
|
|
856
859
|
},
|
|
857
860
|
analyze: {
|
|
858
861
|
description: "Score the relevance of the statements to the input",
|
|
859
|
-
outputSchema: z.
|
|
862
|
+
outputSchema: z.object({
|
|
863
|
+
verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
|
|
864
|
+
}),
|
|
860
865
|
createPrompt: ({ run }) => {
|
|
861
866
|
const prompt = createHallucinationAnalyzePrompt({
|
|
862
867
|
claims: run.extractStepResult.claims,
|
|
@@ -866,8 +871,8 @@ function createHallucinationScorer({
|
|
|
866
871
|
}
|
|
867
872
|
},
|
|
868
873
|
calculateScore: ({ run }) => {
|
|
869
|
-
const totalStatements = run.analyzeStepResult.length;
|
|
870
|
-
const contradictedStatements = run.analyzeStepResult.filter((v) => v.verdict === "yes").length;
|
|
874
|
+
const totalStatements = run.analyzeStepResult.verdicts.length;
|
|
875
|
+
const contradictedStatements = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
871
876
|
if (totalStatements === 0) {
|
|
872
877
|
return 0;
|
|
873
878
|
}
|
|
@@ -878,12 +883,12 @@ function createHallucinationScorer({
|
|
|
878
883
|
description: "Reason about the results",
|
|
879
884
|
createPrompt: ({ run }) => {
|
|
880
885
|
const prompt = createHallucinationReasonPrompt({
|
|
881
|
-
input: run.input
|
|
886
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
882
887
|
output: run.output.text,
|
|
883
888
|
context: run?.additionalContext?.context || [],
|
|
884
889
|
score: run.score,
|
|
885
890
|
scale: options?.scale || 1,
|
|
886
|
-
verdicts: run.analyzeStepResult || []
|
|
891
|
+
verdicts: run.analyzeStepResult?.verdicts || []
|
|
887
892
|
});
|
|
888
893
|
return prompt;
|
|
889
894
|
}
|
|
@@ -989,22 +994,22 @@ function createToxicityScorer({ model, options }) {
|
|
|
989
994
|
},
|
|
990
995
|
analyze: {
|
|
991
996
|
description: "Score the relevance of the statements to the input",
|
|
992
|
-
outputSchema: z.array(z.object({ verdict: z.string(), reason: z.string() })),
|
|
997
|
+
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
993
998
|
createPrompt: ({ run }) => {
|
|
994
999
|
const prompt = createToxicityAnalyzePrompt({
|
|
995
|
-
input: run.input
|
|
1000
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
996
1001
|
output: run.output.text
|
|
997
1002
|
});
|
|
998
1003
|
return prompt;
|
|
999
1004
|
}
|
|
1000
1005
|
},
|
|
1001
1006
|
calculateScore: ({ run }) => {
|
|
1002
|
-
const numberOfVerdicts = run.analyzeStepResult?.length || 0;
|
|
1007
|
+
const numberOfVerdicts = run.analyzeStepResult?.verdicts.length || 0;
|
|
1003
1008
|
if (numberOfVerdicts === 0) {
|
|
1004
1009
|
return 1;
|
|
1005
1010
|
}
|
|
1006
1011
|
let toxicityCount = 0;
|
|
1007
|
-
for (const { verdict } of run.analyzeStepResult) {
|
|
1012
|
+
for (const { verdict } of run.analyzeStepResult.verdicts) {
|
|
1008
1013
|
if (verdict.trim().toLowerCase() === "yes") {
|
|
1009
1014
|
toxicityCount++;
|
|
1010
1015
|
}
|
|
@@ -1017,7 +1022,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
1017
1022
|
createPrompt: ({ run }) => {
|
|
1018
1023
|
const prompt = createToxicityReasonPrompt({
|
|
1019
1024
|
score: run.score,
|
|
1020
|
-
toxics: run.analyzeStepResult?.map((v) => v.reason) || []
|
|
1025
|
+
toxics: run.analyzeStepResult?.verdicts.map((v) => v.reason) || []
|
|
1021
1026
|
});
|
|
1022
1027
|
return prompt;
|
|
1023
1028
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.8-alpha.1",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -100,8 +100,8 @@
|
|
|
100
100
|
"tsup": "^8.5.0",
|
|
101
101
|
"typescript": "^5.8.3",
|
|
102
102
|
"vitest": "^3.2.4",
|
|
103
|
-
"@internal/lint": "0.0.
|
|
104
|
-
"@mastra/core": "0.
|
|
103
|
+
"@internal/lint": "0.0.23",
|
|
104
|
+
"@mastra/core": "0.12.0-alpha.1"
|
|
105
105
|
},
|
|
106
106
|
"scripts": {
|
|
107
107
|
"check": "tsc --noEmit",
|