@mastra/evals 0.10.8-alpha.0 → 0.11.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_tsup-dts-rollup.d.cts +4 -1
- package/dist/_tsup-dts-rollup.d.ts +4 -1
- package/dist/scorers/code/index.cjs +44 -4
- package/dist/scorers/code/index.d.cts +1 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.js +43 -5
- package/dist/scorers/llm/index.cjs +4 -4
- package/dist/scorers/llm/index.js +4 -4
- package/package.json +3 -3
|
@@ -405,7 +405,10 @@ export { createTextualDifferenceScorer }
|
|
|
405
405
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_1 }
|
|
406
406
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_2 }
|
|
407
407
|
|
|
408
|
-
|
|
408
|
+
declare function createToneScorer(): MastraScorer;
|
|
409
|
+
export { createToneScorer }
|
|
410
|
+
export { createToneScorer as createToneScorer_alias_1 }
|
|
411
|
+
export { createToneScorer as createToneScorer_alias_2 }
|
|
409
412
|
|
|
410
413
|
export declare function createToxicityAnalyzePrompt({ input, output }: {
|
|
411
414
|
input: string;
|
|
@@ -405,7 +405,10 @@ export { createTextualDifferenceScorer }
|
|
|
405
405
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_1 }
|
|
406
406
|
export { createTextualDifferenceScorer as createTextualDifferenceScorer_alias_2 }
|
|
407
407
|
|
|
408
|
-
|
|
408
|
+
declare function createToneScorer(): MastraScorer;
|
|
409
|
+
export { createToneScorer }
|
|
410
|
+
export { createToneScorer as createToneScorer_alias_1 }
|
|
411
|
+
export { createToneScorer as createToneScorer_alias_2 }
|
|
409
412
|
|
|
410
413
|
export declare function createToxicityAnalyzePrompt({ input, output }: {
|
|
411
414
|
input: string;
|
|
@@ -5,12 +5,14 @@ var nlp = require('compromise');
|
|
|
5
5
|
var difflib = require('difflib');
|
|
6
6
|
var keyword_extractor = require('keyword-extractor');
|
|
7
7
|
var stringSimilarity = require('string-similarity');
|
|
8
|
+
var Sentiment = require('sentiment');
|
|
8
9
|
|
|
9
10
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
10
11
|
|
|
11
12
|
var nlp__default = /*#__PURE__*/_interopDefault(nlp);
|
|
12
13
|
var keyword_extractor__default = /*#__PURE__*/_interopDefault(keyword_extractor);
|
|
13
14
|
var stringSimilarity__default = /*#__PURE__*/_interopDefault(stringSimilarity);
|
|
15
|
+
var Sentiment__default = /*#__PURE__*/_interopDefault(Sentiment);
|
|
14
16
|
|
|
15
17
|
function normalizeString(str) {
|
|
16
18
|
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
@@ -63,7 +65,7 @@ function createCompletenessScorer() {
|
|
|
63
65
|
if (isInputInvalid || isOutputInvalid) {
|
|
64
66
|
throw new Error("Inputs cannot be null or undefined");
|
|
65
67
|
}
|
|
66
|
-
const input = run.input
|
|
68
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
67
69
|
const output = run.output.text;
|
|
68
70
|
const inputToProcess = input;
|
|
69
71
|
const outputToProcess = output;
|
|
@@ -100,7 +102,7 @@ function createTextualDifferenceScorer() {
|
|
|
100
102
|
name: "Completeness",
|
|
101
103
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
102
104
|
analyze: async (run) => {
|
|
103
|
-
const input = run.input
|
|
105
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
104
106
|
const output = run.output.text;
|
|
105
107
|
const matcher = new difflib.SequenceMatcher(null, input, output);
|
|
106
108
|
const ratio = matcher.ratio();
|
|
@@ -125,7 +127,7 @@ function createKeywordCoverageScorer() {
|
|
|
125
127
|
name: "Completeness",
|
|
126
128
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
127
129
|
extract: async (run) => {
|
|
128
|
-
const input = run.input
|
|
130
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
129
131
|
const output = run.output.text;
|
|
130
132
|
if (!input && !output) {
|
|
131
133
|
return {
|
|
@@ -182,7 +184,7 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
182
184
|
name: "Completeness",
|
|
183
185
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
184
186
|
extract: async (run) => {
|
|
185
|
-
let processedInput = run.input
|
|
187
|
+
let processedInput = run.input?.map((i) => i.content).join(", ") || "";
|
|
186
188
|
let processedOutput = run.output.text;
|
|
187
189
|
if (ignoreCase) {
|
|
188
190
|
processedInput = processedInput.toLowerCase();
|
|
@@ -213,8 +215,46 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
213
215
|
}
|
|
214
216
|
});
|
|
215
217
|
}
|
|
218
|
+
function createToneScorer() {
|
|
219
|
+
return scores.createScorer({
|
|
220
|
+
name: "Completeness",
|
|
221
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
222
|
+
analyze: async (run) => {
|
|
223
|
+
const sentiment = new Sentiment__default.default();
|
|
224
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
225
|
+
const output = run.output.text;
|
|
226
|
+
const responseSentiment = sentiment.analyze(input);
|
|
227
|
+
if (output) {
|
|
228
|
+
const referenceSentiment = sentiment.analyze(output);
|
|
229
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
230
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
231
|
+
return {
|
|
232
|
+
score: normalizedScore,
|
|
233
|
+
result: {
|
|
234
|
+
responseSentiment: responseSentiment.comparative,
|
|
235
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
236
|
+
difference: sentimentDiff
|
|
237
|
+
}
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
const sentences = input.match(/[^.!?]+[.!?]+/g) || [input];
|
|
241
|
+
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
242
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
243
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
244
|
+
const stability = Math.max(0, 1 - variance);
|
|
245
|
+
return {
|
|
246
|
+
score: stability,
|
|
247
|
+
result: {
|
|
248
|
+
avgSentiment,
|
|
249
|
+
sentimentVariance: variance
|
|
250
|
+
}
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
});
|
|
254
|
+
}
|
|
216
255
|
|
|
217
256
|
exports.createCompletenessScorer = createCompletenessScorer;
|
|
218
257
|
exports.createContentSimilarityScorer = createContentSimilarityScorer;
|
|
219
258
|
exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
|
|
220
259
|
exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
|
|
260
|
+
exports.createToneScorer = createToneScorer;
|
|
@@ -2,3 +2,4 @@ export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '..
|
|
|
2
2
|
export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.cjs';
|
|
3
3
|
export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.cjs';
|
|
4
4
|
export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.cjs';
|
|
5
|
+
export { createToneScorer_alias_1 as createToneScorer } from '../../_tsup-dts-rollup.cjs';
|
|
@@ -2,3 +2,4 @@ export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '..
|
|
|
2
2
|
export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.js';
|
|
3
3
|
export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.js';
|
|
4
4
|
export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.js';
|
|
5
|
+
export { createToneScorer_alias_1 as createToneScorer } from '../../_tsup-dts-rollup.js';
|
|
@@ -3,6 +3,7 @@ import nlp from 'compromise';
|
|
|
3
3
|
import { SequenceMatcher } from 'difflib';
|
|
4
4
|
import keyword_extractor from 'keyword-extractor';
|
|
5
5
|
import stringSimilarity from 'string-similarity';
|
|
6
|
+
import Sentiment from 'sentiment';
|
|
6
7
|
|
|
7
8
|
function normalizeString(str) {
|
|
8
9
|
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
@@ -55,7 +56,7 @@ function createCompletenessScorer() {
|
|
|
55
56
|
if (isInputInvalid || isOutputInvalid) {
|
|
56
57
|
throw new Error("Inputs cannot be null or undefined");
|
|
57
58
|
}
|
|
58
|
-
const input = run.input
|
|
59
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
59
60
|
const output = run.output.text;
|
|
60
61
|
const inputToProcess = input;
|
|
61
62
|
const outputToProcess = output;
|
|
@@ -92,7 +93,7 @@ function createTextualDifferenceScorer() {
|
|
|
92
93
|
name: "Completeness",
|
|
93
94
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
94
95
|
analyze: async (run) => {
|
|
95
|
-
const input = run.input
|
|
96
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
96
97
|
const output = run.output.text;
|
|
97
98
|
const matcher = new SequenceMatcher(null, input, output);
|
|
98
99
|
const ratio = matcher.ratio();
|
|
@@ -117,7 +118,7 @@ function createKeywordCoverageScorer() {
|
|
|
117
118
|
name: "Completeness",
|
|
118
119
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
119
120
|
extract: async (run) => {
|
|
120
|
-
const input = run.input
|
|
121
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
121
122
|
const output = run.output.text;
|
|
122
123
|
if (!input && !output) {
|
|
123
124
|
return {
|
|
@@ -174,7 +175,7 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
174
175
|
name: "Completeness",
|
|
175
176
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
176
177
|
extract: async (run) => {
|
|
177
|
-
let processedInput = run.input
|
|
178
|
+
let processedInput = run.input?.map((i) => i.content).join(", ") || "";
|
|
178
179
|
let processedOutput = run.output.text;
|
|
179
180
|
if (ignoreCase) {
|
|
180
181
|
processedInput = processedInput.toLowerCase();
|
|
@@ -205,5 +206,42 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
205
206
|
}
|
|
206
207
|
});
|
|
207
208
|
}
|
|
209
|
+
function createToneScorer() {
|
|
210
|
+
return createScorer({
|
|
211
|
+
name: "Completeness",
|
|
212
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
213
|
+
analyze: async (run) => {
|
|
214
|
+
const sentiment = new Sentiment();
|
|
215
|
+
const input = run.input?.map((i) => i.content).join(", ") || "";
|
|
216
|
+
const output = run.output.text;
|
|
217
|
+
const responseSentiment = sentiment.analyze(input);
|
|
218
|
+
if (output) {
|
|
219
|
+
const referenceSentiment = sentiment.analyze(output);
|
|
220
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
221
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
222
|
+
return {
|
|
223
|
+
score: normalizedScore,
|
|
224
|
+
result: {
|
|
225
|
+
responseSentiment: responseSentiment.comparative,
|
|
226
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
227
|
+
difference: sentimentDiff
|
|
228
|
+
}
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
const sentences = input.match(/[^.!?]+[.!?]+/g) || [input];
|
|
232
|
+
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
233
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
234
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
235
|
+
const stability = Math.max(0, 1 - variance);
|
|
236
|
+
return {
|
|
237
|
+
score: stability,
|
|
238
|
+
result: {
|
|
239
|
+
avgSentiment,
|
|
240
|
+
sentimentVariance: variance
|
|
241
|
+
}
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
}
|
|
208
246
|
|
|
209
|
-
export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer };
|
|
247
|
+
export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer, createToneScorer };
|
|
@@ -243,7 +243,7 @@ function createAnswerRelevancyScorer({
|
|
|
243
243
|
description: "Reason about the results",
|
|
244
244
|
createPrompt: ({ run }) => {
|
|
245
245
|
return createReasonPrompt({
|
|
246
|
-
input: run.input
|
|
246
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
247
247
|
output: run.output.text,
|
|
248
248
|
score: run.score,
|
|
249
249
|
results: run.analyzeStepResult.results,
|
|
@@ -477,7 +477,7 @@ function createFaithfulnessScorer({
|
|
|
477
477
|
description: "Reason about the results",
|
|
478
478
|
createPrompt: ({ run }) => {
|
|
479
479
|
const prompt = createFaithfulnessReasonPrompt({
|
|
480
|
-
input: run.input
|
|
480
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
481
481
|
output: run.output.text,
|
|
482
482
|
context: options?.context || [],
|
|
483
483
|
score: run.score,
|
|
@@ -885,7 +885,7 @@ function createHallucinationScorer({
|
|
|
885
885
|
description: "Reason about the results",
|
|
886
886
|
createPrompt: ({ run }) => {
|
|
887
887
|
const prompt = createHallucinationReasonPrompt({
|
|
888
|
-
input: run.input
|
|
888
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
889
889
|
output: run.output.text,
|
|
890
890
|
context: run?.additionalContext?.context || [],
|
|
891
891
|
score: run.score,
|
|
@@ -999,7 +999,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
999
999
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1000
1000
|
createPrompt: ({ run }) => {
|
|
1001
1001
|
const prompt = createToxicityAnalyzePrompt({
|
|
1002
|
-
input: run.input
|
|
1002
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
1003
1003
|
output: run.output.text
|
|
1004
1004
|
});
|
|
1005
1005
|
return prompt;
|
|
@@ -241,7 +241,7 @@ function createAnswerRelevancyScorer({
|
|
|
241
241
|
description: "Reason about the results",
|
|
242
242
|
createPrompt: ({ run }) => {
|
|
243
243
|
return createReasonPrompt({
|
|
244
|
-
input: run.input
|
|
244
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
245
245
|
output: run.output.text,
|
|
246
246
|
score: run.score,
|
|
247
247
|
results: run.analyzeStepResult.results,
|
|
@@ -475,7 +475,7 @@ function createFaithfulnessScorer({
|
|
|
475
475
|
description: "Reason about the results",
|
|
476
476
|
createPrompt: ({ run }) => {
|
|
477
477
|
const prompt = createFaithfulnessReasonPrompt({
|
|
478
|
-
input: run.input
|
|
478
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
479
479
|
output: run.output.text,
|
|
480
480
|
context: options?.context || [],
|
|
481
481
|
score: run.score,
|
|
@@ -883,7 +883,7 @@ function createHallucinationScorer({
|
|
|
883
883
|
description: "Reason about the results",
|
|
884
884
|
createPrompt: ({ run }) => {
|
|
885
885
|
const prompt = createHallucinationReasonPrompt({
|
|
886
|
-
input: run.input
|
|
886
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
887
887
|
output: run.output.text,
|
|
888
888
|
context: run?.additionalContext?.context || [],
|
|
889
889
|
score: run.score,
|
|
@@ -997,7 +997,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
997
997
|
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
998
998
|
createPrompt: ({ run }) => {
|
|
999
999
|
const prompt = createToxicityAnalyzePrompt({
|
|
1000
|
-
input: run.input
|
|
1000
|
+
input: run.input?.map((input) => input.content).join(", ") || "",
|
|
1001
1001
|
output: run.output.text
|
|
1002
1002
|
});
|
|
1003
1003
|
return prompt;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.11.0-alpha.2",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -84,7 +84,7 @@
|
|
|
84
84
|
"zod": "^3.25.67"
|
|
85
85
|
},
|
|
86
86
|
"peerDependencies": {
|
|
87
|
-
"@mastra/core": ">=0.
|
|
87
|
+
"@mastra/core": ">=0.12.0-0 <0.13.0-0",
|
|
88
88
|
"ai": "^4.0.0"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
@@ -101,7 +101,7 @@
|
|
|
101
101
|
"typescript": "^5.8.3",
|
|
102
102
|
"vitest": "^3.2.4",
|
|
103
103
|
"@internal/lint": "0.0.23",
|
|
104
|
-
"@mastra/core": "0.12.0-alpha.
|
|
104
|
+
"@mastra/core": "0.12.0-alpha.5"
|
|
105
105
|
},
|
|
106
106
|
"scripts": {
|
|
107
107
|
"check": "tsc --noEmit",
|