@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/README.md +186 -0
- package/dist/evals.cjs.development.js +1 -0
- package/dist/evals.cjs.development.js.map +1 -1
- package/dist/evals.cjs.production.min.js.map +1 -1
- package/dist/evals.esm.js +1 -0
- package/dist/evals.esm.js.map +1 -1
- package/dist/evaluation.d.ts +2 -2
- package/dist/evaluation.d.ts.map +1 -1
- package/package.json +4 -7
- package/src/evaluation.test.ts +1 -1
- package/src/evaluation.ts +2 -0
- package/src/metrics/llm/answer-relevancy/index.test.ts +49 -44
- package/src/metrics/llm/bias/index.test.ts +13 -12
- package/src/metrics/llm/context-position/index.test.ts +92 -87
- package/src/metrics/llm/context-precision/index.test.ts +69 -64
- package/src/metrics/llm/context-relevancy/index.test.ts +27 -22
- package/src/metrics/llm/contextual-recall/index.test.ts +28 -23
- package/src/metrics/llm/faithfulness/index.test.ts +81 -76
- package/src/metrics/llm/hallucination/index.test.ts +85 -80
- package/src/metrics/llm/prompt-alignment/index.test.ts +53 -48
- package/src/metrics/llm/summarization/index.test.ts +85 -80
- package/src/metrics/llm/toxicity/index.test.ts +22 -17
- package/src/metrics/nlp/completeness/index.test.ts +1 -1
- package/src/metrics/nlp/content-similarity/index.test.ts +1 -1
- package/src/metrics/nlp/keyword-coverage/index.test.ts +1 -1
- package/src/metrics/nlp/textual-difference/index.test.ts +1 -1
- package/src/metrics/nlp/tone/index.test.ts +1 -1
- package/vitest.config.ts +9 -0
- package/jest.config.ts +0 -21
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { describe, it, expect, jest } from '@jest/globals';
|
|
2
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCase } from '../utils';
|
|
5
5
|
|
|
@@ -166,7 +166,6 @@ const testCases: TestCase[] = [
|
|
|
166
166
|
];
|
|
167
167
|
|
|
168
168
|
const SECONDS = 10000;
|
|
169
|
-
jest.setTimeout(15 * SECONDS);
|
|
170
169
|
|
|
171
170
|
const modelConfig: ModelConfig = {
|
|
172
171
|
provider: 'OPEN_AI',
|
|
@@ -175,96 +174,102 @@ const modelConfig: ModelConfig = {
|
|
|
175
174
|
apiKey: process.env.OPENAI_API_KEY,
|
|
176
175
|
};
|
|
177
176
|
|
|
178
|
-
describe(
|
|
179
|
-
|
|
177
|
+
describe(
|
|
178
|
+
'SummarizationMetric',
|
|
179
|
+
() => {
|
|
180
|
+
const metric = new SummarizationMetric(modelConfig);
|
|
180
181
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
182
|
+
it('should handle perfect summarization', async () => {
|
|
183
|
+
const testCase = testCases[0]!;
|
|
184
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
185
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
186
|
+
});
|
|
186
187
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
188
|
+
it('should handle mixed accuracy with contradictions', async () => {
|
|
189
|
+
const testCase = testCases[1]!;
|
|
190
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
191
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
192
|
+
});
|
|
192
193
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
194
|
+
it('should handle missing key information', async () => {
|
|
195
|
+
const testCase = testCases[2]!;
|
|
196
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
197
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
198
|
+
});
|
|
198
199
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
200
|
+
it('should handle empty output', async () => {
|
|
201
|
+
const testCase = testCases[3]!;
|
|
202
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
203
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
204
|
+
});
|
|
204
205
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
206
|
+
it('should handle speculative additions', async () => {
|
|
207
|
+
const testCase = testCases[4]!;
|
|
208
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
209
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
210
|
+
});
|
|
210
211
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
212
|
+
it('should handle incorrect emphasis', async () => {
|
|
213
|
+
const testCase = testCases[5]!;
|
|
214
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
215
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
216
|
+
});
|
|
216
217
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
218
|
+
it('should handle technical accuracy with missing context', async () => {
|
|
219
|
+
const testCase = testCases[6]!;
|
|
220
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
221
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
222
|
+
});
|
|
222
223
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
224
|
+
it('should handle numerical approximation', async () => {
|
|
225
|
+
const testCase = testCases[7]!;
|
|
226
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
227
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
228
|
+
});
|
|
228
229
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
230
|
+
it('should handle mixed tenses', async () => {
|
|
231
|
+
const testCase = testCases[8]!;
|
|
232
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
233
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
234
|
+
});
|
|
234
235
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
236
|
+
it('should handle subjective interpretation', async () => {
|
|
237
|
+
const testCase = testCases[9]!;
|
|
238
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
239
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
240
|
+
});
|
|
240
241
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
242
|
+
it('should handle high alignment with low coverage', async () => {
|
|
243
|
+
const testCase = testCases[10]!;
|
|
244
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
245
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
246
|
+
});
|
|
246
247
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
248
|
+
it('should handle low alignment with high coverage', async () => {
|
|
249
|
+
const testCase = testCases[11]!;
|
|
250
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
251
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
252
|
+
});
|
|
252
253
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
254
|
+
it('should handle single word summary', async () => {
|
|
255
|
+
const testCase = testCases[12]!;
|
|
256
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
257
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
258
|
+
});
|
|
258
259
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
260
|
+
it('should handle repetitive summary', async () => {
|
|
261
|
+
const testCase = testCases[13]!;
|
|
262
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
263
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
264
|
+
});
|
|
264
265
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
}
|
|
266
|
+
it('should handle overly verbose summary', async () => {
|
|
267
|
+
const testCase = testCases[14]!;
|
|
268
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
269
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
270
|
+
});
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
timeout: 15 * SECONDS,
|
|
274
|
+
},
|
|
275
|
+
);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { describe, it, expect, jest } from '@jest/globals';
|
|
2
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCase } from '../utils';
|
|
5
5
|
|
|
@@ -35,7 +35,6 @@ const testCases: TestCase[] = [
|
|
|
35
35
|
];
|
|
36
36
|
|
|
37
37
|
const SECONDS = 10000;
|
|
38
|
-
jest.setTimeout(15 * SECONDS);
|
|
39
38
|
|
|
40
39
|
const modelConfig: ModelConfig = {
|
|
41
40
|
provider: 'OPEN_AI',
|
|
@@ -44,24 +43,30 @@ const modelConfig: ModelConfig = {
|
|
|
44
43
|
apiKey: process.env.OPENAI_API_KEY,
|
|
45
44
|
};
|
|
46
45
|
|
|
47
|
-
describe(
|
|
48
|
-
|
|
46
|
+
describe(
|
|
47
|
+
'ToxicityMetric',
|
|
48
|
+
() => {
|
|
49
|
+
const metric = new ToxicityMetric(modelConfig);
|
|
49
50
|
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
it('should be able to measure a prompt that is toxic', async () => {
|
|
52
|
+
const result = await metric.measure(testCases[0].input, testCases[0].output);
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
54
|
+
expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
|
|
55
|
+
});
|
|
55
56
|
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
it('should be able to measure a prompt that is not toxic', async () => {
|
|
58
|
+
const result = await metric.measure(testCases[1].input, testCases[1].output);
|
|
58
59
|
|
|
59
|
-
|
|
60
|
-
|
|
60
|
+
expect(result.score).toBeCloseTo(testCases[1].expectedResult.score, 1);
|
|
61
|
+
});
|
|
61
62
|
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
it('should be able to measure a prompt that is midly toxic', async () => {
|
|
64
|
+
const result = await metric.measure(testCases[2].input, testCases[2].output);
|
|
64
65
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
}
|
|
66
|
+
expect(result.score).toBeCloseTo(testCases[2].expectedResult.score, 1);
|
|
67
|
+
});
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
timeout: 15 * SECONDS,
|
|
71
|
+
},
|
|
72
|
+
);
|
package/vitest.config.ts
ADDED
package/jest.config.ts
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import { config } from 'dotenv';
|
|
2
|
-
|
|
3
|
-
config();
|
|
4
|
-
|
|
5
|
-
export default {
|
|
6
|
-
maxWorkers: 1,
|
|
7
|
-
preset: 'ts-jest',
|
|
8
|
-
extensionsToTreatAsEsm: ['.ts'],
|
|
9
|
-
moduleNameMapper: {
|
|
10
|
-
'^(\\.{1,2}/.*)\\.js$': '$1',
|
|
11
|
-
},
|
|
12
|
-
transform: {
|
|
13
|
-
'^.+\\.tsx?$': [
|
|
14
|
-
'ts-jest',
|
|
15
|
-
{
|
|
16
|
-
useESM: true,
|
|
17
|
-
isolatedModules: true,
|
|
18
|
-
},
|
|
19
|
-
],
|
|
20
|
-
},
|
|
21
|
-
};
|