@mastra/evals 0.1.0-alpha.23 → 0.1.0-alpha.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/dist/index.js +2 -1
- package/dist/metrics/nlp/index.d.ts +1 -1
- package/dist/metrics/nlp/index.js +1 -1
- package/package.json +2 -4
- package/src/attachListeners.ts +1 -1
- package/src/evaluation.test.ts +3 -3
- package/src/metrics/judge/index.ts +1 -1
- package/src/metrics/llm/answer-relevancy/metricJudge.ts +1 -1
- package/src/metrics/llm/bias/index.test.ts +10 -7
- package/src/metrics/llm/bias/metricJudge.ts +1 -1
- package/src/metrics/llm/context-position/metricJudge.ts +1 -1
- package/src/metrics/llm/context-precision/metricJudge.ts +1 -1
- package/src/metrics/llm/context-relevancy/metricJudge.ts +1 -1
- package/src/metrics/llm/contextual-recall/metricJudge.ts +1 -1
- package/src/metrics/llm/faithfulness/metricJudge.ts +1 -1
- package/src/metrics/llm/hallucination/metricJudge.ts +1 -1
- package/src/metrics/llm/prompt-alignment/metricJudge.ts +1 -1
- package/src/metrics/llm/summarization/index.test.ts +84 -86
- package/src/metrics/llm/summarization/metricJudge.ts +1 -1
- package/src/metrics/llm/toxicity/metricJudge.ts +1 -1
- package/src/metrics/nlp/completeness/index.ts +1 -1
- package/src/metrics/nlp/content-similarity/index.ts +1 -1
- package/src/metrics/nlp/keyword-coverage/index.ts +1 -1
- package/src/metrics/nlp/textual-difference/index.ts +1 -1
- package/src/metrics/nlp/tone/index.ts +1 -1
- package/vitest.config.ts +2 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
+
## 0.1.0-alpha.25
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 9625602: Use mastra core splitted bundles in other packages
|
|
8
|
+
- 8769a62: Split core into seperate entry fils
|
|
9
|
+
- Updated dependencies [30322ce]
|
|
10
|
+
- Updated dependencies [78eec7c]
|
|
11
|
+
- Updated dependencies [9625602]
|
|
12
|
+
- Updated dependencies [8769a62]
|
|
13
|
+
- @mastra/core@0.2.0-alpha.83
|
|
14
|
+
|
|
15
|
+
## 0.1.0-alpha.24
|
|
16
|
+
|
|
17
|
+
### Patch Changes
|
|
18
|
+
|
|
19
|
+
- Updated dependencies [73d112c]
|
|
20
|
+
- @mastra/core@0.1.27-alpha.82
|
|
21
|
+
|
|
3
22
|
## 0.1.0-alpha.23
|
|
4
23
|
|
|
5
24
|
### Patch Changes
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import './chunk-4VNS5WPM.js';
|
|
2
|
-
import { evaluate as evaluate$1
|
|
2
|
+
import { evaluate as evaluate$1 } from '@mastra/core';
|
|
3
|
+
import { registerHook, AvailableHooks } from '@mastra/core/hooks';
|
|
3
4
|
import { mkdirSync, appendFile } from 'fs';
|
|
4
5
|
import { join } from 'path';
|
|
5
6
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.25",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -38,11 +38,9 @@
|
|
|
38
38
|
"sentiment": "^5.0.2",
|
|
39
39
|
"string-similarity": "^4.0.4",
|
|
40
40
|
"zod": "^3.24.1",
|
|
41
|
-
"@mastra/core": "0.
|
|
41
|
+
"@mastra/core": "0.2.0-alpha.83"
|
|
42
42
|
},
|
|
43
43
|
"devDependencies": {
|
|
44
|
-
"@babel/preset-env": "^7.26.0",
|
|
45
|
-
"@babel/preset-typescript": "^7.26.0",
|
|
46
44
|
"@tsconfig/recommended": "^1.0.7",
|
|
47
45
|
"@types/difflib": "^0.2.7",
|
|
48
46
|
"@types/fs-extra": "^11.0.4",
|
package/src/attachListeners.ts
CHANGED
package/src/evaluation.test.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import { Metric } from '@mastra/core';
|
|
1
|
+
import { type ModelConfig } from '@mastra/core';
|
|
2
|
+
import { Agent } from '@mastra/core/agent';
|
|
3
|
+
import { Metric } from '@mastra/core/eval';
|
|
4
4
|
import { describe, expect, it } from 'vitest';
|
|
5
5
|
|
|
6
6
|
import { evaluate } from './evaluation';
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
3
3
|
|
|
4
|
-
import { isCloserTo } from '../utils';
|
|
5
4
|
import { TestCase } from '../utils';
|
|
6
5
|
|
|
7
6
|
import { BiasMetric } from './index';
|
|
@@ -41,7 +40,11 @@ const testCases: TestCase[] = [
|
|
|
41
40
|
},
|
|
42
41
|
];
|
|
43
42
|
|
|
44
|
-
const SECONDS =
|
|
43
|
+
const SECONDS = 1000;
|
|
44
|
+
|
|
45
|
+
vi.setConfig({
|
|
46
|
+
testTimeout: 20 * SECONDS,
|
|
47
|
+
});
|
|
45
48
|
|
|
46
49
|
const modelConfig: ModelConfig = {
|
|
47
50
|
provider: 'OPEN_AI',
|
|
@@ -56,20 +59,20 @@ describe('BiasMetric', () => {
|
|
|
56
59
|
it('should be able to measure a prompt that is biased', async () => {
|
|
57
60
|
const result = await metric.measure(testCases[0].input, testCases[0].output);
|
|
58
61
|
expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
|
|
59
|
-
}
|
|
62
|
+
});
|
|
60
63
|
|
|
61
64
|
it('should be able to measure a prompt that is almost not biased', async () => {
|
|
62
65
|
const result = await metric.measure(testCases[1].input, testCases[1].output);
|
|
63
66
|
expect(result.score).toBeLessThan(0.5);
|
|
64
|
-
}
|
|
67
|
+
});
|
|
65
68
|
|
|
66
69
|
it('should be able to measure a prompt that is mildly biased but actually not', async () => {
|
|
67
70
|
const result = await metric.measure(testCases[2].input, testCases[2].output);
|
|
68
71
|
expect(result.score).toBe(0);
|
|
69
|
-
}
|
|
72
|
+
});
|
|
70
73
|
|
|
71
74
|
it('should be able to measure a prompt that is mildly biased', async () => {
|
|
72
75
|
const result = await metric.measure(testCases[3].input, testCases[3].output);
|
|
73
76
|
expect(result.score).toBeLessThan(0.8);
|
|
74
|
-
}
|
|
77
|
+
});
|
|
75
78
|
});
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCase } from '../utils';
|
|
5
5
|
|
|
@@ -165,7 +165,7 @@ const testCases: TestCase[] = [
|
|
|
165
165
|
},
|
|
166
166
|
];
|
|
167
167
|
|
|
168
|
-
const SECONDS =
|
|
168
|
+
const SECONDS = 1000;
|
|
169
169
|
|
|
170
170
|
const modelConfig: ModelConfig = {
|
|
171
171
|
provider: 'OPEN_AI',
|
|
@@ -174,102 +174,100 @@ const modelConfig: ModelConfig = {
|
|
|
174
174
|
apiKey: process.env.OPENAI_API_KEY,
|
|
175
175
|
};
|
|
176
176
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
const metric = new SummarizationMetric(modelConfig);
|
|
177
|
+
vi.setConfig({
|
|
178
|
+
testTimeout: 20 * SECONDS,
|
|
179
|
+
});
|
|
181
180
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
const result = await metric.measure(testCase.input, testCase.output);
|
|
185
|
-
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
186
|
-
});
|
|
181
|
+
describe('SummarizationMetric', () => {
|
|
182
|
+
const metric = new SummarizationMetric(modelConfig);
|
|
187
183
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
184
|
+
it('should handle perfect summarization', async () => {
|
|
185
|
+
const testCase = testCases[0]!;
|
|
186
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
187
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
188
|
+
});
|
|
193
189
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
190
|
+
it('should handle mixed accuracy with contradictions', async () => {
|
|
191
|
+
const testCase = testCases[1]!;
|
|
192
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
193
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
194
|
+
});
|
|
199
195
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
196
|
+
it('should handle missing key information', async () => {
|
|
197
|
+
const testCase = testCases[2]!;
|
|
198
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
199
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
200
|
+
});
|
|
205
201
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
202
|
+
it('should handle empty output', async () => {
|
|
203
|
+
const testCase = testCases[3]!;
|
|
204
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
205
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
206
|
+
});
|
|
211
207
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
208
|
+
it('should handle speculative additions', async () => {
|
|
209
|
+
const testCase = testCases[4]!;
|
|
210
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
211
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
212
|
+
});
|
|
217
213
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
214
|
+
it('should handle incorrect emphasis', async () => {
|
|
215
|
+
const testCase = testCases[5]!;
|
|
216
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
217
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
218
|
+
});
|
|
223
219
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
220
|
+
it('should handle technical accuracy with missing context', async () => {
|
|
221
|
+
const testCase = testCases[6]!;
|
|
222
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
223
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
224
|
+
});
|
|
229
225
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
226
|
+
it('should handle numerical approximation', async () => {
|
|
227
|
+
const testCase = testCases[7]!;
|
|
228
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
229
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
230
|
+
});
|
|
235
231
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
232
|
+
it('should handle mixed tenses', async () => {
|
|
233
|
+
const testCase = testCases[8]!;
|
|
234
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
235
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
236
|
+
});
|
|
241
237
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
238
|
+
it('should handle subjective interpretation', async () => {
|
|
239
|
+
const testCase = testCases[9]!;
|
|
240
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
241
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
242
|
+
});
|
|
247
243
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
244
|
+
it('should handle high alignment with low coverage', async () => {
|
|
245
|
+
const testCase = testCases[10]!;
|
|
246
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
247
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
248
|
+
});
|
|
253
249
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
250
|
+
it('should handle low alignment with high coverage', async () => {
|
|
251
|
+
const testCase = testCases[11]!;
|
|
252
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
253
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
254
|
+
});
|
|
259
255
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
256
|
+
it('should handle single word summary', async () => {
|
|
257
|
+
const testCase = testCases[12]!;
|
|
258
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
259
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
260
|
+
});
|
|
265
261
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
{
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
);
|
|
262
|
+
it('should handle repetitive summary', async () => {
|
|
263
|
+
const testCase = testCases[13]!;
|
|
264
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
265
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
it('should handle overly verbose summary', async () => {
|
|
269
|
+
const testCase = testCases[14]!;
|
|
270
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
271
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
272
|
+
});
|
|
273
|
+
});
|