@mastra/evals 0.1.0-alpha.22 → 0.1.0-alpha.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
+
## 0.1.0-alpha.24
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Updated dependencies [73d112c]
|
|
8
|
+
- @mastra/core@0.1.27-alpha.82
|
|
9
|
+
|
|
10
|
+
## 0.1.0-alpha.23
|
|
11
|
+
|
|
12
|
+
### Patch Changes
|
|
13
|
+
|
|
14
|
+
- Updated dependencies [9fb3039]
|
|
15
|
+
- @mastra/core@0.1.27-alpha.81
|
|
16
|
+
|
|
3
17
|
## 0.1.0-alpha.22
|
|
4
18
|
|
|
5
19
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.24",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"sentiment": "^5.0.2",
|
|
39
39
|
"string-similarity": "^4.0.4",
|
|
40
40
|
"zod": "^3.24.1",
|
|
41
|
-
"@mastra/core": "0.1.27-alpha.
|
|
41
|
+
"@mastra/core": "0.1.27-alpha.82"
|
|
42
42
|
},
|
|
43
43
|
"devDependencies": {
|
|
44
44
|
"@babel/preset-env": "^7.26.0",
|
|
@@ -109,13 +109,13 @@ describe(
|
|
|
109
109
|
expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
|
|
110
110
|
});
|
|
111
111
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
112
|
+
it('should be able to measure a prompt with mostly relevant information', async () => {
|
|
113
|
+
const result = await metric.measure(testCases[1].input, testCases[1].output);
|
|
114
|
+
const expectedScore = testCases[1].expectedResult.score;
|
|
115
|
+
const difference = Math.abs(result.score - expectedScore);
|
|
116
116
|
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
expect(Math.round(difference * 10) / 10).toBeLessThanOrEqual(0.1);
|
|
118
|
+
});
|
|
119
119
|
|
|
120
120
|
it('should be able to measure a prompt with partial relevance', async () => {
|
|
121
121
|
const result = await metric.measure(testCases[2].input, testCases[2].output);
|
|
@@ -50,28 +50,26 @@ const modelConfig: ModelConfig = {
|
|
|
50
50
|
apiKey: process.env.OPENAI_API_KEY,
|
|
51
51
|
};
|
|
52
52
|
|
|
53
|
-
describe(
|
|
54
|
-
|
|
55
|
-
() => {
|
|
56
|
-
const metric = new BiasMetric(modelConfig);
|
|
53
|
+
describe('BiasMetric', () => {
|
|
54
|
+
const metric = new BiasMetric(modelConfig);
|
|
57
55
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
56
|
+
it('should be able to measure a prompt that is biased', async () => {
|
|
57
|
+
const result = await metric.measure(testCases[0].input, testCases[0].output);
|
|
58
|
+
expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
|
|
59
|
+
}, 10000);
|
|
62
60
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
61
|
+
it('should be able to measure a prompt that is almost not biased', async () => {
|
|
62
|
+
const result = await metric.measure(testCases[1].input, testCases[1].output);
|
|
63
|
+
expect(result.score).toBeLessThan(0.5);
|
|
64
|
+
}, 10000);
|
|
67
65
|
|
|
68
66
|
it('should be able to measure a prompt that is mildly biased but actually not', async () => {
|
|
69
67
|
const result = await metric.measure(testCases[2].input, testCases[2].output);
|
|
70
68
|
expect(result.score).toBe(0);
|
|
71
|
-
});
|
|
69
|
+
}, 10000);
|
|
72
70
|
|
|
73
71
|
it('should be able to measure a prompt that is mildly biased', async () => {
|
|
74
72
|
const result = await metric.measure(testCases[3].input, testCases[3].output);
|
|
75
73
|
expect(result.score).toBeLessThan(0.8);
|
|
76
|
-
});
|
|
74
|
+
}, 10000);
|
|
77
75
|
});
|