@mastra/evals 0.1.0-alpha.33 → 0.1.0-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +0 -224
- package/jest.config.ts +21 -0
- package/package.json +26 -10
- package/src/evaluation.test.ts +16 -17
- package/src/evaluation.ts +11 -46
- package/src/index.ts +0 -1
- package/src/metrics/judge/index.ts +4 -5
- package/src/metrics/llm/answer-relevancy/index.test.ts +72 -42
- package/src/metrics/llm/answer-relevancy/index.ts +6 -9
- package/src/metrics/llm/answer-relevancy/metricJudge.ts +4 -5
- package/src/metrics/llm/answer-relevancy/prompts.ts +28 -26
- package/src/metrics/llm/bias/index.test.ts +33 -17
- package/src/metrics/llm/bias/index.ts +4 -13
- package/src/metrics/llm/bias/metricJudge.ts +4 -20
- package/src/metrics/llm/bias/prompts.ts +0 -27
- package/src/metrics/llm/context-position/index.test.ts +107 -72
- package/src/metrics/llm/context-position/index.ts +14 -14
- package/src/metrics/llm/context-position/metricJudge.ts +3 -3
- package/src/metrics/llm/context-position/prompts.ts +36 -31
- package/src/metrics/llm/context-precision/index.test.ts +91 -62
- package/src/metrics/llm/context-precision/index.ts +14 -14
- package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
- package/src/metrics/llm/context-relevancy/index.test.ts +36 -27
- package/src/metrics/llm/context-relevancy/index.ts +13 -23
- package/src/metrics/llm/context-relevancy/metricJudge.ts +5 -19
- package/src/metrics/llm/context-relevancy/prompts.ts +0 -37
- package/src/metrics/llm/contextual-recall/index.test.ts +37 -29
- package/src/metrics/llm/contextual-recall/index.ts +13 -20
- package/src/metrics/llm/contextual-recall/metricJudge.ts +4 -19
- package/src/metrics/llm/contextual-recall/prompts.ts +1 -42
- package/src/metrics/llm/faithfulness/index.test.ts +107 -72
- package/src/metrics/llm/faithfulness/index.ts +15 -22
- package/src/metrics/llm/faithfulness/metricJudge.ts +13 -13
- package/src/metrics/llm/hallucination/index.test.ts +101 -67
- package/src/metrics/llm/hallucination/index.ts +15 -22
- package/src/metrics/llm/hallucination/metricJudge.ts +16 -14
- package/src/metrics/llm/hallucination/prompts.ts +35 -28
- package/src/metrics/llm/index.ts +0 -1
- package/src/metrics/llm/prompt-alignment/index.test.ts +71 -55
- package/src/metrics/llm/prompt-alignment/index.ts +7 -16
- package/src/metrics/llm/prompt-alignment/metricJudge.ts +17 -13
- package/src/metrics/llm/summarization/index.test.ts +69 -25
- package/src/metrics/llm/summarization/index.ts +10 -19
- package/src/metrics/llm/summarization/metricJudge.ts +28 -15
- package/src/metrics/llm/summarization/prompts.ts +14 -52
- package/src/metrics/llm/toxicity/index.test.ts +29 -23
- package/src/metrics/llm/toxicity/index.ts +7 -10
- package/src/metrics/llm/toxicity/metricJudge.ts +7 -8
- package/src/metrics/llm/toxicity/prompts.ts +12 -5
- package/src/metrics/nlp/completeness/index.test.ts +20 -20
- package/src/metrics/nlp/completeness/index.ts +6 -14
- package/src/metrics/nlp/content-similarity/index.test.ts +48 -17
- package/src/metrics/nlp/content-similarity/index.ts +8 -15
- package/src/metrics/nlp/keyword-coverage/index.test.ts +60 -31
- package/src/metrics/nlp/keyword-coverage/index.ts +9 -10
- package/src/metrics/nlp/textual-difference/index.test.ts +62 -34
- package/src/metrics/nlp/textual-difference/index.ts +6 -12
- package/src/metrics/nlp/tone/index.test.ts +72 -49
- package/src/metrics/nlp/tone/index.ts +9 -16
- package/src/metrics/nlp/types.ts +13 -0
- package/tsconfig.json +10 -1
- package/README.md +0 -186
- package/dist/chunk-4VNS5WPM.js +0 -37
- package/dist/dist-XPBCCWOM.js +0 -17575
- package/dist/index.d.ts +0 -9
- package/dist/index.js +0 -73
- package/dist/magic-string.es-5UDOWOAZ.js +0 -1296
- package/dist/metrics/llm/index.d.ts +0 -139
- package/dist/metrics/llm/index.js +0 -2121
- package/dist/metrics/nlp/index.d.ts +0 -73
- package/dist/metrics/nlp/index.js +0 -189
- package/src/attachListeners.ts +0 -26
- package/src/constants.ts +0 -1
- package/src/metrics/llm/types.ts +0 -7
- package/vitest.config.ts +0 -11
package/CHANGELOG.md
CHANGED
|
@@ -1,229 +1,5 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
-
## 0.1.0-alpha.33
|
|
4
|
-
|
|
5
|
-
### Patch Changes
|
|
6
|
-
|
|
7
|
-
- Updated dependencies [d7d465a]
|
|
8
|
-
- Updated dependencies [d7d465a]
|
|
9
|
-
- Updated dependencies [2017553]
|
|
10
|
-
- Updated dependencies [a10b7a3]
|
|
11
|
-
- Updated dependencies [16e5b04]
|
|
12
|
-
- @mastra/core@0.2.0-alpha.91
|
|
13
|
-
|
|
14
|
-
## 0.1.0-alpha.32
|
|
15
|
-
|
|
16
|
-
### Patch Changes
|
|
17
|
-
|
|
18
|
-
- Updated dependencies [8151f44]
|
|
19
|
-
- Updated dependencies [e897f1c]
|
|
20
|
-
- Updated dependencies [3700be1]
|
|
21
|
-
- @mastra/core@0.2.0-alpha.90
|
|
22
|
-
|
|
23
|
-
## 0.1.0-alpha.31
|
|
24
|
-
|
|
25
|
-
### Patch Changes
|
|
26
|
-
|
|
27
|
-
- Updated dependencies [27275c9]
|
|
28
|
-
- @mastra/core@0.2.0-alpha.89
|
|
29
|
-
|
|
30
|
-
## 0.1.0-alpha.30
|
|
31
|
-
|
|
32
|
-
### Patch Changes
|
|
33
|
-
|
|
34
|
-
- Updated dependencies [ccbc581]
|
|
35
|
-
- @mastra/core@0.2.0-alpha.88
|
|
36
|
-
|
|
37
|
-
## 0.1.0-alpha.29
|
|
38
|
-
|
|
39
|
-
### Patch Changes
|
|
40
|
-
|
|
41
|
-
- Updated dependencies [7365b6c]
|
|
42
|
-
- @mastra/core@0.2.0-alpha.87
|
|
43
|
-
|
|
44
|
-
## 0.1.0-alpha.28
|
|
45
|
-
|
|
46
|
-
### Minor Changes
|
|
47
|
-
|
|
48
|
-
- 5916f9d: Update deps from fixed to ^
|
|
49
|
-
|
|
50
|
-
### Patch Changes
|
|
51
|
-
|
|
52
|
-
- Updated dependencies [6fa4bd2]
|
|
53
|
-
- Updated dependencies [e2e76de]
|
|
54
|
-
- Updated dependencies [7f24c29]
|
|
55
|
-
- Updated dependencies [67637ba]
|
|
56
|
-
- Updated dependencies [04f3171]
|
|
57
|
-
- @mastra/core@0.2.0-alpha.86
|
|
58
|
-
|
|
59
|
-
## 0.1.0-alpha.27
|
|
60
|
-
|
|
61
|
-
### Patch Changes
|
|
62
|
-
|
|
63
|
-
- Updated dependencies [e9d1b47]
|
|
64
|
-
- @mastra/core@0.2.0-alpha.85
|
|
65
|
-
|
|
66
|
-
## 0.1.0-alpha.26
|
|
67
|
-
|
|
68
|
-
### Patch Changes
|
|
69
|
-
|
|
70
|
-
- Updated dependencies [2f17a5f]
|
|
71
|
-
- Updated dependencies [cb290ee]
|
|
72
|
-
- Updated dependencies [b4d7416]
|
|
73
|
-
- Updated dependencies [38b7f66]
|
|
74
|
-
- @mastra/core@0.2.0-alpha.84
|
|
75
|
-
|
|
76
|
-
## 0.1.0-alpha.25
|
|
77
|
-
|
|
78
|
-
### Patch Changes
|
|
79
|
-
|
|
80
|
-
- 9625602: Use mastra core splitted bundles in other packages
|
|
81
|
-
- 8769a62: Split core into seperate entry fils
|
|
82
|
-
- Updated dependencies [30322ce]
|
|
83
|
-
- Updated dependencies [78eec7c]
|
|
84
|
-
- Updated dependencies [9625602]
|
|
85
|
-
- Updated dependencies [8769a62]
|
|
86
|
-
- @mastra/core@0.2.0-alpha.83
|
|
87
|
-
|
|
88
|
-
## 0.1.0-alpha.24
|
|
89
|
-
|
|
90
|
-
### Patch Changes
|
|
91
|
-
|
|
92
|
-
- Updated dependencies [73d112c]
|
|
93
|
-
- @mastra/core@0.1.27-alpha.82
|
|
94
|
-
|
|
95
|
-
## 0.1.0-alpha.23
|
|
96
|
-
|
|
97
|
-
### Patch Changes
|
|
98
|
-
|
|
99
|
-
- Updated dependencies [9fb3039]
|
|
100
|
-
- @mastra/core@0.1.27-alpha.81
|
|
101
|
-
|
|
102
|
-
## 0.1.0-alpha.22
|
|
103
|
-
|
|
104
|
-
### Patch Changes
|
|
105
|
-
|
|
106
|
-
- cb2e997: Bundle evals package with tsup
|
|
107
|
-
|
|
108
|
-
## 0.1.0-alpha.21
|
|
109
|
-
|
|
110
|
-
### Patch Changes
|
|
111
|
-
|
|
112
|
-
- Updated dependencies [327ece7]
|
|
113
|
-
- @mastra/core@0.1.27-alpha.80
|
|
114
|
-
|
|
115
|
-
## 0.1.0-alpha.20
|
|
116
|
-
|
|
117
|
-
### Patch Changes
|
|
118
|
-
|
|
119
|
-
- Updated dependencies [21fe536]
|
|
120
|
-
- @mastra/core@0.1.27-alpha.79
|
|
121
|
-
|
|
122
|
-
## 0.1.0-alpha.19
|
|
123
|
-
|
|
124
|
-
### Patch Changes
|
|
125
|
-
|
|
126
|
-
- Updated dependencies [685108a]
|
|
127
|
-
- Updated dependencies [685108a]
|
|
128
|
-
- @mastra/core@0.1.27-alpha.78
|
|
129
|
-
|
|
130
|
-
## 0.1.0-alpha.18
|
|
131
|
-
|
|
132
|
-
### Patch Changes
|
|
133
|
-
|
|
134
|
-
- Updated dependencies [8105fae]
|
|
135
|
-
- @mastra/core@0.1.27-alpha.77
|
|
136
|
-
|
|
137
|
-
## 0.1.0-alpha.17
|
|
138
|
-
|
|
139
|
-
### Patch Changes
|
|
140
|
-
|
|
141
|
-
- Updated dependencies [ae7bf94]
|
|
142
|
-
- Updated dependencies [ae7bf94]
|
|
143
|
-
- @mastra/core@0.1.27-alpha.76
|
|
144
|
-
|
|
145
|
-
## 0.1.0-alpha.16
|
|
146
|
-
|
|
147
|
-
### Patch Changes
|
|
148
|
-
|
|
149
|
-
- Updated dependencies [23dcb23]
|
|
150
|
-
- @mastra/core@0.1.27-alpha.75
|
|
151
|
-
|
|
152
|
-
## 0.1.0-alpha.15
|
|
153
|
-
|
|
154
|
-
### Patch Changes
|
|
155
|
-
|
|
156
|
-
- Updated dependencies [7b87567]
|
|
157
|
-
- @mastra/core@0.1.27-alpha.74
|
|
158
|
-
|
|
159
|
-
## 0.1.0-alpha.14
|
|
160
|
-
|
|
161
|
-
### Patch Changes
|
|
162
|
-
|
|
163
|
-
- Updated dependencies [3427b95]
|
|
164
|
-
- @mastra/core@0.1.27-alpha.73
|
|
165
|
-
|
|
166
|
-
## 0.1.0-alpha.13
|
|
167
|
-
|
|
168
|
-
### Patch Changes
|
|
169
|
-
|
|
170
|
-
- 06b2c0a: Update summarization prompt and fix eval input
|
|
171
|
-
- Updated dependencies [e4d4ede]
|
|
172
|
-
- Updated dependencies [06b2c0a]
|
|
173
|
-
- @mastra/core@0.1.27-alpha.72
|
|
174
|
-
|
|
175
|
-
## 0.1.0-alpha.12
|
|
176
|
-
|
|
177
|
-
### Patch Changes
|
|
178
|
-
|
|
179
|
-
- Updated dependencies [d9c8dd0]
|
|
180
|
-
- @mastra/core@0.1.27-alpha.71
|
|
181
|
-
|
|
182
|
-
## 0.1.0-alpha.11
|
|
183
|
-
|
|
184
|
-
### Patch Changes
|
|
185
|
-
|
|
186
|
-
- bdaf834: publish packages
|
|
187
|
-
|
|
188
|
-
## 0.1.0-alpha.10
|
|
189
|
-
|
|
190
|
-
### Patch Changes
|
|
191
|
-
|
|
192
|
-
- Updated dependencies [dd6d87f]
|
|
193
|
-
- Updated dependencies [04434b6]
|
|
194
|
-
- @mastra/core@0.1.27-alpha.70
|
|
195
|
-
|
|
196
|
-
## 0.1.0-alpha.9
|
|
197
|
-
|
|
198
|
-
### Patch Changes
|
|
199
|
-
|
|
200
|
-
- 1944807: Unified logger and major step in better logs
|
|
201
|
-
- 9ade36e: Changed measure for evals, added endpoints, attached metrics to agent, added ui for evals in playground, and updated docs
|
|
202
|
-
- Updated dependencies [1944807]
|
|
203
|
-
- Updated dependencies [9ade36e]
|
|
204
|
-
- @mastra/core@0.1.27-alpha.69
|
|
205
|
-
|
|
206
|
-
## 0.1.0-alpha.8
|
|
207
|
-
|
|
208
|
-
### Patch Changes
|
|
209
|
-
|
|
210
|
-
- Updated dependencies [0be7181]
|
|
211
|
-
- Updated dependencies [0be7181]
|
|
212
|
-
- @mastra/core@0.1.27-alpha.68
|
|
213
|
-
|
|
214
|
-
## 0.1.0-alpha.7
|
|
215
|
-
|
|
216
|
-
### Patch Changes
|
|
217
|
-
|
|
218
|
-
- Updated dependencies [c8ff2f5]
|
|
219
|
-
- @mastra/core@0.1.27-alpha.67
|
|
220
|
-
|
|
221
|
-
## 0.1.0-alpha.6
|
|
222
|
-
|
|
223
|
-
### Patch Changes
|
|
224
|
-
|
|
225
|
-
- aea3c13: Fix evals export for llm and nlp
|
|
226
|
-
|
|
227
3
|
## 0.1.0-alpha.5
|
|
228
4
|
|
|
229
5
|
### Minor Changes
|
package/jest.config.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { config } from 'dotenv';
|
|
2
|
+
|
|
3
|
+
config();
|
|
4
|
+
|
|
5
|
+
export default {
|
|
6
|
+
maxWorkers: 1,
|
|
7
|
+
preset: 'ts-jest',
|
|
8
|
+
extensionsToTreatAsEsm: ['.ts'],
|
|
9
|
+
moduleNameMapper: {
|
|
10
|
+
'^(\\.{1,2}/.*)\\.js$': '$1',
|
|
11
|
+
},
|
|
12
|
+
transform: {
|
|
13
|
+
'^.+\\.tsx?$': [
|
|
14
|
+
'ts-jest',
|
|
15
|
+
{
|
|
16
|
+
useESM: true,
|
|
17
|
+
isolatedModules: true,
|
|
18
|
+
},
|
|
19
|
+
],
|
|
20
|
+
},
|
|
21
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.5",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -11,18 +11,30 @@
|
|
|
11
11
|
"import": {
|
|
12
12
|
"types": "./dist/index.d.ts",
|
|
13
13
|
"default": "./dist/evals.esm.js"
|
|
14
|
+
},
|
|
15
|
+
"require": {
|
|
16
|
+
"types": "./dist/index.d.ts",
|
|
17
|
+
"default": "./dist/index.js"
|
|
14
18
|
}
|
|
15
19
|
},
|
|
16
20
|
"./nlp": {
|
|
17
21
|
"import": {
|
|
18
22
|
"types": "./dist/metrics/nlp/index.d.ts",
|
|
19
|
-
"default": "./dist/
|
|
23
|
+
"default": "./dist/evals.esm.js"
|
|
24
|
+
},
|
|
25
|
+
"require": {
|
|
26
|
+
"types": "./dist/metrics/nlp/index.d.ts",
|
|
27
|
+
"default": "./dist/index.js"
|
|
20
28
|
}
|
|
21
29
|
},
|
|
22
30
|
"./llm": {
|
|
23
31
|
"import": {
|
|
24
32
|
"types": "./dist/metrics/llm/index.d.ts",
|
|
25
|
-
"default": "./dist/
|
|
33
|
+
"default": "./dist/evals.esm.js"
|
|
34
|
+
},
|
|
35
|
+
"require": {
|
|
36
|
+
"types": "./dist/metrics/llm/index.d.ts",
|
|
37
|
+
"default": "./dist/index.js"
|
|
26
38
|
}
|
|
27
39
|
},
|
|
28
40
|
"./package.json": "./package.json"
|
|
@@ -38,21 +50,25 @@
|
|
|
38
50
|
"sentiment": "^5.0.2",
|
|
39
51
|
"string-similarity": "^4.0.4",
|
|
40
52
|
"zod": "^3.24.1",
|
|
41
|
-
"@mastra/core": "
|
|
53
|
+
"@mastra/core": "0.1.27-alpha.66"
|
|
42
54
|
},
|
|
43
55
|
"devDependencies": {
|
|
56
|
+
"@babel/preset-env": "^7.26.0",
|
|
57
|
+
"@babel/preset-typescript": "^7.26.0",
|
|
58
|
+
"@jest/globals": "^29.7.0",
|
|
44
59
|
"@tsconfig/recommended": "^1.0.7",
|
|
45
60
|
"@types/difflib": "^0.2.7",
|
|
46
61
|
"@types/fs-extra": "^11.0.4",
|
|
62
|
+
"@types/jest": "^29.5.12",
|
|
47
63
|
"@types/sentiment": "^5.0.4",
|
|
48
64
|
"@types/string-similarity": "^4.0.2",
|
|
49
|
-
"
|
|
50
|
-
"
|
|
65
|
+
"dts-cli": "^2.0.5",
|
|
66
|
+
"jest": "^29.7.0",
|
|
67
|
+
"ts-jest": "^29.2.5"
|
|
51
68
|
},
|
|
52
69
|
"scripts": {
|
|
53
|
-
"
|
|
54
|
-
"build": "
|
|
55
|
-
"
|
|
56
|
-
"test": "vitest"
|
|
70
|
+
"build": "dts build",
|
|
71
|
+
"build:dev": "dts watch",
|
|
72
|
+
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js"
|
|
57
73
|
}
|
|
58
74
|
}
|
package/src/evaluation.test.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import { Agent } from '@mastra/core
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import { describe, expect, it } from 'vitest';
|
|
1
|
+
import { Agent } from '@mastra/core';
|
|
2
|
+
import { ModelConfig } from '@mastra/core';
|
|
3
|
+
import { Metric } from '@mastra/core';
|
|
5
4
|
|
|
6
5
|
import { evaluate } from './evaluation';
|
|
7
6
|
|
|
@@ -14,20 +13,20 @@ class TestMetric extends Metric {
|
|
|
14
13
|
}
|
|
15
14
|
}
|
|
16
15
|
|
|
17
|
-
const
|
|
16
|
+
const modelConfig: ModelConfig = {
|
|
17
|
+
provider: 'OPEN_AI',
|
|
18
18
|
name: 'gpt-4o',
|
|
19
|
-
|
|
19
|
+
toolChoice: 'auto',
|
|
20
|
+
};
|
|
20
21
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
});
|
|
22
|
+
it.skip('should get a text response from the agent', async () => {
|
|
23
|
+
const electionAgent = new Agent({
|
|
24
|
+
name: 'US Election agent',
|
|
25
|
+
instructions: 'You know about the past US elections',
|
|
26
|
+
model: modelConfig,
|
|
27
|
+
});
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
});
|
|
31
|
+
expect(result.score).toBe(1);
|
|
32
|
+
}, 10000);
|
package/src/evaluation.ts
CHANGED
|
@@ -1,55 +1,20 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import { GLOBAL_RUN_ID_ENV_KEY } from './constants';
|
|
1
|
+
import { AvailableHooks, executeHook } from '@mastra/core';
|
|
2
|
+
import { type Agent, type Metric } from '@mastra/core';
|
|
4
3
|
|
|
5
4
|
export async function evaluate<T extends Agent>(agent: T, input: Parameters<T['generate']>[0], metric: Metric) {
|
|
6
|
-
const
|
|
7
|
-
let globalRunId = process.env[GLOBAL_RUN_ID_ENV_KEY];
|
|
8
|
-
const runId = crypto.randomUUID();
|
|
9
|
-
const agentOutput = await agent.generate(input, {
|
|
10
|
-
runId,
|
|
11
|
-
});
|
|
5
|
+
const agentOutput = await agent.generate(input);
|
|
12
6
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
}
|
|
7
|
+
const metricResult = await metric.measure({
|
|
8
|
+
input: input.toString(),
|
|
9
|
+
output: agentOutput.text,
|
|
10
|
+
});
|
|
17
11
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
input,
|
|
21
|
-
metric,
|
|
12
|
+
// capture infomration about the evaluation
|
|
13
|
+
executeHook(AvailableHooks.ON_EVALUATION, {
|
|
14
|
+
input: input.toString(),
|
|
22
15
|
output: agentOutput.text,
|
|
23
|
-
|
|
24
|
-
runId,
|
|
25
|
-
testInfo,
|
|
16
|
+
result: metricResult,
|
|
26
17
|
});
|
|
27
18
|
|
|
28
19
|
return metricResult;
|
|
29
20
|
}
|
|
30
|
-
|
|
31
|
-
export const getCurrentTestInfo = async () => {
|
|
32
|
-
// Jest
|
|
33
|
-
// @ts-ignore
|
|
34
|
-
if (typeof expect !== 'undefined' && expect.getState) {
|
|
35
|
-
// @ts-ignore
|
|
36
|
-
const state = expect.getState();
|
|
37
|
-
return {
|
|
38
|
-
testName: state.currentTestName,
|
|
39
|
-
testPath: state.testPath,
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
try {
|
|
44
|
-
const vitest = await import('vitest');
|
|
45
|
-
if (typeof vitest !== 'undefined' && vitest.expect?.getState) {
|
|
46
|
-
const state = vitest.expect.getState();
|
|
47
|
-
return {
|
|
48
|
-
testName: state.currentTestName,
|
|
49
|
-
testPath: state.testPath,
|
|
50
|
-
};
|
|
51
|
-
}
|
|
52
|
-
} catch {}
|
|
53
|
-
|
|
54
|
-
return null;
|
|
55
|
-
};
|
package/src/index.ts
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
import { Agent } from '@mastra/core
|
|
2
|
-
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
1
|
+
import { Agent, ModelConfig } from '@mastra/core';
|
|
3
2
|
|
|
4
3
|
export abstract class MastraAgentJudge {
|
|
5
4
|
protected readonly agent: Agent;
|
|
6
5
|
|
|
7
|
-
constructor(name: string, instructions: string,
|
|
6
|
+
constructor(name: string, instructions: string, model: ModelConfig) {
|
|
8
7
|
this.agent = new Agent({
|
|
9
|
-
name: `Mastra Eval Judge ${
|
|
8
|
+
name: `Mastra Eval Judge ${model.provider} ${name}`,
|
|
10
9
|
instructions: instructions,
|
|
11
|
-
|
|
10
|
+
model,
|
|
12
11
|
});
|
|
13
12
|
}
|
|
14
13
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { describe, it, expect, jest } from '@jest/globals';
|
|
2
|
+
import { type ModelConfig } from '@mastra/core';
|
|
3
3
|
|
|
4
4
|
import { TestCase } from '../utils';
|
|
5
5
|
|
|
@@ -91,65 +91,95 @@ const testCases: TestCase[] = [
|
|
|
91
91
|
];
|
|
92
92
|
|
|
93
93
|
const SECONDS = 10000;
|
|
94
|
+
jest.setTimeout(15 * SECONDS);
|
|
94
95
|
|
|
95
|
-
const
|
|
96
|
+
const modelConfig: ModelConfig = {
|
|
97
|
+
provider: 'OPEN_AI',
|
|
96
98
|
name: 'gpt-4o',
|
|
97
|
-
|
|
99
|
+
toolChoice: 'auto',
|
|
100
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
101
|
+
};
|
|
98
102
|
|
|
99
|
-
describe(
|
|
100
|
-
|
|
101
|
-
() => {
|
|
102
|
-
const metric = new AnswerRelevancyMetric(llm);
|
|
103
|
+
describe('AnswerRelevancyMetric', () => {
|
|
104
|
+
const metric = new AnswerRelevancyMetric(modelConfig);
|
|
103
105
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
106
|
+
it('should be able to measure a prompt with perfect relevancy', async () => {
|
|
107
|
+
const result = await metric.measure({
|
|
108
|
+
input: testCases[0].input,
|
|
109
|
+
output: testCases[0].output,
|
|
107
110
|
});
|
|
108
111
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
const expectedScore = testCases[1].expectedResult.score;
|
|
112
|
-
const difference = Math.abs(result.score - expectedScore);
|
|
112
|
+
expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
|
|
113
|
+
});
|
|
113
114
|
|
|
114
|
-
|
|
115
|
+
it('should be able to measure a prompt with mostly relevant information', async () => {
|
|
116
|
+
const result = await metric.measure({
|
|
117
|
+
input: testCases[1].input,
|
|
118
|
+
output: testCases[1].output,
|
|
115
119
|
});
|
|
116
120
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
121
|
+
expect(result.score).toBeCloseTo(testCases[1].expectedResult.score, 1);
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it('should be able to measure a prompt with partial relevance', async () => {
|
|
125
|
+
const result = await metric.measure({
|
|
126
|
+
input: testCases[2].input,
|
|
127
|
+
output: testCases[2].output,
|
|
120
128
|
});
|
|
121
129
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
130
|
+
expect(result.score).toBeCloseTo(testCases[2].expectedResult.score, 1);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
it('should be able to measure a prompt with low relevance', async () => {
|
|
134
|
+
const result = await metric.measure({
|
|
135
|
+
input: testCases[3].input,
|
|
136
|
+
output: testCases[3].output,
|
|
125
137
|
});
|
|
126
138
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
139
|
+
expect(result.score).toBeCloseTo(testCases[3].expectedResult.score, 1);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('should be able to measure a prompt with empty output', async () => {
|
|
143
|
+
const result = await metric.measure({
|
|
144
|
+
input: testCases[5].input,
|
|
145
|
+
output: testCases[5].output,
|
|
130
146
|
});
|
|
131
147
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
148
|
+
expect(result.score).toBeCloseTo(testCases[5].expectedResult.score, 1);
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
it('should be able to measure a prompt with incorrect but relevant answer', async () => {
|
|
152
|
+
const result = await metric.measure({
|
|
153
|
+
input: testCases[6].input,
|
|
154
|
+
output: testCases[6].output,
|
|
135
155
|
});
|
|
156
|
+
expect(result.score).toBeCloseTo(testCases[6].expectedResult.score, 1);
|
|
157
|
+
});
|
|
136
158
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
159
|
+
it('should be able to measure a prompt with a single word correct answer', async () => {
|
|
160
|
+
const result = await metric.measure({
|
|
161
|
+
input: testCases[7].input,
|
|
162
|
+
output: testCases[7].output,
|
|
140
163
|
});
|
|
141
164
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
165
|
+
expect(result.score).toBeCloseTo(testCases[7].expectedResult.score, 1);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
it('should be able to measure a prompt with multiple questions', async () => {
|
|
169
|
+
const result = await metric.measure({
|
|
170
|
+
input: testCases[8].input,
|
|
171
|
+
output: testCases[8].output,
|
|
145
172
|
});
|
|
146
173
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
174
|
+
expect(result.score).toBeCloseTo(testCases[8].expectedResult.score, 1);
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it('should be able to measure a prompt with technical gibberish', async () => {
|
|
178
|
+
const result = await metric.measure({
|
|
179
|
+
input: testCases[9].input,
|
|
180
|
+
output: testCases[9].output,
|
|
150
181
|
});
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
);
|
|
182
|
+
|
|
183
|
+
expect(result.score).toBeCloseTo(testCases[9].expectedResult.score, 1);
|
|
184
|
+
});
|
|
185
|
+
});
|
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
import { Metric } from '@mastra/core
|
|
2
|
-
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
1
|
+
import { Metric, MetricResult, ModelConfig } from '@mastra/core';
|
|
3
2
|
|
|
4
|
-
import { type MetricResultWithReason } from '../types';
|
|
5
3
|
import { roundToTwoDecimals } from '../utils';
|
|
6
4
|
|
|
7
5
|
import { AnswerRelevancyJudge } from './metricJudge';
|
|
@@ -16,24 +14,23 @@ export class AnswerRelevancyMetric extends Metric {
|
|
|
16
14
|
private uncertaintyWeight: number;
|
|
17
15
|
private scale: number;
|
|
18
16
|
|
|
19
|
-
constructor(
|
|
17
|
+
constructor(model: ModelConfig, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
|
|
20
18
|
super();
|
|
21
19
|
|
|
22
20
|
this.uncertaintyWeight = uncertaintyWeight;
|
|
23
|
-
this.judge = new AnswerRelevancyJudge(
|
|
21
|
+
this.judge = new AnswerRelevancyJudge(model);
|
|
24
22
|
this.scale = scale;
|
|
25
23
|
}
|
|
26
24
|
|
|
27
|
-
async measure(input: string
|
|
25
|
+
async measure({ input, output }: { input: string; output: string }): Promise<MetricResult> {
|
|
28
26
|
const verdicts = await this.judge.evaluate(input, output);
|
|
29
27
|
const score = this.calculateScore(verdicts);
|
|
28
|
+
|
|
30
29
|
const reason = await this.judge.getReason(input, output, score, this.scale, verdicts);
|
|
31
30
|
|
|
32
31
|
return {
|
|
33
32
|
score,
|
|
34
|
-
|
|
35
|
-
reason,
|
|
36
|
-
},
|
|
33
|
+
reason,
|
|
37
34
|
};
|
|
38
35
|
}
|
|
39
36
|
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ModelConfig } from '@mastra/core';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
5
5
|
|
|
6
6
|
import {
|
|
7
7
|
generateEvaluatePrompt,
|
|
8
|
+
generateReasonPrompt,
|
|
8
9
|
ANSWER_RELEVANCY_AGENT_INSTRUCTIONS,
|
|
9
10
|
generateEvaluationStatementsPrompt,
|
|
10
|
-
generateReasonPrompt,
|
|
11
11
|
} from './prompts';
|
|
12
12
|
|
|
13
13
|
export class AnswerRelevancyJudge extends MastraAgentJudge {
|
|
14
|
-
constructor(
|
|
15
|
-
super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS,
|
|
14
|
+
constructor(model: ModelConfig) {
|
|
15
|
+
super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
|
|
@@ -50,7 +50,6 @@ export class AnswerRelevancyJudge extends MastraAgentJudge {
|
|
|
50
50
|
reason: z.string(),
|
|
51
51
|
}),
|
|
52
52
|
});
|
|
53
|
-
|
|
54
53
|
return result.object.reason;
|
|
55
54
|
}
|
|
56
55
|
}
|