@mastra/evals 0.1.0-alpha.5 → 0.1.0-alpha.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +372 -0
- package/README.md +185 -0
- package/dist/_tsup-dts-rollup.d.ts +723 -0
- package/dist/chunk-4VNS5WPM.js +37 -0
- package/dist/dist-56AYDN4X.js +17575 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +87 -0
- package/dist/magic-string.es-5UDOWOAZ.js +1296 -0
- package/dist/metrics/llm/index.d.ts +10 -0
- package/dist/metrics/llm/index.js +2121 -0
- package/dist/metrics/nlp/index.d.ts +5 -0
- package/dist/metrics/nlp/index.js +189 -0
- package/package.json +16 -28
- package/src/attachListeners.ts +40 -0
- package/src/constants.ts +1 -0
- package/src/evaluation.test.ts +15 -18
- package/src/evaluation.ts +48 -11
- package/src/index.ts +1 -0
- package/src/metrics/judge/index.ts +4 -3
- package/src/metrics/llm/answer-relevancy/index.test.ts +44 -74
- package/src/metrics/llm/answer-relevancy/index.ts +8 -5
- package/src/metrics/llm/answer-relevancy/metricJudge.ts +4 -3
- package/src/metrics/llm/answer-relevancy/prompts.ts +26 -28
- package/src/metrics/llm/bias/index.test.ts +19 -34
- package/src/metrics/llm/bias/index.ts +13 -4
- package/src/metrics/llm/bias/metricJudge.ts +20 -4
- package/src/metrics/llm/bias/prompts.ts +27 -0
- package/src/metrics/llm/context-position/index.test.ts +98 -108
- package/src/metrics/llm/context-position/index.ts +13 -13
- package/src/metrics/llm/context-position/metricJudge.ts +2 -2
- package/src/metrics/llm/context-position/prompts.ts +31 -36
- package/src/metrics/llm/context-precision/index.test.ts +72 -100
- package/src/metrics/llm/context-precision/index.ts +13 -13
- package/src/metrics/llm/context-precision/metricJudge.ts +2 -2
- package/src/metrics/llm/context-relevancy/index.test.ts +28 -36
- package/src/metrics/llm/context-relevancy/index.ts +22 -12
- package/src/metrics/llm/context-relevancy/metricJudge.ts +20 -6
- package/src/metrics/llm/context-relevancy/prompts.ts +37 -0
- package/src/metrics/llm/contextual-recall/index.test.ts +30 -37
- package/src/metrics/llm/contextual-recall/index.ts +19 -12
- package/src/metrics/llm/contextual-recall/metricJudge.ts +19 -4
- package/src/metrics/llm/contextual-recall/prompts.ts +42 -1
- package/src/metrics/llm/faithfulness/index.test.ts +71 -109
- package/src/metrics/llm/faithfulness/index.ts +21 -14
- package/src/metrics/llm/faithfulness/metricJudge.ts +12 -12
- package/src/metrics/llm/hallucination/index.test.ts +66 -104
- package/src/metrics/llm/hallucination/index.ts +21 -14
- package/src/metrics/llm/hallucination/metricJudge.ts +13 -15
- package/src/metrics/llm/hallucination/prompts.ts +28 -35
- package/src/metrics/llm/index.ts +1 -0
- package/src/metrics/llm/prompt-alignment/index.test.ts +59 -74
- package/src/metrics/llm/prompt-alignment/index.ts +15 -6
- package/src/metrics/llm/prompt-alignment/metricJudge.ts +12 -16
- package/src/metrics/llm/summarization/index.test.ts +33 -75
- package/src/metrics/llm/summarization/index.ts +18 -9
- package/src/metrics/llm/summarization/metricJudge.ts +14 -27
- package/src/metrics/llm/summarization/prompts.ts +52 -14
- package/src/metrics/llm/toxicity/index.test.ts +22 -31
- package/src/metrics/llm/toxicity/index.ts +10 -7
- package/src/metrics/llm/toxicity/metricJudge.ts +7 -6
- package/src/metrics/llm/toxicity/prompts.ts +5 -12
- package/src/metrics/llm/types.ts +7 -0
- package/src/metrics/nlp/completeness/index.test.ts +20 -20
- package/src/metrics/nlp/completeness/index.ts +14 -6
- package/src/metrics/nlp/content-similarity/index.test.ts +17 -48
- package/src/metrics/nlp/content-similarity/index.ts +15 -8
- package/src/metrics/nlp/keyword-coverage/index.test.ts +31 -60
- package/src/metrics/nlp/keyword-coverage/index.ts +10 -9
- package/src/metrics/nlp/textual-difference/index.test.ts +34 -62
- package/src/metrics/nlp/textual-difference/index.ts +12 -6
- package/src/metrics/nlp/tone/index.test.ts +49 -72
- package/src/metrics/nlp/tone/index.ts +16 -9
- package/tsconfig.json +1 -10
- package/vitest.config.ts +11 -0
- package/jest.config.ts +0 -21
- package/src/metrics/nlp/types.ts +0 -13
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,377 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
+
## 0.1.0-alpha.50
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Updated dependencies [5ee67d3]
|
|
8
|
+
- Updated dependencies [95a4697]
|
|
9
|
+
- @mastra/core@0.2.0-alpha.108
|
|
10
|
+
|
|
11
|
+
## 0.1.0-alpha.49
|
|
12
|
+
|
|
13
|
+
### Patch Changes
|
|
14
|
+
|
|
15
|
+
- Updated dependencies [66a5392]
|
|
16
|
+
- @mastra/core@0.2.0-alpha.107
|
|
17
|
+
|
|
18
|
+
## 0.1.0-alpha.48
|
|
19
|
+
|
|
20
|
+
### Patch Changes
|
|
21
|
+
|
|
22
|
+
- a8a459a: Updated Evals table UI
|
|
23
|
+
- Updated dependencies [6f2c0f5]
|
|
24
|
+
- Updated dependencies [a8a459a]
|
|
25
|
+
- @mastra/core@0.2.0-alpha.106
|
|
26
|
+
|
|
27
|
+
## 0.1.0-alpha.47
|
|
28
|
+
|
|
29
|
+
### Patch Changes
|
|
30
|
+
|
|
31
|
+
- Updated dependencies [1420ae2]
|
|
32
|
+
- Updated dependencies [99f1847]
|
|
33
|
+
- @mastra/core@0.2.0-alpha.105
|
|
34
|
+
|
|
35
|
+
## 0.1.0-alpha.46
|
|
36
|
+
|
|
37
|
+
### Patch Changes
|
|
38
|
+
|
|
39
|
+
- 5fdc87c: Update evals storage in attachListeners
|
|
40
|
+
- b97ca96: Tracing into default storage
|
|
41
|
+
- 72d1990: Updated evals table schema
|
|
42
|
+
- Updated dependencies [5fdc87c]
|
|
43
|
+
- Updated dependencies [b97ca96]
|
|
44
|
+
- Updated dependencies [72d1990]
|
|
45
|
+
- Updated dependencies [cf6d825]
|
|
46
|
+
- Updated dependencies [10870bc]
|
|
47
|
+
- @mastra/core@0.2.0-alpha.104
|
|
48
|
+
|
|
49
|
+
## 0.1.0-alpha.45
|
|
50
|
+
|
|
51
|
+
### Patch Changes
|
|
52
|
+
|
|
53
|
+
- Updated dependencies [4534e77]
|
|
54
|
+
- @mastra/core@0.2.0-alpha.103
|
|
55
|
+
|
|
56
|
+
## 0.1.0-alpha.44
|
|
57
|
+
|
|
58
|
+
### Patch Changes
|
|
59
|
+
|
|
60
|
+
- Updated dependencies [a9345f9]
|
|
61
|
+
- @mastra/core@0.2.0-alpha.102
|
|
62
|
+
|
|
63
|
+
## 0.1.0-alpha.43
|
|
64
|
+
|
|
65
|
+
### Patch Changes
|
|
66
|
+
|
|
67
|
+
- 4f1d1a1: Enforce types ann cleanup package.json
|
|
68
|
+
- Updated dependencies [66a03ec]
|
|
69
|
+
- Updated dependencies [4f1d1a1]
|
|
70
|
+
- @mastra/core@0.2.0-alpha.101
|
|
71
|
+
|
|
72
|
+
## 0.1.0-alpha.42
|
|
73
|
+
|
|
74
|
+
### Patch Changes
|
|
75
|
+
|
|
76
|
+
- Updated dependencies [9d1796d]
|
|
77
|
+
- @mastra/core@0.2.0-alpha.100
|
|
78
|
+
|
|
79
|
+
## 0.1.0-alpha.41
|
|
80
|
+
|
|
81
|
+
### Patch Changes
|
|
82
|
+
|
|
83
|
+
- Updated dependencies [7d83b92]
|
|
84
|
+
- @mastra/core@0.2.0-alpha.99
|
|
85
|
+
|
|
86
|
+
## 0.1.0-alpha.40
|
|
87
|
+
|
|
88
|
+
### Patch Changes
|
|
89
|
+
|
|
90
|
+
- 70dabd9: Fix broken publish
|
|
91
|
+
- 202d404: Added instructions when generating evals
|
|
92
|
+
- Updated dependencies [70dabd9]
|
|
93
|
+
- Updated dependencies [202d404]
|
|
94
|
+
- @mastra/core@0.2.0-alpha.98
|
|
95
|
+
|
|
96
|
+
## 0.1.0-alpha.39
|
|
97
|
+
|
|
98
|
+
### Patch Changes
|
|
99
|
+
|
|
100
|
+
- 7892533: Updated test evals to use Mastra Storage
|
|
101
|
+
- d641d91: Fix exports for @mastra/evals
|
|
102
|
+
- Updated dependencies [07c069d]
|
|
103
|
+
- Updated dependencies [7892533]
|
|
104
|
+
- Updated dependencies [e6d8055]
|
|
105
|
+
- Updated dependencies [5950de5]
|
|
106
|
+
- Updated dependencies [df843d3]
|
|
107
|
+
- Updated dependencies [a870123]
|
|
108
|
+
- @mastra/core@0.2.0-alpha.97
|
|
109
|
+
|
|
110
|
+
## 0.1.0-alpha.38
|
|
111
|
+
|
|
112
|
+
### Patch Changes
|
|
113
|
+
|
|
114
|
+
- Updated dependencies [74b3078]
|
|
115
|
+
- @mastra/core@0.2.0-alpha.96
|
|
116
|
+
|
|
117
|
+
## 0.1.0-alpha.37
|
|
118
|
+
|
|
119
|
+
### Patch Changes
|
|
120
|
+
|
|
121
|
+
- Updated dependencies [9fb59d6]
|
|
122
|
+
- @mastra/core@0.2.0-alpha.95
|
|
123
|
+
|
|
124
|
+
## 0.1.0-alpha.36
|
|
125
|
+
|
|
126
|
+
### Minor Changes
|
|
127
|
+
|
|
128
|
+
- 8b416d9: Breaking changes
|
|
129
|
+
|
|
130
|
+
### Patch Changes
|
|
131
|
+
|
|
132
|
+
- 9c10484: update all packages
|
|
133
|
+
- Updated dependencies [9c10484]
|
|
134
|
+
- Updated dependencies [8b416d9]
|
|
135
|
+
- @mastra/core@0.2.0-alpha.94
|
|
136
|
+
|
|
137
|
+
## 0.1.0-alpha.35
|
|
138
|
+
|
|
139
|
+
### Patch Changes
|
|
140
|
+
|
|
141
|
+
- Updated dependencies [5285356]
|
|
142
|
+
- @mastra/core@0.2.0-alpha.93
|
|
143
|
+
|
|
144
|
+
## 0.1.0-alpha.34
|
|
145
|
+
|
|
146
|
+
### Patch Changes
|
|
147
|
+
|
|
148
|
+
- Updated dependencies [4d4f6b6]
|
|
149
|
+
- @mastra/core@0.2.0-alpha.92
|
|
150
|
+
|
|
151
|
+
## 0.1.0-alpha.33
|
|
152
|
+
|
|
153
|
+
### Patch Changes
|
|
154
|
+
|
|
155
|
+
- Updated dependencies [d7d465a]
|
|
156
|
+
- Updated dependencies [d7d465a]
|
|
157
|
+
- Updated dependencies [2017553]
|
|
158
|
+
- Updated dependencies [a10b7a3]
|
|
159
|
+
- Updated dependencies [16e5b04]
|
|
160
|
+
- @mastra/core@0.2.0-alpha.91
|
|
161
|
+
|
|
162
|
+
## 0.1.0-alpha.32
|
|
163
|
+
|
|
164
|
+
### Patch Changes
|
|
165
|
+
|
|
166
|
+
- Updated dependencies [8151f44]
|
|
167
|
+
- Updated dependencies [e897f1c]
|
|
168
|
+
- Updated dependencies [3700be1]
|
|
169
|
+
- @mastra/core@0.2.0-alpha.90
|
|
170
|
+
|
|
171
|
+
## 0.1.0-alpha.31
|
|
172
|
+
|
|
173
|
+
### Patch Changes
|
|
174
|
+
|
|
175
|
+
- Updated dependencies [27275c9]
|
|
176
|
+
- @mastra/core@0.2.0-alpha.89
|
|
177
|
+
|
|
178
|
+
## 0.1.0-alpha.30
|
|
179
|
+
|
|
180
|
+
### Patch Changes
|
|
181
|
+
|
|
182
|
+
- Updated dependencies [ccbc581]
|
|
183
|
+
- @mastra/core@0.2.0-alpha.88
|
|
184
|
+
|
|
185
|
+
## 0.1.0-alpha.29
|
|
186
|
+
|
|
187
|
+
### Patch Changes
|
|
188
|
+
|
|
189
|
+
- Updated dependencies [7365b6c]
|
|
190
|
+
- @mastra/core@0.2.0-alpha.87
|
|
191
|
+
|
|
192
|
+
## 0.1.0-alpha.28
|
|
193
|
+
|
|
194
|
+
### Minor Changes
|
|
195
|
+
|
|
196
|
+
- 5916f9d: Update deps from fixed to ^
|
|
197
|
+
|
|
198
|
+
### Patch Changes
|
|
199
|
+
|
|
200
|
+
- Updated dependencies [6fa4bd2]
|
|
201
|
+
- Updated dependencies [e2e76de]
|
|
202
|
+
- Updated dependencies [7f24c29]
|
|
203
|
+
- Updated dependencies [67637ba]
|
|
204
|
+
- Updated dependencies [04f3171]
|
|
205
|
+
- @mastra/core@0.2.0-alpha.86
|
|
206
|
+
|
|
207
|
+
## 0.1.0-alpha.27
|
|
208
|
+
|
|
209
|
+
### Patch Changes
|
|
210
|
+
|
|
211
|
+
- Updated dependencies [e9d1b47]
|
|
212
|
+
- @mastra/core@0.2.0-alpha.85
|
|
213
|
+
|
|
214
|
+
## 0.1.0-alpha.26
|
|
215
|
+
|
|
216
|
+
### Patch Changes
|
|
217
|
+
|
|
218
|
+
- Updated dependencies [2f17a5f]
|
|
219
|
+
- Updated dependencies [cb290ee]
|
|
220
|
+
- Updated dependencies [b4d7416]
|
|
221
|
+
- Updated dependencies [38b7f66]
|
|
222
|
+
- @mastra/core@0.2.0-alpha.84
|
|
223
|
+
|
|
224
|
+
## 0.1.0-alpha.25
|
|
225
|
+
|
|
226
|
+
### Patch Changes
|
|
227
|
+
|
|
228
|
+
- 9625602: Use mastra core splitted bundles in other packages
|
|
229
|
+
- 8769a62: Split core into seperate entry fils
|
|
230
|
+
- Updated dependencies [30322ce]
|
|
231
|
+
- Updated dependencies [78eec7c]
|
|
232
|
+
- Updated dependencies [9625602]
|
|
233
|
+
- Updated dependencies [8769a62]
|
|
234
|
+
- @mastra/core@0.2.0-alpha.83
|
|
235
|
+
|
|
236
|
+
## 0.1.0-alpha.24
|
|
237
|
+
|
|
238
|
+
### Patch Changes
|
|
239
|
+
|
|
240
|
+
- Updated dependencies [73d112c]
|
|
241
|
+
- @mastra/core@0.1.27-alpha.82
|
|
242
|
+
|
|
243
|
+
## 0.1.0-alpha.23
|
|
244
|
+
|
|
245
|
+
### Patch Changes
|
|
246
|
+
|
|
247
|
+
- Updated dependencies [9fb3039]
|
|
248
|
+
- @mastra/core@0.1.27-alpha.81
|
|
249
|
+
|
|
250
|
+
## 0.1.0-alpha.22
|
|
251
|
+
|
|
252
|
+
### Patch Changes
|
|
253
|
+
|
|
254
|
+
- cb2e997: Bundle evals package with tsup
|
|
255
|
+
|
|
256
|
+
## 0.1.0-alpha.21
|
|
257
|
+
|
|
258
|
+
### Patch Changes
|
|
259
|
+
|
|
260
|
+
- Updated dependencies [327ece7]
|
|
261
|
+
- @mastra/core@0.1.27-alpha.80
|
|
262
|
+
|
|
263
|
+
## 0.1.0-alpha.20
|
|
264
|
+
|
|
265
|
+
### Patch Changes
|
|
266
|
+
|
|
267
|
+
- Updated dependencies [21fe536]
|
|
268
|
+
- @mastra/core@0.1.27-alpha.79
|
|
269
|
+
|
|
270
|
+
## 0.1.0-alpha.19
|
|
271
|
+
|
|
272
|
+
### Patch Changes
|
|
273
|
+
|
|
274
|
+
- Updated dependencies [685108a]
|
|
275
|
+
- Updated dependencies [685108a]
|
|
276
|
+
- @mastra/core@0.1.27-alpha.78
|
|
277
|
+
|
|
278
|
+
## 0.1.0-alpha.18
|
|
279
|
+
|
|
280
|
+
### Patch Changes
|
|
281
|
+
|
|
282
|
+
- Updated dependencies [8105fae]
|
|
283
|
+
- @mastra/core@0.1.27-alpha.77
|
|
284
|
+
|
|
285
|
+
## 0.1.0-alpha.17
|
|
286
|
+
|
|
287
|
+
### Patch Changes
|
|
288
|
+
|
|
289
|
+
- Updated dependencies [ae7bf94]
|
|
290
|
+
- Updated dependencies [ae7bf94]
|
|
291
|
+
- @mastra/core@0.1.27-alpha.76
|
|
292
|
+
|
|
293
|
+
## 0.1.0-alpha.16
|
|
294
|
+
|
|
295
|
+
### Patch Changes
|
|
296
|
+
|
|
297
|
+
- Updated dependencies [23dcb23]
|
|
298
|
+
- @mastra/core@0.1.27-alpha.75
|
|
299
|
+
|
|
300
|
+
## 0.1.0-alpha.15
|
|
301
|
+
|
|
302
|
+
### Patch Changes
|
|
303
|
+
|
|
304
|
+
- Updated dependencies [7b87567]
|
|
305
|
+
- @mastra/core@0.1.27-alpha.74
|
|
306
|
+
|
|
307
|
+
## 0.1.0-alpha.14
|
|
308
|
+
|
|
309
|
+
### Patch Changes
|
|
310
|
+
|
|
311
|
+
- Updated dependencies [3427b95]
|
|
312
|
+
- @mastra/core@0.1.27-alpha.73
|
|
313
|
+
|
|
314
|
+
## 0.1.0-alpha.13
|
|
315
|
+
|
|
316
|
+
### Patch Changes
|
|
317
|
+
|
|
318
|
+
- 06b2c0a: Update summarization prompt and fix eval input
|
|
319
|
+
- Updated dependencies [e4d4ede]
|
|
320
|
+
- Updated dependencies [06b2c0a]
|
|
321
|
+
- @mastra/core@0.1.27-alpha.72
|
|
322
|
+
|
|
323
|
+
## 0.1.0-alpha.12
|
|
324
|
+
|
|
325
|
+
### Patch Changes
|
|
326
|
+
|
|
327
|
+
- Updated dependencies [d9c8dd0]
|
|
328
|
+
- @mastra/core@0.1.27-alpha.71
|
|
329
|
+
|
|
330
|
+
## 0.1.0-alpha.11
|
|
331
|
+
|
|
332
|
+
### Patch Changes
|
|
333
|
+
|
|
334
|
+
- bdaf834: publish packages
|
|
335
|
+
|
|
336
|
+
## 0.1.0-alpha.10
|
|
337
|
+
|
|
338
|
+
### Patch Changes
|
|
339
|
+
|
|
340
|
+
- Updated dependencies [dd6d87f]
|
|
341
|
+
- Updated dependencies [04434b6]
|
|
342
|
+
- @mastra/core@0.1.27-alpha.70
|
|
343
|
+
|
|
344
|
+
## 0.1.0-alpha.9
|
|
345
|
+
|
|
346
|
+
### Patch Changes
|
|
347
|
+
|
|
348
|
+
- 1944807: Unified logger and major step in better logs
|
|
349
|
+
- 9ade36e: Changed measure for evals, added endpoints, attached metrics to agent, added ui for evals in playground, and updated docs
|
|
350
|
+
- Updated dependencies [1944807]
|
|
351
|
+
- Updated dependencies [9ade36e]
|
|
352
|
+
- @mastra/core@0.1.27-alpha.69
|
|
353
|
+
|
|
354
|
+
## 0.1.0-alpha.8
|
|
355
|
+
|
|
356
|
+
### Patch Changes
|
|
357
|
+
|
|
358
|
+
- Updated dependencies [0be7181]
|
|
359
|
+
- Updated dependencies [0be7181]
|
|
360
|
+
- @mastra/core@0.1.27-alpha.68
|
|
361
|
+
|
|
362
|
+
## 0.1.0-alpha.7
|
|
363
|
+
|
|
364
|
+
### Patch Changes
|
|
365
|
+
|
|
366
|
+
- Updated dependencies [c8ff2f5]
|
|
367
|
+
- @mastra/core@0.1.27-alpha.67
|
|
368
|
+
|
|
369
|
+
## 0.1.0-alpha.6
|
|
370
|
+
|
|
371
|
+
### Patch Changes
|
|
372
|
+
|
|
373
|
+
- aea3c13: Fix evals export for llm and nlp
|
|
374
|
+
|
|
3
375
|
## 0.1.0-alpha.5
|
|
4
376
|
|
|
5
377
|
### Minor Changes
|
package/README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# @mastra/evals
|
|
2
|
+
|
|
3
|
+
A comprehensive evaluation framework for assessing AI model outputs across multiple dimensions.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @mastra/evals
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Overview
|
|
12
|
+
|
|
13
|
+
`@mastra/evals` provides a suite of evaluation metrics for assessing AI model outputs. The package includes both LLM-based and NLP-based metrics, enabling both automated and model-assisted evaluation of AI responses.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
### LLM-Based Metrics
|
|
18
|
+
|
|
19
|
+
1. **Answer Relevancy**
|
|
20
|
+
|
|
21
|
+
- Evaluates how well an answer addresses the input question
|
|
22
|
+
- Considers uncertainty weighting for more nuanced scoring
|
|
23
|
+
- Returns detailed reasoning for scores
|
|
24
|
+
|
|
25
|
+
2. **Bias Detection**
|
|
26
|
+
|
|
27
|
+
- Identifies potential biases in model outputs
|
|
28
|
+
- Analyzes opinions and statements for bias indicators
|
|
29
|
+
- Provides explanations for detected biases
|
|
30
|
+
- Configurable scoring scale
|
|
31
|
+
|
|
32
|
+
3. **Context Precision & Relevancy**
|
|
33
|
+
|
|
34
|
+
- Assesses how well responses use provided context
|
|
35
|
+
- Evaluates accuracy of context usage
|
|
36
|
+
- Measures relevance of context to the response
|
|
37
|
+
- Analyzes context positioning in responses
|
|
38
|
+
|
|
39
|
+
4. **Faithfulness**
|
|
40
|
+
|
|
41
|
+
- Verifies that responses are faithful to provided context
|
|
42
|
+
- Detects hallucinations or fabricated information
|
|
43
|
+
- Evaluates claims against provided context
|
|
44
|
+
- Provides detailed analysis of faithfulness breaches
|
|
45
|
+
|
|
46
|
+
5. **Prompt Alignment**
|
|
47
|
+
|
|
48
|
+
- Measures how well responses follow given instructions
|
|
49
|
+
- Evaluates adherence to multiple instruction criteria
|
|
50
|
+
- Provides per-instruction scoring
|
|
51
|
+
- Supports custom instruction sets
|
|
52
|
+
|
|
53
|
+
6. **Toxicity**
|
|
54
|
+
- Detects toxic or harmful content in responses
|
|
55
|
+
- Provides detailed reasoning for toxicity verdicts
|
|
56
|
+
- Configurable scoring thresholds
|
|
57
|
+
- Considers both input and output context
|
|
58
|
+
|
|
59
|
+
### NLP-Based Metrics
|
|
60
|
+
|
|
61
|
+
1. **Completeness**
|
|
62
|
+
|
|
63
|
+
- Analyzes structural completeness of responses
|
|
64
|
+
- Identifies missing elements from input requirements
|
|
65
|
+
- Provides detailed element coverage analysis
|
|
66
|
+
- Tracks input-output element ratios
|
|
67
|
+
|
|
68
|
+
2. **Content Similarity**
|
|
69
|
+
|
|
70
|
+
- Measures text similarity between inputs and outputs
|
|
71
|
+
- Configurable for case and whitespace sensitivity
|
|
72
|
+
- Returns normalized similarity scores
|
|
73
|
+
- Uses string comparison algorithms for accuracy
|
|
74
|
+
|
|
75
|
+
3. **Keyword Coverage**
|
|
76
|
+
- Tracks presence of key terms from input in output
|
|
77
|
+
- Provides detailed keyword matching statistics
|
|
78
|
+
- Calculates coverage ratios
|
|
79
|
+
- Useful for ensuring comprehensive responses
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
### Basic Example
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
import { ContentSimilarityMetric, ToxicityMetric } from '@mastra/evals';
|
|
87
|
+
|
|
88
|
+
// Initialize metrics
|
|
89
|
+
const similarityMetric = new ContentSimilarityMetric({
|
|
90
|
+
ignoreCase: true,
|
|
91
|
+
ignoreWhitespace: true,
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
const toxicityMetric = new ToxicityMetric({
|
|
95
|
+
model: openai('gpt-4'),
|
|
96
|
+
scale: 1, // Optional: adjust scoring scale
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// Evaluate outputs
|
|
100
|
+
const input = 'What is the capital of France?';
|
|
101
|
+
const output = 'Paris is the capital of France.';
|
|
102
|
+
|
|
103
|
+
const similarityResult = await similarityMetric.measure(input, output);
|
|
104
|
+
const toxicityResult = await toxicityMetric.measure(input, output);
|
|
105
|
+
|
|
106
|
+
console.log('Similarity Score:', similarityResult.score);
|
|
107
|
+
console.log('Toxicity Score:', toxicityResult.score);
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Context-Aware Evaluation
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
import { FaithfulnessMetric } from '@mastra/evals';
|
|
114
|
+
|
|
115
|
+
// Initialize with context
|
|
116
|
+
const faithfulnessMetric = new FaithfulnessMetric({
|
|
117
|
+
model: openai('gpt-4'),
|
|
118
|
+
context: ['Paris is the capital of France', 'Paris has a population of 2.2 million'],
|
|
119
|
+
scale: 1,
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
// Evaluate response against context
|
|
123
|
+
const result = await faithfulnessMetric.measure(
|
|
124
|
+
'Tell me about Paris',
|
|
125
|
+
'Paris is the capital of France with 2.2 million residents',
|
|
126
|
+
);
|
|
127
|
+
|
|
128
|
+
console.log('Faithfulness Score:', result.score);
|
|
129
|
+
console.log('Reasoning:', result.reason);
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Metric Results
|
|
133
|
+
|
|
134
|
+
Each metric returns a standardized result object containing:
|
|
135
|
+
|
|
136
|
+
- `score`: Normalized score (typically 0-1)
|
|
137
|
+
- `info`: Detailed information about the evaluation
|
|
138
|
+
- Additional metric-specific data (e.g., matched keywords, missing elements)
|
|
139
|
+
|
|
140
|
+
Some metrics also provide:
|
|
141
|
+
|
|
142
|
+
- `reason`: Detailed explanation of the score
|
|
143
|
+
- `verdicts`: Individual judgments that contributed to the final score
|
|
144
|
+
|
|
145
|
+
## Telemetry and Logging
|
|
146
|
+
|
|
147
|
+
The package includes built-in telemetry and logging capabilities:
|
|
148
|
+
|
|
149
|
+
- Automatic evaluation tracking through Mastra Storage
|
|
150
|
+
- Integration with OpenTelemetry for performance monitoring
|
|
151
|
+
- Detailed evaluation traces for debugging
|
|
152
|
+
|
|
153
|
+
```typescript
|
|
154
|
+
import { attachListeners } from '@mastra/evals';
|
|
155
|
+
|
|
156
|
+
// Enable basic evaluation tracking
|
|
157
|
+
await attachListeners();
|
|
158
|
+
|
|
159
|
+
// Store evals in Mastra Storage (if storage is enabled)
|
|
160
|
+
await attachListeners(mastra);
|
|
161
|
+
// Note: When using in-memory storage, evaluations are isolated to the test process.
|
|
162
|
+
// When using file storage, evaluations are persisted and can be queried later.
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Environment Variables
|
|
166
|
+
|
|
167
|
+
Required for LLM-based metrics:
|
|
168
|
+
|
|
169
|
+
- `OPENAI_API_KEY`: For OpenAI model access
|
|
170
|
+
- Additional provider keys as needed (Cohere, Anthropic, etc.)
|
|
171
|
+
|
|
172
|
+
## Package Exports
|
|
173
|
+
|
|
174
|
+
```typescript
|
|
175
|
+
// Main package exports
|
|
176
|
+
import { evaluate } from '@mastra/evals';
|
|
177
|
+
// NLP-specific metrics
|
|
178
|
+
import { ContentSimilarityMetric } from '@mastra/evals/nlp';
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Related Packages
|
|
182
|
+
|
|
183
|
+
- `@mastra/core`: Core framework functionality
|
|
184
|
+
- `@mastra/engine`: LLM execution engine
|
|
185
|
+
- `@mastra/mcp`: Model Context Protocol integration
|