@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Context
|
|
1
|
+
# Context relevance scorer
|
|
2
2
|
|
|
3
3
|
The `createContextRelevanceScorerLLM()` function creates a scorer that evaluates how relevant and useful provided context was for generating agent responses. It uses weighted relevance levels and applies penalties for unused high-relevance context and missing information.
|
|
4
4
|
|
|
5
|
-
It
|
|
5
|
+
It's especially useful for these use cases:
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## Content generation evaluation
|
|
8
8
|
|
|
9
9
|
Best for evaluating context quality in:
|
|
10
10
|
|
|
@@ -12,7 +12,7 @@ Best for evaluating context quality in:
|
|
|
12
12
|
- RAG pipelines needing nuanced relevance assessment
|
|
13
13
|
- Systems where missing context affects quality
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
## Context selection optimization
|
|
16
16
|
|
|
17
17
|
Use when optimizing for:
|
|
18
18
|
|
|
@@ -22,19 +22,19 @@ Use when optimizing for:
|
|
|
22
22
|
|
|
23
23
|
## Parameters
|
|
24
24
|
|
|
25
|
-
**model
|
|
25
|
+
**model** (`MastraModelConfig`): The language model to use for evaluating context relevance
|
|
26
26
|
|
|
27
|
-
**options
|
|
27
|
+
**options** (`ContextRelevanceOptions`): Configuration options for the scorer
|
|
28
28
|
|
|
29
29
|
Note: Either `context` or `contextExtractor` must be provided. If both are provided, `contextExtractor` takes precedence.
|
|
30
30
|
|
|
31
|
-
##
|
|
31
|
+
## `.run()` returns
|
|
32
32
|
|
|
33
|
-
**score
|
|
33
|
+
**score** (`number`): Weighted relevance score between 0 and scale (default 0-1)
|
|
34
34
|
|
|
35
|
-
**reason
|
|
35
|
+
**reason** (`string`): Human-readable explanation of the context relevance evaluation
|
|
36
36
|
|
|
37
|
-
## Scoring
|
|
37
|
+
## Scoring details
|
|
38
38
|
|
|
39
39
|
### Weighted Relevance Scoring
|
|
40
40
|
|
|
@@ -115,16 +115,16 @@ Use results to improve your system:
|
|
|
115
115
|
Control how penalties are applied for unused and missing context:
|
|
116
116
|
|
|
117
117
|
```typescript
|
|
118
|
-
import { createContextRelevanceScorerLLM } from
|
|
118
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
119
119
|
|
|
120
120
|
// Stricter penalty configuration
|
|
121
121
|
const strictScorer = createContextRelevanceScorerLLM({
|
|
122
|
-
model:
|
|
122
|
+
model: 'openai/gpt-5.4',
|
|
123
123
|
options: {
|
|
124
124
|
context: [
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
125
|
+
'Einstein won the Nobel Prize for photoelectric effect',
|
|
126
|
+
'He developed the theory of relativity',
|
|
127
|
+
'Einstein was born in Germany',
|
|
128
128
|
],
|
|
129
129
|
penalties: {
|
|
130
130
|
unusedHighRelevanceContext: 0.2, // 20% penalty per unused high-relevance context
|
|
@@ -133,16 +133,16 @@ const strictScorer = createContextRelevanceScorerLLM({
|
|
|
133
133
|
},
|
|
134
134
|
scale: 1,
|
|
135
135
|
},
|
|
136
|
-
})
|
|
136
|
+
})
|
|
137
137
|
|
|
138
138
|
// Lenient penalty configuration
|
|
139
139
|
const lenientScorer = createContextRelevanceScorerLLM({
|
|
140
|
-
model:
|
|
140
|
+
model: 'openai/gpt-5.4',
|
|
141
141
|
options: {
|
|
142
142
|
context: [
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
'Einstein won the Nobel Prize for photoelectric effect',
|
|
144
|
+
'He developed the theory of relativity',
|
|
145
|
+
'Einstein was born in Germany',
|
|
146
146
|
],
|
|
147
147
|
penalties: {
|
|
148
148
|
unusedHighRelevanceContext: 0.05, // 5% penalty per unused high-relevance context
|
|
@@ -151,69 +151,68 @@ const lenientScorer = createContextRelevanceScorerLLM({
|
|
|
151
151
|
},
|
|
152
152
|
scale: 1,
|
|
153
153
|
},
|
|
154
|
-
})
|
|
154
|
+
})
|
|
155
155
|
|
|
156
156
|
const testRun = {
|
|
157
157
|
input: {
|
|
158
158
|
inputMessages: [
|
|
159
159
|
{
|
|
160
|
-
id:
|
|
161
|
-
role:
|
|
162
|
-
content:
|
|
160
|
+
id: '1',
|
|
161
|
+
role: 'user',
|
|
162
|
+
content: 'What did Einstein achieve in physics?',
|
|
163
163
|
},
|
|
164
164
|
],
|
|
165
165
|
},
|
|
166
166
|
output: [
|
|
167
167
|
{
|
|
168
|
-
id:
|
|
169
|
-
role:
|
|
170
|
-
content:
|
|
171
|
-
"Einstein won the Nobel Prize for his work on the photoelectric effect.",
|
|
168
|
+
id: '2',
|
|
169
|
+
role: 'assistant',
|
|
170
|
+
content: 'Einstein won the Nobel Prize for his work on the photoelectric effect.',
|
|
172
171
|
},
|
|
173
172
|
],
|
|
174
|
-
}
|
|
173
|
+
}
|
|
175
174
|
|
|
176
|
-
const strictResult = await strictScorer.run(testRun)
|
|
177
|
-
const lenientResult = await lenientScorer.run(testRun)
|
|
175
|
+
const strictResult = await strictScorer.run(testRun)
|
|
176
|
+
const lenientResult = await lenientScorer.run(testRun)
|
|
178
177
|
|
|
179
|
-
console.log(
|
|
180
|
-
console.log(
|
|
178
|
+
console.log('Strict penalties:', strictResult.score) // Lower score due to unused context
|
|
179
|
+
console.log('Lenient penalties:', lenientResult.score) // Higher score, less penalty
|
|
181
180
|
```
|
|
182
181
|
|
|
183
182
|
### Dynamic Context Extraction
|
|
184
183
|
|
|
185
184
|
```typescript
|
|
186
185
|
const scorer = createContextRelevanceScorerLLM({
|
|
187
|
-
model:
|
|
186
|
+
model: 'openai/gpt-5.4',
|
|
188
187
|
options: {
|
|
189
188
|
contextExtractor: (input, output) => {
|
|
190
189
|
// Extract context based on the query
|
|
191
|
-
const userQuery = input?.inputMessages?.[0]?.content ||
|
|
192
|
-
if (userQuery.includes(
|
|
190
|
+
const userQuery = input?.inputMessages?.[0]?.content || ''
|
|
191
|
+
if (userQuery.includes('Einstein')) {
|
|
193
192
|
return [
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
]
|
|
193
|
+
'Einstein won the Nobel Prize for the photoelectric effect',
|
|
194
|
+
'He developed the theory of relativity',
|
|
195
|
+
]
|
|
197
196
|
}
|
|
198
|
-
return [
|
|
197
|
+
return ['General physics information']
|
|
199
198
|
},
|
|
200
199
|
penalties: {
|
|
201
200
|
unusedHighRelevanceContext: 0.15,
|
|
202
201
|
},
|
|
203
202
|
},
|
|
204
|
-
})
|
|
203
|
+
})
|
|
205
204
|
```
|
|
206
205
|
|
|
207
206
|
### Custom scale factor
|
|
208
207
|
|
|
209
208
|
```typescript
|
|
210
209
|
const scorer = createContextRelevanceScorerLLM({
|
|
211
|
-
model:
|
|
210
|
+
model: 'openai/gpt-5.4',
|
|
212
211
|
options: {
|
|
213
|
-
context: [
|
|
212
|
+
context: ['Relevant information...', 'Supporting details...'],
|
|
214
213
|
scale: 100, // Scale scores from 0-100 instead of 0-1
|
|
215
214
|
},
|
|
216
|
-
})
|
|
215
|
+
})
|
|
217
216
|
|
|
218
217
|
// Result will be scaled: score: 85 instead of 0.85
|
|
219
218
|
```
|
|
@@ -222,21 +221,21 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
222
221
|
|
|
223
222
|
```typescript
|
|
224
223
|
const scorer = createContextRelevanceScorerLLM({
|
|
225
|
-
model:
|
|
224
|
+
model: 'openai/gpt-5.4',
|
|
226
225
|
options: {
|
|
227
226
|
contextExtractor: (input, output) => {
|
|
228
|
-
const query = input?.inputMessages?.[0]?.content ||
|
|
227
|
+
const query = input?.inputMessages?.[0]?.content || ''
|
|
229
228
|
|
|
230
229
|
// Combine from multiple sources
|
|
231
|
-
const kbContext = knowledgeBase.search(query)
|
|
232
|
-
const docContext = documentStore.retrieve(query)
|
|
233
|
-
const cacheContext = contextCache.get(query)
|
|
230
|
+
const kbContext = knowledgeBase.search(query)
|
|
231
|
+
const docContext = documentStore.retrieve(query)
|
|
232
|
+
const cacheContext = contextCache.get(query)
|
|
234
233
|
|
|
235
|
-
return [...kbContext, ...docContext, ...cacheContext]
|
|
234
|
+
return [...kbContext, ...docContext, ...cacheContext]
|
|
236
235
|
},
|
|
237
236
|
scale: 1,
|
|
238
237
|
},
|
|
239
|
-
})
|
|
238
|
+
})
|
|
240
239
|
```
|
|
241
240
|
|
|
242
241
|
## Examples
|
|
@@ -246,41 +245,41 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
246
245
|
This example shows excellent context relevance where all context directly supports the response:
|
|
247
246
|
|
|
248
247
|
```typescript
|
|
249
|
-
import { createContextRelevanceScorerLLM } from
|
|
248
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
250
249
|
|
|
251
250
|
const scorer = createContextRelevanceScorerLLM({
|
|
252
|
-
model:
|
|
251
|
+
model: 'openai/gpt-5.4',
|
|
253
252
|
options: {
|
|
254
253
|
context: [
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
254
|
+
'Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1921.',
|
|
255
|
+
'He published his theory of special relativity in 1905.',
|
|
256
|
+
'His general relativity theory, published in 1915, revolutionized our understanding of gravity.',
|
|
258
257
|
],
|
|
259
258
|
scale: 1,
|
|
260
259
|
},
|
|
261
|
-
})
|
|
260
|
+
})
|
|
262
261
|
|
|
263
262
|
const result = await scorer.run({
|
|
264
263
|
input: {
|
|
265
264
|
inputMessages: [
|
|
266
265
|
{
|
|
267
|
-
id:
|
|
268
|
-
role:
|
|
266
|
+
id: '1',
|
|
267
|
+
role: 'user',
|
|
269
268
|
content: "What were Einstein's major scientific achievements?",
|
|
270
269
|
},
|
|
271
270
|
],
|
|
272
271
|
},
|
|
273
272
|
output: [
|
|
274
273
|
{
|
|
275
|
-
id:
|
|
276
|
-
role:
|
|
274
|
+
id: '2',
|
|
275
|
+
role: 'assistant',
|
|
277
276
|
content:
|
|
278
277
|
"Einstein's major achievements include the Nobel Prize for the photoelectric effect, special relativity in 1905, and general relativity in 1915.",
|
|
279
278
|
},
|
|
280
279
|
],
|
|
281
|
-
})
|
|
280
|
+
})
|
|
282
281
|
|
|
283
|
-
console.log(result)
|
|
282
|
+
console.log(result)
|
|
284
283
|
// Output:
|
|
285
284
|
// {
|
|
286
285
|
// score: 1.0,
|
|
@@ -293,43 +292,43 @@ console.log(result);
|
|
|
293
292
|
This example shows moderate relevance with some context being irrelevant or unused:
|
|
294
293
|
|
|
295
294
|
```typescript
|
|
296
|
-
import { createContextRelevanceScorerLLM } from
|
|
295
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
297
296
|
|
|
298
297
|
const scorer = createContextRelevanceScorerLLM({
|
|
299
|
-
model:
|
|
298
|
+
model: 'openai/gpt-5.4',
|
|
300
299
|
options: {
|
|
301
300
|
context: [
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
301
|
+
'Solar eclipses occur when the Moon blocks the Sun.',
|
|
302
|
+
'The Moon moves between the Earth and Sun during eclipses.',
|
|
303
|
+
'The Moon is visible at night.',
|
|
304
|
+
'Stars twinkle due to atmospheric interference.',
|
|
305
|
+
'Total eclipses can last up to 7.5 minutes.',
|
|
307
306
|
],
|
|
308
307
|
scale: 1,
|
|
309
308
|
},
|
|
310
|
-
})
|
|
309
|
+
})
|
|
311
310
|
|
|
312
311
|
const result = await scorer.run({
|
|
313
312
|
input: {
|
|
314
313
|
inputMessages: [
|
|
315
314
|
{
|
|
316
|
-
id:
|
|
317
|
-
role:
|
|
318
|
-
content:
|
|
315
|
+
id: '1',
|
|
316
|
+
role: 'user',
|
|
317
|
+
content: 'What causes solar eclipses?',
|
|
319
318
|
},
|
|
320
319
|
],
|
|
321
320
|
},
|
|
322
321
|
output: [
|
|
323
322
|
{
|
|
324
|
-
id:
|
|
325
|
-
role:
|
|
323
|
+
id: '2',
|
|
324
|
+
role: 'assistant',
|
|
326
325
|
content:
|
|
327
|
-
|
|
326
|
+
'Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.',
|
|
328
327
|
},
|
|
329
328
|
],
|
|
330
|
-
})
|
|
329
|
+
})
|
|
331
330
|
|
|
332
|
-
console.log(result)
|
|
331
|
+
console.log(result)
|
|
333
332
|
// Output with default penalties:
|
|
334
333
|
// {
|
|
335
334
|
// score: 0.64,
|
|
@@ -338,14 +337,14 @@ console.log(result);
|
|
|
338
337
|
|
|
339
338
|
// With custom penalty configuration
|
|
340
339
|
const customScorer = createContextRelevanceScorerLLM({
|
|
341
|
-
model:
|
|
340
|
+
model: 'openai/gpt-5.4',
|
|
342
341
|
options: {
|
|
343
342
|
context: [
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
343
|
+
'Solar eclipses occur when the Moon blocks the Sun.',
|
|
344
|
+
'The Moon moves between the Earth and Sun during eclipses.',
|
|
345
|
+
'The Moon is visible at night.',
|
|
346
|
+
'Stars twinkle due to atmospheric interference.',
|
|
347
|
+
'Total eclipses can last up to 7.5 minutes.',
|
|
349
348
|
],
|
|
350
349
|
penalties: {
|
|
351
350
|
unusedHighRelevanceContext: 0.05, // Lower penalty for unused context
|
|
@@ -353,25 +352,23 @@ const customScorer = createContextRelevanceScorerLLM({
|
|
|
353
352
|
maxMissingContextPenalty: 0.3,
|
|
354
353
|
},
|
|
355
354
|
},
|
|
356
|
-
})
|
|
355
|
+
})
|
|
357
356
|
|
|
358
357
|
const customResult = await customScorer.run({
|
|
359
358
|
input: {
|
|
360
|
-
inputMessages: [
|
|
361
|
-
{ id: "1", role: "user", content: "What causes solar eclipses?" },
|
|
362
|
-
],
|
|
359
|
+
inputMessages: [{ id: '1', role: 'user', content: 'What causes solar eclipses?' }],
|
|
363
360
|
},
|
|
364
361
|
output: [
|
|
365
362
|
{
|
|
366
|
-
id:
|
|
367
|
-
role:
|
|
363
|
+
id: '2',
|
|
364
|
+
role: 'assistant',
|
|
368
365
|
content:
|
|
369
|
-
|
|
366
|
+
'Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.',
|
|
370
367
|
},
|
|
371
368
|
],
|
|
372
|
-
})
|
|
369
|
+
})
|
|
373
370
|
|
|
374
|
-
console.log(customResult)
|
|
371
|
+
console.log(customResult)
|
|
375
372
|
// Output with lenient penalties:
|
|
376
373
|
// {
|
|
377
374
|
// score: 0.69, // Higher score due to reduced penalty for unused context
|
|
@@ -384,42 +381,42 @@ console.log(customResult);
|
|
|
384
381
|
This example shows poor context relevance with mostly irrelevant information:
|
|
385
382
|
|
|
386
383
|
```typescript
|
|
387
|
-
import { createContextRelevanceScorerLLM } from
|
|
384
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
388
385
|
|
|
389
386
|
const scorer = createContextRelevanceScorerLLM({
|
|
390
|
-
model:
|
|
387
|
+
model: 'openai/gpt-5.4',
|
|
391
388
|
options: {
|
|
392
389
|
context: [
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
390
|
+
'The Great Barrier Reef is located in Australia.',
|
|
391
|
+
'Coral reefs need warm water to survive.',
|
|
392
|
+
'Many fish species live in coral reefs.',
|
|
393
|
+
'Australia has six states and two territories.',
|
|
394
|
+
'The capital of Australia is Canberra.',
|
|
398
395
|
],
|
|
399
396
|
scale: 1,
|
|
400
397
|
},
|
|
401
|
-
})
|
|
398
|
+
})
|
|
402
399
|
|
|
403
400
|
const result = await scorer.run({
|
|
404
401
|
input: {
|
|
405
402
|
inputMessages: [
|
|
406
403
|
{
|
|
407
|
-
id:
|
|
408
|
-
role:
|
|
409
|
-
content:
|
|
404
|
+
id: '1',
|
|
405
|
+
role: 'user',
|
|
406
|
+
content: 'What is the capital of Australia?',
|
|
410
407
|
},
|
|
411
408
|
],
|
|
412
409
|
},
|
|
413
410
|
output: [
|
|
414
411
|
{
|
|
415
|
-
id:
|
|
416
|
-
role:
|
|
417
|
-
content:
|
|
412
|
+
id: '2',
|
|
413
|
+
role: 'assistant',
|
|
414
|
+
content: 'The capital of Australia is Canberra.',
|
|
418
415
|
},
|
|
419
416
|
],
|
|
420
|
-
})
|
|
417
|
+
})
|
|
421
418
|
|
|
422
|
-
console.log(result)
|
|
419
|
+
console.log(result)
|
|
423
420
|
// Output:
|
|
424
421
|
// {
|
|
425
422
|
// score: 0.26,
|
|
@@ -432,33 +429,33 @@ console.log(result);
|
|
|
432
429
|
Extract context dynamically based on the run input:
|
|
433
430
|
|
|
434
431
|
```typescript
|
|
435
|
-
import { createContextRelevanceScorerLLM } from
|
|
432
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
436
433
|
|
|
437
434
|
const scorer = createContextRelevanceScorerLLM({
|
|
438
|
-
model:
|
|
435
|
+
model: 'openai/gpt-5.4',
|
|
439
436
|
options: {
|
|
440
437
|
contextExtractor: (input, output) => {
|
|
441
438
|
// Extract query from input
|
|
442
|
-
const query = input?.inputMessages?.[0]?.content ||
|
|
439
|
+
const query = input?.inputMessages?.[0]?.content || ''
|
|
443
440
|
|
|
444
441
|
// Dynamically retrieve context based on query
|
|
445
|
-
if (query.toLowerCase().includes(
|
|
442
|
+
if (query.toLowerCase().includes('einstein')) {
|
|
446
443
|
return [
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
]
|
|
444
|
+
'Einstein developed E=mc²',
|
|
445
|
+
'He won the Nobel Prize in 1921',
|
|
446
|
+
'His theories revolutionized physics',
|
|
447
|
+
]
|
|
451
448
|
}
|
|
452
449
|
|
|
453
|
-
if (query.toLowerCase().includes(
|
|
450
|
+
if (query.toLowerCase().includes('climate')) {
|
|
454
451
|
return [
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
]
|
|
452
|
+
'Global temperatures are rising',
|
|
453
|
+
'CO2 levels affect climate',
|
|
454
|
+
'Renewable energy reduces emissions',
|
|
455
|
+
]
|
|
459
456
|
}
|
|
460
457
|
|
|
461
|
-
return [
|
|
458
|
+
return ['General knowledge base entry']
|
|
462
459
|
},
|
|
463
460
|
penalties: {
|
|
464
461
|
unusedHighRelevanceContext: 0.15, // 15% penalty for unused relevant context
|
|
@@ -467,7 +464,7 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
467
464
|
},
|
|
468
465
|
scale: 1,
|
|
469
466
|
},
|
|
470
|
-
})
|
|
467
|
+
})
|
|
471
468
|
```
|
|
472
469
|
|
|
473
470
|
### RAG system integration
|
|
@@ -475,19 +472,17 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
475
472
|
Integrate with RAG pipelines to evaluate retrieved context:
|
|
476
473
|
|
|
477
474
|
```typescript
|
|
478
|
-
import { createContextRelevanceScorerLLM } from
|
|
475
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
479
476
|
|
|
480
477
|
const scorer = createContextRelevanceScorerLLM({
|
|
481
|
-
model:
|
|
478
|
+
model: 'openai/gpt-5.4',
|
|
482
479
|
options: {
|
|
483
480
|
contextExtractor: (input, output) => {
|
|
484
481
|
// Extract from RAG retrieval results
|
|
485
|
-
const ragResults = inputData.metadata?.ragResults || []
|
|
482
|
+
const ragResults = inputData.metadata?.ragResults || []
|
|
486
483
|
|
|
487
484
|
// Return the text content of retrieved documents
|
|
488
|
-
return ragResults
|
|
489
|
-
.filter((doc) => doc.relevanceScore > 0.5)
|
|
490
|
-
.map((doc) => doc.content);
|
|
485
|
+
return ragResults.filter(doc => doc.relevanceScore > 0.5).map(doc => doc.content)
|
|
491
486
|
},
|
|
492
487
|
penalties: {
|
|
493
488
|
unusedHighRelevanceContext: 0.12, // Moderate penalty for unused RAG context
|
|
@@ -496,28 +491,28 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
496
491
|
},
|
|
497
492
|
scale: 1,
|
|
498
493
|
},
|
|
499
|
-
})
|
|
494
|
+
})
|
|
500
495
|
|
|
501
496
|
// Evaluate RAG system performance
|
|
502
|
-
const evaluateRAG = async
|
|
503
|
-
const results = []
|
|
497
|
+
const evaluateRAG = async testCases => {
|
|
498
|
+
const results = []
|
|
504
499
|
|
|
505
500
|
for (const testCase of testCases) {
|
|
506
|
-
const score = await scorer.run(testCase)
|
|
501
|
+
const score = await scorer.run(testCase)
|
|
507
502
|
results.push({
|
|
508
503
|
query: testCase.inputData.inputMessages[0].content,
|
|
509
504
|
relevanceScore: score.score,
|
|
510
505
|
feedback: score.reason,
|
|
511
|
-
unusedContext: score.reason.includes(
|
|
512
|
-
missingContext: score.reason.includes(
|
|
513
|
-
})
|
|
506
|
+
unusedContext: score.reason.includes('unused'),
|
|
507
|
+
missingContext: score.reason.includes('missing'),
|
|
508
|
+
})
|
|
514
509
|
}
|
|
515
510
|
|
|
516
|
-
return results
|
|
517
|
-
}
|
|
511
|
+
return results
|
|
512
|
+
}
|
|
518
513
|
```
|
|
519
514
|
|
|
520
|
-
## Comparison with
|
|
515
|
+
## Comparison with context precision
|
|
521
516
|
|
|
522
517
|
Choose the right scorer for your needs:
|
|
523
518
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Faithfulness
|
|
1
|
+
# Faithfulness scorer
|
|
2
2
|
|
|
3
3
|
The `createFaithfulnessScorer()` function evaluates how factually accurate an LLM's output is compared to the provided context. It extracts claims from the output and verifies them against the context, making it essential to measure RAG pipeline responses' reliability.
|
|
4
4
|
|
|
@@ -6,33 +6,33 @@ The `createFaithfulnessScorer()` function evaluates how factually accurate an LL
|
|
|
6
6
|
|
|
7
7
|
The `createFaithfulnessScorer()` function accepts a single options object with the following properties:
|
|
8
8
|
|
|
9
|
-
**model
|
|
9
|
+
**model** (`LanguageModel`): Configuration for the model used to evaluate faithfulness.
|
|
10
10
|
|
|
11
|
-
**context
|
|
11
|
+
**context** (`string[]`): Array of context chunks against which the output's claims will be verified.
|
|
12
12
|
|
|
13
|
-
**scale
|
|
13
|
+
**scale** (`number`): The maximum score value. The final score will be normalized to this scale. (Default: `1`)
|
|
14
14
|
|
|
15
15
|
This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
|
|
16
16
|
|
|
17
|
-
##
|
|
17
|
+
## `.run()` returns
|
|
18
18
|
|
|
19
|
-
**runId
|
|
19
|
+
**runId** (`string`): The id of the run (optional).
|
|
20
20
|
|
|
21
|
-
**preprocessStepResult
|
|
21
|
+
**preprocessStepResult** (`string[]`): Array of extracted claims from the output.
|
|
22
22
|
|
|
23
|
-
**preprocessPrompt
|
|
23
|
+
**preprocessPrompt** (`string`): The prompt sent to the LLM for the preprocess step (optional).
|
|
24
24
|
|
|
25
|
-
**analyzeStepResult
|
|
25
|
+
**analyzeStepResult** (`object`): Object with verdicts: { verdicts: Array<{ verdict: 'yes' | 'no' | 'unsure', reason: string }> }
|
|
26
26
|
|
|
27
|
-
**analyzePrompt
|
|
27
|
+
**analyzePrompt** (`string`): The prompt sent to the LLM for the analyze step (optional).
|
|
28
28
|
|
|
29
|
-
**score
|
|
29
|
+
**score** (`number`): A score between 0 and the configured scale, representing the proportion of claims that are supported by the context.
|
|
30
30
|
|
|
31
|
-
**reason
|
|
31
|
+
**reason** (`string`): A detailed explanation of the score, including which claims were supported, contradicted, or marked as unsure.
|
|
32
32
|
|
|
33
|
-
**generateReasonPrompt
|
|
33
|
+
**generateReasonPrompt** (`string`): The prompt sent to the LLM for the generateReason step (optional).
|
|
34
34
|
|
|
35
|
-
## Scoring
|
|
35
|
+
## Scoring details
|
|
36
36
|
|
|
37
37
|
The scorer evaluates faithfulness through claim verification against provided context.
|
|
38
38
|
|
|
@@ -73,22 +73,22 @@ A faithfulness score between 0 and 1:
|
|
|
73
73
|
Evaluate agent responses for faithfulness to provided context:
|
|
74
74
|
|
|
75
75
|
```typescript
|
|
76
|
-
import { runEvals } from
|
|
77
|
-
import { createFaithfulnessScorer } from
|
|
78
|
-
import { myAgent } from
|
|
76
|
+
import { runEvals } from '@mastra/core/evals'
|
|
77
|
+
import { createFaithfulnessScorer } from '@mastra/evals/scorers/prebuilt'
|
|
78
|
+
import { myAgent } from './agent'
|
|
79
79
|
|
|
80
80
|
// Context is typically populated from agent tool calls or RAG retrieval
|
|
81
81
|
const scorer = createFaithfulnessScorer({
|
|
82
|
-
model:
|
|
83
|
-
})
|
|
82
|
+
model: 'openai/gpt-5.4',
|
|
83
|
+
})
|
|
84
84
|
|
|
85
85
|
const result = await runEvals({
|
|
86
86
|
data: [
|
|
87
87
|
{
|
|
88
|
-
input:
|
|
88
|
+
input: 'Tell me about the Tesla Model 3.',
|
|
89
89
|
},
|
|
90
90
|
{
|
|
91
|
-
input:
|
|
91
|
+
input: 'What are the key features of this electric vehicle?',
|
|
92
92
|
},
|
|
93
93
|
],
|
|
94
94
|
scorers: [scorer],
|
|
@@ -97,11 +97,11 @@ const result = await runEvals({
|
|
|
97
97
|
console.log({
|
|
98
98
|
score: scorerResults[scorer.id].score,
|
|
99
99
|
reason: scorerResults[scorer.id].reason,
|
|
100
|
-
})
|
|
100
|
+
})
|
|
101
101
|
},
|
|
102
|
-
})
|
|
102
|
+
})
|
|
103
103
|
|
|
104
|
-
console.log(result.scores)
|
|
104
|
+
console.log(result.scores)
|
|
105
105
|
```
|
|
106
106
|
|
|
107
107
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|