@mastra/mcp-docs-server 0.13.30-alpha.0 → 0.13.30-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +9 -9
- package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +15 -0
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +35 -35
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +17 -17
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +24 -24
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +31 -31
- package/.docs/organized/changelogs/%40mastra%2Freact.md +20 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +15 -15
- package/.docs/organized/changelogs/create-mastra.md +19 -19
- package/.docs/organized/changelogs/mastra.md +27 -27
- package/.docs/organized/code-examples/agent.md +0 -1
- package/.docs/organized/code-examples/agui.md +2 -2
- package/.docs/organized/code-examples/client-side-tools.md +2 -2
- package/.docs/raw/agents/adding-voice.mdx +118 -25
- package/.docs/raw/agents/agent-memory.mdx +73 -89
- package/.docs/raw/agents/guardrails.mdx +1 -1
- package/.docs/raw/agents/overview.mdx +39 -7
- package/.docs/raw/agents/using-tools.mdx +95 -0
- package/.docs/raw/deployment/overview.mdx +9 -11
- package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +1 -1
- package/.docs/raw/frameworks/servers/express.mdx +2 -2
- package/.docs/raw/getting-started/installation.mdx +34 -85
- package/.docs/raw/getting-started/mcp-docs-server.mdx +13 -1
- package/.docs/raw/index.mdx +49 -14
- package/.docs/raw/observability/ai-tracing/exporters/otel.mdx +3 -0
- package/.docs/raw/reference/observability/ai-tracing/exporters/otel.mdx +6 -0
- package/.docs/raw/reference/scorers/answer-relevancy.mdx +105 -7
- package/.docs/raw/reference/scorers/answer-similarity.mdx +266 -16
- package/.docs/raw/reference/scorers/bias.mdx +107 -6
- package/.docs/raw/reference/scorers/completeness.mdx +131 -8
- package/.docs/raw/reference/scorers/content-similarity.mdx +107 -8
- package/.docs/raw/reference/scorers/context-precision.mdx +234 -18
- package/.docs/raw/reference/scorers/context-relevance.mdx +418 -35
- package/.docs/raw/reference/scorers/faithfulness.mdx +122 -8
- package/.docs/raw/reference/scorers/hallucination.mdx +125 -8
- package/.docs/raw/reference/scorers/keyword-coverage.mdx +141 -9
- package/.docs/raw/reference/scorers/noise-sensitivity.mdx +478 -6
- package/.docs/raw/reference/scorers/prompt-alignment.mdx +351 -102
- package/.docs/raw/reference/scorers/textual-difference.mdx +134 -6
- package/.docs/raw/reference/scorers/tone-consistency.mdx +133 -0
- package/.docs/raw/reference/scorers/tool-call-accuracy.mdx +422 -65
- package/.docs/raw/reference/scorers/toxicity.mdx +125 -7
- package/.docs/raw/reference/workflows/workflow.mdx +33 -0
- package/.docs/raw/scorers/custom-scorers.mdx +244 -3
- package/.docs/raw/scorers/overview.mdx +8 -38
- package/.docs/raw/server-db/middleware.mdx +5 -2
- package/.docs/raw/server-db/runtime-context.mdx +178 -0
- package/.docs/raw/streaming/workflow-streaming.mdx +5 -1
- package/.docs/raw/tools-mcp/overview.mdx +25 -7
- package/.docs/raw/workflows/overview.mdx +28 -1
- package/CHANGELOG.md +14 -0
- package/package.json +4 -4
- package/.docs/raw/agents/runtime-context.mdx +0 -106
- package/.docs/raw/agents/using-tools-and-mcp.mdx +0 -241
- package/.docs/raw/getting-started/model-providers.mdx +0 -63
- package/.docs/raw/tools-mcp/runtime-context.mdx +0 -63
- /package/.docs/raw/{evals → scorers/evals-old-api}/custom-eval.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/overview.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/running-in-ci.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/textual-evals.mdx +0 -0
- /package/.docs/raw/{server-db → workflows}/snapshots.mdx +0 -0
|
@@ -9,6 +9,22 @@ import { PropertiesTable } from "@/components/properties-table";
|
|
|
9
9
|
|
|
10
10
|
The `createContextRelevanceScorerLLM()` function creates a scorer that evaluates how relevant and useful provided context was for generating agent responses. It uses weighted relevance levels and applies penalties for unused high-relevance context and missing information.
|
|
11
11
|
|
|
12
|
+
It is especially useful for these use cases:
|
|
13
|
+
|
|
14
|
+
**Content Generation Evaluation**
|
|
15
|
+
|
|
16
|
+
Best for evaluating context quality in:
|
|
17
|
+
- Chat systems where context usage matters
|
|
18
|
+
- RAG pipelines needing nuanced relevance assessment
|
|
19
|
+
- Systems where missing context affects quality
|
|
20
|
+
|
|
21
|
+
**Context Selection Optimization**
|
|
22
|
+
|
|
23
|
+
Use when optimizing for:
|
|
24
|
+
- Comprehensive context coverage
|
|
25
|
+
- Effective context utilization
|
|
26
|
+
- Identifying context gaps
|
|
27
|
+
|
|
12
28
|
## Parameters
|
|
13
29
|
|
|
14
30
|
<PropertiesTable
|
|
@@ -74,9 +90,7 @@ The `createContextRelevanceScorerLLM()` function creates a scorer that evaluates
|
|
|
74
90
|
]}
|
|
75
91
|
/>
|
|
76
92
|
|
|
77
|
-
|
|
78
|
-
Either `context` or `contextExtractor` must be provided. If both are provided, `contextExtractor` takes precedence.
|
|
79
|
-
:::
|
|
93
|
+
Note: Either `context` or `contextExtractor` must be provided. If both are provided, `contextExtractor` takes precedence.
|
|
80
94
|
|
|
81
95
|
## .run() Returns
|
|
82
96
|
|
|
@@ -129,12 +143,30 @@ Final Score = max(0, Base Score - Usage Penalty - Missing Penalty) × scale
|
|
|
129
143
|
- `maxMissingContextPenalty` = 0.5 (maximum 50% penalty for missing context)
|
|
130
144
|
- `scale` = 1
|
|
131
145
|
|
|
132
|
-
### Score
|
|
146
|
+
### Score interpretation
|
|
147
|
+
|
|
148
|
+
- **0.9-1.0**: Excellent - all context highly relevant and used
|
|
149
|
+
- **0.7-0.8**: Good - mostly relevant with minor gaps
|
|
150
|
+
- **0.4-0.6**: Mixed - significant irrelevant or unused context
|
|
151
|
+
- **0.2-0.3**: Poor - mostly irrelevant context
|
|
152
|
+
- **0.0-0.1**: Very poor - no relevant context found
|
|
153
|
+
|
|
154
|
+
### Reason analysis
|
|
155
|
+
|
|
156
|
+
The reason field provides insights on:
|
|
157
|
+
- Relevance level of each context piece (high/medium/low/none)
|
|
158
|
+
- Which context was actually used in the response
|
|
159
|
+
- Penalties applied for unused high-relevance context (configurable via `unusedHighRelevanceContext`)
|
|
160
|
+
- Missing context that would have improved the response (penalized via `missingContextPerItem` up to `maxMissingContextPenalty`)
|
|
133
161
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
- **
|
|
162
|
+
### Optimization strategies
|
|
163
|
+
|
|
164
|
+
Use results to improve your system:
|
|
165
|
+
- **Filter irrelevant context**: Remove low/none relevance pieces before processing
|
|
166
|
+
- **Ensure context usage**: Make sure high-relevance context is incorporated
|
|
167
|
+
- **Fill context gaps**: Add missing information identified by the scorer
|
|
168
|
+
- **Balance context size**: Find optimal amount of context for best relevance
|
|
169
|
+
- **Tune penalty sensitivity**: Adjust `unusedHighRelevanceContext`, `missingContextPerItem`, and `maxMissingContextPenalty` based on your application's tolerance for unused or missing context
|
|
138
170
|
|
|
139
171
|
### Difference from Context Precision
|
|
140
172
|
|
|
@@ -146,35 +178,76 @@ Final Score = max(0, Base Score - Usage Penalty - Missing Penalty) × scale
|
|
|
146
178
|
| **Usage** | Tracks and penalizes unused context | Not considered |
|
|
147
179
|
| **Missing** | Identifies and penalizes gaps | Not evaluated |
|
|
148
180
|
|
|
149
|
-
##
|
|
181
|
+
## Scorer configuration
|
|
182
|
+
|
|
183
|
+
### Custom penalty configuration
|
|
150
184
|
|
|
151
|
-
|
|
185
|
+
Control how penalties are applied for unused and missing context:
|
|
152
186
|
|
|
153
187
|
```typescript
|
|
154
|
-
|
|
155
|
-
|
|
188
|
+
import { openai } from '@ai-sdk/openai';
|
|
189
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals';
|
|
190
|
+
|
|
191
|
+
// Stricter penalty configuration
|
|
192
|
+
const strictScorer = createContextRelevanceScorerLLM({
|
|
193
|
+
model: openai('gpt-4o-mini'),
|
|
156
194
|
options: {
|
|
157
|
-
context: [
|
|
195
|
+
context: [
|
|
196
|
+
'Einstein won the Nobel Prize for photoelectric effect',
|
|
197
|
+
'He developed the theory of relativity',
|
|
198
|
+
'Einstein was born in Germany',
|
|
199
|
+
],
|
|
200
|
+
penalties: {
|
|
201
|
+
unusedHighRelevanceContext: 0.2, // 20% penalty per unused high-relevance context
|
|
202
|
+
missingContextPerItem: 0.25, // 25% penalty per missing context item
|
|
203
|
+
maxMissingContextPenalty: 0.6, // Maximum 60% penalty for missing context
|
|
204
|
+
},
|
|
158
205
|
scale: 1,
|
|
159
206
|
},
|
|
160
207
|
});
|
|
161
|
-
```
|
|
162
208
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
const scorer = createContextRelevanceScorerLLM({
|
|
167
|
-
model: openai('gpt-4o'),
|
|
209
|
+
// Lenient penalty configuration
|
|
210
|
+
const lenientScorer = createContextRelevanceScorerLLM({
|
|
211
|
+
model: openai('gpt-4o-mini'),
|
|
168
212
|
options: {
|
|
169
|
-
context: [
|
|
213
|
+
context: [
|
|
214
|
+
'Einstein won the Nobel Prize for photoelectric effect',
|
|
215
|
+
'He developed the theory of relativity',
|
|
216
|
+
'Einstein was born in Germany',
|
|
217
|
+
],
|
|
170
218
|
penalties: {
|
|
171
|
-
unusedHighRelevanceContext: 0.05, //
|
|
172
|
-
missingContextPerItem: 0.
|
|
173
|
-
maxMissingContextPenalty: 0.
|
|
219
|
+
unusedHighRelevanceContext: 0.05, // 5% penalty per unused high-relevance context
|
|
220
|
+
missingContextPerItem: 0.1, // 10% penalty per missing context item
|
|
221
|
+
maxMissingContextPenalty: 0.3, // Maximum 30% penalty for missing context
|
|
174
222
|
},
|
|
175
|
-
scale:
|
|
223
|
+
scale: 1,
|
|
176
224
|
},
|
|
177
225
|
});
|
|
226
|
+
|
|
227
|
+
const testRun = {
|
|
228
|
+
input: {
|
|
229
|
+
inputMessages: [
|
|
230
|
+
{
|
|
231
|
+
id: '1',
|
|
232
|
+
role: 'user',
|
|
233
|
+
content: 'What did Einstein achieve in physics?',
|
|
234
|
+
},
|
|
235
|
+
],
|
|
236
|
+
},
|
|
237
|
+
output: [
|
|
238
|
+
{
|
|
239
|
+
id: '2',
|
|
240
|
+
role: 'assistant',
|
|
241
|
+
content: 'Einstein won the Nobel Prize for his work on the photoelectric effect.',
|
|
242
|
+
},
|
|
243
|
+
],
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
const strictResult = await strictScorer.run(testRun);
|
|
247
|
+
const lenientResult = await lenientScorer.run(testRun);
|
|
248
|
+
|
|
249
|
+
console.log('Strict penalties:', strictResult.score); // Lower score due to unused context
|
|
250
|
+
console.log('Lenient penalties:', lenientResult.score); // Higher score, less penalty
|
|
178
251
|
```
|
|
179
252
|
|
|
180
253
|
### Dynamic Context Extraction
|
|
@@ -201,19 +274,329 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
201
274
|
});
|
|
202
275
|
```
|
|
203
276
|
|
|
204
|
-
|
|
277
|
+
### Custom scale factor
|
|
205
278
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
-
|
|
209
|
-
|
|
210
|
-
|
|
279
|
+
```typescript
|
|
280
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
281
|
+
model: openai('gpt-4o-mini'),
|
|
282
|
+
options: {
|
|
283
|
+
context: [
|
|
284
|
+
'Relevant information...',
|
|
285
|
+
'Supporting details...',
|
|
286
|
+
],
|
|
287
|
+
scale: 100, // Scale scores from 0-100 instead of 0-1
|
|
288
|
+
},
|
|
289
|
+
});
|
|
211
290
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
291
|
+
// Result will be scaled: score: 85 instead of 0.85
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Combining multiple context sources
|
|
295
|
+
|
|
296
|
+
```typescript
|
|
297
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
298
|
+
model: openai('gpt-4o-mini'),
|
|
299
|
+
options: {
|
|
300
|
+
contextExtractor: (input, output) => {
|
|
301
|
+
const query = input?.inputMessages?.[0]?.content || '';
|
|
302
|
+
|
|
303
|
+
// Combine from multiple sources
|
|
304
|
+
const kbContext = knowledgeBase.search(query);
|
|
305
|
+
const docContext = documentStore.retrieve(query);
|
|
306
|
+
const cacheContext = contextCache.get(query);
|
|
307
|
+
|
|
308
|
+
return [
|
|
309
|
+
...kbContext,
|
|
310
|
+
...docContext,
|
|
311
|
+
...cacheContext,
|
|
312
|
+
];
|
|
313
|
+
},
|
|
314
|
+
scale: 1,
|
|
315
|
+
},
|
|
316
|
+
});
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
## Examples
|
|
320
|
+
|
|
321
|
+
### High relevance example
|
|
322
|
+
|
|
323
|
+
This example shows excellent context relevance where all context directly supports the response:
|
|
324
|
+
|
|
325
|
+
```typescript
|
|
326
|
+
import { openai } from '@ai-sdk/openai';
|
|
327
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals';
|
|
328
|
+
|
|
329
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
330
|
+
model: openai('gpt-4o-mini'),
|
|
331
|
+
options: {
|
|
332
|
+
context: [
|
|
333
|
+
'Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1921.',
|
|
334
|
+
'He published his theory of special relativity in 1905.',
|
|
335
|
+
'His general relativity theory, published in 1915, revolutionized our understanding of gravity.',
|
|
336
|
+
],
|
|
337
|
+
scale: 1,
|
|
338
|
+
},
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
const result = await scorer.run({
|
|
342
|
+
input: {
|
|
343
|
+
inputMessages: [
|
|
344
|
+
{
|
|
345
|
+
id: '1',
|
|
346
|
+
role: 'user',
|
|
347
|
+
content: 'What were Einstein\'s major scientific achievements?',
|
|
348
|
+
},
|
|
349
|
+
],
|
|
350
|
+
},
|
|
351
|
+
output: [
|
|
352
|
+
{
|
|
353
|
+
id: '2',
|
|
354
|
+
role: 'assistant',
|
|
355
|
+
content: 'Einstein\'s major achievements include the Nobel Prize for the photoelectric effect, special relativity in 1905, and general relativity in 1915.',
|
|
356
|
+
},
|
|
357
|
+
],
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
console.log(result);
|
|
361
|
+
// Output:
|
|
362
|
+
// {
|
|
363
|
+
// score: 1.0,
|
|
364
|
+
// reason: "The score is 1.0 because all context pieces are highly relevant to Einstein's achievements and were effectively used in generating the comprehensive response."
|
|
365
|
+
// }
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
### Mixed relevance example
|
|
369
|
+
|
|
370
|
+
This example shows moderate relevance with some context being irrelevant or unused:
|
|
371
|
+
|
|
372
|
+
```typescript
|
|
373
|
+
import { openai } from '@ai-sdk/openai';
|
|
374
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals';
|
|
375
|
+
|
|
376
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
377
|
+
model: openai('gpt-4o-mini'),
|
|
378
|
+
options: {
|
|
379
|
+
context: [
|
|
380
|
+
'Solar eclipses occur when the Moon blocks the Sun.',
|
|
381
|
+
'The Moon moves between the Earth and Sun during eclipses.',
|
|
382
|
+
'The Moon is visible at night.',
|
|
383
|
+
'Stars twinkle due to atmospheric interference.',
|
|
384
|
+
'Total eclipses can last up to 7.5 minutes.',
|
|
385
|
+
],
|
|
386
|
+
scale: 1,
|
|
387
|
+
},
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
const result = await scorer.run({
|
|
391
|
+
input: {
|
|
392
|
+
inputMessages: [
|
|
393
|
+
{
|
|
394
|
+
id: '1',
|
|
395
|
+
role: 'user',
|
|
396
|
+
content: 'What causes solar eclipses?',
|
|
397
|
+
},
|
|
398
|
+
],
|
|
399
|
+
},
|
|
400
|
+
output: [
|
|
401
|
+
{
|
|
402
|
+
id: '2',
|
|
403
|
+
role: 'assistant',
|
|
404
|
+
content: 'Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.',
|
|
405
|
+
},
|
|
406
|
+
],
|
|
407
|
+
});
|
|
408
|
+
|
|
409
|
+
console.log(result);
|
|
410
|
+
// Output with default penalties:
|
|
411
|
+
// {
|
|
412
|
+
// score: 0.64,
|
|
413
|
+
// reason: "The score is 0.64 because contexts 1 and 2 are highly relevant and used, context 5 is relevant but unused (10% penalty), while contexts 3 and 4 are irrelevant."
|
|
414
|
+
// }
|
|
415
|
+
|
|
416
|
+
// With custom penalty configuration
|
|
417
|
+
const customScorer = createContextRelevanceScorerLLM({
|
|
418
|
+
model: openai('gpt-4o-mini'),
|
|
419
|
+
options: {
|
|
420
|
+
context: [
|
|
421
|
+
'Solar eclipses occur when the Moon blocks the Sun.',
|
|
422
|
+
'The Moon moves between the Earth and Sun during eclipses.',
|
|
423
|
+
'The Moon is visible at night.',
|
|
424
|
+
'Stars twinkle due to atmospheric interference.',
|
|
425
|
+
'Total eclipses can last up to 7.5 minutes.',
|
|
426
|
+
],
|
|
427
|
+
penalties: {
|
|
428
|
+
unusedHighRelevanceContext: 0.05, // Lower penalty for unused context
|
|
429
|
+
missingContextPerItem: 0.1,
|
|
430
|
+
maxMissingContextPenalty: 0.3,
|
|
431
|
+
},
|
|
432
|
+
},
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
const customResult = await customScorer.run({
|
|
436
|
+
input: { inputMessages: [{ id: '1', role: 'user', content: 'What causes solar eclipses?' }] },
|
|
437
|
+
output: [{ id: '2', role: 'assistant', content: 'Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.' }],
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
console.log(customResult);
|
|
441
|
+
// Output with lenient penalties:
|
|
442
|
+
// {
|
|
443
|
+
// score: 0.69, // Higher score due to reduced penalty for unused context
|
|
444
|
+
// reason: "The score is 0.69 because contexts 1 and 2 are highly relevant and used, context 5 is relevant but unused (5% penalty), while contexts 3 and 4 are irrelevant."
|
|
445
|
+
// }
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Low relevance example
|
|
449
|
+
|
|
450
|
+
This example shows poor context relevance with mostly irrelevant information:
|
|
451
|
+
|
|
452
|
+
```typescript
|
|
453
|
+
import { openai } from '@ai-sdk/openai';
|
|
454
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals';
|
|
455
|
+
|
|
456
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
457
|
+
model: openai('gpt-4o-mini'),
|
|
458
|
+
options: {
|
|
459
|
+
context: [
|
|
460
|
+
'The Great Barrier Reef is located in Australia.',
|
|
461
|
+
'Coral reefs need warm water to survive.',
|
|
462
|
+
'Many fish species live in coral reefs.',
|
|
463
|
+
'Australia has six states and two territories.',
|
|
464
|
+
'The capital of Australia is Canberra.',
|
|
465
|
+
],
|
|
466
|
+
scale: 1,
|
|
467
|
+
},
|
|
468
|
+
});
|
|
469
|
+
|
|
470
|
+
const result = await scorer.run({
|
|
471
|
+
input: {
|
|
472
|
+
inputMessages: [
|
|
473
|
+
{
|
|
474
|
+
id: '1',
|
|
475
|
+
role: 'user',
|
|
476
|
+
content: 'What is the capital of Australia?',
|
|
477
|
+
},
|
|
478
|
+
],
|
|
479
|
+
},
|
|
480
|
+
output: [
|
|
481
|
+
{
|
|
482
|
+
id: '2',
|
|
483
|
+
role: 'assistant',
|
|
484
|
+
content: 'The capital of Australia is Canberra.',
|
|
485
|
+
},
|
|
486
|
+
],
|
|
487
|
+
});
|
|
488
|
+
|
|
489
|
+
console.log(result);
|
|
490
|
+
// Output:
|
|
491
|
+
// {
|
|
492
|
+
// score: 0.26,
|
|
493
|
+
// reason: "The score is 0.26 because only context 5 is relevant to the query about Australia's capital, while the other contexts about reefs are completely irrelevant."
|
|
494
|
+
// }
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
### Dynamic context extraction
|
|
498
|
+
|
|
499
|
+
Extract context dynamically based on the run input:
|
|
500
|
+
|
|
501
|
+
```typescript
|
|
502
|
+
import { openai } from '@ai-sdk/openai';
|
|
503
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals';
|
|
504
|
+
|
|
505
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
506
|
+
model: openai('gpt-4o-mini'),
|
|
507
|
+
options: {
|
|
508
|
+
contextExtractor: (input, output) => {
|
|
509
|
+
// Extract query from input
|
|
510
|
+
const query = input?.inputMessages?.[0]?.content || '';
|
|
511
|
+
|
|
512
|
+
// Dynamically retrieve context based on query
|
|
513
|
+
if (query.toLowerCase().includes('einstein')) {
|
|
514
|
+
return [
|
|
515
|
+
'Einstein developed E=mc²',
|
|
516
|
+
'He won the Nobel Prize in 1921',
|
|
517
|
+
'His theories revolutionized physics',
|
|
518
|
+
];
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
if (query.toLowerCase().includes('climate')) {
|
|
522
|
+
return [
|
|
523
|
+
'Global temperatures are rising',
|
|
524
|
+
'CO2 levels affect climate',
|
|
525
|
+
'Renewable energy reduces emissions',
|
|
526
|
+
];
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
return ['General knowledge base entry'];
|
|
530
|
+
},
|
|
531
|
+
penalties: {
|
|
532
|
+
unusedHighRelevanceContext: 0.15, // 15% penalty for unused relevant context
|
|
533
|
+
missingContextPerItem: 0.2, // 20% penalty per missing context item
|
|
534
|
+
maxMissingContextPenalty: 0.4, // Cap at 40% total missing context penalty
|
|
535
|
+
},
|
|
536
|
+
scale: 1,
|
|
537
|
+
},
|
|
538
|
+
});
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
### RAG system integration
|
|
542
|
+
|
|
543
|
+
Integrate with RAG pipelines to evaluate retrieved context:
|
|
544
|
+
|
|
545
|
+
```typescript
|
|
546
|
+
import { openai } from '@ai-sdk/openai';
|
|
547
|
+
import { createContextRelevanceScorerLLM } from '@mastra/evals';
|
|
548
|
+
|
|
549
|
+
const scorer = createContextRelevanceScorerLLM({
|
|
550
|
+
model: openai('gpt-4o-mini'),
|
|
551
|
+
options: {
|
|
552
|
+
contextExtractor: (input, output) => {
|
|
553
|
+
// Extract from RAG retrieval results
|
|
554
|
+
const ragResults = input.metadata?.ragResults || [];
|
|
555
|
+
|
|
556
|
+
// Return the text content of retrieved documents
|
|
557
|
+
return ragResults
|
|
558
|
+
.filter(doc => doc.relevanceScore > 0.5)
|
|
559
|
+
.map(doc => doc.content);
|
|
560
|
+
},
|
|
561
|
+
penalties: {
|
|
562
|
+
unusedHighRelevanceContext: 0.12, // Moderate penalty for unused RAG context
|
|
563
|
+
missingContextPerItem: 0.18, // Higher penalty for missing information in RAG
|
|
564
|
+
maxMissingContextPenalty: 0.45, // Slightly higher cap for RAG systems
|
|
565
|
+
},
|
|
566
|
+
scale: 1,
|
|
567
|
+
},
|
|
568
|
+
});
|
|
569
|
+
|
|
570
|
+
// Evaluate RAG system performance
|
|
571
|
+
const evaluateRAG = async (testCases) => {
|
|
572
|
+
const results = [];
|
|
573
|
+
|
|
574
|
+
for (const testCase of testCases) {
|
|
575
|
+
const score = await scorer.run(testCase);
|
|
576
|
+
results.push({
|
|
577
|
+
query: testCase.input.inputMessages[0].content,
|
|
578
|
+
relevanceScore: score.score,
|
|
579
|
+
feedback: score.reason,
|
|
580
|
+
unusedContext: score.reason.includes('unused'),
|
|
581
|
+
missingContext: score.reason.includes('missing'),
|
|
582
|
+
});
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
return results;
|
|
586
|
+
};
|
|
587
|
+
```
|
|
588
|
+
|
|
589
|
+
## Comparison with Context Precision
|
|
590
|
+
|
|
591
|
+
Choose the right scorer for your needs:
|
|
592
|
+
|
|
593
|
+
| Use Case | Context Relevance | Context Precision |
|
|
594
|
+
|----------|-------------------|-------------------|
|
|
595
|
+
| **RAG evaluation** | When usage matters | When ranking matters |
|
|
596
|
+
| **Context quality** | Nuanced levels | Binary relevance |
|
|
597
|
+
| **Missing detection** | ✓ Identifies gaps | ✗ Not evaluated |
|
|
598
|
+
| **Usage tracking** | ✓ Tracks utilization | ✗ Not considered |
|
|
599
|
+
| **Position sensitivity** | ✗ Position agnostic | ✓ Rewards early placement |
|
|
217
600
|
|
|
218
601
|
## Related
|
|
219
602
|
|
|
@@ -7,8 +7,6 @@ description: Documentation for the Faithfulness Scorer in Mastra, which evaluate
|
|
|
7
7
|
|
|
8
8
|
The `createFaithfulnessScorer()` function evaluates how factually accurate an LLM's output is compared to the provided context. It extracts claims from the output and verifies them against the context, making it essential to measure RAG pipeline responses' reliability.
|
|
9
9
|
|
|
10
|
-
For a usage example, see the [Faithfulness Examples](/examples/scorers/faithfulness).
|
|
11
|
-
|
|
12
10
|
## Parameters
|
|
13
11
|
|
|
14
12
|
The `createFaithfulnessScorer()` function accepts a single options object with the following properties:
|
|
@@ -108,13 +106,129 @@ Final score: `(supported_claims / total_claims) * scale`
|
|
|
108
106
|
|
|
109
107
|
### Score interpretation
|
|
110
108
|
|
|
111
|
-
|
|
109
|
+
A faithfulness score between 0 and 1:
|
|
110
|
+
|
|
111
|
+
- **1.0**: All claims are accurate and directly supported by the context.
|
|
112
|
+
- **0.7–0.9**: Most claims are correct, with minor additions or omissions.
|
|
113
|
+
- **0.4–0.6**: Some claims are supported, but others are unverifiable.
|
|
114
|
+
- **0.1–0.3**: Most of the content is inaccurate or unsupported.
|
|
115
|
+
- **0.0**: All claims are false or contradict the context.
|
|
116
|
+
|
|
117
|
+
## Examples
|
|
118
|
+
|
|
119
|
+
### High faithfulness example
|
|
120
|
+
|
|
121
|
+
In this example, the response closely aligns with the context. Each statement in the output is verifiable and supported by the provided context entries, resulting in a high score.
|
|
122
|
+
|
|
123
|
+
```typescript filename="src/example-high-faithfulness.ts" showLineNumbers copy
|
|
124
|
+
import { openai } from "@ai-sdk/openai";
|
|
125
|
+
import { createFaithfulnessScorer } from "@mastra/evals/scorers/llm";
|
|
126
|
+
|
|
127
|
+
const scorer = createFaithfulnessScorer({ model: openai("gpt-4o-mini"), options: {
|
|
128
|
+
context: [
|
|
129
|
+
"The Tesla Model 3 was launched in 2017.",
|
|
130
|
+
"It has a range of up to 358 miles.",
|
|
131
|
+
"The base model accelerates 0-60 mph in 5.8 seconds."
|
|
132
|
+
]
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
const query = "Tell me about the Tesla Model 3.";
|
|
136
|
+
const response = "The Tesla Model 3 was introduced in 2017. It can travel up to 358 miles on a single charge and the base version goes from 0 to 60 mph in 5.8 seconds.";
|
|
137
|
+
|
|
138
|
+
const result = await scorer.run({
|
|
139
|
+
input: [{ role: 'user', content: query }],
|
|
140
|
+
output: { text: response },
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
console.log(result);
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
#### High faithfulness output
|
|
147
|
+
|
|
148
|
+
The output receives a score of 1 because all the information it provides can be directly traced to the context. There are no missing or contradictory facts.
|
|
149
|
+
|
|
150
|
+
```typescript
|
|
151
|
+
{
|
|
152
|
+
score: 1,
|
|
153
|
+
reason: 'The score is 1 because all claims made in the output are supported by the provided context.'
|
|
154
|
+
}
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Mixed faithfulness example
|
|
158
|
+
|
|
159
|
+
In this example, there are a mix of supported and unsupported claims. Some parts of the response are backed by the context, while others introduce new information not found in the source material.
|
|
160
|
+
|
|
161
|
+
```typescript filename="src/example-mixed-faithfulness.ts" showLineNumbers copy
|
|
162
|
+
import { openai } from "@ai-sdk/openai";
|
|
163
|
+
import { createFaithfulnessScorer } from "@mastra/evals/scorers/llm";
|
|
164
|
+
|
|
165
|
+
const scorer = createFaithfulnessScorer({ model: openai("gpt-4o-mini"), options: {
|
|
166
|
+
context: [
|
|
167
|
+
"Python was created by Guido van Rossum.",
|
|
168
|
+
"The first version was released in 1991.",
|
|
169
|
+
"Python emphasizes code readability."
|
|
170
|
+
]
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
const query = "What can you tell me about Python?";
|
|
174
|
+
const response = "Python was created by Guido van Rossum and released in 1991. It is the most popular programming language today and is used by millions of developers worldwide.";
|
|
175
|
+
|
|
176
|
+
const result = await scorer.run({
|
|
177
|
+
input: [{ role: 'user', content: query }],
|
|
178
|
+
output: { text: response },
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
console.log(result);
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
#### Mixed faithfulness output
|
|
185
|
+
|
|
186
|
+
The score is lower because only a portion of the response is verifiable. While some claims match the context, others are unconfirmed or out of scope, reducing the overall faithfulness.
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
{
|
|
190
|
+
score: 0.5,
|
|
191
|
+
reason: "The score is 0.5 because while two claims are supported by the context (Python was created by Guido van Rossum and Python was released in 1991), the other two claims regarding Python's popularity and usage cannot be verified as they are not mentioned in the context."
|
|
192
|
+
}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Low faithfulness example
|
|
196
|
+
|
|
197
|
+
In this example, the response directly contradicts the context. None of the claims are supported, and several conflict with the facts provided.
|
|
198
|
+
|
|
199
|
+
```typescript filename="src/example-low-faithfulness.ts" showLineNumbers copy
|
|
200
|
+
import { openai } from "@ai-sdk/openai";
|
|
201
|
+
import { createFaithfulnessScorer } from "@mastra/evals/scorers/llm";
|
|
202
|
+
|
|
203
|
+
const scorer = createFaithfulnessScorer({ model: openai("gpt-4o-mini"), options: {
|
|
204
|
+
context: [
|
|
205
|
+
"Mars is the fourth planet from the Sun.",
|
|
206
|
+
"It has a thin atmosphere of mostly carbon dioxide.",
|
|
207
|
+
"Two small moons orbit Mars: Phobos and Deimos."
|
|
208
|
+
]
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const query = "What do we know about Mars?";
|
|
212
|
+
const response = "Mars is the third planet from the Sun. It has a thick atmosphere rich in oxygen and nitrogen, and is orbited by three large moons.";
|
|
213
|
+
|
|
214
|
+
const result = await scorer.run({
|
|
215
|
+
input: [{ role: 'user', content: query }],
|
|
216
|
+
output: { text: response },
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
console.log(result);
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
#### Low faithfulness output
|
|
223
|
+
|
|
224
|
+
Each claim is inaccurate or conflicts with the context, resulting in a score of 0.
|
|
112
225
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
226
|
+
```typescript
|
|
227
|
+
{
|
|
228
|
+
score: 0,
|
|
229
|
+
reason: "The score is 0 because all claims made in the output contradict the provided context. The output states that Mars is the third planet from the Sun, while the context clearly states it is the fourth. Additionally, it claims that Mars has a thick atmosphere rich in oxygen and nitrogen, contradicting the context's description of a thin atmosphere mostly composed of carbon dioxide. Finally, the output mentions that Mars is orbited by three large moons, while the context specifies that it has only two small moons, Phobos and Deimos. Therefore, there are no supported claims, leading to a score of 0."
|
|
230
|
+
}
|
|
231
|
+
```
|
|
118
232
|
|
|
119
233
|
## Related
|
|
120
234
|
|