@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,10 +1,10 @@
1
- # Context Relevance Scorer
1
+ # Context relevance scorer
2
2
 
3
3
  The `createContextRelevanceScorerLLM()` function creates a scorer that evaluates how relevant and useful provided context was for generating agent responses. It uses weighted relevance levels and applies penalties for unused high-relevance context and missing information.
4
4
 
5
- It is especially useful for these use cases:
5
+ It's especially useful for these use cases:
6
6
 
7
- **Content Generation Evaluation**
7
+ ## Content generation evaluation
8
8
 
9
9
  Best for evaluating context quality in:
10
10
 
@@ -12,7 +12,7 @@ Best for evaluating context quality in:
12
12
  - RAG pipelines needing nuanced relevance assessment
13
13
  - Systems where missing context affects quality
14
14
 
15
- **Context Selection Optimization**
15
+ ## Context selection optimization
16
16
 
17
17
  Use when optimizing for:
18
18
 
@@ -22,19 +22,19 @@ Use when optimizing for:
22
22
 
23
23
  ## Parameters
24
24
 
25
- **model:** (`MastraModelConfig`): The language model to use for evaluating context relevance
25
+ **model** (`MastraModelConfig`): The language model to use for evaluating context relevance
26
26
 
27
- **options:** (`ContextRelevanceOptions`): Configuration options for the scorer
27
+ **options** (`ContextRelevanceOptions`): Configuration options for the scorer
28
28
 
29
29
  Note: Either `context` or `contextExtractor` must be provided. If both are provided, `contextExtractor` takes precedence.
30
30
 
31
- ## .run() Returns
31
+ ## `.run()` returns
32
32
 
33
- **score:** (`number`): Weighted relevance score between 0 and scale (default 0-1)
33
+ **score** (`number`): Weighted relevance score between 0 and scale (default 0-1)
34
34
 
35
- **reason:** (`string`): Human-readable explanation of the context relevance evaluation
35
+ **reason** (`string`): Human-readable explanation of the context relevance evaluation
36
36
 
37
- ## Scoring Details
37
+ ## Scoring details
38
38
 
39
39
  ### Weighted Relevance Scoring
40
40
 
@@ -115,16 +115,16 @@ Use results to improve your system:
115
115
  Control how penalties are applied for unused and missing context:
116
116
 
117
117
  ```typescript
118
- import { createContextRelevanceScorerLLM } from "@mastra/evals";
118
+ import { createContextRelevanceScorerLLM } from '@mastra/evals'
119
119
 
120
120
  // Stricter penalty configuration
121
121
  const strictScorer = createContextRelevanceScorerLLM({
122
- model: "openai/gpt-5.1",
122
+ model: 'openai/gpt-5.4',
123
123
  options: {
124
124
  context: [
125
- "Einstein won the Nobel Prize for photoelectric effect",
126
- "He developed the theory of relativity",
127
- "Einstein was born in Germany",
125
+ 'Einstein won the Nobel Prize for photoelectric effect',
126
+ 'He developed the theory of relativity',
127
+ 'Einstein was born in Germany',
128
128
  ],
129
129
  penalties: {
130
130
  unusedHighRelevanceContext: 0.2, // 20% penalty per unused high-relevance context
@@ -133,16 +133,16 @@ const strictScorer = createContextRelevanceScorerLLM({
133
133
  },
134
134
  scale: 1,
135
135
  },
136
- });
136
+ })
137
137
 
138
138
  // Lenient penalty configuration
139
139
  const lenientScorer = createContextRelevanceScorerLLM({
140
- model: "openai/gpt-5.1",
140
+ model: 'openai/gpt-5.4',
141
141
  options: {
142
142
  context: [
143
- "Einstein won the Nobel Prize for photoelectric effect",
144
- "He developed the theory of relativity",
145
- "Einstein was born in Germany",
143
+ 'Einstein won the Nobel Prize for photoelectric effect',
144
+ 'He developed the theory of relativity',
145
+ 'Einstein was born in Germany',
146
146
  ],
147
147
  penalties: {
148
148
  unusedHighRelevanceContext: 0.05, // 5% penalty per unused high-relevance context
@@ -151,69 +151,68 @@ const lenientScorer = createContextRelevanceScorerLLM({
151
151
  },
152
152
  scale: 1,
153
153
  },
154
- });
154
+ })
155
155
 
156
156
  const testRun = {
157
157
  input: {
158
158
  inputMessages: [
159
159
  {
160
- id: "1",
161
- role: "user",
162
- content: "What did Einstein achieve in physics?",
160
+ id: '1',
161
+ role: 'user',
162
+ content: 'What did Einstein achieve in physics?',
163
163
  },
164
164
  ],
165
165
  },
166
166
  output: [
167
167
  {
168
- id: "2",
169
- role: "assistant",
170
- content:
171
- "Einstein won the Nobel Prize for his work on the photoelectric effect.",
168
+ id: '2',
169
+ role: 'assistant',
170
+ content: 'Einstein won the Nobel Prize for his work on the photoelectric effect.',
172
171
  },
173
172
  ],
174
- };
173
+ }
175
174
 
176
- const strictResult = await strictScorer.run(testRun);
177
- const lenientResult = await lenientScorer.run(testRun);
175
+ const strictResult = await strictScorer.run(testRun)
176
+ const lenientResult = await lenientScorer.run(testRun)
178
177
 
179
- console.log("Strict penalties:", strictResult.score); // Lower score due to unused context
180
- console.log("Lenient penalties:", lenientResult.score); // Higher score, less penalty
178
+ console.log('Strict penalties:', strictResult.score) // Lower score due to unused context
179
+ console.log('Lenient penalties:', lenientResult.score) // Higher score, less penalty
181
180
  ```
182
181
 
183
182
  ### Dynamic Context Extraction
184
183
 
185
184
  ```typescript
186
185
  const scorer = createContextRelevanceScorerLLM({
187
- model: "openai/gpt-5.1",
186
+ model: 'openai/gpt-5.4',
188
187
  options: {
189
188
  contextExtractor: (input, output) => {
190
189
  // Extract context based on the query
191
- const userQuery = input?.inputMessages?.[0]?.content || "";
192
- if (userQuery.includes("Einstein")) {
190
+ const userQuery = input?.inputMessages?.[0]?.content || ''
191
+ if (userQuery.includes('Einstein')) {
193
192
  return [
194
- "Einstein won the Nobel Prize for the photoelectric effect",
195
- "He developed the theory of relativity",
196
- ];
193
+ 'Einstein won the Nobel Prize for the photoelectric effect',
194
+ 'He developed the theory of relativity',
195
+ ]
197
196
  }
198
- return ["General physics information"];
197
+ return ['General physics information']
199
198
  },
200
199
  penalties: {
201
200
  unusedHighRelevanceContext: 0.15,
202
201
  },
203
202
  },
204
- });
203
+ })
205
204
  ```
206
205
 
207
206
  ### Custom scale factor
208
207
 
209
208
  ```typescript
210
209
  const scorer = createContextRelevanceScorerLLM({
211
- model: "openai/gpt-5.1",
210
+ model: 'openai/gpt-5.4',
212
211
  options: {
213
- context: ["Relevant information...", "Supporting details..."],
212
+ context: ['Relevant information...', 'Supporting details...'],
214
213
  scale: 100, // Scale scores from 0-100 instead of 0-1
215
214
  },
216
- });
215
+ })
217
216
 
218
217
  // Result will be scaled: score: 85 instead of 0.85
219
218
  ```
@@ -222,21 +221,21 @@ const scorer = createContextRelevanceScorerLLM({
222
221
 
223
222
  ```typescript
224
223
  const scorer = createContextRelevanceScorerLLM({
225
- model: "openai/gpt-5.1",
224
+ model: 'openai/gpt-5.4',
226
225
  options: {
227
226
  contextExtractor: (input, output) => {
228
- const query = input?.inputMessages?.[0]?.content || "";
227
+ const query = input?.inputMessages?.[0]?.content || ''
229
228
 
230
229
  // Combine from multiple sources
231
- const kbContext = knowledgeBase.search(query);
232
- const docContext = documentStore.retrieve(query);
233
- const cacheContext = contextCache.get(query);
230
+ const kbContext = knowledgeBase.search(query)
231
+ const docContext = documentStore.retrieve(query)
232
+ const cacheContext = contextCache.get(query)
234
233
 
235
- return [...kbContext, ...docContext, ...cacheContext];
234
+ return [...kbContext, ...docContext, ...cacheContext]
236
235
  },
237
236
  scale: 1,
238
237
  },
239
- });
238
+ })
240
239
  ```
241
240
 
242
241
  ## Examples
@@ -246,41 +245,41 @@ const scorer = createContextRelevanceScorerLLM({
246
245
  This example shows excellent context relevance where all context directly supports the response:
247
246
 
248
247
  ```typescript
249
- import { createContextRelevanceScorerLLM } from "@mastra/evals";
248
+ import { createContextRelevanceScorerLLM } from '@mastra/evals'
250
249
 
251
250
  const scorer = createContextRelevanceScorerLLM({
252
- model: "openai/gpt-5.1",
251
+ model: 'openai/gpt-5.4',
253
252
  options: {
254
253
  context: [
255
- "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1921.",
256
- "He published his theory of special relativity in 1905.",
257
- "His general relativity theory, published in 1915, revolutionized our understanding of gravity.",
254
+ 'Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1921.',
255
+ 'He published his theory of special relativity in 1905.',
256
+ 'His general relativity theory, published in 1915, revolutionized our understanding of gravity.',
258
257
  ],
259
258
  scale: 1,
260
259
  },
261
- });
260
+ })
262
261
 
263
262
  const result = await scorer.run({
264
263
  input: {
265
264
  inputMessages: [
266
265
  {
267
- id: "1",
268
- role: "user",
266
+ id: '1',
267
+ role: 'user',
269
268
  content: "What were Einstein's major scientific achievements?",
270
269
  },
271
270
  ],
272
271
  },
273
272
  output: [
274
273
  {
275
- id: "2",
276
- role: "assistant",
274
+ id: '2',
275
+ role: 'assistant',
277
276
  content:
278
277
  "Einstein's major achievements include the Nobel Prize for the photoelectric effect, special relativity in 1905, and general relativity in 1915.",
279
278
  },
280
279
  ],
281
- });
280
+ })
282
281
 
283
- console.log(result);
282
+ console.log(result)
284
283
  // Output:
285
284
  // {
286
285
  // score: 1.0,
@@ -293,43 +292,43 @@ console.log(result);
293
292
  This example shows moderate relevance with some context being irrelevant or unused:
294
293
 
295
294
  ```typescript
296
- import { createContextRelevanceScorerLLM } from "@mastra/evals";
295
+ import { createContextRelevanceScorerLLM } from '@mastra/evals'
297
296
 
298
297
  const scorer = createContextRelevanceScorerLLM({
299
- model: "openai/gpt-5.1",
298
+ model: 'openai/gpt-5.4',
300
299
  options: {
301
300
  context: [
302
- "Solar eclipses occur when the Moon blocks the Sun.",
303
- "The Moon moves between the Earth and Sun during eclipses.",
304
- "The Moon is visible at night.",
305
- "Stars twinkle due to atmospheric interference.",
306
- "Total eclipses can last up to 7.5 minutes.",
301
+ 'Solar eclipses occur when the Moon blocks the Sun.',
302
+ 'The Moon moves between the Earth and Sun during eclipses.',
303
+ 'The Moon is visible at night.',
304
+ 'Stars twinkle due to atmospheric interference.',
305
+ 'Total eclipses can last up to 7.5 minutes.',
307
306
  ],
308
307
  scale: 1,
309
308
  },
310
- });
309
+ })
311
310
 
312
311
  const result = await scorer.run({
313
312
  input: {
314
313
  inputMessages: [
315
314
  {
316
- id: "1",
317
- role: "user",
318
- content: "What causes solar eclipses?",
315
+ id: '1',
316
+ role: 'user',
317
+ content: 'What causes solar eclipses?',
319
318
  },
320
319
  ],
321
320
  },
322
321
  output: [
323
322
  {
324
- id: "2",
325
- role: "assistant",
323
+ id: '2',
324
+ role: 'assistant',
326
325
  content:
327
- "Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.",
326
+ 'Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.',
328
327
  },
329
328
  ],
330
- });
329
+ })
331
330
 
332
- console.log(result);
331
+ console.log(result)
333
332
  // Output with default penalties:
334
333
  // {
335
334
  // score: 0.64,
@@ -338,14 +337,14 @@ console.log(result);
338
337
 
339
338
  // With custom penalty configuration
340
339
  const customScorer = createContextRelevanceScorerLLM({
341
- model: "openai/gpt-5.1",
340
+ model: 'openai/gpt-5.4',
342
341
  options: {
343
342
  context: [
344
- "Solar eclipses occur when the Moon blocks the Sun.",
345
- "The Moon moves between the Earth and Sun during eclipses.",
346
- "The Moon is visible at night.",
347
- "Stars twinkle due to atmospheric interference.",
348
- "Total eclipses can last up to 7.5 minutes.",
343
+ 'Solar eclipses occur when the Moon blocks the Sun.',
344
+ 'The Moon moves between the Earth and Sun during eclipses.',
345
+ 'The Moon is visible at night.',
346
+ 'Stars twinkle due to atmospheric interference.',
347
+ 'Total eclipses can last up to 7.5 minutes.',
349
348
  ],
350
349
  penalties: {
351
350
  unusedHighRelevanceContext: 0.05, // Lower penalty for unused context
@@ -353,25 +352,23 @@ const customScorer = createContextRelevanceScorerLLM({
353
352
  maxMissingContextPenalty: 0.3,
354
353
  },
355
354
  },
356
- });
355
+ })
357
356
 
358
357
  const customResult = await customScorer.run({
359
358
  input: {
360
- inputMessages: [
361
- { id: "1", role: "user", content: "What causes solar eclipses?" },
362
- ],
359
+ inputMessages: [{ id: '1', role: 'user', content: 'What causes solar eclipses?' }],
363
360
  },
364
361
  output: [
365
362
  {
366
- id: "2",
367
- role: "assistant",
363
+ id: '2',
364
+ role: 'assistant',
368
365
  content:
369
- "Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.",
366
+ 'Solar eclipses happen when the Moon moves between Earth and the Sun, blocking sunlight.',
370
367
  },
371
368
  ],
372
- });
369
+ })
373
370
 
374
- console.log(customResult);
371
+ console.log(customResult)
375
372
  // Output with lenient penalties:
376
373
  // {
377
374
  // score: 0.69, // Higher score due to reduced penalty for unused context
@@ -384,42 +381,42 @@ console.log(customResult);
384
381
  This example shows poor context relevance with mostly irrelevant information:
385
382
 
386
383
  ```typescript
387
- import { createContextRelevanceScorerLLM } from "@mastra/evals";
384
+ import { createContextRelevanceScorerLLM } from '@mastra/evals'
388
385
 
389
386
  const scorer = createContextRelevanceScorerLLM({
390
- model: "openai/gpt-5.1",
387
+ model: 'openai/gpt-5.4',
391
388
  options: {
392
389
  context: [
393
- "The Great Barrier Reef is located in Australia.",
394
- "Coral reefs need warm water to survive.",
395
- "Many fish species live in coral reefs.",
396
- "Australia has six states and two territories.",
397
- "The capital of Australia is Canberra.",
390
+ 'The Great Barrier Reef is located in Australia.',
391
+ 'Coral reefs need warm water to survive.',
392
+ 'Many fish species live in coral reefs.',
393
+ 'Australia has six states and two territories.',
394
+ 'The capital of Australia is Canberra.',
398
395
  ],
399
396
  scale: 1,
400
397
  },
401
- });
398
+ })
402
399
 
403
400
  const result = await scorer.run({
404
401
  input: {
405
402
  inputMessages: [
406
403
  {
407
- id: "1",
408
- role: "user",
409
- content: "What is the capital of Australia?",
404
+ id: '1',
405
+ role: 'user',
406
+ content: 'What is the capital of Australia?',
410
407
  },
411
408
  ],
412
409
  },
413
410
  output: [
414
411
  {
415
- id: "2",
416
- role: "assistant",
417
- content: "The capital of Australia is Canberra.",
412
+ id: '2',
413
+ role: 'assistant',
414
+ content: 'The capital of Australia is Canberra.',
418
415
  },
419
416
  ],
420
- });
417
+ })
421
418
 
422
- console.log(result);
419
+ console.log(result)
423
420
  // Output:
424
421
  // {
425
422
  // score: 0.26,
@@ -432,33 +429,33 @@ console.log(result);
432
429
  Extract context dynamically based on the run input:
433
430
 
434
431
  ```typescript
435
- import { createContextRelevanceScorerLLM } from "@mastra/evals";
432
+ import { createContextRelevanceScorerLLM } from '@mastra/evals'
436
433
 
437
434
  const scorer = createContextRelevanceScorerLLM({
438
- model: "openai/gpt-5.1",
435
+ model: 'openai/gpt-5.4',
439
436
  options: {
440
437
  contextExtractor: (input, output) => {
441
438
  // Extract query from input
442
- const query = input?.inputMessages?.[0]?.content || "";
439
+ const query = input?.inputMessages?.[0]?.content || ''
443
440
 
444
441
  // Dynamically retrieve context based on query
445
- if (query.toLowerCase().includes("einstein")) {
442
+ if (query.toLowerCase().includes('einstein')) {
446
443
  return [
447
- "Einstein developed E=mc²",
448
- "He won the Nobel Prize in 1921",
449
- "His theories revolutionized physics",
450
- ];
444
+ 'Einstein developed E=mc²',
445
+ 'He won the Nobel Prize in 1921',
446
+ 'His theories revolutionized physics',
447
+ ]
451
448
  }
452
449
 
453
- if (query.toLowerCase().includes("climate")) {
450
+ if (query.toLowerCase().includes('climate')) {
454
451
  return [
455
- "Global temperatures are rising",
456
- "CO2 levels affect climate",
457
- "Renewable energy reduces emissions",
458
- ];
452
+ 'Global temperatures are rising',
453
+ 'CO2 levels affect climate',
454
+ 'Renewable energy reduces emissions',
455
+ ]
459
456
  }
460
457
 
461
- return ["General knowledge base entry"];
458
+ return ['General knowledge base entry']
462
459
  },
463
460
  penalties: {
464
461
  unusedHighRelevanceContext: 0.15, // 15% penalty for unused relevant context
@@ -467,7 +464,7 @@ const scorer = createContextRelevanceScorerLLM({
467
464
  },
468
465
  scale: 1,
469
466
  },
470
- });
467
+ })
471
468
  ```
472
469
 
473
470
  ### RAG system integration
@@ -475,19 +472,17 @@ const scorer = createContextRelevanceScorerLLM({
475
472
  Integrate with RAG pipelines to evaluate retrieved context:
476
473
 
477
474
  ```typescript
478
- import { createContextRelevanceScorerLLM } from "@mastra/evals";
475
+ import { createContextRelevanceScorerLLM } from '@mastra/evals'
479
476
 
480
477
  const scorer = createContextRelevanceScorerLLM({
481
- model: "openai/gpt-5.1",
478
+ model: 'openai/gpt-5.4',
482
479
  options: {
483
480
  contextExtractor: (input, output) => {
484
481
  // Extract from RAG retrieval results
485
- const ragResults = inputData.metadata?.ragResults || [];
482
+ const ragResults = inputData.metadata?.ragResults || []
486
483
 
487
484
  // Return the text content of retrieved documents
488
- return ragResults
489
- .filter((doc) => doc.relevanceScore > 0.5)
490
- .map((doc) => doc.content);
485
+ return ragResults.filter(doc => doc.relevanceScore > 0.5).map(doc => doc.content)
491
486
  },
492
487
  penalties: {
493
488
  unusedHighRelevanceContext: 0.12, // Moderate penalty for unused RAG context
@@ -496,28 +491,28 @@ const scorer = createContextRelevanceScorerLLM({
496
491
  },
497
492
  scale: 1,
498
493
  },
499
- });
494
+ })
500
495
 
501
496
  // Evaluate RAG system performance
502
- const evaluateRAG = async (testCases) => {
503
- const results = [];
497
+ const evaluateRAG = async testCases => {
498
+ const results = []
504
499
 
505
500
  for (const testCase of testCases) {
506
- const score = await scorer.run(testCase);
501
+ const score = await scorer.run(testCase)
507
502
  results.push({
508
503
  query: testCase.inputData.inputMessages[0].content,
509
504
  relevanceScore: score.score,
510
505
  feedback: score.reason,
511
- unusedContext: score.reason.includes("unused"),
512
- missingContext: score.reason.includes("missing"),
513
- });
506
+ unusedContext: score.reason.includes('unused'),
507
+ missingContext: score.reason.includes('missing'),
508
+ })
514
509
  }
515
510
 
516
- return results;
517
- };
511
+ return results
512
+ }
518
513
  ```
519
514
 
520
- ## Comparison with Context Precision
515
+ ## Comparison with context precision
521
516
 
522
517
  Choose the right scorer for your needs:
523
518
 
@@ -1,4 +1,4 @@
1
- # Faithfulness Scorer
1
+ # Faithfulness scorer
2
2
 
3
3
  The `createFaithfulnessScorer()` function evaluates how factually accurate an LLM's output is compared to the provided context. It extracts claims from the output and verifies them against the context, making it essential to measure RAG pipeline responses' reliability.
4
4
 
@@ -6,33 +6,33 @@ The `createFaithfulnessScorer()` function evaluates how factually accurate an LL
6
6
 
7
7
  The `createFaithfulnessScorer()` function accepts a single options object with the following properties:
8
8
 
9
- **model:** (`LanguageModel`): Configuration for the model used to evaluate faithfulness.
9
+ **model** (`LanguageModel`): Configuration for the model used to evaluate faithfulness.
10
10
 
11
- **context:** (`string[]`): Array of context chunks against which the output's claims will be verified.
11
+ **context** (`string[]`): Array of context chunks against which the output's claims will be verified.
12
12
 
13
- **scale:** (`number`): The maximum score value. The final score will be normalized to this scale. (Default: `1`)
13
+ **scale** (`number`): The maximum score value. The final score will be normalized to this scale. (Default: `1`)
14
14
 
15
15
  This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
16
16
 
17
- ## .run() Returns
17
+ ## `.run()` returns
18
18
 
19
- **runId:** (`string`): The id of the run (optional).
19
+ **runId** (`string`): The id of the run (optional).
20
20
 
21
- **preprocessStepResult:** (`string[]`): Array of extracted claims from the output.
21
+ **preprocessStepResult** (`string[]`): Array of extracted claims from the output.
22
22
 
23
- **preprocessPrompt:** (`string`): The prompt sent to the LLM for the preprocess step (optional).
23
+ **preprocessPrompt** (`string`): The prompt sent to the LLM for the preprocess step (optional).
24
24
 
25
- **analyzeStepResult:** (`object`): Object with verdicts: { verdicts: Array<{ verdict: 'yes' | 'no' | 'unsure', reason: string }> }
25
+ **analyzeStepResult** (`object`): Object with verdicts: { verdicts: Array<{ verdict: 'yes' | 'no' | 'unsure', reason: string }> }
26
26
 
27
- **analyzePrompt:** (`string`): The prompt sent to the LLM for the analyze step (optional).
27
+ **analyzePrompt** (`string`): The prompt sent to the LLM for the analyze step (optional).
28
28
 
29
- **score:** (`number`): A score between 0 and the configured scale, representing the proportion of claims that are supported by the context.
29
+ **score** (`number`): A score between 0 and the configured scale, representing the proportion of claims that are supported by the context.
30
30
 
31
- **reason:** (`string`): A detailed explanation of the score, including which claims were supported, contradicted, or marked as unsure.
31
+ **reason** (`string`): A detailed explanation of the score, including which claims were supported, contradicted, or marked as unsure.
32
32
 
33
- **generateReasonPrompt:** (`string`): The prompt sent to the LLM for the generateReason step (optional).
33
+ **generateReasonPrompt** (`string`): The prompt sent to the LLM for the generateReason step (optional).
34
34
 
35
- ## Scoring Details
35
+ ## Scoring details
36
36
 
37
37
  The scorer evaluates faithfulness through claim verification against provided context.
38
38
 
@@ -73,22 +73,22 @@ A faithfulness score between 0 and 1:
73
73
  Evaluate agent responses for faithfulness to provided context:
74
74
 
75
75
  ```typescript
76
- import { runEvals } from "@mastra/core/evals";
77
- import { createFaithfulnessScorer } from "@mastra/evals/scorers/prebuilt";
78
- import { myAgent } from "./agent";
76
+ import { runEvals } from '@mastra/core/evals'
77
+ import { createFaithfulnessScorer } from '@mastra/evals/scorers/prebuilt'
78
+ import { myAgent } from './agent'
79
79
 
80
80
  // Context is typically populated from agent tool calls or RAG retrieval
81
81
  const scorer = createFaithfulnessScorer({
82
- model: "openai/gpt-4o",
83
- });
82
+ model: 'openai/gpt-5.4',
83
+ })
84
84
 
85
85
  const result = await runEvals({
86
86
  data: [
87
87
  {
88
- input: "Tell me about the Tesla Model 3.",
88
+ input: 'Tell me about the Tesla Model 3.',
89
89
  },
90
90
  {
91
- input: "What are the key features of this electric vehicle?",
91
+ input: 'What are the key features of this electric vehicle?',
92
92
  },
93
93
  ],
94
94
  scorers: [scorer],
@@ -97,11 +97,11 @@ const result = await runEvals({
97
97
  console.log({
98
98
  score: scorerResults[scorer.id].score,
99
99
  reason: scorerResults[scorer.id].reason,
100
- });
100
+ })
101
101
  },
102
- });
102
+ })
103
103
 
104
- console.log(result.scores);
104
+ console.log(result.scores)
105
105
  ```
106
106
 
107
107
  For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).