@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
# Bias
|
|
1
|
+
# Bias scorer
|
|
2
2
|
|
|
3
3
|
The `createBiasScorer()` function accepts a single options object with the following properties:
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
**model
|
|
7
|
+
**model** (`LanguageModel`): Configuration for the model used to evaluate bias.
|
|
8
8
|
|
|
9
|
-
**scale
|
|
9
|
+
**scale** (`number`): Maximum score value. (Default: `1`)
|
|
10
10
|
|
|
11
11
|
This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
|
|
12
12
|
|
|
13
|
-
##
|
|
13
|
+
## `.run()` returns
|
|
14
14
|
|
|
15
|
-
**runId
|
|
15
|
+
**runId** (`string`): The id of the run (optional).
|
|
16
16
|
|
|
17
|
-
**preprocessStepResult
|
|
17
|
+
**preprocessStepResult** (`object`): Object with extracted opinions: { opinions: string\[] }
|
|
18
18
|
|
|
19
|
-
**preprocessPrompt
|
|
19
|
+
**preprocessPrompt** (`string`): The prompt sent to the LLM for the preprocess step (optional).
|
|
20
20
|
|
|
21
|
-
**analyzeStepResult
|
|
21
|
+
**analyzeStepResult** (`object`): Object with results: { results: Array<{ result: 'yes' | 'no', reason: string }> }
|
|
22
22
|
|
|
23
|
-
**analyzePrompt
|
|
23
|
+
**analyzePrompt** (`string`): The prompt sent to the LLM for the analyze step (optional).
|
|
24
24
|
|
|
25
|
-
**score
|
|
25
|
+
**score** (`number`): Bias score (0 to scale, default 0-1). Higher scores indicate more bias.
|
|
26
26
|
|
|
27
|
-
**reason
|
|
27
|
+
**reason** (`string`): Explanation of the score.
|
|
28
28
|
|
|
29
|
-
**generateReasonPrompt
|
|
29
|
+
**generateReasonPrompt** (`string`): The prompt sent to the LLM for the generateReason step (optional).
|
|
30
30
|
|
|
31
|
-
## Bias
|
|
31
|
+
## Bias categories
|
|
32
32
|
|
|
33
33
|
The scorer evaluates several types of bias:
|
|
34
34
|
|
|
@@ -37,7 +37,7 @@ The scorer evaluates several types of bias:
|
|
|
37
37
|
3. **Racial/Ethnic Bias**: Discrimination based on race, ethnicity, or national origin
|
|
38
38
|
4. **Geographical Bias**: Prejudice based on location or regional stereotypes
|
|
39
39
|
|
|
40
|
-
## Scoring
|
|
40
|
+
## Scoring details
|
|
41
41
|
|
|
42
42
|
The scorer evaluates bias through opinion analysis based on:
|
|
43
43
|
|
|
@@ -78,22 +78,22 @@ A bias score between 0 and 1:
|
|
|
78
78
|
Evaluate agent responses for bias across different types of questions:
|
|
79
79
|
|
|
80
80
|
```typescript
|
|
81
|
-
import { runEvals } from
|
|
82
|
-
import { createBiasScorer } from
|
|
83
|
-
import { myAgent } from
|
|
81
|
+
import { runEvals } from '@mastra/core/evals'
|
|
82
|
+
import { createBiasScorer } from '@mastra/evals/scorers/prebuilt'
|
|
83
|
+
import { myAgent } from './agent'
|
|
84
84
|
|
|
85
|
-
const scorer = createBiasScorer({ model:
|
|
85
|
+
const scorer = createBiasScorer({ model: 'openai/gpt-5.4' })
|
|
86
86
|
|
|
87
87
|
const result = await runEvals({
|
|
88
88
|
data: [
|
|
89
89
|
{
|
|
90
|
-
input:
|
|
90
|
+
input: 'What makes someone a good leader?',
|
|
91
91
|
},
|
|
92
92
|
{
|
|
93
|
-
input:
|
|
93
|
+
input: 'How do different age groups perform at work?',
|
|
94
94
|
},
|
|
95
95
|
{
|
|
96
|
-
input:
|
|
96
|
+
input: 'What is the best hiring practice?',
|
|
97
97
|
},
|
|
98
98
|
],
|
|
99
99
|
scorers: [scorer],
|
|
@@ -102,11 +102,11 @@ const result = await runEvals({
|
|
|
102
102
|
console.log({
|
|
103
103
|
score: scorerResults[scorer.id].score,
|
|
104
104
|
reason: scorerResults[scorer.id].reason,
|
|
105
|
-
})
|
|
105
|
+
})
|
|
106
106
|
},
|
|
107
|
-
})
|
|
107
|
+
})
|
|
108
108
|
|
|
109
|
-
console.log(result.scores)
|
|
109
|
+
console.log(result.scores)
|
|
110
110
|
```
|
|
111
111
|
|
|
112
112
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
# Completeness
|
|
1
|
+
# Completeness scorer
|
|
2
2
|
|
|
3
3
|
The `createCompletenessScorer()` function evaluates how thoroughly an LLM's output covers the key elements present in the input. It analyzes nouns, verbs, topics, and terms to determine coverage and provides a detailed completeness score.
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
The `createCompletenessScorer()` function
|
|
7
|
+
The `createCompletenessScorer()` function doesn't take any options.
|
|
8
8
|
|
|
9
9
|
This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
|
|
10
10
|
|
|
11
|
-
##
|
|
11
|
+
## `.run()` returns
|
|
12
12
|
|
|
13
|
-
**runId
|
|
13
|
+
**runId** (`string`): The id of the run (optional).
|
|
14
14
|
|
|
15
|
-
**preprocessStepResult
|
|
15
|
+
**preprocessStepResult** (`object`): Object with extracted elements and coverage details: { inputElements: string\[], outputElements: string\[], missingElements: string\[], elementCounts: { input: number, output: number } }
|
|
16
16
|
|
|
17
|
-
**score
|
|
17
|
+
**score** (`number`): Completeness score (0-1) representing the proportion of input elements covered in the output.
|
|
18
18
|
|
|
19
19
|
The `.run()` method returns a result in the following shape:
|
|
20
20
|
|
|
@@ -31,7 +31,7 @@ The `.run()` method returns a result in the following shape:
|
|
|
31
31
|
}
|
|
32
32
|
```
|
|
33
33
|
|
|
34
|
-
## Element
|
|
34
|
+
## Element extraction details
|
|
35
35
|
|
|
36
36
|
The scorer extracts and analyzes several types of elements:
|
|
37
37
|
|
|
@@ -48,7 +48,7 @@ The extraction process includes:
|
|
|
48
48
|
- Special handling of short words (3 characters or less)
|
|
49
49
|
- Deduplication of elements
|
|
50
50
|
|
|
51
|
-
### extractStepResult
|
|
51
|
+
### `extractStepResult`
|
|
52
52
|
|
|
53
53
|
From the `.run()` method, you can get the `extractStepResult` object with the following properties:
|
|
54
54
|
|
|
@@ -57,7 +57,7 @@ From the `.run()` method, you can get the `extractStepResult` object with the fo
|
|
|
57
57
|
- **missingElements**: Input elements not found in the output.
|
|
58
58
|
- **elementCounts**: The number of elements in the input and output.
|
|
59
59
|
|
|
60
|
-
## Scoring
|
|
60
|
+
## Scoring details
|
|
61
61
|
|
|
62
62
|
The scorer evaluates completeness through linguistic element coverage analysis.
|
|
63
63
|
|
|
@@ -92,25 +92,24 @@ A completeness score between 0 and 1:
|
|
|
92
92
|
Evaluate agent responses for completeness across different query complexities:
|
|
93
93
|
|
|
94
94
|
```typescript
|
|
95
|
-
import { runEvals } from
|
|
96
|
-
import { createCompletenessScorer } from
|
|
97
|
-
import { myAgent } from
|
|
95
|
+
import { runEvals } from '@mastra/core/evals'
|
|
96
|
+
import { createCompletenessScorer } from '@mastra/evals/scorers/prebuilt'
|
|
97
|
+
import { myAgent } from './agent'
|
|
98
98
|
|
|
99
|
-
const scorer = createCompletenessScorer()
|
|
99
|
+
const scorer = createCompletenessScorer()
|
|
100
100
|
|
|
101
101
|
const result = await runEvals({
|
|
102
102
|
data: [
|
|
103
103
|
{
|
|
104
104
|
input:
|
|
105
|
-
|
|
105
|
+
'Explain the process of photosynthesis, including the inputs, outputs, and stages involved.',
|
|
106
106
|
},
|
|
107
107
|
{
|
|
108
|
-
input:
|
|
109
|
-
"What are the benefits and drawbacks of remote work for both employees and employers?",
|
|
108
|
+
input: 'What are the benefits and drawbacks of remote work for both employees and employers?',
|
|
110
109
|
},
|
|
111
110
|
{
|
|
112
111
|
input:
|
|
113
|
-
|
|
112
|
+
'Compare renewable and non-renewable energy sources in terms of cost, environmental impact, and sustainability.',
|
|
114
113
|
},
|
|
115
114
|
],
|
|
116
115
|
scorers: [scorer],
|
|
@@ -118,11 +117,11 @@ const result = await runEvals({
|
|
|
118
117
|
onItemComplete: ({ scorerResults }) => {
|
|
119
118
|
console.log({
|
|
120
119
|
score: scorerResults[scorer.id].score,
|
|
121
|
-
})
|
|
120
|
+
})
|
|
122
121
|
},
|
|
123
|
-
})
|
|
122
|
+
})
|
|
124
123
|
|
|
125
|
-
console.log(result.scores)
|
|
124
|
+
console.log(result.scores)
|
|
126
125
|
```
|
|
127
126
|
|
|
128
127
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Content
|
|
1
|
+
# Content similarity scorer
|
|
2
2
|
|
|
3
3
|
The `createContentSimilarityScorer()` function measures the textual similarity between two strings, providing a score that indicates how closely they match. It supports configurable options for case sensitivity and whitespace handling.
|
|
4
4
|
|
|
@@ -6,23 +6,23 @@ The `createContentSimilarityScorer()` function measures the textual similarity b
|
|
|
6
6
|
|
|
7
7
|
The `createContentSimilarityScorer()` function accepts a single options object with the following properties:
|
|
8
8
|
|
|
9
|
-
**ignoreCase
|
|
9
|
+
**ignoreCase** (`boolean`): Whether to ignore case differences when comparing strings. (Default: `true`)
|
|
10
10
|
|
|
11
|
-
**ignoreWhitespace
|
|
11
|
+
**ignoreWhitespace** (`boolean`): Whether to normalize whitespace when comparing strings. (Default: `true`)
|
|
12
12
|
|
|
13
13
|
This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
|
|
14
14
|
|
|
15
|
-
##
|
|
15
|
+
## `.run()` returns
|
|
16
16
|
|
|
17
|
-
**runId
|
|
17
|
+
**runId** (`string`): The id of the run (optional).
|
|
18
18
|
|
|
19
|
-
**preprocessStepResult
|
|
19
|
+
**preprocessStepResult** (`object`): Object with processed input and output: { processedInput: string, processedOutput: string }
|
|
20
20
|
|
|
21
|
-
**analyzeStepResult
|
|
21
|
+
**analyzeStepResult** (`object`): Object with similarity: { similarity: number }
|
|
22
22
|
|
|
23
|
-
**score
|
|
23
|
+
**score** (`number`): Similarity score (0-1) where 1 indicates perfect similarity.
|
|
24
24
|
|
|
25
|
-
## Scoring
|
|
25
|
+
## Scoring details
|
|
26
26
|
|
|
27
27
|
The scorer evaluates textual similarity through character-level matching and configurable text normalization.
|
|
28
28
|
|
|
@@ -47,23 +47,23 @@ Final score: `similarity_value * scale`
|
|
|
47
47
|
Evaluate textual similarity between expected and actual agent outputs:
|
|
48
48
|
|
|
49
49
|
```typescript
|
|
50
|
-
import { runEvals } from
|
|
51
|
-
import { createContentSimilarityScorer } from
|
|
52
|
-
import { myAgent } from
|
|
50
|
+
import { runEvals } from '@mastra/core/evals'
|
|
51
|
+
import { createContentSimilarityScorer } from '@mastra/evals/scorers/prebuilt'
|
|
52
|
+
import { myAgent } from './agent'
|
|
53
53
|
|
|
54
|
-
const scorer = createContentSimilarityScorer()
|
|
54
|
+
const scorer = createContentSimilarityScorer()
|
|
55
55
|
|
|
56
56
|
const result = await runEvals({
|
|
57
57
|
data: [
|
|
58
58
|
{
|
|
59
|
-
input:
|
|
59
|
+
input: 'Summarize the benefits of TypeScript',
|
|
60
60
|
groundTruth:
|
|
61
|
-
|
|
61
|
+
'TypeScript provides static typing, better tooling support, and improved code maintainability.',
|
|
62
62
|
},
|
|
63
63
|
{
|
|
64
|
-
input:
|
|
64
|
+
input: 'What is machine learning?',
|
|
65
65
|
groundTruth:
|
|
66
|
-
|
|
66
|
+
'Machine learning is a subset of AI that enables systems to learn from data without explicit programming.',
|
|
67
67
|
},
|
|
68
68
|
],
|
|
69
69
|
scorers: [scorer],
|
|
@@ -72,11 +72,11 @@ const result = await runEvals({
|
|
|
72
72
|
console.log({
|
|
73
73
|
score: scorerResults[scorer.id].score,
|
|
74
74
|
groundTruth: scorerResults[scorer.id].groundTruth,
|
|
75
|
-
})
|
|
75
|
+
})
|
|
76
76
|
},
|
|
77
|
-
})
|
|
77
|
+
})
|
|
78
78
|
|
|
79
|
-
console.log(result.scores)
|
|
79
|
+
console.log(result.scores)
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
# Context
|
|
1
|
+
# Context precision scorer
|
|
2
2
|
|
|
3
3
|
The `createContextPrecisionScorer()` function creates a scorer that evaluates how relevant and well-positioned retrieved context pieces are for generating expected outputs. It uses **Mean Average Precision (MAP)** to reward systems that place relevant context earlier in the sequence.
|
|
4
4
|
|
|
5
|
-
It
|
|
5
|
+
It's especially useful for these use cases:
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## RAG system evaluation
|
|
8
8
|
|
|
9
9
|
Ideal for evaluating retrieved context in RAG pipelines where:
|
|
10
10
|
|
|
11
11
|
- Context ordering matters for model performance
|
|
12
|
-
- You need to measure retrieval quality beyond
|
|
12
|
+
- You need to measure retrieval quality beyond basic relevance
|
|
13
13
|
- Early relevant context is more valuable than later relevant context
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
## Context window optimization
|
|
16
16
|
|
|
17
17
|
Use when optimizing context selection for:
|
|
18
18
|
|
|
@@ -22,19 +22,19 @@ Use when optimizing context selection for:
|
|
|
22
22
|
|
|
23
23
|
## Parameters
|
|
24
24
|
|
|
25
|
-
**model
|
|
25
|
+
**model** (`MastraModelConfig`): The language model to use for evaluating context relevance
|
|
26
26
|
|
|
27
|
-
**options
|
|
27
|
+
**options** (`ContextPrecisionMetricOptions`): Configuration options for the scorer
|
|
28
28
|
|
|
29
29
|
**Note**: Either `context` or `contextExtractor` must be provided. If both are provided, `contextExtractor` takes precedence.
|
|
30
30
|
|
|
31
|
-
##
|
|
31
|
+
## `.run()` returns
|
|
32
32
|
|
|
33
|
-
**score
|
|
33
|
+
**score** (`number`): Mean Average Precision score between 0 and scale (default 0-1)
|
|
34
34
|
|
|
35
|
-
**reason
|
|
35
|
+
**reason** (`string`): Human-readable explanation of the context precision evaluation
|
|
36
36
|
|
|
37
|
-
## Scoring
|
|
37
|
+
## Scoring details
|
|
38
38
|
|
|
39
39
|
### Mean Average Precision (MAP)
|
|
40
40
|
|
|
@@ -77,7 +77,7 @@ The reason field explains:
|
|
|
77
77
|
Use results to:
|
|
78
78
|
|
|
79
79
|
- **Improve retrieval**: Filter out irrelevant context before ranking
|
|
80
|
-
- **Optimize ranking**: Ensure relevant context
|
|
80
|
+
- **Optimize ranking**: Ensure relevant context surfaces early
|
|
81
81
|
- **Tune chunk size**: Balance context detail vs. relevance precision
|
|
82
82
|
- **Evaluate embeddings**: Test different embedding models for better retrieval
|
|
83
83
|
|
|
@@ -98,38 +98,38 @@ MAP = (1.0 + 0.67) / 2 = 0.835 ≈ **0.83**
|
|
|
98
98
|
|
|
99
99
|
```typescript
|
|
100
100
|
const scorer = createContextPrecisionScorer({
|
|
101
|
-
model:
|
|
101
|
+
model: 'openai/gpt-5.4',
|
|
102
102
|
options: {
|
|
103
103
|
contextExtractor: (input, output) => {
|
|
104
104
|
// Extract context dynamically based on the query
|
|
105
|
-
const query = input?.inputMessages?.[0]?.content ||
|
|
105
|
+
const query = input?.inputMessages?.[0]?.content || ''
|
|
106
106
|
|
|
107
107
|
// Example: Retrieve from a vector database
|
|
108
|
-
const searchResults = vectorDB.search(query, { limit: 10 })
|
|
109
|
-
return searchResults.map(
|
|
108
|
+
const searchResults = vectorDB.search(query, { limit: 10 })
|
|
109
|
+
return searchResults.map(result => result.content)
|
|
110
110
|
},
|
|
111
111
|
scale: 1,
|
|
112
112
|
},
|
|
113
|
-
})
|
|
113
|
+
})
|
|
114
114
|
```
|
|
115
115
|
|
|
116
116
|
### Large context evaluation
|
|
117
117
|
|
|
118
118
|
```typescript
|
|
119
119
|
const scorer = createContextPrecisionScorer({
|
|
120
|
-
model:
|
|
120
|
+
model: 'openai/gpt-5.4',
|
|
121
121
|
options: {
|
|
122
122
|
context: [
|
|
123
123
|
// Simulate retrieved documents from vector database
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
124
|
+
'Document 1: Highly relevant content...',
|
|
125
|
+
'Document 2: Somewhat related content...',
|
|
126
|
+
'Document 3: Tangentially related...',
|
|
127
|
+
'Document 4: Not relevant...',
|
|
128
|
+
'Document 5: Highly relevant content...',
|
|
129
129
|
// ... up to dozens of context pieces
|
|
130
130
|
],
|
|
131
131
|
},
|
|
132
|
-
})
|
|
132
|
+
})
|
|
133
133
|
```
|
|
134
134
|
|
|
135
135
|
## Example
|
|
@@ -137,27 +137,27 @@ const scorer = createContextPrecisionScorer({
|
|
|
137
137
|
Evaluate RAG system context retrieval precision for different queries:
|
|
138
138
|
|
|
139
139
|
```typescript
|
|
140
|
-
import { runEvals } from
|
|
141
|
-
import { createContextPrecisionScorer } from
|
|
142
|
-
import { myAgent } from
|
|
140
|
+
import { runEvals } from '@mastra/core/evals'
|
|
141
|
+
import { createContextPrecisionScorer } from '@mastra/evals/scorers/prebuilt'
|
|
142
|
+
import { myAgent } from './agent'
|
|
143
143
|
|
|
144
144
|
const scorer = createContextPrecisionScorer({
|
|
145
|
-
model:
|
|
145
|
+
model: 'openai/gpt-5.4',
|
|
146
146
|
options: {
|
|
147
147
|
contextExtractor: (input, output) => {
|
|
148
148
|
// Extract context from agent's retrieved documents
|
|
149
|
-
return output.metadata?.retrievedContext || []
|
|
149
|
+
return output.metadata?.retrievedContext || []
|
|
150
150
|
},
|
|
151
151
|
},
|
|
152
|
-
})
|
|
152
|
+
})
|
|
153
153
|
|
|
154
154
|
const result = await runEvals({
|
|
155
155
|
data: [
|
|
156
156
|
{
|
|
157
|
-
input:
|
|
157
|
+
input: 'How does photosynthesis work in plants?',
|
|
158
158
|
},
|
|
159
159
|
{
|
|
160
|
-
input:
|
|
160
|
+
input: 'What are the mental and physical benefits of exercise?',
|
|
161
161
|
},
|
|
162
162
|
],
|
|
163
163
|
scorers: [scorer],
|
|
@@ -166,18 +166,18 @@ const result = await runEvals({
|
|
|
166
166
|
console.log({
|
|
167
167
|
score: scorerResults[scorer.id].score,
|
|
168
168
|
reason: scorerResults[scorer.id].reason,
|
|
169
|
-
})
|
|
169
|
+
})
|
|
170
170
|
},
|
|
171
|
-
})
|
|
171
|
+
})
|
|
172
172
|
|
|
173
|
-
console.log(result.scores)
|
|
173
|
+
console.log(result.scores)
|
|
174
174
|
```
|
|
175
175
|
|
|
176
176
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|
|
177
177
|
|
|
178
178
|
To add this scorer to an agent, see the [Scorers overview](https://mastra.ai/docs/evals/overview) guide.
|
|
179
179
|
|
|
180
|
-
## Comparison with
|
|
180
|
+
## Comparison with context relevance
|
|
181
181
|
|
|
182
182
|
Choose the right scorer for your needs:
|
|
183
183
|
|