@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Hallucination
|
|
1
|
+
# Hallucination scorer
|
|
2
2
|
|
|
3
3
|
The `createHallucinationScorer()` function evaluates whether an LLM generates factually correct information by comparing its output against the provided context. This scorer measures hallucination by identifying direct contradictions between the context and the output.
|
|
4
4
|
|
|
@@ -6,47 +6,37 @@ The `createHallucinationScorer()` function evaluates whether an LLM generates fa
|
|
|
6
6
|
|
|
7
7
|
The `createHallucinationScorer()` function accepts a single options object with the following properties:
|
|
8
8
|
|
|
9
|
-
**model
|
|
9
|
+
**model** (`LanguageModel`): Configuration for the model used to evaluate hallucination.
|
|
10
10
|
|
|
11
|
-
**options
|
|
11
|
+
**options** (`Options`): Configuration options.
|
|
12
12
|
|
|
13
|
-
**options.
|
|
13
|
+
**options.scale** (`number`): Maximum score value.
|
|
14
14
|
|
|
15
|
-
**options.
|
|
15
|
+
**options.context** (`string[]`): Static context strings to use as ground truth for hallucination detection.
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
### GetContextParams
|
|
20
|
-
|
|
21
|
-
The `getContext` hook receives the following parameters:
|
|
22
|
-
|
|
23
|
-
**run:** (`GetContextRun`): The scorer run containing input, output, runId, requestContext, and tracingContext.
|
|
17
|
+
**options.getContext** (`(params: GetContextParams) => string[] | Promise<string[]>`): A hook to dynamically resolve context at runtime. Takes priority over static context. Useful for live scoring where context (like tool results) is only available when the scorer runs.
|
|
24
18
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
**score:** (`number`): The computed score. Only present when called from the generateReason step.
|
|
28
|
-
|
|
29
|
-
**step:** (`'analyze' | 'generateReason'`): Which step is calling the hook. Useful for caching context between calls.
|
|
19
|
+
This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
|
|
30
20
|
|
|
31
|
-
##
|
|
21
|
+
## `.run()` returns
|
|
32
22
|
|
|
33
|
-
**runId
|
|
23
|
+
**runId** (`string`): The id of the run (optional).
|
|
34
24
|
|
|
35
|
-
**preprocessStepResult
|
|
25
|
+
**preprocessStepResult** (`object`): Object with extracted claims: { claims: string\[] }
|
|
36
26
|
|
|
37
|
-
**preprocessPrompt
|
|
27
|
+
**preprocessPrompt** (`string`): The prompt sent to the LLM for the preprocess step (optional).
|
|
38
28
|
|
|
39
|
-
**analyzeStepResult
|
|
29
|
+
**analyzeStepResult** (`object`): Object with verdicts: { verdicts: Array<{ statement: string, verdict: 'yes' | 'no', reason: string }> }
|
|
40
30
|
|
|
41
|
-
**analyzePrompt
|
|
31
|
+
**analyzePrompt** (`string`): The prompt sent to the LLM for the analyze step (optional).
|
|
42
32
|
|
|
43
|
-
**score
|
|
33
|
+
**score** (`number`): Hallucination score (0 to scale, default 0-1).
|
|
44
34
|
|
|
45
|
-
**reason
|
|
35
|
+
**reason** (`string`): Detailed explanation of the score and identified contradictions.
|
|
46
36
|
|
|
47
|
-
**generateReasonPrompt
|
|
37
|
+
**generateReasonPrompt** (`string`): The prompt sent to the LLM for the generateReason step (optional).
|
|
48
38
|
|
|
49
|
-
## Scoring
|
|
39
|
+
## Scoring details
|
|
50
40
|
|
|
51
41
|
The scorer evaluates hallucination through contradiction detection and unsupported claim analysis.
|
|
52
42
|
|
|
@@ -111,40 +101,38 @@ A hallucination score between 0 and 1:
|
|
|
111
101
|
Use static context when you have known ground truth to compare against:
|
|
112
102
|
|
|
113
103
|
```typescript
|
|
114
|
-
import { createHallucinationScorer } from
|
|
104
|
+
import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
115
105
|
|
|
116
106
|
const scorer = createHallucinationScorer({
|
|
117
|
-
model:
|
|
107
|
+
model: 'openai/gpt-5.4',
|
|
118
108
|
options: {
|
|
119
109
|
context: [
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
110
|
+
'The first iPhone was announced on January 9, 2007.',
|
|
111
|
+
'It was released on June 29, 2007.',
|
|
112
|
+
'Steve Jobs introduced it at Macworld.',
|
|
123
113
|
],
|
|
124
114
|
},
|
|
125
|
-
})
|
|
115
|
+
})
|
|
126
116
|
```
|
|
127
117
|
|
|
128
|
-
### Dynamic Context with getContext
|
|
118
|
+
### Dynamic Context with `getContext`
|
|
129
119
|
|
|
130
120
|
Use `getContext` for live scoring scenarios where context comes from tool results:
|
|
131
121
|
|
|
132
122
|
```typescript
|
|
133
|
-
import { createHallucinationScorer } from
|
|
134
|
-
import { extractToolResults } from
|
|
123
|
+
import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
124
|
+
import { extractToolResults } from '@mastra/evals/scorers'
|
|
135
125
|
|
|
136
126
|
const scorer = createHallucinationScorer({
|
|
137
|
-
model:
|
|
127
|
+
model: 'openai/gpt-5.4',
|
|
138
128
|
options: {
|
|
139
129
|
getContext: ({ run, step }) => {
|
|
140
130
|
// Extract tool results as context
|
|
141
|
-
const toolResults = extractToolResults(run.output)
|
|
142
|
-
return toolResults.map((t)
|
|
143
|
-
JSON.stringify({ tool: t.toolName, result: t.result })
|
|
144
|
-
);
|
|
131
|
+
const toolResults = extractToolResults(run.output)
|
|
132
|
+
return toolResults.map(t => JSON.stringify({ tool: t.toolName, result: t.result }))
|
|
145
133
|
},
|
|
146
134
|
},
|
|
147
|
-
})
|
|
135
|
+
})
|
|
148
136
|
```
|
|
149
137
|
|
|
150
138
|
### Live Scoring with Agent
|
|
@@ -152,62 +140,57 @@ const scorer = createHallucinationScorer({
|
|
|
152
140
|
Attach the scorer to an agent for live evaluation:
|
|
153
141
|
|
|
154
142
|
```typescript
|
|
155
|
-
import { Agent } from
|
|
156
|
-
import { createHallucinationScorer } from
|
|
157
|
-
import { extractToolResults } from
|
|
143
|
+
import { Agent } from '@mastra/core/agent'
|
|
144
|
+
import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
145
|
+
import { extractToolResults } from '@mastra/evals/scorers'
|
|
158
146
|
|
|
159
147
|
const hallucinationScorer = createHallucinationScorer({
|
|
160
|
-
model:
|
|
148
|
+
model: 'openai/gpt-5.4',
|
|
161
149
|
options: {
|
|
162
150
|
getContext: ({ run }) => {
|
|
163
|
-
const toolResults = extractToolResults(run.output)
|
|
164
|
-
return toolResults.map((t)
|
|
165
|
-
JSON.stringify({ tool: t.toolName, result: t.result })
|
|
166
|
-
);
|
|
151
|
+
const toolResults = extractToolResults(run.output)
|
|
152
|
+
return toolResults.map(t => JSON.stringify({ tool: t.toolName, result: t.result }))
|
|
167
153
|
},
|
|
168
154
|
},
|
|
169
|
-
})
|
|
155
|
+
})
|
|
170
156
|
|
|
171
157
|
const agent = new Agent({
|
|
172
|
-
name:
|
|
173
|
-
model:
|
|
174
|
-
instructions:
|
|
158
|
+
name: 'my-agent',
|
|
159
|
+
model: 'openai/gpt-5.4',
|
|
160
|
+
instructions: 'You are a helpful assistant.',
|
|
175
161
|
evals: {
|
|
176
162
|
scorers: [hallucinationScorer],
|
|
177
163
|
},
|
|
178
|
-
})
|
|
164
|
+
})
|
|
179
165
|
```
|
|
180
166
|
|
|
181
|
-
### Batch Evaluation with runEvals
|
|
167
|
+
### Batch Evaluation with `runEvals`
|
|
182
168
|
|
|
183
169
|
```typescript
|
|
184
|
-
import { runEvals } from
|
|
185
|
-
import { createHallucinationScorer } from
|
|
186
|
-
import { myAgent } from
|
|
170
|
+
import { runEvals } from '@mastra/core/evals'
|
|
171
|
+
import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
172
|
+
import { myAgent } from './agent'
|
|
187
173
|
|
|
188
174
|
const scorer = createHallucinationScorer({
|
|
189
|
-
model:
|
|
175
|
+
model: 'openai/gpt-5.4',
|
|
190
176
|
options: {
|
|
191
|
-
context: [
|
|
177
|
+
context: ['Known fact 1', 'Known fact 2'],
|
|
192
178
|
},
|
|
193
|
-
})
|
|
179
|
+
})
|
|
194
180
|
|
|
195
181
|
const result = await runEvals({
|
|
196
|
-
data: [
|
|
197
|
-
{ input: "Tell me about topic A" },
|
|
198
|
-
{ input: "Tell me about topic B" },
|
|
199
|
-
],
|
|
182
|
+
data: [{ input: 'Tell me about topic A' }, { input: 'Tell me about topic B' }],
|
|
200
183
|
scorers: [scorer],
|
|
201
184
|
target: myAgent,
|
|
202
185
|
onItemComplete: ({ scorerResults }) => {
|
|
203
186
|
console.log({
|
|
204
187
|
score: scorerResults[scorer.id].score,
|
|
205
188
|
reason: scorerResults[scorer.id].reason,
|
|
206
|
-
})
|
|
189
|
+
})
|
|
207
190
|
},
|
|
208
|
-
})
|
|
191
|
+
})
|
|
209
192
|
|
|
210
|
-
console.log(result.scores)
|
|
193
|
+
console.log(result.scores)
|
|
211
194
|
```
|
|
212
195
|
|
|
213
196
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
# Keyword
|
|
1
|
+
# Keyword coverage scorer
|
|
2
2
|
|
|
3
3
|
The `createKeywordCoverageScorer()` function evaluates how well an LLM's output covers the important keywords from the input. It analyzes keyword presence and matches while ignoring common words and stop words.
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
The `createKeywordCoverageScorer()` function
|
|
7
|
+
The `createKeywordCoverageScorer()` function doesn't take any options.
|
|
8
8
|
|
|
9
9
|
This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
|
|
10
10
|
|
|
11
|
-
##
|
|
11
|
+
## `.run()` returns
|
|
12
12
|
|
|
13
|
-
**runId
|
|
13
|
+
**runId** (`string`): The id of the run (optional).
|
|
14
14
|
|
|
15
|
-
**preprocessStepResult
|
|
15
|
+
**preprocessStepResult** (`object`): Object with extracted keywords: { referenceKeywords: Set\<string>, responseKeywords: Set\<string> }
|
|
16
16
|
|
|
17
|
-
**analyzeStepResult
|
|
17
|
+
**analyzeStepResult** (`object`): Object with keyword coverage: { totalKeywords: number, matchedKeywords: number }
|
|
18
18
|
|
|
19
|
-
**score
|
|
19
|
+
**score** (`number`): Coverage score (0-1) representing the proportion of matched keywords.
|
|
20
20
|
|
|
21
21
|
`.run()` returns a result in the following shape:
|
|
22
22
|
|
|
@@ -35,7 +35,7 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
|
|
|
35
35
|
}
|
|
36
36
|
```
|
|
37
37
|
|
|
38
|
-
## Scoring
|
|
38
|
+
## Scoring details
|
|
39
39
|
|
|
40
40
|
The scorer evaluates keyword coverage by matching keywords with the following features:
|
|
41
41
|
|
|
@@ -85,23 +85,23 @@ The scorer handles several special cases:
|
|
|
85
85
|
Evaluate keyword coverage between input queries and agent responses:
|
|
86
86
|
|
|
87
87
|
```typescript
|
|
88
|
-
import { runEvals } from
|
|
89
|
-
import { createKeywordCoverageScorer } from
|
|
90
|
-
import { myAgent } from
|
|
88
|
+
import { runEvals } from '@mastra/core/evals'
|
|
89
|
+
import { createKeywordCoverageScorer } from '@mastra/evals/scorers/prebuilt'
|
|
90
|
+
import { myAgent } from './agent'
|
|
91
91
|
|
|
92
|
-
const scorer = createKeywordCoverageScorer()
|
|
92
|
+
const scorer = createKeywordCoverageScorer()
|
|
93
93
|
|
|
94
94
|
const result = await runEvals({
|
|
95
95
|
data: [
|
|
96
96
|
{
|
|
97
|
-
input:
|
|
97
|
+
input: 'JavaScript frameworks like React and Vue',
|
|
98
98
|
},
|
|
99
99
|
{
|
|
100
|
-
input:
|
|
100
|
+
input: 'TypeScript offers interfaces, generics, and type inference',
|
|
101
101
|
},
|
|
102
102
|
{
|
|
103
103
|
input:
|
|
104
|
-
|
|
104
|
+
'Machine learning models require data preprocessing, feature engineering, and hyperparameter tuning',
|
|
105
105
|
},
|
|
106
106
|
],
|
|
107
107
|
scorers: [scorer],
|
|
@@ -109,11 +109,11 @@ const result = await runEvals({
|
|
|
109
109
|
onItemComplete: ({ scorerResults }) => {
|
|
110
110
|
console.log({
|
|
111
111
|
score: scorerResults[scorer.id].score,
|
|
112
|
-
})
|
|
112
|
+
})
|
|
113
113
|
},
|
|
114
|
-
})
|
|
114
|
+
})
|
|
115
115
|
|
|
116
|
-
console.log(result.scores)
|
|
116
|
+
console.log(result.scores)
|
|
117
117
|
```
|
|
118
118
|
|
|
119
119
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|