@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
# Tool
|
|
1
|
+
# Tool call accuracy scorers
|
|
2
2
|
|
|
3
3
|
Mastra provides two tool call accuracy scorers for evaluating whether an LLM selects the correct tools from available options:
|
|
4
4
|
|
|
5
5
|
1. **Code-based scorer** - Deterministic evaluation using exact tool matching
|
|
6
6
|
2. **LLM-based scorer** - Semantic evaluation using AI to assess appropriateness
|
|
7
7
|
|
|
8
|
-
## Choosing
|
|
8
|
+
## Choosing between scorers
|
|
9
9
|
|
|
10
10
|
### Use the Code-Based Scorer When:
|
|
11
11
|
|
|
@@ -23,17 +23,17 @@ Mastra provides two tool call accuracy scorers for evaluating whether an LLM sel
|
|
|
23
23
|
- You need **explanations** for scoring decisions
|
|
24
24
|
- You're evaluating **production agent behavior**
|
|
25
25
|
|
|
26
|
-
## Code-
|
|
26
|
+
## Code-based tool call accuracy scorer
|
|
27
27
|
|
|
28
28
|
The `createToolCallAccuracyScorerCode()` function from `@mastra/evals/scorers/prebuilt` provides deterministic binary scoring based on exact tool matching and supports both strict and lenient evaluation modes, as well as tool calling order validation.
|
|
29
29
|
|
|
30
30
|
### Parameters
|
|
31
31
|
|
|
32
|
-
**expectedTool
|
|
32
|
+
**expectedTool** (`string`): The name of the tool that should be called for the given task. Ignored when expectedToolOrder is provided.
|
|
33
33
|
|
|
34
|
-
**strictMode
|
|
34
|
+
**strictMode** (`boolean`): Controls evaluation strictness. For single tool mode: only exact single tool calls accepted. For order checking mode: tools must match exactly with no extra tools allowed.
|
|
35
35
|
|
|
36
|
-
**expectedToolOrder
|
|
36
|
+
**expectedToolOrder** (`string[]`): Array of tool names in the expected calling order. When provided, enables order checking mode and ignores expectedTool parameter.
|
|
37
37
|
|
|
38
38
|
This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
|
|
39
39
|
|
|
@@ -43,7 +43,7 @@ The code-based scorer operates in two distinct modes:
|
|
|
43
43
|
|
|
44
44
|
#### Single Tool Mode
|
|
45
45
|
|
|
46
|
-
When `expectedToolOrder`
|
|
46
|
+
When `expectedToolOrder` isn't provided, the scorer evaluates single tool selection:
|
|
47
47
|
|
|
48
48
|
- **Standard Mode (strictMode: false)**: Returns `1` if the expected tool is called, regardless of other tools
|
|
49
49
|
- **Strict Mode (strictMode: true)**: Returns `1` only if exactly one tool is called and it matches the expected tool
|
|
@@ -55,7 +55,7 @@ When `expectedToolOrder` is provided, the scorer validates tool calling sequence
|
|
|
55
55
|
- **Strict Order (strictMode: true)**: Tools must be called in exactly the specified order with no extra tools
|
|
56
56
|
- **Flexible Order (strictMode: false)**: Expected tools must appear in correct relative order (extra tools allowed)
|
|
57
57
|
|
|
58
|
-
## Code-
|
|
58
|
+
## Code-based scoring details
|
|
59
59
|
|
|
60
60
|
- **Binary scores**: Always returns 0 or 1
|
|
61
61
|
- **Deterministic**: Same input always produces same output
|
|
@@ -66,22 +66,22 @@ When `expectedToolOrder` is provided, the scorer validates tool calling sequence
|
|
|
66
66
|
```typescript
|
|
67
67
|
// Standard mode - passes if expected tool is called
|
|
68
68
|
const lenientScorer = createCodeScorer({
|
|
69
|
-
expectedTool:
|
|
69
|
+
expectedTool: 'search-tool',
|
|
70
70
|
strictMode: false,
|
|
71
|
-
})
|
|
71
|
+
})
|
|
72
72
|
|
|
73
73
|
// Strict mode - only passes if exactly one tool is called
|
|
74
74
|
const strictScorer = createCodeScorer({
|
|
75
|
-
expectedTool:
|
|
75
|
+
expectedTool: 'search-tool',
|
|
76
76
|
strictMode: true,
|
|
77
|
-
})
|
|
77
|
+
})
|
|
78
78
|
|
|
79
79
|
// Order checking with strict mode
|
|
80
80
|
const strictOrderScorer = createCodeScorer({
|
|
81
|
-
expectedTool:
|
|
82
|
-
expectedToolOrder: [
|
|
81
|
+
expectedTool: 'step1-tool',
|
|
82
|
+
expectedToolOrder: ['step1-tool', 'step2-tool', 'step3-tool'],
|
|
83
83
|
strictMode: true, // no extra tools allowed
|
|
84
|
-
})
|
|
84
|
+
})
|
|
85
85
|
```
|
|
86
86
|
|
|
87
87
|
### Code-Based Scorer Results
|
|
@@ -103,7 +103,7 @@ const strictOrderScorer = createCodeScorer({
|
|
|
103
103
|
}
|
|
104
104
|
```
|
|
105
105
|
|
|
106
|
-
## Code-
|
|
106
|
+
## Code-based scorer examples
|
|
107
107
|
|
|
108
108
|
The code-based scorer provides deterministic, binary scoring (0 or 1) based on exact tool matching.
|
|
109
109
|
|
|
@@ -111,40 +111,40 @@ The code-based scorer provides deterministic, binary scoring (0 or 1) based on e
|
|
|
111
111
|
|
|
112
112
|
```typescript
|
|
113
113
|
const scorer = createToolCallAccuracyScorerCode({
|
|
114
|
-
expectedTool:
|
|
115
|
-
})
|
|
114
|
+
expectedTool: 'weather-tool',
|
|
115
|
+
})
|
|
116
116
|
|
|
117
117
|
// Simulate LLM input and output with tool call
|
|
118
118
|
const inputMessages = [
|
|
119
119
|
createTestMessage({
|
|
120
|
-
content:
|
|
121
|
-
role:
|
|
122
|
-
id:
|
|
120
|
+
content: 'What is the weather like in New York today?',
|
|
121
|
+
role: 'user',
|
|
122
|
+
id: 'input-1',
|
|
123
123
|
}),
|
|
124
|
-
]
|
|
124
|
+
]
|
|
125
125
|
|
|
126
126
|
const output = [
|
|
127
127
|
createTestMessage({
|
|
128
|
-
content:
|
|
129
|
-
role:
|
|
130
|
-
id:
|
|
128
|
+
content: 'Let me check the weather for you.',
|
|
129
|
+
role: 'assistant',
|
|
130
|
+
id: 'output-1',
|
|
131
131
|
toolInvocations: [
|
|
132
132
|
createToolInvocation({
|
|
133
|
-
toolCallId:
|
|
134
|
-
toolName:
|
|
135
|
-
args: { location:
|
|
136
|
-
result: { temperature:
|
|
137
|
-
state:
|
|
133
|
+
toolCallId: 'call-123',
|
|
134
|
+
toolName: 'weather-tool',
|
|
135
|
+
args: { location: 'New York' },
|
|
136
|
+
result: { temperature: '72°F', condition: 'sunny' },
|
|
137
|
+
state: 'result',
|
|
138
138
|
}),
|
|
139
139
|
],
|
|
140
140
|
}),
|
|
141
|
-
]
|
|
141
|
+
]
|
|
142
142
|
|
|
143
|
-
const run = createAgentTestRun({ inputMessages, output })
|
|
144
|
-
const result = await scorer.run(run)
|
|
143
|
+
const run = createAgentTestRun({ inputMessages, output })
|
|
144
|
+
const result = await scorer.run(run)
|
|
145
145
|
|
|
146
|
-
console.log(result.score)
|
|
147
|
-
console.log(result.preprocessStepResult?.correctToolCalled)
|
|
146
|
+
console.log(result.score) // 1
|
|
147
|
+
console.log(result.preprocessStepResult?.correctToolCalled) // true
|
|
148
148
|
```
|
|
149
149
|
|
|
150
150
|
### Strict mode evaluation
|
|
@@ -153,37 +153,37 @@ Only passes if exactly one tool is called:
|
|
|
153
153
|
|
|
154
154
|
```typescript
|
|
155
155
|
const strictScorer = createToolCallAccuracyScorerCode({
|
|
156
|
-
expectedTool:
|
|
156
|
+
expectedTool: 'weather-tool',
|
|
157
157
|
strictMode: true,
|
|
158
|
-
})
|
|
158
|
+
})
|
|
159
159
|
|
|
160
160
|
// Multiple tools called - fails in strict mode
|
|
161
161
|
const output = [
|
|
162
162
|
createTestMessage({
|
|
163
|
-
content:
|
|
164
|
-
role:
|
|
165
|
-
id:
|
|
163
|
+
content: 'Let me help you with that.',
|
|
164
|
+
role: 'assistant',
|
|
165
|
+
id: 'output-1',
|
|
166
166
|
toolInvocations: [
|
|
167
167
|
createToolInvocation({
|
|
168
|
-
toolCallId:
|
|
169
|
-
toolName:
|
|
168
|
+
toolCallId: 'call-1',
|
|
169
|
+
toolName: 'search-tool',
|
|
170
170
|
args: {},
|
|
171
171
|
result: {},
|
|
172
|
-
state:
|
|
172
|
+
state: 'result',
|
|
173
173
|
}),
|
|
174
174
|
createToolInvocation({
|
|
175
|
-
toolCallId:
|
|
176
|
-
toolName:
|
|
177
|
-
args: { location:
|
|
178
|
-
result: { temperature:
|
|
179
|
-
state:
|
|
175
|
+
toolCallId: 'call-2',
|
|
176
|
+
toolName: 'weather-tool',
|
|
177
|
+
args: { location: 'New York' },
|
|
178
|
+
result: { temperature: '20°C' },
|
|
179
|
+
state: 'result',
|
|
180
180
|
}),
|
|
181
181
|
],
|
|
182
182
|
}),
|
|
183
|
-
]
|
|
183
|
+
]
|
|
184
184
|
|
|
185
|
-
const result = await strictScorer.run(run)
|
|
186
|
-
console.log(result.score)
|
|
185
|
+
const result = await strictScorer.run(run)
|
|
186
|
+
console.log(result.score) // 0 - fails because multiple tools were called
|
|
187
187
|
```
|
|
188
188
|
|
|
189
189
|
### Tool order validation
|
|
@@ -192,37 +192,37 @@ Validates that tools are called in a specific sequence:
|
|
|
192
192
|
|
|
193
193
|
```typescript
|
|
194
194
|
const orderScorer = createToolCallAccuracyScorerCode({
|
|
195
|
-
expectedTool:
|
|
196
|
-
expectedToolOrder: [
|
|
195
|
+
expectedTool: 'auth-tool', // ignored when order is specified
|
|
196
|
+
expectedToolOrder: ['auth-tool', 'fetch-tool'],
|
|
197
197
|
strictMode: true, // no extra tools allowed
|
|
198
|
-
})
|
|
198
|
+
})
|
|
199
199
|
|
|
200
200
|
const output = [
|
|
201
201
|
createTestMessage({
|
|
202
|
-
content:
|
|
203
|
-
role:
|
|
204
|
-
id:
|
|
202
|
+
content: 'I will authenticate and fetch the data.',
|
|
203
|
+
role: 'assistant',
|
|
204
|
+
id: 'output-1',
|
|
205
205
|
toolInvocations: [
|
|
206
206
|
createToolInvocation({
|
|
207
|
-
toolCallId:
|
|
208
|
-
toolName:
|
|
209
|
-
args: { token:
|
|
207
|
+
toolCallId: 'call-1',
|
|
208
|
+
toolName: 'auth-tool',
|
|
209
|
+
args: { token: 'abc123' },
|
|
210
210
|
result: { authenticated: true },
|
|
211
|
-
state:
|
|
211
|
+
state: 'result',
|
|
212
212
|
}),
|
|
213
213
|
createToolInvocation({
|
|
214
|
-
toolCallId:
|
|
215
|
-
toolName:
|
|
216
|
-
args: { endpoint:
|
|
217
|
-
result: { data: [
|
|
218
|
-
state:
|
|
214
|
+
toolCallId: 'call-2',
|
|
215
|
+
toolName: 'fetch-tool',
|
|
216
|
+
args: { endpoint: '/data' },
|
|
217
|
+
result: { data: ['item1'] },
|
|
218
|
+
state: 'result',
|
|
219
219
|
}),
|
|
220
220
|
],
|
|
221
221
|
}),
|
|
222
|
-
]
|
|
222
|
+
]
|
|
223
223
|
|
|
224
|
-
const result = await orderScorer.run(run)
|
|
225
|
-
console.log(result.score)
|
|
224
|
+
const result = await orderScorer.run(run)
|
|
225
|
+
console.log(result.score) // 1 - correct order
|
|
226
226
|
```
|
|
227
227
|
|
|
228
228
|
### Flexible order mode
|
|
@@ -231,55 +231,55 @@ Allows extra tools as long as expected tools maintain relative order:
|
|
|
231
231
|
|
|
232
232
|
```typescript
|
|
233
233
|
const flexibleOrderScorer = createToolCallAccuracyScorerCode({
|
|
234
|
-
expectedTool:
|
|
235
|
-
expectedToolOrder: [
|
|
234
|
+
expectedTool: 'auth-tool',
|
|
235
|
+
expectedToolOrder: ['auth-tool', 'fetch-tool'],
|
|
236
236
|
strictMode: false, // allows extra tools
|
|
237
|
-
})
|
|
237
|
+
})
|
|
238
238
|
|
|
239
239
|
const output = [
|
|
240
240
|
createTestMessage({
|
|
241
|
-
content:
|
|
242
|
-
role:
|
|
243
|
-
id:
|
|
241
|
+
content: 'Performing comprehensive operation.',
|
|
242
|
+
role: 'assistant',
|
|
243
|
+
id: 'output-1',
|
|
244
244
|
toolInvocations: [
|
|
245
245
|
createToolInvocation({
|
|
246
|
-
toolCallId:
|
|
247
|
-
toolName:
|
|
248
|
-
args: { token:
|
|
246
|
+
toolCallId: 'call-1',
|
|
247
|
+
toolName: 'auth-tool',
|
|
248
|
+
args: { token: 'abc123' },
|
|
249
249
|
result: { authenticated: true },
|
|
250
|
-
state:
|
|
250
|
+
state: 'result',
|
|
251
251
|
}),
|
|
252
252
|
createToolInvocation({
|
|
253
|
-
toolCallId:
|
|
254
|
-
toolName:
|
|
255
|
-
args: { message:
|
|
253
|
+
toolCallId: 'call-2',
|
|
254
|
+
toolName: 'log-tool', // Extra tool - OK in flexible mode
|
|
255
|
+
args: { message: 'Starting fetch' },
|
|
256
256
|
result: { logged: true },
|
|
257
|
-
state:
|
|
257
|
+
state: 'result',
|
|
258
258
|
}),
|
|
259
259
|
createToolInvocation({
|
|
260
|
-
toolCallId:
|
|
261
|
-
toolName:
|
|
262
|
-
args: { endpoint:
|
|
263
|
-
result: { data: [
|
|
264
|
-
state:
|
|
260
|
+
toolCallId: 'call-3',
|
|
261
|
+
toolName: 'fetch-tool',
|
|
262
|
+
args: { endpoint: '/data' },
|
|
263
|
+
result: { data: ['item1'] },
|
|
264
|
+
state: 'result',
|
|
265
265
|
}),
|
|
266
266
|
],
|
|
267
267
|
}),
|
|
268
|
-
]
|
|
268
|
+
]
|
|
269
269
|
|
|
270
|
-
const result = await flexibleOrderScorer.run(run)
|
|
271
|
-
console.log(result.score)
|
|
270
|
+
const result = await flexibleOrderScorer.run(run)
|
|
271
|
+
console.log(result.score) // 1 - auth-tool comes before fetch-tool
|
|
272
272
|
```
|
|
273
273
|
|
|
274
|
-
## LLM-
|
|
274
|
+
## LLM-based tool call accuracy scorer
|
|
275
275
|
|
|
276
276
|
The `createToolCallAccuracyScorerLLM()` function from `@mastra/evals/scorers/prebuilt` uses an LLM to evaluate whether the tools called by an agent are appropriate for the given user request, providing semantic evaluation rather than exact matching.
|
|
277
277
|
|
|
278
278
|
### Parameters
|
|
279
279
|
|
|
280
|
-
**model
|
|
280
|
+
**model** (`MastraModelConfig`): The LLM model to use for evaluating tool appropriateness
|
|
281
281
|
|
|
282
|
-
**availableTools
|
|
282
|
+
**availableTools** (`Array<{name: string, description: string}>`): List of available tools with their descriptions for context
|
|
283
283
|
|
|
284
284
|
### Features
|
|
285
285
|
|
|
@@ -298,7 +298,7 @@ The LLM-based scorer provides:
|
|
|
298
298
|
3. **Generate Score**: Calculates score based on appropriate vs total tool calls
|
|
299
299
|
4. **Generate Reasoning**: Provides human-readable explanation
|
|
300
300
|
|
|
301
|
-
## LLM-
|
|
301
|
+
## LLM-based scoring details
|
|
302
302
|
|
|
303
303
|
- **Fractional scores**: Returns values between 0.0 and 1.0
|
|
304
304
|
- **Context-aware**: Considers user intent and appropriateness
|
|
@@ -309,7 +309,7 @@ The LLM-based scorer provides:
|
|
|
309
309
|
```typescript
|
|
310
310
|
// Basic configuration
|
|
311
311
|
const basicLLMScorer = createLLMScorer({
|
|
312
|
-
model: 'openai/gpt-5.
|
|
312
|
+
model: 'openai/gpt-5.4',
|
|
313
313
|
availableTools: [
|
|
314
314
|
{ name: 'tool1', description: 'Description 1' },
|
|
315
315
|
{ name: 'tool2', description: 'Description 2' }
|
|
@@ -341,7 +341,7 @@ const customModelScorer = createLLMScorer({
|
|
|
341
341
|
}
|
|
342
342
|
```
|
|
343
343
|
|
|
344
|
-
## LLM-
|
|
344
|
+
## LLM-based scorer examples
|
|
345
345
|
|
|
346
346
|
The LLM-based scorer uses AI to evaluate whether tool selections are appropriate for the user's request.
|
|
347
347
|
|
|
@@ -349,53 +349,53 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
|
|
|
349
349
|
|
|
350
350
|
```typescript
|
|
351
351
|
const llmScorer = createToolCallAccuracyScorerLLM({
|
|
352
|
-
model:
|
|
352
|
+
model: 'openai/gpt-5.4',
|
|
353
353
|
availableTools: [
|
|
354
354
|
{
|
|
355
|
-
name:
|
|
356
|
-
description:
|
|
355
|
+
name: 'weather-tool',
|
|
356
|
+
description: 'Get current weather information for any location',
|
|
357
357
|
},
|
|
358
358
|
{
|
|
359
|
-
name:
|
|
360
|
-
description:
|
|
359
|
+
name: 'calendar-tool',
|
|
360
|
+
description: 'Check calendar events and scheduling',
|
|
361
361
|
},
|
|
362
362
|
{
|
|
363
|
-
name:
|
|
364
|
-
description:
|
|
363
|
+
name: 'search-tool',
|
|
364
|
+
description: 'Search the web for general information',
|
|
365
365
|
},
|
|
366
366
|
],
|
|
367
|
-
})
|
|
367
|
+
})
|
|
368
368
|
|
|
369
369
|
const inputMessages = [
|
|
370
370
|
createTestMessage({
|
|
371
|
-
content:
|
|
372
|
-
role:
|
|
373
|
-
id:
|
|
371
|
+
content: 'What is the weather like in San Francisco today?',
|
|
372
|
+
role: 'user',
|
|
373
|
+
id: 'input-1',
|
|
374
374
|
}),
|
|
375
|
-
]
|
|
375
|
+
]
|
|
376
376
|
|
|
377
377
|
const output = [
|
|
378
378
|
createTestMessage({
|
|
379
|
-
content:
|
|
380
|
-
role:
|
|
381
|
-
id:
|
|
379
|
+
content: 'Let me check the current weather for you.',
|
|
380
|
+
role: 'assistant',
|
|
381
|
+
id: 'output-1',
|
|
382
382
|
toolInvocations: [
|
|
383
383
|
createToolInvocation({
|
|
384
|
-
toolCallId:
|
|
385
|
-
toolName:
|
|
386
|
-
args: { location:
|
|
387
|
-
result: { temperature:
|
|
388
|
-
state:
|
|
384
|
+
toolCallId: 'call-123',
|
|
385
|
+
toolName: 'weather-tool',
|
|
386
|
+
args: { location: 'San Francisco', date: 'today' },
|
|
387
|
+
result: { temperature: '68°F', condition: 'foggy' },
|
|
388
|
+
state: 'result',
|
|
389
389
|
}),
|
|
390
390
|
],
|
|
391
391
|
}),
|
|
392
|
-
]
|
|
392
|
+
]
|
|
393
393
|
|
|
394
|
-
const run = createAgentTestRun({ inputMessages, output })
|
|
395
|
-
const result = await llmScorer.run(run)
|
|
394
|
+
const run = createAgentTestRun({ inputMessages, output })
|
|
395
|
+
const result = await llmScorer.run(run)
|
|
396
396
|
|
|
397
|
-
console.log(result.score)
|
|
398
|
-
console.log(result.reason)
|
|
397
|
+
console.log(result.score) // 1.0 - appropriate tool usage
|
|
398
|
+
console.log(result.reason) // "The agent correctly used the weather-tool to address the user's request for weather information."
|
|
399
399
|
```
|
|
400
400
|
|
|
401
401
|
### Handling inappropriate tool usage
|
|
@@ -403,34 +403,34 @@ console.log(result.reason); // "The agent correctly used the weather-tool to add
|
|
|
403
403
|
```typescript
|
|
404
404
|
const inputMessages = [
|
|
405
405
|
createTestMessage({
|
|
406
|
-
content:
|
|
407
|
-
role:
|
|
408
|
-
id:
|
|
406
|
+
content: 'What is the weather in Tokyo?',
|
|
407
|
+
role: 'user',
|
|
408
|
+
id: 'input-1',
|
|
409
409
|
}),
|
|
410
|
-
]
|
|
410
|
+
]
|
|
411
411
|
|
|
412
412
|
const inappropriateOutput = [
|
|
413
413
|
createTestMessage({
|
|
414
|
-
content:
|
|
415
|
-
role:
|
|
416
|
-
id:
|
|
414
|
+
content: 'Let me search for that information.',
|
|
415
|
+
role: 'assistant',
|
|
416
|
+
id: 'output-1',
|
|
417
417
|
toolInvocations: [
|
|
418
418
|
createToolInvocation({
|
|
419
|
-
toolCallId:
|
|
420
|
-
toolName:
|
|
421
|
-
args: { query:
|
|
422
|
-
result: { results: [
|
|
423
|
-
state:
|
|
419
|
+
toolCallId: 'call-456',
|
|
420
|
+
toolName: 'search-tool', // Less appropriate than weather-tool
|
|
421
|
+
args: { query: 'Tokyo weather' },
|
|
422
|
+
result: { results: ['Tokyo weather data...'] },
|
|
423
|
+
state: 'result',
|
|
424
424
|
}),
|
|
425
425
|
],
|
|
426
426
|
}),
|
|
427
|
-
]
|
|
427
|
+
]
|
|
428
428
|
|
|
429
|
-
const run = createAgentTestRun({ inputMessages, output: inappropriateOutput })
|
|
430
|
-
const result = await llmScorer.run(run)
|
|
429
|
+
const run = createAgentTestRun({ inputMessages, output: inappropriateOutput })
|
|
430
|
+
const result = await llmScorer.run(run)
|
|
431
431
|
|
|
432
|
-
console.log(result.score)
|
|
433
|
-
console.log(result.reason)
|
|
432
|
+
console.log(result.score) // 0.5 - partially appropriate
|
|
433
|
+
console.log(result.reason) // "The agent used search-tool when weather-tool would have been more appropriate for a direct weather query."
|
|
434
434
|
```
|
|
435
435
|
|
|
436
436
|
### Evaluating clarification requests
|
|
@@ -465,64 +465,64 @@ console.log(result.score); // 1.0 - appropriate to ask for clarification
|
|
|
465
465
|
console.log(result.reason); // "The agent appropriately asked for clarification rather than calling tools with insufficient information."
|
|
466
466
|
```
|
|
467
467
|
|
|
468
|
-
## Comparing
|
|
468
|
+
## Comparing both scorers
|
|
469
469
|
|
|
470
470
|
Here's an example using both scorers on the same data:
|
|
471
471
|
|
|
472
472
|
```typescript
|
|
473
473
|
import {
|
|
474
474
|
createToolCallAccuracyScorerCode as createCodeScorer,
|
|
475
|
-
createToolCallAccuracyScorerLLM as createLLMScorer
|
|
476
|
-
} from
|
|
475
|
+
createToolCallAccuracyScorerLLM as createLLMScorer,
|
|
476
|
+
} from '@mastra/evals/scorers/prebuilt'
|
|
477
477
|
|
|
478
478
|
// Setup both scorers
|
|
479
479
|
const codeScorer = createCodeScorer({
|
|
480
|
-
expectedTool:
|
|
480
|
+
expectedTool: 'weather-tool',
|
|
481
481
|
strictMode: false,
|
|
482
|
-
})
|
|
482
|
+
})
|
|
483
483
|
|
|
484
484
|
const llmScorer = createLLMScorer({
|
|
485
|
-
model:
|
|
485
|
+
model: 'openai/gpt-5.4',
|
|
486
486
|
availableTools: [
|
|
487
|
-
{ name:
|
|
488
|
-
{ name:
|
|
487
|
+
{ name: 'weather-tool', description: 'Get weather information' },
|
|
488
|
+
{ name: 'search-tool', description: 'Search the web' },
|
|
489
489
|
],
|
|
490
|
-
})
|
|
490
|
+
})
|
|
491
491
|
|
|
492
492
|
// Test data
|
|
493
493
|
const run = createAgentTestRun({
|
|
494
494
|
inputMessages: [
|
|
495
495
|
createTestMessage({
|
|
496
|
-
content:
|
|
497
|
-
role:
|
|
498
|
-
id:
|
|
496
|
+
content: 'What is the weather?',
|
|
497
|
+
role: 'user',
|
|
498
|
+
id: 'input-1',
|
|
499
499
|
}),
|
|
500
500
|
],
|
|
501
501
|
output: [
|
|
502
502
|
createTestMessage({
|
|
503
|
-
content:
|
|
504
|
-
role:
|
|
505
|
-
id:
|
|
503
|
+
content: 'Let me find that information.',
|
|
504
|
+
role: 'assistant',
|
|
505
|
+
id: 'output-1',
|
|
506
506
|
toolInvocations: [
|
|
507
507
|
createToolInvocation({
|
|
508
|
-
toolCallId:
|
|
509
|
-
toolName:
|
|
510
|
-
args: { query:
|
|
511
|
-
result: { results: [
|
|
512
|
-
state:
|
|
508
|
+
toolCallId: 'call-1',
|
|
509
|
+
toolName: 'search-tool',
|
|
510
|
+
args: { query: 'weather' },
|
|
511
|
+
result: { results: ['weather data'] },
|
|
512
|
+
state: 'result',
|
|
513
513
|
}),
|
|
514
514
|
],
|
|
515
515
|
}),
|
|
516
516
|
],
|
|
517
|
-
})
|
|
517
|
+
})
|
|
518
518
|
|
|
519
519
|
// Run both scorers
|
|
520
|
-
const codeResult = await codeScorer.run(run)
|
|
521
|
-
const llmResult = await llmScorer.run(run)
|
|
520
|
+
const codeResult = await codeScorer.run(run)
|
|
521
|
+
const llmResult = await llmScorer.run(run)
|
|
522
522
|
|
|
523
|
-
console.log(
|
|
524
|
-
console.log(
|
|
525
|
-
console.log(
|
|
523
|
+
console.log('Code Scorer:', codeResult.score) // 0 - wrong tool
|
|
524
|
+
console.log('LLM Scorer:', llmResult.score) // 0.3 - partially appropriate
|
|
525
|
+
console.log('LLM Reason:', llmResult.reason) // Explains why search-tool is less appropriate
|
|
526
526
|
```
|
|
527
527
|
|
|
528
528
|
## Related
|