@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,11 +1,11 @@
1
- # Tool Call Accuracy Scorers
1
+ # Tool call accuracy scorers
2
2
 
3
3
  Mastra provides two tool call accuracy scorers for evaluating whether an LLM selects the correct tools from available options:
4
4
 
5
5
  1. **Code-based scorer** - Deterministic evaluation using exact tool matching
6
6
  2. **LLM-based scorer** - Semantic evaluation using AI to assess appropriateness
7
7
 
8
- ## Choosing Between Scorers
8
+ ## Choosing between scorers
9
9
 
10
10
  ### Use the Code-Based Scorer When:
11
11
 
@@ -23,17 +23,17 @@ Mastra provides two tool call accuracy scorers for evaluating whether an LLM sel
23
23
  - You need **explanations** for scoring decisions
24
24
  - You're evaluating **production agent behavior**
25
25
 
26
- ## Code-Based Tool Call Accuracy Scorer
26
+ ## Code-based tool call accuracy scorer
27
27
 
28
28
  The `createToolCallAccuracyScorerCode()` function from `@mastra/evals/scorers/prebuilt` provides deterministic binary scoring based on exact tool matching and supports both strict and lenient evaluation modes, as well as tool calling order validation.
29
29
 
30
30
  ### Parameters
31
31
 
32
- **expectedTool:** (`string`): The name of the tool that should be called for the given task. Ignored when expectedToolOrder is provided.
32
+ **expectedTool** (`string`): The name of the tool that should be called for the given task. Ignored when expectedToolOrder is provided.
33
33
 
34
- **strictMode:** (`boolean`): Controls evaluation strictness. For single tool mode: only exact single tool calls accepted. For order checking mode: tools must match exactly with no extra tools allowed.
34
+ **strictMode** (`boolean`): Controls evaluation strictness. For single tool mode: only exact single tool calls accepted. For order checking mode: tools must match exactly with no extra tools allowed.
35
35
 
36
- **expectedToolOrder:** (`string[]`): Array of tool names in the expected calling order. When provided, enables order checking mode and ignores expectedTool parameter.
36
+ **expectedToolOrder** (`string[]`): Array of tool names in the expected calling order. When provided, enables order checking mode and ignores expectedTool parameter.
37
37
 
38
38
  This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
39
39
 
@@ -43,7 +43,7 @@ The code-based scorer operates in two distinct modes:
43
43
 
44
44
  #### Single Tool Mode
45
45
 
46
- When `expectedToolOrder` is not provided, the scorer evaluates single tool selection:
46
+ When `expectedToolOrder` isn't provided, the scorer evaluates single tool selection:
47
47
 
48
48
  - **Standard Mode (strictMode: false)**: Returns `1` if the expected tool is called, regardless of other tools
49
49
  - **Strict Mode (strictMode: true)**: Returns `1` only if exactly one tool is called and it matches the expected tool
@@ -55,7 +55,7 @@ When `expectedToolOrder` is provided, the scorer validates tool calling sequence
55
55
  - **Strict Order (strictMode: true)**: Tools must be called in exactly the specified order with no extra tools
56
56
  - **Flexible Order (strictMode: false)**: Expected tools must appear in correct relative order (extra tools allowed)
57
57
 
58
- ## Code-Based Scoring Details
58
+ ## Code-based scoring details
59
59
 
60
60
  - **Binary scores**: Always returns 0 or 1
61
61
  - **Deterministic**: Same input always produces same output
@@ -66,22 +66,22 @@ When `expectedToolOrder` is provided, the scorer validates tool calling sequence
66
66
  ```typescript
67
67
  // Standard mode - passes if expected tool is called
68
68
  const lenientScorer = createCodeScorer({
69
- expectedTool: "search-tool",
69
+ expectedTool: 'search-tool',
70
70
  strictMode: false,
71
- });
71
+ })
72
72
 
73
73
  // Strict mode - only passes if exactly one tool is called
74
74
  const strictScorer = createCodeScorer({
75
- expectedTool: "search-tool",
75
+ expectedTool: 'search-tool',
76
76
  strictMode: true,
77
- });
77
+ })
78
78
 
79
79
  // Order checking with strict mode
80
80
  const strictOrderScorer = createCodeScorer({
81
- expectedTool: "step1-tool",
82
- expectedToolOrder: ["step1-tool", "step2-tool", "step3-tool"],
81
+ expectedTool: 'step1-tool',
82
+ expectedToolOrder: ['step1-tool', 'step2-tool', 'step3-tool'],
83
83
  strictMode: true, // no extra tools allowed
84
- });
84
+ })
85
85
  ```
86
86
 
87
87
  ### Code-Based Scorer Results
@@ -103,7 +103,7 @@ const strictOrderScorer = createCodeScorer({
103
103
  }
104
104
  ```
105
105
 
106
- ## Code-Based Scorer Examples
106
+ ## Code-based scorer examples
107
107
 
108
108
  The code-based scorer provides deterministic, binary scoring (0 or 1) based on exact tool matching.
109
109
 
@@ -111,40 +111,40 @@ The code-based scorer provides deterministic, binary scoring (0 or 1) based on e
111
111
 
112
112
  ```typescript
113
113
  const scorer = createToolCallAccuracyScorerCode({
114
- expectedTool: "weather-tool",
115
- });
114
+ expectedTool: 'weather-tool',
115
+ })
116
116
 
117
117
  // Simulate LLM input and output with tool call
118
118
  const inputMessages = [
119
119
  createTestMessage({
120
- content: "What is the weather like in New York today?",
121
- role: "user",
122
- id: "input-1",
120
+ content: 'What is the weather like in New York today?',
121
+ role: 'user',
122
+ id: 'input-1',
123
123
  }),
124
- ];
124
+ ]
125
125
 
126
126
  const output = [
127
127
  createTestMessage({
128
- content: "Let me check the weather for you.",
129
- role: "assistant",
130
- id: "output-1",
128
+ content: 'Let me check the weather for you.',
129
+ role: 'assistant',
130
+ id: 'output-1',
131
131
  toolInvocations: [
132
132
  createToolInvocation({
133
- toolCallId: "call-123",
134
- toolName: "weather-tool",
135
- args: { location: "New York" },
136
- result: { temperature: "72°F", condition: "sunny" },
137
- state: "result",
133
+ toolCallId: 'call-123',
134
+ toolName: 'weather-tool',
135
+ args: { location: 'New York' },
136
+ result: { temperature: '72°F', condition: 'sunny' },
137
+ state: 'result',
138
138
  }),
139
139
  ],
140
140
  }),
141
- ];
141
+ ]
142
142
 
143
- const run = createAgentTestRun({ inputMessages, output });
144
- const result = await scorer.run(run);
143
+ const run = createAgentTestRun({ inputMessages, output })
144
+ const result = await scorer.run(run)
145
145
 
146
- console.log(result.score); // 1
147
- console.log(result.preprocessStepResult?.correctToolCalled); // true
146
+ console.log(result.score) // 1
147
+ console.log(result.preprocessStepResult?.correctToolCalled) // true
148
148
  ```
149
149
 
150
150
  ### Strict mode evaluation
@@ -153,37 +153,37 @@ Only passes if exactly one tool is called:
153
153
 
154
154
  ```typescript
155
155
  const strictScorer = createToolCallAccuracyScorerCode({
156
- expectedTool: "weather-tool",
156
+ expectedTool: 'weather-tool',
157
157
  strictMode: true,
158
- });
158
+ })
159
159
 
160
160
  // Multiple tools called - fails in strict mode
161
161
  const output = [
162
162
  createTestMessage({
163
- content: "Let me help you with that.",
164
- role: "assistant",
165
- id: "output-1",
163
+ content: 'Let me help you with that.',
164
+ role: 'assistant',
165
+ id: 'output-1',
166
166
  toolInvocations: [
167
167
  createToolInvocation({
168
- toolCallId: "call-1",
169
- toolName: "search-tool",
168
+ toolCallId: 'call-1',
169
+ toolName: 'search-tool',
170
170
  args: {},
171
171
  result: {},
172
- state: "result",
172
+ state: 'result',
173
173
  }),
174
174
  createToolInvocation({
175
- toolCallId: "call-2",
176
- toolName: "weather-tool",
177
- args: { location: "New York" },
178
- result: { temperature: "20°C" },
179
- state: "result",
175
+ toolCallId: 'call-2',
176
+ toolName: 'weather-tool',
177
+ args: { location: 'New York' },
178
+ result: { temperature: '20°C' },
179
+ state: 'result',
180
180
  }),
181
181
  ],
182
182
  }),
183
- ];
183
+ ]
184
184
 
185
- const result = await strictScorer.run(run);
186
- console.log(result.score); // 0 - fails because multiple tools were called
185
+ const result = await strictScorer.run(run)
186
+ console.log(result.score) // 0 - fails because multiple tools were called
187
187
  ```
188
188
 
189
189
  ### Tool order validation
@@ -192,37 +192,37 @@ Validates that tools are called in a specific sequence:
192
192
 
193
193
  ```typescript
194
194
  const orderScorer = createToolCallAccuracyScorerCode({
195
- expectedTool: "auth-tool", // ignored when order is specified
196
- expectedToolOrder: ["auth-tool", "fetch-tool"],
195
+ expectedTool: 'auth-tool', // ignored when order is specified
196
+ expectedToolOrder: ['auth-tool', 'fetch-tool'],
197
197
  strictMode: true, // no extra tools allowed
198
- });
198
+ })
199
199
 
200
200
  const output = [
201
201
  createTestMessage({
202
- content: "I will authenticate and fetch the data.",
203
- role: "assistant",
204
- id: "output-1",
202
+ content: 'I will authenticate and fetch the data.',
203
+ role: 'assistant',
204
+ id: 'output-1',
205
205
  toolInvocations: [
206
206
  createToolInvocation({
207
- toolCallId: "call-1",
208
- toolName: "auth-tool",
209
- args: { token: "abc123" },
207
+ toolCallId: 'call-1',
208
+ toolName: 'auth-tool',
209
+ args: { token: 'abc123' },
210
210
  result: { authenticated: true },
211
- state: "result",
211
+ state: 'result',
212
212
  }),
213
213
  createToolInvocation({
214
- toolCallId: "call-2",
215
- toolName: "fetch-tool",
216
- args: { endpoint: "/data" },
217
- result: { data: ["item1"] },
218
- state: "result",
214
+ toolCallId: 'call-2',
215
+ toolName: 'fetch-tool',
216
+ args: { endpoint: '/data' },
217
+ result: { data: ['item1'] },
218
+ state: 'result',
219
219
  }),
220
220
  ],
221
221
  }),
222
- ];
222
+ ]
223
223
 
224
- const result = await orderScorer.run(run);
225
- console.log(result.score); // 1 - correct order
224
+ const result = await orderScorer.run(run)
225
+ console.log(result.score) // 1 - correct order
226
226
  ```
227
227
 
228
228
  ### Flexible order mode
@@ -231,55 +231,55 @@ Allows extra tools as long as expected tools maintain relative order:
231
231
 
232
232
  ```typescript
233
233
  const flexibleOrderScorer = createToolCallAccuracyScorerCode({
234
- expectedTool: "auth-tool",
235
- expectedToolOrder: ["auth-tool", "fetch-tool"],
234
+ expectedTool: 'auth-tool',
235
+ expectedToolOrder: ['auth-tool', 'fetch-tool'],
236
236
  strictMode: false, // allows extra tools
237
- });
237
+ })
238
238
 
239
239
  const output = [
240
240
  createTestMessage({
241
- content: "Performing comprehensive operation.",
242
- role: "assistant",
243
- id: "output-1",
241
+ content: 'Performing comprehensive operation.',
242
+ role: 'assistant',
243
+ id: 'output-1',
244
244
  toolInvocations: [
245
245
  createToolInvocation({
246
- toolCallId: "call-1",
247
- toolName: "auth-tool",
248
- args: { token: "abc123" },
246
+ toolCallId: 'call-1',
247
+ toolName: 'auth-tool',
248
+ args: { token: 'abc123' },
249
249
  result: { authenticated: true },
250
- state: "result",
250
+ state: 'result',
251
251
  }),
252
252
  createToolInvocation({
253
- toolCallId: "call-2",
254
- toolName: "log-tool", // Extra tool - OK in flexible mode
255
- args: { message: "Starting fetch" },
253
+ toolCallId: 'call-2',
254
+ toolName: 'log-tool', // Extra tool - OK in flexible mode
255
+ args: { message: 'Starting fetch' },
256
256
  result: { logged: true },
257
- state: "result",
257
+ state: 'result',
258
258
  }),
259
259
  createToolInvocation({
260
- toolCallId: "call-3",
261
- toolName: "fetch-tool",
262
- args: { endpoint: "/data" },
263
- result: { data: ["item1"] },
264
- state: "result",
260
+ toolCallId: 'call-3',
261
+ toolName: 'fetch-tool',
262
+ args: { endpoint: '/data' },
263
+ result: { data: ['item1'] },
264
+ state: 'result',
265
265
  }),
266
266
  ],
267
267
  }),
268
- ];
268
+ ]
269
269
 
270
- const result = await flexibleOrderScorer.run(run);
271
- console.log(result.score); // 1 - auth-tool comes before fetch-tool
270
+ const result = await flexibleOrderScorer.run(run)
271
+ console.log(result.score) // 1 - auth-tool comes before fetch-tool
272
272
  ```
273
273
 
274
- ## LLM-Based Tool Call Accuracy Scorer
274
+ ## LLM-based tool call accuracy scorer
275
275
 
276
276
  The `createToolCallAccuracyScorerLLM()` function from `@mastra/evals/scorers/prebuilt` uses an LLM to evaluate whether the tools called by an agent are appropriate for the given user request, providing semantic evaluation rather than exact matching.
277
277
 
278
278
  ### Parameters
279
279
 
280
- **model:** (`MastraModelConfig`): The LLM model to use for evaluating tool appropriateness
280
+ **model** (`MastraModelConfig`): The LLM model to use for evaluating tool appropriateness
281
281
 
282
- **availableTools:** (`Array<{name: string, description: string}>`): List of available tools with their descriptions for context
282
+ **availableTools** (`Array<{name: string, description: string}>`): List of available tools with their descriptions for context
283
283
 
284
284
  ### Features
285
285
 
@@ -298,7 +298,7 @@ The LLM-based scorer provides:
298
298
  3. **Generate Score**: Calculates score based on appropriate vs total tool calls
299
299
  4. **Generate Reasoning**: Provides human-readable explanation
300
300
 
301
- ## LLM-Based Scoring Details
301
+ ## LLM-based scoring details
302
302
 
303
303
  - **Fractional scores**: Returns values between 0.0 and 1.0
304
304
  - **Context-aware**: Considers user intent and appropriateness
@@ -309,7 +309,7 @@ The LLM-based scorer provides:
309
309
  ```typescript
310
310
  // Basic configuration
311
311
  const basicLLMScorer = createLLMScorer({
312
- model: 'openai/gpt-5.1',
312
+ model: 'openai/gpt-5.4',
313
313
  availableTools: [
314
314
  { name: 'tool1', description: 'Description 1' },
315
315
  { name: 'tool2', description: 'Description 2' }
@@ -341,7 +341,7 @@ const customModelScorer = createLLMScorer({
341
341
  }
342
342
  ```
343
343
 
344
- ## LLM-Based Scorer Examples
344
+ ## LLM-based scorer examples
345
345
 
346
346
  The LLM-based scorer uses AI to evaluate whether tool selections are appropriate for the user's request.
347
347
 
@@ -349,53 +349,53 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
349
349
 
350
350
  ```typescript
351
351
  const llmScorer = createToolCallAccuracyScorerLLM({
352
- model: "openai/gpt-5.1",
352
+ model: 'openai/gpt-5.4',
353
353
  availableTools: [
354
354
  {
355
- name: "weather-tool",
356
- description: "Get current weather information for any location",
355
+ name: 'weather-tool',
356
+ description: 'Get current weather information for any location',
357
357
  },
358
358
  {
359
- name: "calendar-tool",
360
- description: "Check calendar events and scheduling",
359
+ name: 'calendar-tool',
360
+ description: 'Check calendar events and scheduling',
361
361
  },
362
362
  {
363
- name: "search-tool",
364
- description: "Search the web for general information",
363
+ name: 'search-tool',
364
+ description: 'Search the web for general information',
365
365
  },
366
366
  ],
367
- });
367
+ })
368
368
 
369
369
  const inputMessages = [
370
370
  createTestMessage({
371
- content: "What is the weather like in San Francisco today?",
372
- role: "user",
373
- id: "input-1",
371
+ content: 'What is the weather like in San Francisco today?',
372
+ role: 'user',
373
+ id: 'input-1',
374
374
  }),
375
- ];
375
+ ]
376
376
 
377
377
  const output = [
378
378
  createTestMessage({
379
- content: "Let me check the current weather for you.",
380
- role: "assistant",
381
- id: "output-1",
379
+ content: 'Let me check the current weather for you.',
380
+ role: 'assistant',
381
+ id: 'output-1',
382
382
  toolInvocations: [
383
383
  createToolInvocation({
384
- toolCallId: "call-123",
385
- toolName: "weather-tool",
386
- args: { location: "San Francisco", date: "today" },
387
- result: { temperature: "68°F", condition: "foggy" },
388
- state: "result",
384
+ toolCallId: 'call-123',
385
+ toolName: 'weather-tool',
386
+ args: { location: 'San Francisco', date: 'today' },
387
+ result: { temperature: '68°F', condition: 'foggy' },
388
+ state: 'result',
389
389
  }),
390
390
  ],
391
391
  }),
392
- ];
392
+ ]
393
393
 
394
- const run = createAgentTestRun({ inputMessages, output });
395
- const result = await llmScorer.run(run);
394
+ const run = createAgentTestRun({ inputMessages, output })
395
+ const result = await llmScorer.run(run)
396
396
 
397
- console.log(result.score); // 1.0 - appropriate tool usage
398
- console.log(result.reason); // "The agent correctly used the weather-tool to address the user's request for weather information."
397
+ console.log(result.score) // 1.0 - appropriate tool usage
398
+ console.log(result.reason) // "The agent correctly used the weather-tool to address the user's request for weather information."
399
399
  ```
400
400
 
401
401
  ### Handling inappropriate tool usage
@@ -403,34 +403,34 @@ console.log(result.reason); // "The agent correctly used the weather-tool to add
403
403
  ```typescript
404
404
  const inputMessages = [
405
405
  createTestMessage({
406
- content: "What is the weather in Tokyo?",
407
- role: "user",
408
- id: "input-1",
406
+ content: 'What is the weather in Tokyo?',
407
+ role: 'user',
408
+ id: 'input-1',
409
409
  }),
410
- ];
410
+ ]
411
411
 
412
412
  const inappropriateOutput = [
413
413
  createTestMessage({
414
- content: "Let me search for that information.",
415
- role: "assistant",
416
- id: "output-1",
414
+ content: 'Let me search for that information.',
415
+ role: 'assistant',
416
+ id: 'output-1',
417
417
  toolInvocations: [
418
418
  createToolInvocation({
419
- toolCallId: "call-456",
420
- toolName: "search-tool", // Less appropriate than weather-tool
421
- args: { query: "Tokyo weather" },
422
- result: { results: ["Tokyo weather data..."] },
423
- state: "result",
419
+ toolCallId: 'call-456',
420
+ toolName: 'search-tool', // Less appropriate than weather-tool
421
+ args: { query: 'Tokyo weather' },
422
+ result: { results: ['Tokyo weather data...'] },
423
+ state: 'result',
424
424
  }),
425
425
  ],
426
426
  }),
427
- ];
427
+ ]
428
428
 
429
- const run = createAgentTestRun({ inputMessages, output: inappropriateOutput });
430
- const result = await llmScorer.run(run);
429
+ const run = createAgentTestRun({ inputMessages, output: inappropriateOutput })
430
+ const result = await llmScorer.run(run)
431
431
 
432
- console.log(result.score); // 0.5 - partially appropriate
433
- console.log(result.reason); // "The agent used search-tool when weather-tool would have been more appropriate for a direct weather query."
432
+ console.log(result.score) // 0.5 - partially appropriate
433
+ console.log(result.reason) // "The agent used search-tool when weather-tool would have been more appropriate for a direct weather query."
434
434
  ```
435
435
 
436
436
  ### Evaluating clarification requests
@@ -465,64 +465,64 @@ console.log(result.score); // 1.0 - appropriate to ask for clarification
465
465
  console.log(result.reason); // "The agent appropriately asked for clarification rather than calling tools with insufficient information."
466
466
  ```
467
467
 
468
- ## Comparing Both Scorers
468
+ ## Comparing both scorers
469
469
 
470
470
  Here's an example using both scorers on the same data:
471
471
 
472
472
  ```typescript
473
473
  import {
474
474
  createToolCallAccuracyScorerCode as createCodeScorer,
475
- createToolCallAccuracyScorerLLM as createLLMScorer
476
- } from "@mastra/evals/scorers/prebuilt";
475
+ createToolCallAccuracyScorerLLM as createLLMScorer,
476
+ } from '@mastra/evals/scorers/prebuilt'
477
477
 
478
478
  // Setup both scorers
479
479
  const codeScorer = createCodeScorer({
480
- expectedTool: "weather-tool",
480
+ expectedTool: 'weather-tool',
481
481
  strictMode: false,
482
- });
482
+ })
483
483
 
484
484
  const llmScorer = createLLMScorer({
485
- model: "openai/gpt-5.1",
485
+ model: 'openai/gpt-5.4',
486
486
  availableTools: [
487
- { name: "weather-tool", description: "Get weather information" },
488
- { name: "search-tool", description: "Search the web" },
487
+ { name: 'weather-tool', description: 'Get weather information' },
488
+ { name: 'search-tool', description: 'Search the web' },
489
489
  ],
490
- });
490
+ })
491
491
 
492
492
  // Test data
493
493
  const run = createAgentTestRun({
494
494
  inputMessages: [
495
495
  createTestMessage({
496
- content: "What is the weather?",
497
- role: "user",
498
- id: "input-1",
496
+ content: 'What is the weather?',
497
+ role: 'user',
498
+ id: 'input-1',
499
499
  }),
500
500
  ],
501
501
  output: [
502
502
  createTestMessage({
503
- content: "Let me find that information.",
504
- role: "assistant",
505
- id: "output-1",
503
+ content: 'Let me find that information.',
504
+ role: 'assistant',
505
+ id: 'output-1',
506
506
  toolInvocations: [
507
507
  createToolInvocation({
508
- toolCallId: "call-1",
509
- toolName: "search-tool",
510
- args: { query: "weather" },
511
- result: { results: ["weather data"] },
512
- state: "result",
508
+ toolCallId: 'call-1',
509
+ toolName: 'search-tool',
510
+ args: { query: 'weather' },
511
+ result: { results: ['weather data'] },
512
+ state: 'result',
513
513
  }),
514
514
  ],
515
515
  }),
516
516
  ],
517
- });
517
+ })
518
518
 
519
519
  // Run both scorers
520
- const codeResult = await codeScorer.run(run);
521
- const llmResult = await llmScorer.run(run);
520
+ const codeResult = await codeScorer.run(run)
521
+ const llmResult = await llmScorer.run(run)
522
522
 
523
- console.log("Code Scorer:", codeResult.score); // 0 - wrong tool
524
- console.log("LLM Scorer:", llmResult.score); // 0.3 - partially appropriate
525
- console.log("LLM Reason:", llmResult.reason); // Explains why search-tool is less appropriate
523
+ console.log('Code Scorer:', codeResult.score) // 0 - wrong tool
524
+ console.log('LLM Scorer:', llmResult.score) // 0.3 - partially appropriate
525
+ console.log('LLM Reason:', llmResult.reason) // Explains why search-tool is less appropriate
526
526
  ```
527
527
 
528
528
  ## Related