@mastra/mcp-docs-server 0.13.30-alpha.0 → 0.13.30-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +9 -9
- package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +15 -0
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +35 -35
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +17 -17
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +24 -24
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +31 -31
- package/.docs/organized/changelogs/%40mastra%2Freact.md +20 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +15 -15
- package/.docs/organized/changelogs/create-mastra.md +19 -19
- package/.docs/organized/changelogs/mastra.md +27 -27
- package/.docs/organized/code-examples/agent.md +0 -1
- package/.docs/organized/code-examples/agui.md +2 -2
- package/.docs/organized/code-examples/client-side-tools.md +2 -2
- package/.docs/raw/agents/adding-voice.mdx +118 -25
- package/.docs/raw/agents/agent-memory.mdx +73 -89
- package/.docs/raw/agents/guardrails.mdx +1 -1
- package/.docs/raw/agents/overview.mdx +39 -7
- package/.docs/raw/agents/using-tools.mdx +95 -0
- package/.docs/raw/deployment/overview.mdx +9 -11
- package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +1 -1
- package/.docs/raw/frameworks/servers/express.mdx +2 -2
- package/.docs/raw/getting-started/installation.mdx +34 -85
- package/.docs/raw/getting-started/mcp-docs-server.mdx +13 -1
- package/.docs/raw/index.mdx +49 -14
- package/.docs/raw/observability/ai-tracing/exporters/otel.mdx +3 -0
- package/.docs/raw/reference/observability/ai-tracing/exporters/otel.mdx +6 -0
- package/.docs/raw/reference/scorers/answer-relevancy.mdx +105 -7
- package/.docs/raw/reference/scorers/answer-similarity.mdx +266 -16
- package/.docs/raw/reference/scorers/bias.mdx +107 -6
- package/.docs/raw/reference/scorers/completeness.mdx +131 -8
- package/.docs/raw/reference/scorers/content-similarity.mdx +107 -8
- package/.docs/raw/reference/scorers/context-precision.mdx +234 -18
- package/.docs/raw/reference/scorers/context-relevance.mdx +418 -35
- package/.docs/raw/reference/scorers/faithfulness.mdx +122 -8
- package/.docs/raw/reference/scorers/hallucination.mdx +125 -8
- package/.docs/raw/reference/scorers/keyword-coverage.mdx +141 -9
- package/.docs/raw/reference/scorers/noise-sensitivity.mdx +478 -6
- package/.docs/raw/reference/scorers/prompt-alignment.mdx +351 -102
- package/.docs/raw/reference/scorers/textual-difference.mdx +134 -6
- package/.docs/raw/reference/scorers/tone-consistency.mdx +133 -0
- package/.docs/raw/reference/scorers/tool-call-accuracy.mdx +422 -65
- package/.docs/raw/reference/scorers/toxicity.mdx +125 -7
- package/.docs/raw/reference/workflows/workflow.mdx +33 -0
- package/.docs/raw/scorers/custom-scorers.mdx +244 -3
- package/.docs/raw/scorers/overview.mdx +8 -38
- package/.docs/raw/server-db/middleware.mdx +5 -2
- package/.docs/raw/server-db/runtime-context.mdx +178 -0
- package/.docs/raw/streaming/workflow-streaming.mdx +5 -1
- package/.docs/raw/tools-mcp/overview.mdx +25 -7
- package/.docs/raw/workflows/overview.mdx +28 -1
- package/CHANGELOG.md +14 -0
- package/package.json +4 -4
- package/.docs/raw/agents/runtime-context.mdx +0 -106
- package/.docs/raw/agents/using-tools-and-mcp.mdx +0 -241
- package/.docs/raw/getting-started/model-providers.mdx +0 -63
- package/.docs/raw/tools-mcp/runtime-context.mdx +0 -63
- /package/.docs/raw/{evals → scorers/evals-old-api}/custom-eval.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/overview.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/running-in-ci.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/textual-evals.mdx +0 -0
- /package/.docs/raw/{server-db → workflows}/snapshots.mdx +0 -0
|
@@ -37,6 +37,22 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
|
|
|
37
37
|
]}
|
|
38
38
|
/>
|
|
39
39
|
|
|
40
|
+
`.run()` returns a result in the following shape:
|
|
41
|
+
|
|
42
|
+
```typescript
|
|
43
|
+
{
|
|
44
|
+
runId: string,
|
|
45
|
+
analyzeStepResult: {
|
|
46
|
+
responseSentiment?: number,
|
|
47
|
+
referenceSentiment?: number,
|
|
48
|
+
difference?: number,
|
|
49
|
+
avgSentiment?: number,
|
|
50
|
+
sentimentVariance?: number,
|
|
51
|
+
},
|
|
52
|
+
score: number
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
40
56
|
## Scoring Details
|
|
41
57
|
|
|
42
58
|
The scorer evaluates sentiment consistency through tone pattern analysis and mode-specific scoring.
|
|
@@ -69,6 +85,123 @@ Final score: `mode_specific_score * scale`
|
|
|
69
85
|
- 0.1-0.3: Poor consistency with major tone changes
|
|
70
86
|
- 0.0: No consistency - completely different tones
|
|
71
87
|
|
|
88
|
+
### analyzeStepResult
|
|
89
|
+
Object with tone metrics:
|
|
90
|
+
- **responseSentiment**: Sentiment score for the response (comparison mode).
|
|
91
|
+
- **referenceSentiment**: Sentiment score for the input/reference (comparison mode).
|
|
92
|
+
- **difference**: Absolute difference between sentiment scores (comparison mode).
|
|
93
|
+
- **avgSentiment**: Average sentiment across sentences (stability mode).
|
|
94
|
+
- **sentimentVariance**: Variance of sentiment across sentences (stability mode).
|
|
95
|
+
|
|
96
|
+
## Examples
|
|
97
|
+
|
|
98
|
+
### Positive tone example
|
|
99
|
+
|
|
100
|
+
In this example, the texts exhibit a similar positive sentiment. The scorer measures the consistency between the tones, resulting in a high score.
|
|
101
|
+
|
|
102
|
+
```typescript filename="src/example-positive-tone.ts" showLineNumbers copy
|
|
103
|
+
import { createToneScorer } from "@mastra/evals/scorers/code";
|
|
104
|
+
|
|
105
|
+
const scorer = createToneScorer();
|
|
106
|
+
|
|
107
|
+
const input = 'This product is fantastic and amazing!';
|
|
108
|
+
const output = 'The product is excellent and wonderful!';
|
|
109
|
+
|
|
110
|
+
const result = await scorer.run({
|
|
111
|
+
input: [{ role: 'user', content: input }],
|
|
112
|
+
output: { role: 'assistant', text: output },
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
console.log('Score:', result.score);
|
|
116
|
+
console.log('AnalyzeStepResult:', result.analyzeStepResult);
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
#### Positive tone output
|
|
120
|
+
|
|
121
|
+
The scorer returns a high score reflecting strong sentiment alignment. The `analyzeStepResult` field provides sentiment values and the difference between them.
|
|
122
|
+
|
|
123
|
+
```typescript
|
|
124
|
+
{
|
|
125
|
+
score: 0.8333333333333335,
|
|
126
|
+
analyzeStepResult: {
|
|
127
|
+
responseSentiment: 1.3333333333333333,
|
|
128
|
+
referenceSentiment: 1.1666666666666667,
|
|
129
|
+
difference: 0.16666666666666652,
|
|
130
|
+
},
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Stable tone example
|
|
135
|
+
|
|
136
|
+
In this example, the text’s internal tone consistency is analyzed by passing an empty response. This signals the scorer to evaluate sentiment stability within the single input text, resulting in a score reflecting how uniform the tone is throughout.
|
|
137
|
+
|
|
138
|
+
```typescript filename="src/example-stable-tone.ts" showLineNumbers copy
|
|
139
|
+
import { createToneScorer } from "@mastra/evals/scorers/code";
|
|
140
|
+
|
|
141
|
+
const scorer = createToneScorer();
|
|
142
|
+
|
|
143
|
+
const input = 'Great service! Friendly staff. Perfect atmosphere.';
|
|
144
|
+
const output = '';
|
|
145
|
+
|
|
146
|
+
const result = await scorer.run({
|
|
147
|
+
input: [{ role: 'user', content: input }],
|
|
148
|
+
output: { role: 'assistant', text: output },
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
console.log('Score:', result.score);
|
|
152
|
+
console.log('AnalyzeStepResult:', result.analyzeStepResult);
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
#### Stable tone output
|
|
156
|
+
|
|
157
|
+
The scorer returns a high score indicating consistent sentiment throughout the input text. The `analyzeStepResult` field includes the average sentiment and sentiment variance, reflecting tone stability.
|
|
158
|
+
|
|
159
|
+
```typescript
|
|
160
|
+
{
|
|
161
|
+
score: 0.9444444444444444,
|
|
162
|
+
analyzeStepResult: {
|
|
163
|
+
avgSentiment: 1.3333333333333333,
|
|
164
|
+
sentimentVariance: 0.05555555555555556,
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Mixed tone example
|
|
170
|
+
|
|
171
|
+
In this example, the input and response have different emotional tones. The scorer picks up on these variations and gives a lower consistency score.
|
|
172
|
+
|
|
173
|
+
```typescript filename="src/example-mixed-tone.ts" showLineNumbers copy
|
|
174
|
+
import { createToneScorer } from "@mastra/evals/scorers/code";
|
|
175
|
+
|
|
176
|
+
const scorer = createToneScorer();
|
|
177
|
+
|
|
178
|
+
const input = 'The interface is frustrating and confusing, though it has potential.';
|
|
179
|
+
const output = 'The design shows promise but needs significant improvements to be usable.';
|
|
180
|
+
|
|
181
|
+
const result = await scorer.run({
|
|
182
|
+
input: [{ role: 'user', content: input }],
|
|
183
|
+
output: { role: 'assistant', text: output },
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
console.log('Score:', result.score);
|
|
187
|
+
console.log('AnalyzeStepResult:', result.analyzeStepResult);
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
#### Mixed tone output
|
|
191
|
+
|
|
192
|
+
The scorer returns a low score due to the noticeable differences in emotional tone. The `analyzeStepResult` field highlights the sentiment values and the degree of variation between them.
|
|
193
|
+
|
|
194
|
+
```typescript
|
|
195
|
+
{
|
|
196
|
+
score: 0.4181818181818182,
|
|
197
|
+
analyzeStepResult: {
|
|
198
|
+
responseSentiment: -0.4,
|
|
199
|
+
referenceSentiment: 0.18181818181818182,
|
|
200
|
+
difference: 0.5818181818181818,
|
|
201
|
+
},
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
72
205
|
## Related
|
|
73
206
|
|
|
74
207
|
- [Content Similarity Scorer](./content-similarity)
|
|
@@ -10,7 +10,23 @@ Mastra provides two tool call accuracy scorers for evaluating whether an LLM sel
|
|
|
10
10
|
1. **Code-based scorer** - Deterministic evaluation using exact tool matching
|
|
11
11
|
2. **LLM-based scorer** - Semantic evaluation using AI to assess appropriateness
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
## Choosing Between Scorers
|
|
14
|
+
|
|
15
|
+
### Use the Code-Based Scorer When:
|
|
16
|
+
|
|
17
|
+
- You need **deterministic, reproducible** results
|
|
18
|
+
- You want to test **exact tool matching**
|
|
19
|
+
- You need to validate **specific tool sequences**
|
|
20
|
+
- Speed and cost are priorities (no LLM calls)
|
|
21
|
+
- You're running automated tests
|
|
22
|
+
|
|
23
|
+
### Use the LLM-Based Scorer When:
|
|
24
|
+
|
|
25
|
+
- You need **semantic understanding** of appropriateness
|
|
26
|
+
- Tool selection depends on **context and intent**
|
|
27
|
+
- You want to handle **edge cases** like clarification requests
|
|
28
|
+
- You need **explanations** for scoring decisions
|
|
29
|
+
- You're evaluating **production agent behavior**
|
|
14
30
|
|
|
15
31
|
## Code-Based Tool Call Accuracy Scorer
|
|
16
32
|
|
|
@@ -62,28 +78,220 @@ When `expectedToolOrder` is provided, the scorer validates tool calling sequence
|
|
|
62
78
|
- **Strict Order (strictMode: true)**: Tools must be called in exactly the specified order with no extra tools
|
|
63
79
|
- **Flexible Order (strictMode: false)**: Expected tools must appear in correct relative order (extra tools allowed)
|
|
64
80
|
|
|
65
|
-
|
|
81
|
+
## Code-Based Scoring Details
|
|
82
|
+
|
|
83
|
+
- **Binary scores**: Always returns 0 or 1
|
|
84
|
+
- **Deterministic**: Same input always produces same output
|
|
85
|
+
- **Fast**: No external API calls
|
|
86
|
+
|
|
87
|
+
### Code-Based Scorer Options
|
|
88
|
+
|
|
89
|
+
```typescript showLineNumbers copy
|
|
90
|
+
// Standard mode - passes if expected tool is called
|
|
91
|
+
const lenientScorer = createCodeScorer({
|
|
92
|
+
expectedTool: 'search-tool',
|
|
93
|
+
strictMode: false
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
// Strict mode - only passes if exactly one tool is called
|
|
97
|
+
const strictScorer = createCodeScorer({
|
|
98
|
+
expectedTool: 'search-tool',
|
|
99
|
+
strictMode: true
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Order checking with strict mode
|
|
103
|
+
const strictOrderScorer = createCodeScorer({
|
|
104
|
+
expectedTool: 'step1-tool',
|
|
105
|
+
expectedToolOrder: ['step1-tool', 'step2-tool', 'step3-tool'],
|
|
106
|
+
strictMode: true // no extra tools allowed
|
|
107
|
+
});
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Code-Based Scorer Results
|
|
66
111
|
|
|
67
112
|
```typescript
|
|
68
|
-
|
|
113
|
+
{
|
|
114
|
+
runId: string,
|
|
115
|
+
preprocessStepResult: {
|
|
116
|
+
expectedTool: string,
|
|
117
|
+
actualTools: string[],
|
|
118
|
+
strictMode: boolean,
|
|
119
|
+
expectedToolOrder?: string[],
|
|
120
|
+
hasToolCalls: boolean,
|
|
121
|
+
correctToolCalled: boolean,
|
|
122
|
+
correctOrderCalled: boolean | null,
|
|
123
|
+
toolCallInfos: ToolCallInfo[]
|
|
124
|
+
},
|
|
125
|
+
score: number // Always 0 or 1
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Code-Based Scorer Examples
|
|
130
|
+
|
|
131
|
+
The code-based scorer provides deterministic, binary scoring (0 or 1) based on exact tool matching.
|
|
69
132
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
133
|
+
### Correct tool selection
|
|
134
|
+
|
|
135
|
+
```typescript filename="src/example-correct-tool.ts" showLineNumbers copy
|
|
136
|
+
const scorer = createToolCallAccuracyScorerCode({
|
|
137
|
+
expectedTool: 'weather-tool'
|
|
73
138
|
});
|
|
74
139
|
|
|
75
|
-
//
|
|
76
|
-
const
|
|
77
|
-
|
|
140
|
+
// Simulate LLM input and output with tool call
|
|
141
|
+
const inputMessages = [
|
|
142
|
+
createUIMessage({
|
|
143
|
+
content: 'What is the weather like in New York today?',
|
|
144
|
+
role: 'user',
|
|
145
|
+
id: 'input-1'
|
|
146
|
+
})
|
|
147
|
+
];
|
|
148
|
+
|
|
149
|
+
const output = [
|
|
150
|
+
createUIMessage({
|
|
151
|
+
content: 'Let me check the weather for you.',
|
|
152
|
+
role: 'assistant',
|
|
153
|
+
id: 'output-1',
|
|
154
|
+
toolInvocations: [
|
|
155
|
+
createToolInvocation({
|
|
156
|
+
toolCallId: 'call-123',
|
|
157
|
+
toolName: 'weather-tool',
|
|
158
|
+
args: { location: 'New York' },
|
|
159
|
+
result: { temperature: '72°F', condition: 'sunny' },
|
|
160
|
+
state: 'result'
|
|
161
|
+
})
|
|
162
|
+
]
|
|
163
|
+
})
|
|
164
|
+
];
|
|
165
|
+
|
|
166
|
+
const run = createAgentTestRun({ inputMessages, output });
|
|
167
|
+
const result = await scorer.run(run);
|
|
168
|
+
|
|
169
|
+
console.log(result.score); // 1
|
|
170
|
+
console.log(result.preprocessStepResult?.correctToolCalled); // true
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Strict mode evaluation
|
|
174
|
+
|
|
175
|
+
Only passes if exactly one tool is called:
|
|
176
|
+
|
|
177
|
+
```typescript filename="src/example-strict-mode.ts" showLineNumbers copy
|
|
178
|
+
const strictScorer = createToolCallAccuracyScorerCode({
|
|
179
|
+
expectedTool: 'weather-tool',
|
|
78
180
|
strictMode: true
|
|
79
181
|
});
|
|
80
182
|
|
|
81
|
-
//
|
|
183
|
+
// Multiple tools called - fails in strict mode
|
|
184
|
+
const output = [
|
|
185
|
+
createUIMessage({
|
|
186
|
+
content: 'Let me help you with that.',
|
|
187
|
+
role: 'assistant',
|
|
188
|
+
id: 'output-1',
|
|
189
|
+
toolInvocations: [
|
|
190
|
+
createToolInvocation({
|
|
191
|
+
toolCallId: 'call-1',
|
|
192
|
+
toolName: 'search-tool',
|
|
193
|
+
args: {},
|
|
194
|
+
result: {},
|
|
195
|
+
state: 'result',
|
|
196
|
+
}),
|
|
197
|
+
createToolInvocation({
|
|
198
|
+
toolCallId: 'call-2',
|
|
199
|
+
toolName: 'weather-tool',
|
|
200
|
+
args: { location: 'New York' },
|
|
201
|
+
result: { temperature: '20°C' },
|
|
202
|
+
state: 'result',
|
|
203
|
+
})
|
|
204
|
+
]
|
|
205
|
+
})
|
|
206
|
+
];
|
|
207
|
+
|
|
208
|
+
const result = await strictScorer.run(run);
|
|
209
|
+
console.log(result.score); // 0 - fails because multiple tools were called
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Tool order validation
|
|
213
|
+
|
|
214
|
+
Validates that tools are called in a specific sequence:
|
|
215
|
+
|
|
216
|
+
```typescript filename="src/example-order-validation.ts" showLineNumbers copy
|
|
82
217
|
const orderScorer = createToolCallAccuracyScorerCode({
|
|
83
|
-
expectedTool: '
|
|
84
|
-
expectedToolOrder: ['
|
|
85
|
-
strictMode: true //
|
|
218
|
+
expectedTool: 'auth-tool', // ignored when order is specified
|
|
219
|
+
expectedToolOrder: ['auth-tool', 'fetch-tool'],
|
|
220
|
+
strictMode: true // no extra tools allowed
|
|
86
221
|
});
|
|
222
|
+
|
|
223
|
+
const output = [
|
|
224
|
+
createUIMessage({
|
|
225
|
+
content: 'I will authenticate and fetch the data.',
|
|
226
|
+
role: 'assistant',
|
|
227
|
+
id: 'output-1',
|
|
228
|
+
toolInvocations: [
|
|
229
|
+
createToolInvocation({
|
|
230
|
+
toolCallId: 'call-1',
|
|
231
|
+
toolName: 'auth-tool',
|
|
232
|
+
args: { token: 'abc123' },
|
|
233
|
+
result: { authenticated: true },
|
|
234
|
+
state: 'result'
|
|
235
|
+
}),
|
|
236
|
+
createToolInvocation({
|
|
237
|
+
toolCallId: 'call-2',
|
|
238
|
+
toolName: 'fetch-tool',
|
|
239
|
+
args: { endpoint: '/data' },
|
|
240
|
+
result: { data: ['item1'] },
|
|
241
|
+
state: 'result'
|
|
242
|
+
})
|
|
243
|
+
]
|
|
244
|
+
})
|
|
245
|
+
];
|
|
246
|
+
|
|
247
|
+
const result = await orderScorer.run(run);
|
|
248
|
+
console.log(result.score); // 1 - correct order
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Flexible order mode
|
|
252
|
+
|
|
253
|
+
Allows extra tools as long as expected tools maintain relative order:
|
|
254
|
+
|
|
255
|
+
```typescript filename="src/example-flexible-order.ts" showLineNumbers copy
|
|
256
|
+
const flexibleOrderScorer = createToolCallAccuracyScorerCode({
|
|
257
|
+
expectedTool: 'auth-tool',
|
|
258
|
+
expectedToolOrder: ['auth-tool', 'fetch-tool'],
|
|
259
|
+
strictMode: false // allows extra tools
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
const output = [
|
|
263
|
+
createUIMessage({
|
|
264
|
+
content: 'Performing comprehensive operation.',
|
|
265
|
+
role: 'assistant',
|
|
266
|
+
id: 'output-1',
|
|
267
|
+
toolInvocations: [
|
|
268
|
+
createToolInvocation({
|
|
269
|
+
toolCallId: 'call-1',
|
|
270
|
+
toolName: 'auth-tool',
|
|
271
|
+
args: { token: 'abc123' },
|
|
272
|
+
result: { authenticated: true },
|
|
273
|
+
state: 'result'
|
|
274
|
+
}),
|
|
275
|
+
createToolInvocation({
|
|
276
|
+
toolCallId: 'call-2',
|
|
277
|
+
toolName: 'log-tool', // Extra tool - OK in flexible mode
|
|
278
|
+
args: { message: 'Starting fetch' },
|
|
279
|
+
result: { logged: true },
|
|
280
|
+
state: 'result'
|
|
281
|
+
}),
|
|
282
|
+
createToolInvocation({
|
|
283
|
+
toolCallId: 'call-3',
|
|
284
|
+
toolName: 'fetch-tool',
|
|
285
|
+
args: { endpoint: '/data' },
|
|
286
|
+
result: { data: ['item1'] },
|
|
287
|
+
state: 'result'
|
|
288
|
+
})
|
|
289
|
+
]
|
|
290
|
+
})
|
|
291
|
+
];
|
|
292
|
+
|
|
293
|
+
const result = await flexibleOrderScorer.run(run);
|
|
294
|
+
console.log(result.score); // 1 - auth-tool comes before fetch-tool
|
|
87
295
|
```
|
|
88
296
|
|
|
89
297
|
## LLM-Based Tool Call Accuracy Scorer
|
|
@@ -126,82 +334,231 @@ The LLM-based scorer provides:
|
|
|
126
334
|
3. **Generate Score**: Calculates score based on appropriate vs total tool calls
|
|
127
335
|
4. **Generate Reasoning**: Provides human-readable explanation
|
|
128
336
|
|
|
129
|
-
|
|
337
|
+
## LLM-Based Scoring Details
|
|
338
|
+
|
|
339
|
+
- **Fractional scores**: Returns values between 0.0 and 1.0
|
|
340
|
+
- **Context-aware**: Considers user intent and appropriateness
|
|
341
|
+
- **Explanatory**: Provides reasoning for scores
|
|
342
|
+
|
|
343
|
+
### LLM-Based Scorer Options
|
|
344
|
+
|
|
345
|
+
```typescript showLineNumbers copy
|
|
346
|
+
// Basic configuration
|
|
347
|
+
const basicLLMScorer = createLLMScorer({
|
|
348
|
+
model: openai('gpt-4o-mini'),
|
|
349
|
+
availableTools: [
|
|
350
|
+
{ name: 'tool1', description: 'Description 1' },
|
|
351
|
+
{ name: 'tool2', description: 'Description 2' }
|
|
352
|
+
]
|
|
353
|
+
});
|
|
354
|
+
|
|
355
|
+
// With different model
|
|
356
|
+
const customModelScorer = createLLMScorer({
|
|
357
|
+
model: openai('gpt-4'), // More powerful model for complex evaluations
|
|
358
|
+
availableTools: [...]
|
|
359
|
+
});
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
### LLM-Based Scorer Results
|
|
130
363
|
|
|
131
364
|
```typescript
|
|
132
|
-
|
|
133
|
-
|
|
365
|
+
{
|
|
366
|
+
runId: string,
|
|
367
|
+
score: number, // 0.0 to 1.0
|
|
368
|
+
reason: string, // Human-readable explanation
|
|
369
|
+
analyzeStepResult: {
|
|
370
|
+
evaluations: Array<{
|
|
371
|
+
toolCalled: string,
|
|
372
|
+
wasAppropriate: boolean,
|
|
373
|
+
reasoning: string
|
|
374
|
+
}>,
|
|
375
|
+
missingTools?: string[]
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
## LLM-Based Scorer Examples
|
|
381
|
+
|
|
382
|
+
The LLM-based scorer uses AI to evaluate whether tool selections are appropriate for the user's request.
|
|
134
383
|
|
|
384
|
+
### Basic LLM evaluation
|
|
385
|
+
|
|
386
|
+
```typescript filename="src/example-llm-basic.ts" showLineNumbers copy
|
|
135
387
|
const llmScorer = createToolCallAccuracyScorerLLM({
|
|
136
388
|
model: openai('gpt-4o-mini'),
|
|
137
389
|
availableTools: [
|
|
138
|
-
{
|
|
139
|
-
name: 'weather-tool',
|
|
140
|
-
description: 'Get current weather information for any location'
|
|
390
|
+
{
|
|
391
|
+
name: 'weather-tool',
|
|
392
|
+
description: 'Get current weather information for any location'
|
|
141
393
|
},
|
|
142
|
-
{
|
|
143
|
-
name: '
|
|
144
|
-
description: '
|
|
394
|
+
{
|
|
395
|
+
name: 'calendar-tool',
|
|
396
|
+
description: 'Check calendar events and scheduling'
|
|
145
397
|
},
|
|
146
|
-
{
|
|
147
|
-
name: '
|
|
148
|
-
description: '
|
|
398
|
+
{
|
|
399
|
+
name: 'search-tool',
|
|
400
|
+
description: 'Search the web for general information'
|
|
149
401
|
}
|
|
150
402
|
]
|
|
151
403
|
});
|
|
152
404
|
|
|
153
|
-
const
|
|
154
|
-
|
|
155
|
-
|
|
405
|
+
const inputMessages = [
|
|
406
|
+
createUIMessage({
|
|
407
|
+
content: 'What is the weather like in San Francisco today?',
|
|
408
|
+
role: 'user',
|
|
409
|
+
id: 'input-1'
|
|
410
|
+
})
|
|
411
|
+
];
|
|
412
|
+
|
|
413
|
+
const output = [
|
|
414
|
+
createUIMessage({
|
|
415
|
+
content: 'Let me check the current weather for you.',
|
|
416
|
+
role: 'assistant',
|
|
417
|
+
id: 'output-1',
|
|
418
|
+
toolInvocations: [
|
|
419
|
+
createToolInvocation({
|
|
420
|
+
toolCallId: 'call-123',
|
|
421
|
+
toolName: 'weather-tool',
|
|
422
|
+
args: { location: 'San Francisco', date: 'today' },
|
|
423
|
+
result: { temperature: '68°F', condition: 'foggy' },
|
|
424
|
+
state: 'result'
|
|
425
|
+
})
|
|
426
|
+
]
|
|
427
|
+
})
|
|
428
|
+
];
|
|
429
|
+
|
|
430
|
+
const run = createAgentTestRun({ inputMessages, output });
|
|
431
|
+
const result = await llmScorer.run(run);
|
|
432
|
+
|
|
433
|
+
console.log(result.score); // 1.0 - appropriate tool usage
|
|
434
|
+
console.log(result.reason); // "The agent correctly used the weather-tool to address the user's request for weather information."
|
|
156
435
|
```
|
|
157
436
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
-
|
|
174
|
-
|
|
437
|
+
### Handling inappropriate tool usage
|
|
438
|
+
|
|
439
|
+
```typescript filename="src/example-llm-inappropriate.ts" showLineNumbers copy
|
|
440
|
+
const inputMessages = [
|
|
441
|
+
createUIMessage({
|
|
442
|
+
content: 'What is the weather in Tokyo?',
|
|
443
|
+
role: 'user',
|
|
444
|
+
id: 'input-1'
|
|
445
|
+
})
|
|
446
|
+
];
|
|
447
|
+
|
|
448
|
+
const inappropriateOutput = [
|
|
449
|
+
createUIMessage({
|
|
450
|
+
content: 'Let me search for that information.',
|
|
451
|
+
role: 'assistant',
|
|
452
|
+
id: 'output-1',
|
|
453
|
+
toolInvocations: [
|
|
454
|
+
createToolInvocation({
|
|
455
|
+
toolCallId: 'call-456',
|
|
456
|
+
toolName: 'search-tool', // Less appropriate than weather-tool
|
|
457
|
+
args: { query: 'Tokyo weather' },
|
|
458
|
+
result: { results: ['Tokyo weather data...'] },
|
|
459
|
+
state: 'result'
|
|
460
|
+
})
|
|
461
|
+
]
|
|
462
|
+
})
|
|
463
|
+
];
|
|
464
|
+
|
|
465
|
+
const run = createAgentTestRun({ inputMessages, output: inappropriateOutput });
|
|
466
|
+
const result = await llmScorer.run(run);
|
|
467
|
+
|
|
468
|
+
console.log(result.score); // 0.5 - partially appropriate
|
|
469
|
+
console.log(result.reason); // "The agent used search-tool when weather-tool would have been more appropriate for a direct weather query."
|
|
470
|
+
```
|
|
175
471
|
|
|
176
|
-
|
|
472
|
+
### Evaluating clarification requests
|
|
473
|
+
|
|
474
|
+
The LLM scorer recognizes when agents appropriately ask for clarification:
|
|
475
|
+
|
|
476
|
+
```typescript filename="src/example-llm-clarification.ts" showLineNumbers copy
|
|
477
|
+
const vagueInput = [
|
|
478
|
+
createUIMessage({
|
|
479
|
+
content: 'I need help with something',
|
|
480
|
+
role: 'user',
|
|
481
|
+
id: 'input-1'
|
|
482
|
+
})
|
|
483
|
+
];
|
|
484
|
+
|
|
485
|
+
const clarificationOutput = [
|
|
486
|
+
createUIMessage({
|
|
487
|
+
content: 'I'd be happy to help! Could you please provide more details about what you need assistance with?',
|
|
488
|
+
role: 'assistant',
|
|
489
|
+
id: 'output-1',
|
|
490
|
+
// No tools called - asking for clarification instead
|
|
491
|
+
})
|
|
492
|
+
];
|
|
493
|
+
|
|
494
|
+
const run = createAgentTestRun({
|
|
495
|
+
inputMessages: vagueInput,
|
|
496
|
+
output: clarificationOutput
|
|
497
|
+
});
|
|
498
|
+
const result = await llmScorer.run(run);
|
|
177
499
|
|
|
178
|
-
|
|
500
|
+
console.log(result.score); // 1.0 - appropriate to ask for clarification
|
|
501
|
+
console.log(result.reason); // "The agent appropriately asked for clarification rather than calling tools with insufficient information."
|
|
502
|
+
```
|
|
179
503
|
|
|
180
|
-
|
|
181
|
-
- **Deterministic**: Same input always produces same output
|
|
182
|
-
- **Fast**: No external API calls
|
|
504
|
+
## Comparing Both Scorers
|
|
183
505
|
|
|
184
|
-
|
|
506
|
+
Here's an example using both scorers on the same data:
|
|
185
507
|
|
|
186
|
-
-
|
|
187
|
-
|
|
188
|
-
|
|
508
|
+
```typescript filename="src/example-comparison.ts" showLineNumbers copy
|
|
509
|
+
import { createToolCallAccuracyScorerCode as createCodeScorer } from '@mastra/evals/scorers/code';
|
|
510
|
+
import { createToolCallAccuracyScorerLLM as createLLMScorer } from '@mastra/evals/scorers/llm';
|
|
511
|
+
import { openai } from '@ai-sdk/openai';
|
|
189
512
|
|
|
190
|
-
|
|
513
|
+
// Setup both scorers
|
|
514
|
+
const codeScorer = createCodeScorer({
|
|
515
|
+
expectedTool: 'weather-tool',
|
|
516
|
+
strictMode: false
|
|
517
|
+
});
|
|
191
518
|
|
|
192
|
-
|
|
519
|
+
const llmScorer = createLLMScorer({
|
|
520
|
+
model: openai('gpt-4o-mini'),
|
|
521
|
+
availableTools: [
|
|
522
|
+
{ name: 'weather-tool', description: 'Get weather information' },
|
|
523
|
+
{ name: 'search-tool', description: 'Search the web' }
|
|
524
|
+
]
|
|
525
|
+
});
|
|
193
526
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
527
|
+
// Test data
|
|
528
|
+
const run = createAgentTestRun({
|
|
529
|
+
inputMessages: [
|
|
530
|
+
createUIMessage({
|
|
531
|
+
content: 'What is the weather?',
|
|
532
|
+
role: 'user',
|
|
533
|
+
id: 'input-1'
|
|
534
|
+
})
|
|
535
|
+
],
|
|
536
|
+
output: [
|
|
537
|
+
createUIMessage({
|
|
538
|
+
content: 'Let me find that information.',
|
|
539
|
+
role: 'assistant',
|
|
540
|
+
id: 'output-1',
|
|
541
|
+
toolInvocations: [
|
|
542
|
+
createToolInvocation({
|
|
543
|
+
toolCallId: 'call-1',
|
|
544
|
+
toolName: 'search-tool',
|
|
545
|
+
args: { query: 'weather' },
|
|
546
|
+
result: { results: ['weather data'] },
|
|
547
|
+
state: 'result'
|
|
548
|
+
})
|
|
549
|
+
]
|
|
550
|
+
})
|
|
551
|
+
]
|
|
552
|
+
});
|
|
198
553
|
|
|
199
|
-
|
|
554
|
+
// Run both scorers
|
|
555
|
+
const codeResult = await codeScorer.run(run);
|
|
556
|
+
const llmResult = await llmScorer.run(run);
|
|
200
557
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
558
|
+
console.log('Code Scorer:', codeResult.score); // 0 - wrong tool
|
|
559
|
+
console.log('LLM Scorer:', llmResult.score); // 0.3 - partially appropriate
|
|
560
|
+
console.log('LLM Reason:', llmResult.reason); // Explains why search-tool is less appropriate
|
|
561
|
+
```
|
|
205
562
|
|
|
206
563
|
## Related
|
|
207
564
|
|