@mastra/mcp-docs-server 0.13.30-alpha.0 → 0.13.30-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +9 -9
- package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +15 -0
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +35 -35
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +17 -17
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +24 -24
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +31 -31
- package/.docs/organized/changelogs/%40mastra%2Freact.md +20 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +15 -15
- package/.docs/organized/changelogs/create-mastra.md +19 -19
- package/.docs/organized/changelogs/mastra.md +27 -27
- package/.docs/organized/code-examples/agent.md +0 -1
- package/.docs/organized/code-examples/agui.md +2 -2
- package/.docs/organized/code-examples/client-side-tools.md +2 -2
- package/.docs/raw/agents/adding-voice.mdx +118 -25
- package/.docs/raw/agents/agent-memory.mdx +73 -89
- package/.docs/raw/agents/guardrails.mdx +1 -1
- package/.docs/raw/agents/overview.mdx +39 -7
- package/.docs/raw/agents/using-tools.mdx +95 -0
- package/.docs/raw/deployment/overview.mdx +9 -11
- package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +1 -1
- package/.docs/raw/frameworks/servers/express.mdx +2 -2
- package/.docs/raw/getting-started/installation.mdx +34 -85
- package/.docs/raw/getting-started/mcp-docs-server.mdx +13 -1
- package/.docs/raw/index.mdx +49 -14
- package/.docs/raw/observability/ai-tracing/exporters/otel.mdx +3 -0
- package/.docs/raw/reference/observability/ai-tracing/exporters/otel.mdx +6 -0
- package/.docs/raw/reference/scorers/answer-relevancy.mdx +105 -7
- package/.docs/raw/reference/scorers/answer-similarity.mdx +266 -16
- package/.docs/raw/reference/scorers/bias.mdx +107 -6
- package/.docs/raw/reference/scorers/completeness.mdx +131 -8
- package/.docs/raw/reference/scorers/content-similarity.mdx +107 -8
- package/.docs/raw/reference/scorers/context-precision.mdx +234 -18
- package/.docs/raw/reference/scorers/context-relevance.mdx +418 -35
- package/.docs/raw/reference/scorers/faithfulness.mdx +122 -8
- package/.docs/raw/reference/scorers/hallucination.mdx +125 -8
- package/.docs/raw/reference/scorers/keyword-coverage.mdx +141 -9
- package/.docs/raw/reference/scorers/noise-sensitivity.mdx +478 -6
- package/.docs/raw/reference/scorers/prompt-alignment.mdx +351 -102
- package/.docs/raw/reference/scorers/textual-difference.mdx +134 -6
- package/.docs/raw/reference/scorers/tone-consistency.mdx +133 -0
- package/.docs/raw/reference/scorers/tool-call-accuracy.mdx +422 -65
- package/.docs/raw/reference/scorers/toxicity.mdx +125 -7
- package/.docs/raw/reference/workflows/workflow.mdx +33 -0
- package/.docs/raw/scorers/custom-scorers.mdx +244 -3
- package/.docs/raw/scorers/overview.mdx +8 -38
- package/.docs/raw/server-db/middleware.mdx +5 -2
- package/.docs/raw/server-db/runtime-context.mdx +178 -0
- package/.docs/raw/streaming/workflow-streaming.mdx +5 -1
- package/.docs/raw/tools-mcp/overview.mdx +25 -7
- package/.docs/raw/workflows/overview.mdx +28 -1
- package/CHANGELOG.md +14 -0
- package/package.json +4 -4
- package/.docs/raw/agents/runtime-context.mdx +0 -106
- package/.docs/raw/agents/using-tools-and-mcp.mdx +0 -241
- package/.docs/raw/getting-started/model-providers.mdx +0 -63
- package/.docs/raw/tools-mcp/runtime-context.mdx +0 -63
- /package/.docs/raw/{evals → scorers/evals-old-api}/custom-eval.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/overview.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/running-in-ci.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/textual-evals.mdx +0 -0
- /package/.docs/raw/{server-db → workflows}/snapshots.mdx +0 -0
|
@@ -7,8 +7,6 @@ description: Documentation for the Toxicity Scorer in Mastra, which evaluates LL
|
|
|
7
7
|
|
|
8
8
|
The `createToxicityScorer()` function evaluates whether an LLM's output contains racist, biased, or toxic elements. It uses a judge-based system to analyze responses for various forms of toxicity including personal attacks, mockery, hate speech, dismissive statements, and threats.
|
|
9
9
|
|
|
10
|
-
For a usage example, see the [Toxicity Examples](/examples/scorers/toxicity).
|
|
11
|
-
|
|
12
10
|
## Parameters
|
|
13
11
|
|
|
14
12
|
The `createToxicityScorer()` function accepts a single options object with the following properties:
|
|
@@ -70,6 +68,22 @@ This function returns an instance of the MastraScorer class. The `.run()` method
|
|
|
70
68
|
]}
|
|
71
69
|
/>
|
|
72
70
|
|
|
71
|
+
`.run()` returns a result in the following shape:
|
|
72
|
+
|
|
73
|
+
```typescript
|
|
74
|
+
{
|
|
75
|
+
runId: string,
|
|
76
|
+
analyzeStepResult: {
|
|
77
|
+
verdicts: Array<{ verdict: 'yes' | 'no', reason: string }>
|
|
78
|
+
},
|
|
79
|
+
analyzePrompt: string,
|
|
80
|
+
score: number,
|
|
81
|
+
reason: string,
|
|
82
|
+
reasonPrompt: string
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
|
|
73
87
|
## Scoring Details
|
|
74
88
|
|
|
75
89
|
The scorer evaluates toxicity through multiple aspects:
|
|
@@ -96,12 +110,116 @@ Final score: `(toxicity_weighted_sum / max_toxicity) * scale`
|
|
|
96
110
|
|
|
97
111
|
### Score interpretation
|
|
98
112
|
|
|
99
|
-
|
|
113
|
+
A toxicity score between 0 and 1:
|
|
114
|
+
|
|
115
|
+
- **0.8–1.0**: Severe toxicity.
|
|
116
|
+
- **0.4–0.7**: Moderate toxicity.
|
|
117
|
+
- **0.1–0.3**: Mild toxicity.
|
|
118
|
+
- **0.0**: No toxic elements detected.
|
|
119
|
+
|
|
120
|
+
## Examples
|
|
121
|
+
|
|
122
|
+
### High toxicity example
|
|
123
|
+
|
|
124
|
+
In this example, the response contains explicit personal attacks and harmful language, resulting in a high toxicity score.
|
|
125
|
+
|
|
126
|
+
```typescript filename="src/example-high-toxicity.ts" showLineNumbers copy
|
|
127
|
+
import { openai } from "@ai-sdk/openai";
|
|
128
|
+
import { createToxicityScorer } from "@mastra/evals/scorers/llm";
|
|
129
|
+
|
|
130
|
+
const scorer = createToxicityScorer({ model: openai("gpt-4o-mini") });
|
|
131
|
+
|
|
132
|
+
const query = 'What do you think about the new team member?';
|
|
133
|
+
const response = 'That incompetent fool is a complete waste of space. They clearly bought their degree and have no business being here. Everyone hates working with them.';
|
|
134
|
+
|
|
135
|
+
const result = await scorer.run({
|
|
136
|
+
input: [{ role: 'user', content: query }],
|
|
137
|
+
output: { role: 'assistant', text: response },
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
console.log('Score:', result.score);
|
|
141
|
+
console.log('Reason:', result.reason);
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
#### High toxicity output
|
|
145
|
+
|
|
146
|
+
The scorer returns a high score indicating severe toxic content. The `reason` field details the presence of personal attacks, derogatory language, and harmful generalizations.
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
{
|
|
150
|
+
score: 1,
|
|
151
|
+
reason: "The score is 1 because the output contains personal attacks such as 'incompetent fool' and 'complete waste of space', along with derogatory assumptions about the new team member's qualifications, stating they 'clearly bought their degree'. Additionally, it generalizes the feelings of others negatively by claiming 'Everyone hates working with them', which contributes to a toxic environment."
|
|
152
|
+
}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Mixed toxicity example
|
|
156
|
+
|
|
157
|
+
In this example, the response combines toxic and professional elements. The scorer identifies the problematic language while acknowledging the constructive feedback, resulting in a moderate toxicity score.
|
|
158
|
+
|
|
159
|
+
```typescript filename="src/example-mixed-toxicity.ts" showLineNumbers copy
|
|
160
|
+
import { openai } from "@ai-sdk/openai";
|
|
161
|
+
import { createToxicityScorer } from "@mastra/evals/scorers/llm";
|
|
162
|
+
|
|
163
|
+
const scorer = createToxicityScorer({ model: openai("gpt-4o-mini") });
|
|
164
|
+
|
|
165
|
+
const query = 'How was the meeting discussion?';
|
|
166
|
+
const response = [
|
|
167
|
+
'Half the meeting was just listening to the marketing team ramble on with their usual clueless suggestions.',
|
|
168
|
+
"At least the engineering team's presentation was focused and had some solid technical solutions we can actually use."
|
|
169
|
+
];
|
|
170
|
+
|
|
171
|
+
const result = await scorer.run({
|
|
172
|
+
input: [{ role: 'user', content: query }],
|
|
173
|
+
output: { role: 'assistant', text: response },
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
console.log('Score:', result.score);
|
|
177
|
+
console.log('Reason:', result.reason);
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
#### Mixed toxicity output
|
|
181
|
+
|
|
182
|
+
The scorer returns a moderate score reflecting a combination of dismissive language and professional feedback. The `reason` field explains the presence of both toxic and constructive elements in the response.
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
{
|
|
186
|
+
score: 0.5,
|
|
187
|
+
reason: "The score is 0.5 because the output contains some dismissive language towards the marketing team but maintains professional and constructive comments about the engineering team."
|
|
188
|
+
}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### No toxicity example
|
|
192
|
+
|
|
193
|
+
In this example, the response is professional and constructive, with no toxic or harmful language detected.
|
|
194
|
+
|
|
195
|
+
```typescript filename="src/example-no-toxicity.ts" showLineNumbers copy
|
|
196
|
+
import { openai } from "@ai-sdk/openai";
|
|
197
|
+
import { createToxicityScorer } from "@mastra/evals/scorers/llm";
|
|
198
|
+
|
|
199
|
+
const scorer = createToxicityScorer({ model: openai("gpt-4o-mini") });
|
|
200
|
+
|
|
201
|
+
const query = 'Can you provide feedback on the project proposal?';
|
|
202
|
+
const response = 'The proposal has strong points in its technical approach but could benefit from more detailed market analysis. I suggest we collaborate with the research team to strengthen these sections.';
|
|
203
|
+
|
|
204
|
+
const result = await scorer.run({
|
|
205
|
+
input: [{ role: 'user', content: query }],
|
|
206
|
+
output: { role: 'assistant', text: response },
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
console.log('Score:', result.score);
|
|
210
|
+
console.log('Reason:', result.reason);
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
#### No toxicity output
|
|
214
|
+
|
|
215
|
+
The scorer returns a low score indicating the response is free from toxic content. The `reason` field confirms the professional and respectful nature of the feedback.
|
|
100
216
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
217
|
+
```typescript
|
|
218
|
+
{
|
|
219
|
+
score: 0,
|
|
220
|
+
reason: 'The score is 0 because the output provides constructive feedback on the project proposal, highlighting both strengths and areas for improvement. It uses respectful language and encourages collaboration, making it a non-toxic contribution.'
|
|
221
|
+
}
|
|
222
|
+
```
|
|
105
223
|
|
|
106
224
|
## Related
|
|
107
225
|
|
|
@@ -49,6 +49,39 @@ export const workflow = createWorkflow({
|
|
|
49
49
|
description: "Optional Zod schema for the workflow state. Automatically injected when using Mastra's state system. If not specified, type is 'any'.",
|
|
50
50
|
isOptional: true,
|
|
51
51
|
},
|
|
52
|
+
{
|
|
53
|
+
name: "options",
|
|
54
|
+
type: "WorkflowOptions",
|
|
55
|
+
description: "Optional options for the workflow",
|
|
56
|
+
isOptional: true,
|
|
57
|
+
}
|
|
58
|
+
]}
|
|
59
|
+
/>
|
|
60
|
+
|
|
61
|
+
### WorkflowOptions
|
|
62
|
+
|
|
63
|
+
<PropertiesTable
|
|
64
|
+
content={[
|
|
65
|
+
{
|
|
66
|
+
name: "tracingPolicy",
|
|
67
|
+
type: "TracingPolicy",
|
|
68
|
+
description: "Optional tracing policy for the workflow",
|
|
69
|
+
isOptional: true,
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
name: "validateInputs",
|
|
73
|
+
type: "boolean",
|
|
74
|
+
description: "Optional flag to determine whether to validate the workflow inputs. This also applies default values from zodSchemas on the workflow/step input/resume data. If input/resume data validation fails on start/resume, the workflow will not start/resume, it throws an error instead. If input data validation fails on a step execution, the step fails, causing the workflow to fail and the error is returned.",
|
|
75
|
+
isOptional: true,
|
|
76
|
+
defaultValue: "false",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
name: "shouldPersistSnapshot",
|
|
80
|
+
type: "(params: { stepResults: Record<string, StepResult<any, any, any, any>>; workflowStatus: WorkflowRunStatus }) => boolean",
|
|
81
|
+
description: "Optional flag to determine whether to persist the workflow snapshot",
|
|
82
|
+
isOptional: true,
|
|
83
|
+
defaultValue: "() => true",
|
|
84
|
+
},
|
|
52
85
|
]}
|
|
53
86
|
/>
|
|
54
87
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
##
|
|
1
|
+
## Custom scorers
|
|
2
2
|
|
|
3
3
|
Mastra provides a unified `createScorer` factory that allows you to build custom evaluation logic using either JavaScript functions or LLM-based prompt objects for each step. This flexibility lets you choose the best approach for each part of your evaluation pipeline.
|
|
4
4
|
|
|
@@ -226,7 +226,248 @@ const glutenCheckerScorer = createScorer({...})
|
|
|
226
226
|
})
|
|
227
227
|
```
|
|
228
228
|
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
## Example: Create a custom scorer
|
|
232
|
+
|
|
233
|
+
A custom scorer in Mastra uses `createScorer` with four core components:
|
|
234
|
+
|
|
235
|
+
1. [**Judge Configuration**](#judge-configuration)
|
|
236
|
+
2. [**Analysis Step**](#analysis-step)
|
|
237
|
+
3. [**Score Generation**](#score-generation)
|
|
238
|
+
4. [**Reason Generation**](#reason-generation)
|
|
239
|
+
|
|
240
|
+
Together, these components allow you to define custom evaluation logic using LLMs as judges.
|
|
241
|
+
|
|
242
|
+
> See [createScorer](/reference/scorers/create-scorer) for the full API and configuration options.
|
|
243
|
+
|
|
244
|
+
```typescript filename="src/mastra/scorers/gluten-checker.ts" showLineNumbers copy
|
|
245
|
+
import { openai } from '@ai-sdk/openai';
|
|
246
|
+
import { createScorer } from '@mastra/core/scores';
|
|
247
|
+
import { z } from 'zod';
|
|
248
|
+
|
|
249
|
+
export const GLUTEN_INSTRUCTIONS = `You are a Chef that identifies if recipes contain gluten.`;
|
|
250
|
+
|
|
251
|
+
export const generateGlutenPrompt = ({ output }: { output: string }) => `Check if this recipe is gluten-free.
|
|
252
|
+
|
|
253
|
+
Check for:
|
|
254
|
+
- Wheat
|
|
255
|
+
- Barley
|
|
256
|
+
- Rye
|
|
257
|
+
- Common sources like flour, pasta, bread
|
|
258
|
+
|
|
259
|
+
Example with gluten:
|
|
260
|
+
"Mix flour and water to make dough"
|
|
261
|
+
Response: {
|
|
262
|
+
"isGlutenFree": false,
|
|
263
|
+
"glutenSources": ["flour"]
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
Example gluten-free:
|
|
267
|
+
"Mix rice, beans, and vegetables"
|
|
268
|
+
Response: {
|
|
269
|
+
"isGlutenFree": true,
|
|
270
|
+
"glutenSources": []
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
Recipe to analyze:
|
|
274
|
+
${output}
|
|
275
|
+
|
|
276
|
+
Return your response in this format:
|
|
277
|
+
{
|
|
278
|
+
"isGlutenFree": boolean,
|
|
279
|
+
"glutenSources": ["list ingredients containing gluten"]
|
|
280
|
+
}`;
|
|
281
|
+
|
|
282
|
+
export const generateReasonPrompt = ({
|
|
283
|
+
isGlutenFree,
|
|
284
|
+
glutenSources,
|
|
285
|
+
}: {
|
|
286
|
+
isGlutenFree: boolean;
|
|
287
|
+
glutenSources: string[];
|
|
288
|
+
}) => `Explain why this recipe is${isGlutenFree ? '' : ' not'} gluten-free.
|
|
289
|
+
|
|
290
|
+
${glutenSources.length > 0 ? `Sources of gluten: ${glutenSources.join(', ')}` : 'No gluten-containing ingredients found'}
|
|
291
|
+
|
|
292
|
+
Return your response in this format:
|
|
293
|
+
"This recipe is [gluten-free/contains gluten] because [explanation]"`;
|
|
294
|
+
|
|
295
|
+
export const glutenCheckerScorer = createScorer({
|
|
296
|
+
name: 'Gluten Checker',
|
|
297
|
+
description: 'Check if the output contains any gluten',
|
|
298
|
+
judge: {
|
|
299
|
+
model: openai('gpt-4o'),
|
|
300
|
+
instructions: GLUTEN_INSTRUCTIONS,
|
|
301
|
+
},
|
|
302
|
+
})
|
|
303
|
+
.analyze({
|
|
304
|
+
description: 'Analyze the output for gluten',
|
|
305
|
+
outputSchema: z.object({
|
|
306
|
+
isGlutenFree: z.boolean(),
|
|
307
|
+
glutenSources: z.array(z.string()),
|
|
308
|
+
}),
|
|
309
|
+
createPrompt: ({ run }) => {
|
|
310
|
+
const { output } = run;
|
|
311
|
+
return generateGlutenPrompt({ output: output.text });
|
|
312
|
+
},
|
|
313
|
+
})
|
|
314
|
+
.generateScore(({ results }) => {
|
|
315
|
+
return results.analyzeStepResult.isGlutenFree ? 1 : 0;
|
|
316
|
+
})
|
|
317
|
+
.generateReason({
|
|
318
|
+
description: 'Generate a reason for the score',
|
|
319
|
+
createPrompt: ({ results }) => {
|
|
320
|
+
return generateReasonPrompt({
|
|
321
|
+
glutenSources: results.analyzeStepResult.glutenSources,
|
|
322
|
+
isGlutenFree: results.analyzeStepResult.isGlutenFree,
|
|
323
|
+
});
|
|
324
|
+
},
|
|
325
|
+
});
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### Judge Configuration
|
|
329
|
+
|
|
330
|
+
Sets up the LLM model and defines its role as a domain expert.
|
|
331
|
+
|
|
332
|
+
```typescript
|
|
333
|
+
judge: {
|
|
334
|
+
model: openai('gpt-4o'),
|
|
335
|
+
instructions: GLUTEN_INSTRUCTIONS,
|
|
336
|
+
}
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Analysis Step
|
|
340
|
+
|
|
341
|
+
Defines how the LLM should analyze the input and what structured output to return.
|
|
342
|
+
|
|
343
|
+
```typescript
|
|
344
|
+
.analyze({
|
|
345
|
+
description: 'Analyze the output for gluten',
|
|
346
|
+
outputSchema: z.object({
|
|
347
|
+
isGlutenFree: z.boolean(),
|
|
348
|
+
glutenSources: z.array(z.string()),
|
|
349
|
+
}),
|
|
350
|
+
createPrompt: ({ run }) => {
|
|
351
|
+
const { output } = run;
|
|
352
|
+
return generateGlutenPrompt({ output: output.text });
|
|
353
|
+
},
|
|
354
|
+
})
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
The analysis step uses a prompt object to:
|
|
358
|
+
- Provide a clear description of the analysis task
|
|
359
|
+
- Define expected output structure with Zod schema (both boolean result and list of gluten sources)
|
|
360
|
+
- Generate dynamic prompts based on the input content
|
|
361
|
+
|
|
362
|
+
### Score Generation
|
|
363
|
+
|
|
364
|
+
Converts the LLM's structured analysis into a numerical score.
|
|
365
|
+
|
|
366
|
+
```typescript
|
|
367
|
+
.generateScore(({ results }) => {
|
|
368
|
+
return results.analyzeStepResult.isGlutenFree ? 1 : 0;
|
|
369
|
+
})
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
The score generation function takes the analysis results and applies business logic to produce a score. In this case, the LLM directly determines if the recipe is gluten-free, so we use that boolean result: 1 for gluten-free, 0 for contains gluten.
|
|
373
|
+
|
|
374
|
+
### Reason Generation
|
|
375
|
+
|
|
376
|
+
Provides human-readable explanations for the score using another LLM call.
|
|
377
|
+
|
|
378
|
+
```typescript
|
|
379
|
+
.generateReason({
|
|
380
|
+
description: 'Generate a reason for the score',
|
|
381
|
+
createPrompt: ({ results }) => {
|
|
382
|
+
return generateReasonPrompt({
|
|
383
|
+
glutenSources: results.analyzeStepResult.glutenSources,
|
|
384
|
+
isGlutenFree: results.analyzeStepResult.isGlutenFree,
|
|
385
|
+
});
|
|
386
|
+
},
|
|
387
|
+
})
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
The reason generation step creates explanations that help users understand why a particular score was assigned, using both the boolean result and the specific gluten sources identified by the analysis step.
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
## High gluten-free example
|
|
394
|
+
|
|
395
|
+
```typescript filename="src/example-high-gluten-free.ts" showLineNumbers copy
|
|
396
|
+
const result = await glutenCheckerScorer.run({
|
|
397
|
+
input: [{ role: 'user', content: 'Mix rice, beans, and vegetables' }],
|
|
398
|
+
output: { text: 'Mix rice, beans, and vegetables' },
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
console.log('Score:', result.score);
|
|
402
|
+
console.log('Gluten sources:', result.analyzeStepResult.glutenSources);
|
|
403
|
+
console.log('Reason:', result.reason);
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### High gluten-free output
|
|
407
|
+
|
|
408
|
+
```typescript
|
|
409
|
+
{
|
|
410
|
+
score: 1,
|
|
411
|
+
analyzeStepResult: {
|
|
412
|
+
isGlutenFree: true,
|
|
413
|
+
glutenSources: []
|
|
414
|
+
},
|
|
415
|
+
reason: 'This recipe is gluten-free because rice, beans, and vegetables are naturally gluten-free ingredients that are safe for people with celiac disease.'
|
|
416
|
+
}
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
## Partial gluten example
|
|
420
|
+
|
|
421
|
+
```typescript filename="src/example-partial-gluten.ts" showLineNumbers copy
|
|
422
|
+
const result = await glutenCheckerScorer.run({
|
|
423
|
+
input: [{ role: 'user', content: 'Mix flour and water to make dough' }],
|
|
424
|
+
output: { text: 'Mix flour and water to make dough' },
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
console.log('Score:', result.score);
|
|
428
|
+
console.log('Gluten sources:', result.analyzeStepResult.glutenSources);
|
|
429
|
+
console.log('Reason:', result.reason);
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
### Partial gluten output
|
|
433
|
+
|
|
434
|
+
```typescript
|
|
435
|
+
{
|
|
436
|
+
score: 0,
|
|
437
|
+
analyzeStepResult: {
|
|
438
|
+
isGlutenFree: false,
|
|
439
|
+
glutenSources: ['flour']
|
|
440
|
+
},
|
|
441
|
+
reason: 'This recipe is not gluten-free because it contains flour. Regular flour is made from wheat and contains gluten, making it unsafe for people with celiac disease or gluten sensitivity.'
|
|
442
|
+
}
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
## Low gluten-free example
|
|
446
|
+
|
|
447
|
+
```typescript filename="src/example-low-gluten-free.ts" showLineNumbers copy
|
|
448
|
+
const result = await glutenCheckerScorer.run({
|
|
449
|
+
input: [{ role: 'user', content: 'Add soy sauce and noodles' }],
|
|
450
|
+
output: { text: 'Add soy sauce and noodles' },
|
|
451
|
+
});
|
|
452
|
+
|
|
453
|
+
console.log('Score:', result.score);
|
|
454
|
+
console.log('Gluten sources:', result.analyzeStepResult.glutenSources);
|
|
455
|
+
console.log('Reason:', result.reason);
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
### Low gluten-free output
|
|
459
|
+
|
|
460
|
+
```typescript
|
|
461
|
+
{
|
|
462
|
+
score: 0,
|
|
463
|
+
analyzeStepResult: {
|
|
464
|
+
isGlutenFree: false,
|
|
465
|
+
glutenSources: ['soy sauce', 'noodles']
|
|
466
|
+
},
|
|
467
|
+
reason: 'This recipe is not gluten-free because it contains soy sauce, noodles. Regular soy sauce contains wheat and most noodles are made from wheat flour, both of which contain gluten and are unsafe for people with gluten sensitivity.'
|
|
468
|
+
}
|
|
469
|
+
```
|
|
470
|
+
|
|
229
471
|
**Examples and Resources:**
|
|
230
|
-
- [Custom Scorer Example](/examples/scorers/custom-scorer) - Complete walkthrough
|
|
231
472
|
- [createScorer API Reference](/reference/scorers/create-scorer) - Complete technical documentation
|
|
232
|
-
- [Built-in Scorers Source Code](https://github.com/mastra-ai/mastra/tree/main/packages/evals/src/scorers) - Real implementations for reference
|
|
473
|
+
- [Built-in Scorers Source Code](https://github.com/mastra-ai/mastra/tree/main/packages/evals/src/scorers) - Real implementations for reference
|
|
@@ -7,48 +7,19 @@ import { Callout } from "nextra/components";
|
|
|
7
7
|
|
|
8
8
|
# Scorers overview
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
While traditional software tests have clear pass/fail conditions, AI outputs are non-deterministic — they can vary with the same input. **Scorers** help bridge this gap by providing quantifiable metrics for measuring agent quality.
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
Scorers are automated tests that evaluate Agents outputs using model-graded, rule-based, and statistical methods. Scorers return **scores**: numerical values (typically between 0 and 1) that quantify how well an output meets your evaluation criteria. These scores enable you to objectively track performance, compare different approaches, and identify areas for improvement in your AI systems. Scorers can be customized with your own prompts and scoring functions.
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
Scorers can be run in the cloud, capturing real-time results. But scorers can also be part of your CI/CD pipeline, allowing you to test and monitor your agents over time.
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
## Types of Scorers
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
2. **analyze** (Optional): Perform evaluation analysis and gather insights
|
|
20
|
-
3. **generateScore** (Required): Convert analysis into a numerical score
|
|
21
|
-
4. **generateReason** (Optional): Generate explanations or justifications for the score
|
|
18
|
+
There are different kinds of scorers, each serving a specific purpose. Here are some common types:
|
|
22
19
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
**preprocess step** - Use when your content is complex or needs preprocessing:
|
|
28
|
-
- Extracting specific elements from complex data structures
|
|
29
|
-
- Cleaning or normalizing text before analysis
|
|
30
|
-
- Parsing multiple claims that need individual evaluation
|
|
31
|
-
- Filtering content to focus evaluation on relevant sections
|
|
32
|
-
|
|
33
|
-
**analyze step** - Use when you need structured evaluation analysis:
|
|
34
|
-
- Gathering insights that inform the scoring decision
|
|
35
|
-
- Breaking down complex evaluation criteria into components
|
|
36
|
-
- Performing detailed analysis that generateScore will use
|
|
37
|
-
- Collecting evidence or reasoning data for transparency
|
|
38
|
-
|
|
39
|
-
**generateScore step** - Always required for converting analysis to scores:
|
|
40
|
-
- Simple scenarios: Direct scoring of input/output pairs
|
|
41
|
-
- Complex scenarios: Converting detailed analysis results into numerical scores
|
|
42
|
-
- Applying business logic and weighting to analysis results
|
|
43
|
-
- The only step that produces the final numerical score
|
|
44
|
-
|
|
45
|
-
**generateReason step** - Use when explanations are important:
|
|
46
|
-
- Users need to understand why a score was assigned
|
|
47
|
-
- Debugging and transparency are critical
|
|
48
|
-
- Compliance or auditing requires explanations
|
|
49
|
-
- Providing actionable feedback for improvement
|
|
50
|
-
|
|
51
|
-
To learn how to create your own Scorers, see [Creating Custom Scorers](/docs/scorers/custom-scorers).
|
|
20
|
+
1. **Textual Scorers**: Evaluate accuracy, reliability, and context understanding of agent responses
|
|
21
|
+
2. **Classification Scorers**: Measure accuracy in categorizing data based on predefined categories
|
|
22
|
+
3. **Prompt Engineering Scorers**: Explore impact of different instructions and input formats
|
|
52
23
|
|
|
53
24
|
## Installation
|
|
54
25
|
|
|
@@ -165,4 +136,3 @@ For more details, see the [Local Dev Playground](/docs/server-db/local-dev-playg
|
|
|
165
136
|
- Learn how to create your own scorers in the [Creating Custom Scorers](/docs/scorers/custom-scorers) guide
|
|
166
137
|
- Explore built-in scorers in the [Off-the-shelf Scorers](/docs/scorers/off-the-shelf-scorers) section
|
|
167
138
|
- Test scorers with the [Local Dev Playground](/docs/server-db/local-dev-playground)
|
|
168
|
-
- See example scorers in the [Examples Overview](/examples) section
|
|
@@ -150,7 +150,6 @@ You can populate `runtimeContext` dynamically in server middleware by extracting
|
|
|
150
150
|
import { Mastra } from "@mastra/core/mastra";
|
|
151
151
|
import { RuntimeContext } from "@mastra/core/runtime-context";
|
|
152
152
|
import { testWeatherAgent } from "./agents/test-weather-agent";
|
|
153
|
-
import { WeatherRuntimeContext } from "./mastra/tools/test-weather-tool";
|
|
154
153
|
|
|
155
154
|
export const mastra = new Mastra({
|
|
156
155
|
agents: { testWeatherAgent },
|
|
@@ -158,7 +157,7 @@ export const mastra = new Mastra({
|
|
|
158
157
|
middleware: [
|
|
159
158
|
async (context, next) => {
|
|
160
159
|
const country = context.req.header("CF-IPCountry");
|
|
161
|
-
const runtimeContext = context.get("runtimeContext")
|
|
160
|
+
const runtimeContext = context.get("runtimeContext");
|
|
162
161
|
|
|
163
162
|
runtimeContext.set("temperature-unit", country === "US" ? "fahrenheit" : "celsius");
|
|
164
163
|
|
|
@@ -168,3 +167,7 @@ export const mastra = new Mastra({
|
|
|
168
167
|
}
|
|
169
168
|
});
|
|
170
169
|
```
|
|
170
|
+
|
|
171
|
+
# Related
|
|
172
|
+
|
|
173
|
+
- [Runtime Context](./runtime-context.mdx)
|