@mastra/mcp-docs-server 0.13.10 → 0.13.11-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40internal%2Fstorage-test-utils.md +9 -9
- package/.docs/organized/changelogs/%40internal%2Ftypes-builder.md +2 -0
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +18 -18
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +23 -23
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloudflare.md +20 -20
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-netlify.md +20 -20
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-vercel.md +20 -20
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +20 -20
- package/.docs/organized/changelogs/%40mastra%2Ffirecrawl.md +13 -13
- package/.docs/organized/changelogs/%40mastra%2Flibsql.md +9 -9
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +18 -18
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +12 -12
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +9 -9
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +21 -21
- package/.docs/organized/changelogs/%40mastra%2Frag.md +12 -12
- package/.docs/organized/changelogs/%40mastra%2Fschema-compat.md +7 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +18 -18
- package/.docs/organized/changelogs/create-mastra.md +9 -9
- package/.docs/organized/changelogs/mastra.md +22 -22
- package/.docs/organized/code-examples/agent-network.md +4 -3
- package/.docs/organized/code-examples/agent.md +33 -2
- package/.docs/raw/agents/overview.mdx +21 -1
- package/.docs/raw/getting-started/mcp-docs-server.mdx +2 -2
- package/.docs/raw/rag/chunking-and-embedding.mdx +11 -0
- package/.docs/raw/reference/agents/agent.mdx +64 -38
- package/.docs/raw/reference/agents/generate.mdx +206 -202
- package/.docs/raw/reference/agents/getAgent.mdx +23 -38
- package/.docs/raw/reference/agents/getDefaultGenerateOptions.mdx +62 -0
- package/.docs/raw/reference/agents/getDefaultStreamOptions.mdx +62 -0
- package/.docs/raw/reference/agents/getDefaultVNextStreamOptions.mdx +62 -0
- package/.docs/raw/reference/agents/getDescription.mdx +30 -0
- package/.docs/raw/reference/agents/getInstructions.mdx +36 -73
- package/.docs/raw/reference/agents/getLLM.mdx +69 -0
- package/.docs/raw/reference/agents/getMemory.mdx +42 -119
- package/.docs/raw/reference/agents/getModel.mdx +36 -75
- package/.docs/raw/reference/agents/getScorers.mdx +62 -0
- package/.docs/raw/reference/agents/getTools.mdx +36 -128
- package/.docs/raw/reference/agents/getVoice.mdx +36 -83
- package/.docs/raw/reference/agents/getWorkflows.mdx +37 -74
- package/.docs/raw/reference/agents/stream.mdx +263 -226
- package/.docs/raw/reference/agents/streamVNext.mdx +208 -402
- package/.docs/raw/reference/rag/chunk.mdx +51 -2
- package/.docs/raw/reference/scorers/answer-relevancy.mdx +6 -6
- package/.docs/raw/reference/scorers/bias.mdx +6 -6
- package/.docs/raw/reference/scorers/completeness.mdx +2 -2
- package/.docs/raw/reference/scorers/content-similarity.mdx +1 -1
- package/.docs/raw/reference/scorers/create-scorer.mdx +445 -0
- package/.docs/raw/reference/scorers/faithfulness.mdx +6 -6
- package/.docs/raw/reference/scorers/hallucination.mdx +6 -6
- package/.docs/raw/reference/scorers/keyword-coverage.mdx +2 -2
- package/.docs/raw/reference/scorers/mastra-scorer.mdx +116 -158
- package/.docs/raw/reference/scorers/toxicity.mdx +2 -2
- package/.docs/raw/scorers/custom-scorers.mdx +166 -268
- package/.docs/raw/scorers/overview.mdx +21 -13
- package/.docs/raw/server-db/local-dev-playground.mdx +3 -3
- package/package.json +3 -3
- package/.docs/raw/reference/agents/createTool.mdx +0 -241
- package/.docs/raw/reference/scorers/custom-code-scorer.mdx +0 -155
- package/.docs/raw/reference/scorers/llm-scorer.mdx +0 -210
|
@@ -1,319 +1,217 @@
|
|
|
1
1
|
## Creating scorers
|
|
2
2
|
|
|
3
|
-
Mastra provides
|
|
3
|
+
Mastra provides a unified `createScorer` factory that allows you to build custom evaluation logic using either JavaScript functions or LLM-based prompt objects for each step. This flexibility lets you choose the best approach for each part of your evaluation pipeline.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
### The Four-Step Pipeline
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
All scorers in Mastra follow a consistent four-step evaluation pipeline:
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
1. **preprocess** (optional): Prepare or transform input/output data
|
|
10
|
+
2. **analyze** (optional): Perform evaluation analysis and gather insights
|
|
11
|
+
3. **generateScore** (required): Convert analysis into a numerical score
|
|
12
|
+
4. **generateReason** (optional): Generate human-readable explanations
|
|
10
13
|
|
|
11
|
-
|
|
14
|
+
Each step can use either **functions** or **prompt objects** (LLM-based evaluation), giving you the flexibility to combine deterministic algorithms with AI judgment as needed.
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
- an optional **extract** step for preprocessing complex data
|
|
15
|
-
- a required **analyze** step for core evaluation and scoring
|
|
16
|
-
- and an optional **reason** step for generating explanations.
|
|
16
|
+
### Functions vs Prompt Objects
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
**Functions** use JavaScript for deterministic logic. They're ideal for:
|
|
19
|
+
- Algorithmic evaluations with clear criteria
|
|
20
|
+
- Performance-critical scenarios
|
|
21
|
+
- Integration with existing libraries
|
|
22
|
+
- Consistent, reproducible results
|
|
19
23
|
|
|
20
|
-
|
|
24
|
+
**Prompt Objects** use LLMs as judges for evaluation. They're perfect for:
|
|
25
|
+
- Subjective evaluations requiring human-like judgment
|
|
26
|
+
- Complex criteria difficult to code algorithmically
|
|
27
|
+
- Natural language understanding tasks
|
|
28
|
+
- Nuanced context evaluation
|
|
21
29
|
|
|
22
|
-
|
|
30
|
+
You can mix and match approaches within a single scorer - for example, use a function for preprocessing data and an LLM for analyzing quality.
|
|
23
31
|
|
|
24
|
-
|
|
25
|
-
- `input`: User messages (when used with agents) or workflow step input (when used with workflow steps)
|
|
26
|
-
- `output`: Agent's response (when used with agents) or workflow step output (when used with workflow steps)
|
|
27
|
-
- `runtimeContext`: Runtime context from the agent or workflow step being evaluated
|
|
28
|
-
- **Must return:** `{ results: any }`
|
|
29
|
-
- **Data flow:** The `results` value is passed to the analyze step as `extractStepResult`
|
|
32
|
+
### Initializing a Scorer
|
|
30
33
|
|
|
31
|
-
|
|
32
|
-
import { createScorer } from "@mastra/core/scores";
|
|
33
|
-
import keywordExtractor from "keyword-extractor";
|
|
34
|
+
Every scorer starts with the `createScorer` factory function, which requires a name and description, and optionally accepts a judge configuration for LLM-based steps.
|
|
34
35
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
```typescript
|
|
37
|
+
import { createScorer } from '@mastra/core/scores';
|
|
38
|
+
import { openai } from '@ai-sdk/openai';
|
|
38
39
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
results: {
|
|
53
|
-
inputKeywords,
|
|
54
|
-
outputKeywords,
|
|
55
|
-
},
|
|
56
|
-
};
|
|
57
|
-
},
|
|
58
|
-
|
|
59
|
-
// ... analyze and reason steps
|
|
60
|
-
});
|
|
40
|
+
const glutenCheckerScorer = createScorer({
|
|
41
|
+
name: 'Gluten Checker',
|
|
42
|
+
description: 'Check if recipes contain gluten ingredients',
|
|
43
|
+
judge: { // Optional: for prompt object steps
|
|
44
|
+
model: openai('gpt-4o'),
|
|
45
|
+
instructions: 'You are a Chef that identifies if recipes contain gluten.'
|
|
46
|
+
}
|
|
47
|
+
})
|
|
48
|
+
// Chain step methods here
|
|
49
|
+
.preprocess(...)
|
|
50
|
+
.analyze(...)
|
|
51
|
+
.generateScore(...)
|
|
52
|
+
.generateReason(...)
|
|
61
53
|
```
|
|
62
54
|
|
|
63
|
-
|
|
55
|
+
The judge configuration is only needed if you plan to use prompt objects in any step. Individual steps can override this default configuration with their own judge settings.
|
|
64
56
|
|
|
65
|
-
|
|
57
|
+
### Step-by-Step Breakdown
|
|
66
58
|
|
|
67
|
-
|
|
68
|
-
- `extractStepResult`: Results from the extract step (if extract step was defined)
|
|
69
|
-
- **Must return:** `{ score: number, results?: any }`
|
|
70
|
-
- **Data flow:** The `score` and optional `results` are passed to the reason step
|
|
59
|
+
#### preprocess Step (Optional)
|
|
71
60
|
|
|
72
|
-
|
|
73
|
-
export const keywordCoverageScorer = createScorer({
|
|
74
|
-
// ... name, description, extract step
|
|
61
|
+
Prepares input/output data when you need to extract specific elements, filter content, or transform complex data structures.
|
|
75
62
|
|
|
76
|
-
|
|
77
|
-
analyze: async ({ input, output, extractStepResult }) => {
|
|
78
|
-
const { inputKeywords, outputKeywords } = extractStepResult.results;
|
|
79
|
-
|
|
80
|
-
if (inputKeywords.size === 0) {
|
|
81
|
-
return { score: 1, results: { coverage: 1, matched: 0, total: 0 } };
|
|
82
|
-
}
|
|
63
|
+
**Functions:** `({ run, results }) => any`
|
|
83
64
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
},
|
|
98
|
-
};
|
|
99
|
-
},
|
|
100
|
-
|
|
101
|
-
// ... reason step
|
|
102
|
-
});
|
|
65
|
+
```typescript
|
|
66
|
+
const glutenCheckerScorer = createScorer(...)
|
|
67
|
+
.preprocess(({ run }) => {
|
|
68
|
+
// Extract and clean recipe text
|
|
69
|
+
const recipeText = run.output.text.toLowerCase();
|
|
70
|
+
const wordCount = recipeText.split(' ').length;
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
recipeText,
|
|
74
|
+
wordCount,
|
|
75
|
+
hasCommonGlutenWords: /flour|wheat|bread|pasta/.test(recipeText)
|
|
76
|
+
};
|
|
77
|
+
})
|
|
103
78
|
```
|
|
104
79
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
// Step 3: Generate explanation for the score
|
|
119
|
-
reason: async ({ score, analyzeStepResult, extractStepResult }) => {
|
|
120
|
-
const { matched, total, matchedKeywords } = analyzeStepResult.results;
|
|
121
|
-
const { inputKeywords } = extractStepResult.results;
|
|
80
|
+
**Prompt Objects:** Use `description`, `outputSchema`, and `createPrompt` to structure LLM-based preprocessing.
|
|
81
|
+
|
|
82
|
+
```typescript
|
|
83
|
+
const glutenCheckerScorer = createScorer(...)
|
|
84
|
+
.preprocess({
|
|
85
|
+
description: 'Extract ingredients from the recipe',
|
|
86
|
+
outputSchema: z.object({
|
|
87
|
+
ingredients: z.array(z.string()),
|
|
88
|
+
cookingMethods: z.array(z.string())
|
|
89
|
+
}),
|
|
90
|
+
createPrompt: ({ run }) => `
|
|
91
|
+
Extract all ingredients and cooking methods from this recipe:
|
|
92
|
+
${run.output.text}
|
|
122
93
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
);
|
|
127
|
-
|
|
128
|
-
let reason = `The output achieved ${percentage}% keyword coverage (${matched}/${total} keywords).`;
|
|
129
|
-
|
|
130
|
-
if (matchedKeywords.length > 0) {
|
|
131
|
-
reason += ` Covered keywords: ${matchedKeywords.join(", ")}.`;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
if (missedKeywords.length > 0) {
|
|
135
|
-
reason += ` Missing keywords: ${missedKeywords.join(", ")}.`;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
return { reason };
|
|
139
|
-
},
|
|
140
|
-
});
|
|
94
|
+
Return JSON with ingredients and cookingMethods arrays.
|
|
95
|
+
`
|
|
96
|
+
})
|
|
141
97
|
```
|
|
142
98
|
|
|
143
|
-
**
|
|
144
|
-
- [Custom Native JavaScript Scorer Example](/examples/scorers/custom-native-javascript-eval) - Example walkthrough.
|
|
145
|
-
- [Built-in Code Scorers](https://github.com/mastra-ai/mastra/tree/main/packages/evals/src/scorers/code) - Real implementations for reference
|
|
146
|
-
|
|
147
|
-
### LLM-based scorers
|
|
148
|
-
|
|
149
|
-
LLM scorers use `createLLMScorer` to build evaluations that leverage language models as judges. They're perfect for subjective evaluations that require understanding context, complex criteria that are difficult to code algorithmically, natural language understanding tasks, and cases where human-like judgment is needed.
|
|
150
|
-
|
|
151
|
-
LLM scorers follow the same evaluation pipeline as code scorers with an additional `calculateScore` function:
|
|
152
|
-
- an optional **extract** step where the LLM processes input/output and returns structured data
|
|
153
|
-
- a required **analyze** step where the LLM performs evaluation and returns structured analysis
|
|
154
|
-
- a required **calculateScore** function that converts LLM analysis into numerical score
|
|
155
|
-
- and an optional **reason** step where the LLM generates human-readable explanations
|
|
156
|
-
|
|
157
|
-
The `calculateScore` function leverages the best of both approaches: LLMs excel at qualitative analysis and understanding, while deterministic functions ensure precise and consistent numerical scoring.
|
|
158
|
-
|
|
159
|
-
For the complete API reference, see [`createLLMScorer`](/reference/scorers/llm-scorer), and for a detailed explanation of the pipeline, see [evaluation process](/docs/scorers/overview#evaluation-pipeline).
|
|
99
|
+
**Data Flow:** Results are available to subsequent steps as `results.preprocessStepResult`
|
|
160
100
|
|
|
161
|
-
####
|
|
101
|
+
#### analyze Step (Optional)
|
|
162
102
|
|
|
163
|
-
|
|
103
|
+
Performs core evaluation analysis, gathering insights that will inform the scoring decision.
|
|
164
104
|
|
|
165
|
-
|
|
166
|
-
- **model:** The LLM model instance for evaluation
|
|
167
|
-
- **instructions:** System prompt that guides the LLM's behavior
|
|
105
|
+
**Functions:** `({ run, results }) => any`
|
|
168
106
|
|
|
169
|
-
```typescript
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
name: 'Tone Scorer',
|
|
175
|
-
description: 'Evaluates the tone of the output',
|
|
107
|
+
```typescript
|
|
108
|
+
const glutenCheckerScorer = createScorer({...})
|
|
109
|
+
.preprocess(...)
|
|
110
|
+
.analyze(({ run, results }) => {
|
|
111
|
+
const { recipeText, hasCommonGlutenWords } = results.preprocessStepResult;
|
|
176
112
|
|
|
177
|
-
//
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
113
|
+
// Simple gluten detection algorithm
|
|
114
|
+
const glutenKeywords = ['wheat', 'flour', 'barley', 'rye', 'bread'];
|
|
115
|
+
const foundGlutenWords = glutenKeywords.filter(word =>
|
|
116
|
+
recipeText.includes(word)
|
|
117
|
+
);
|
|
182
118
|
|
|
183
|
-
|
|
184
|
-
|
|
119
|
+
return {
|
|
120
|
+
isGlutenFree: foundGlutenWords.length === 0,
|
|
121
|
+
detectedGlutenSources: foundGlutenWords,
|
|
122
|
+
confidence: hasCommonGlutenWords ? 0.9 : 0.7
|
|
123
|
+
};
|
|
124
|
+
})
|
|
185
125
|
```
|
|
186
126
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
1. Main themes (3-5 high-level concepts)
|
|
208
|
-
2. Specific topics mentioned
|
|
209
|
-
3. Key phrases that capture the essence
|
|
210
|
-
|
|
211
|
-
Content: ${run.output.text}
|
|
212
|
-
|
|
213
|
-
Return a JSON object with themes, topics, and keyPhrases arrays.
|
|
214
|
-
`,
|
|
215
|
-
},
|
|
216
|
-
|
|
217
|
-
// ... other steps
|
|
218
|
-
});
|
|
127
|
+
**Prompt Objects:** Use `description`, `outputSchema`, and `createPrompt` for LLM-based analysis.
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
const glutenCheckerScorer = createScorer({...})
|
|
131
|
+
.preprocess(...)
|
|
132
|
+
.analyze({
|
|
133
|
+
description: 'Analyze recipe for gluten content',
|
|
134
|
+
outputSchema: z.object({
|
|
135
|
+
isGlutenFree: z.boolean(),
|
|
136
|
+
glutenSources: z.array(z.string()),
|
|
137
|
+
confidence: z.number().min(0).max(1)
|
|
138
|
+
}),
|
|
139
|
+
createPrompt: ({ run, results }) => `
|
|
140
|
+
Analyze this recipe for gluten content:
|
|
141
|
+
"${results.preprocessStepResult.recipeText}"
|
|
142
|
+
|
|
143
|
+
Look for wheat, barley, rye, and hidden sources like soy sauce.
|
|
144
|
+
Return JSON with isGlutenFree, glutenSources array, and confidence (0-1).
|
|
145
|
+
`
|
|
146
|
+
})
|
|
219
147
|
```
|
|
220
148
|
|
|
221
|
-
|
|
149
|
+
**Data Flow:** Results are available to subsequent steps as `results.analyzeStepResult`
|
|
222
150
|
|
|
223
|
-
|
|
151
|
+
#### generateScore Step (Required)
|
|
224
152
|
|
|
225
|
-
|
|
226
|
-
- **Data flow:** The structured output is passed to the calculateScore function and then to the reason step
|
|
153
|
+
Converts analysis results into a numerical score. This is the only required step in the pipeline.
|
|
227
154
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
accuracy: z.number().min(1).max(5),
|
|
237
|
-
completeness: z.number().min(1).max(5),
|
|
238
|
-
relevance: z.number().min(1).max(5)
|
|
239
|
-
}),
|
|
240
|
-
createPrompt: ({ run }) => `
|
|
241
|
-
Evaluate this content on a scale of 1-5 for:
|
|
242
|
-
- Clarity: How clear and understandable is it?
|
|
243
|
-
- Accuracy: How factually correct does it appear?
|
|
244
|
-
- Completeness: How thorough is the response?
|
|
245
|
-
- Relevance: How well does it address the input?
|
|
246
|
-
|
|
247
|
-
Input: ${run.input.map(i => i.content).join(', ')}
|
|
248
|
-
Output: ${run.output.text}
|
|
249
|
-
|
|
250
|
-
Return a JSON object with numeric scores for each dimension.
|
|
251
|
-
`,
|
|
252
|
-
},
|
|
155
|
+
**Functions:** `({ run, results }) => number`
|
|
156
|
+
|
|
157
|
+
```typescript
|
|
158
|
+
const glutenCheckerScorer = createScorer({...})
|
|
159
|
+
.preprocess(...)
|
|
160
|
+
.analyze(...)
|
|
161
|
+
.generateScore(({ results }) => {
|
|
162
|
+
const { isGlutenFree, confidence } = results.analyzeStepResult;
|
|
253
163
|
|
|
254
|
-
//
|
|
255
|
-
|
|
164
|
+
// Return 1 for gluten-free, 0 for contains gluten
|
|
165
|
+
// Weight by confidence level
|
|
166
|
+
return isGlutenFree ? confidence : 0;
|
|
167
|
+
})
|
|
256
168
|
```
|
|
257
169
|
|
|
258
|
-
|
|
170
|
+
**Prompt Objects:** See the [`createScorer`](/reference/scorers/create-scorer) API reference for details on using prompt objects with generateScore, including required `calculateScore` function.
|
|
259
171
|
|
|
260
|
-
|
|
172
|
+
**Data Flow:** The score is available to generateReason as the `score` parameter
|
|
261
173
|
|
|
262
|
-
|
|
263
|
-
- **Data flow:** Converts the analyze step's structured output into a numerical score (0-1 range)
|
|
174
|
+
#### generateReason Step (Optional)
|
|
264
175
|
|
|
265
|
-
|
|
266
|
-
export const qualityScorer = createLLMScorer({
|
|
267
|
-
// ... previous steps
|
|
268
|
-
|
|
269
|
-
calculateScore: ({ run }) => {
|
|
270
|
-
const { clarity, accuracy, completeness, relevance } = run.analyzeStepResult;
|
|
271
|
-
|
|
272
|
-
// Calculate weighted average (scale of 1-5 to 0-1)
|
|
273
|
-
const weights = { clarity: 0.3, accuracy: 0.3, completeness: 0.2, relevance: 0.2 };
|
|
274
|
-
const weightedSum = (clarity * weights.clarity) +
|
|
275
|
-
(accuracy * weights.accuracy) +
|
|
276
|
-
(completeness * weights.completeness) +
|
|
277
|
-
(relevance * weights.relevance);
|
|
278
|
-
|
|
279
|
-
// Convert from 1-5 scale to 0-1 scale
|
|
280
|
-
return (weightedSum - 1) / 4;
|
|
281
|
-
},
|
|
282
|
-
|
|
283
|
-
// ... other steps
|
|
284
|
-
});
|
|
285
|
-
```
|
|
176
|
+
Generates human-readable explanations for the score, useful for debugging, transparency, or user feedback.
|
|
286
177
|
|
|
287
|
-
|
|
178
|
+
**Functions:** `({ run, results, score }) => string`
|
|
288
179
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
// ... previous steps
|
|
180
|
+
```typescript
|
|
181
|
+
const glutenCheckerScorer = createScorer({...})
|
|
182
|
+
.preprocess(...)
|
|
183
|
+
.analyze(...)
|
|
184
|
+
.generateScore(...)
|
|
185
|
+
.generateReason(({ results, score }) => {
|
|
186
|
+
const { isGlutenFree, glutenSources } = results.analyzeStepResult;
|
|
297
187
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
188
|
+
if (isGlutenFree) {
|
|
189
|
+
return `Score: ${score}. This recipe is gluten-free with no harmful ingredients detected.`;
|
|
190
|
+
} else {
|
|
191
|
+
return `Score: ${score}. Contains gluten from: ${glutenSources.join(', ')}`;
|
|
192
|
+
}
|
|
193
|
+
})
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
**Prompt Objects:** Use `description` and `createPrompt` for LLM-generated explanations.
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
const glutenCheckerScorer = createScorer({...})
|
|
200
|
+
.preprocess(...)
|
|
201
|
+
.analyze(...)
|
|
202
|
+
.generateScore(...)
|
|
203
|
+
.generateReason({
|
|
204
|
+
description: 'Explain the gluten assessment',
|
|
205
|
+
createPrompt: ({ results, score }) => `
|
|
206
|
+
Explain why this recipe received a score of ${score}.
|
|
207
|
+
Analysis: ${JSON.stringify(results.analyzeStepResult)}
|
|
208
|
+
|
|
209
|
+
Provide a clear explanation for someone with dietary restrictions.
|
|
210
|
+
`
|
|
211
|
+
})
|
|
315
212
|
```
|
|
316
213
|
|
|
317
214
|
**Examples and Resources:**
|
|
318
|
-
- [Custom
|
|
319
|
-
- [
|
|
215
|
+
- [Custom Scorer Example](/examples/scorers/custom-scorer) - Complete walkthrough
|
|
216
|
+
- [createScorer API Reference](/reference/scorers/create-scorer) - Complete technical documentation
|
|
217
|
+
- [Built-in Scorers Source Code](https://github.com/mastra-ai/mastra/tree/main/packages/evals/src/scorers) - Real implementations for reference
|
|
@@ -11,28 +11,36 @@ description: Overview of scorers in Mastra, detailing their capabilities for eva
|
|
|
11
11
|
|
|
12
12
|
## Evaluation pipeline
|
|
13
13
|
|
|
14
|
-
Mastra scorers follow
|
|
14
|
+
Mastra scorers follow a flexible four-step pipeline that allows for simple to complex evaluation workflows:
|
|
15
15
|
|
|
16
|
-
1. **
|
|
17
|
-
2. **
|
|
18
|
-
3. **
|
|
16
|
+
1. **preprocess** (Optional): Prepare or transform input/output data for evaluation
|
|
17
|
+
2. **analyze** (Optional): Perform evaluation analysis and gather insights
|
|
18
|
+
3. **generateScore** (Required): Convert analysis into a numerical score
|
|
19
|
+
4. **generateReason** (Optional): Generate explanations or justifications for the score
|
|
19
20
|
|
|
20
21
|
This modular structure enables both simple single-step evaluations and complex multi-stage analysis workflows, allowing you to build evaluations that match your specific needs.
|
|
21
22
|
|
|
22
23
|
### When to use each step
|
|
23
24
|
|
|
24
|
-
**
|
|
25
|
-
-
|
|
26
|
-
-
|
|
25
|
+
**preprocess step** - Use when your content is complex or needs preprocessing:
|
|
26
|
+
- Extracting specific elements from complex data structures
|
|
27
|
+
- Cleaning or normalizing text before analysis
|
|
27
28
|
- Parsing multiple claims that need individual evaluation
|
|
28
|
-
-
|
|
29
|
+
- Filtering content to focus evaluation on relevant sections
|
|
29
30
|
|
|
30
|
-
**
|
|
31
|
-
-
|
|
32
|
-
-
|
|
33
|
-
-
|
|
31
|
+
**analyze step** - Use when you need structured evaluation analysis:
|
|
32
|
+
- Gathering insights that inform the scoring decision
|
|
33
|
+
- Breaking down complex evaluation criteria into components
|
|
34
|
+
- Performing detailed analysis that generateScore will use
|
|
35
|
+
- Collecting evidence or reasoning data for transparency
|
|
34
36
|
|
|
35
|
-
**
|
|
37
|
+
**generateScore step** - Always required for converting analysis to scores:
|
|
38
|
+
- Simple scenarios: Direct scoring of input/output pairs
|
|
39
|
+
- Complex scenarios: Converting detailed analysis results into numerical scores
|
|
40
|
+
- Applying business logic and weighting to analysis results
|
|
41
|
+
- The only step that produces the final numerical score
|
|
42
|
+
|
|
43
|
+
**generateReason step** - Use when explanations are important:
|
|
36
44
|
- Users need to understand why a score was assigned
|
|
37
45
|
- Debugging and transparency are critical
|
|
38
46
|
- Compliance or auditing requires explanations
|
|
@@ -44,7 +44,7 @@ The Playground lets you interact with your agents, workflows, and tools. It prov
|
|
|
44
44
|
Quickly test and debug your agents during development using the interactive chat interface in the Agent Playground.
|
|
45
45
|
|
|
46
46
|
<VideoPlayer
|
|
47
|
-
src="https://res.cloudinary.com/
|
|
47
|
+
src="https://res.cloudinary.com/mastra-assets/video/upload/v1751406022/local-dev-agents-playground_100_m3begx.mp4"
|
|
48
48
|
/>
|
|
49
49
|
|
|
50
50
|
Key features:
|
|
@@ -60,7 +60,7 @@ Key features:
|
|
|
60
60
|
Validate workflows by supplying defined inputs and visualizing each step within the Workflow Playground.
|
|
61
61
|
|
|
62
62
|
<VideoPlayer
|
|
63
|
-
src="https://res.cloudinary.com/
|
|
63
|
+
src="https://res.cloudinary.com/mastra-assets/video/upload/v1751406027/local-dev-workflows-playground_100_rbc466.mp4"
|
|
64
64
|
/>
|
|
65
65
|
|
|
66
66
|
Key features:
|
|
@@ -76,7 +76,7 @@ Key features:
|
|
|
76
76
|
Quickly test and debug custom tools in isolation using the Tools Playground, without running a full agent or workflow.
|
|
77
77
|
|
|
78
78
|
<VideoPlayer
|
|
79
|
-
src="https://res.cloudinary.com/
|
|
79
|
+
src="https://res.cloudinary.com/mastra-assets/video/upload/v1751406316/local-dev-agents-tools_100_fe1jdt.mp4"
|
|
80
80
|
/>
|
|
81
81
|
|
|
82
82
|
Key features:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/mcp-docs-server",
|
|
3
|
-
"version": "0.13.
|
|
3
|
+
"version": "0.13.11-alpha.0",
|
|
4
4
|
"description": "MCP server for accessing Mastra.ai documentation, changelogs, and news.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
"uuid": "^11.1.0",
|
|
33
33
|
"zod": "^3.25.67",
|
|
34
34
|
"zod-to-json-schema": "^3.24.5",
|
|
35
|
-
"@mastra/core": "0.13.
|
|
35
|
+
"@mastra/core": "0.13.2-alpha.0",
|
|
36
36
|
"@mastra/mcp": "^0.10.10"
|
|
37
37
|
},
|
|
38
38
|
"devDependencies": {
|
|
@@ -49,7 +49,7 @@
|
|
|
49
49
|
"typescript": "^5.8.3",
|
|
50
50
|
"vitest": "^3.2.4",
|
|
51
51
|
"@internal/lint": "0.0.28",
|
|
52
|
-
"@mastra/core": "0.13.
|
|
52
|
+
"@mastra/core": "0.13.2-alpha.0"
|
|
53
53
|
},
|
|
54
54
|
"scripts": {
|
|
55
55
|
"prepare-docs": "cross-env PREPARE=true node dist/prepare-docs/prepare.js",
|