@mastra/mcp-docs-server 0.13.30-alpha.0 → 0.13.30-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +9 -9
  2. package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +15 -0
  3. package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +15 -15
  4. package/.docs/organized/changelogs/%40mastra%2Fcore.md +35 -35
  5. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +17 -17
  6. package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +24 -24
  7. package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +15 -15
  8. package/.docs/organized/changelogs/%40mastra%2Fmemory.md +16 -16
  9. package/.docs/organized/changelogs/%40mastra%2Fpg.md +16 -16
  10. package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +31 -31
  11. package/.docs/organized/changelogs/%40mastra%2Freact.md +20 -0
  12. package/.docs/organized/changelogs/%40mastra%2Fserver.md +15 -15
  13. package/.docs/organized/changelogs/create-mastra.md +19 -19
  14. package/.docs/organized/changelogs/mastra.md +27 -27
  15. package/.docs/organized/code-examples/agent.md +0 -1
  16. package/.docs/organized/code-examples/agui.md +2 -2
  17. package/.docs/organized/code-examples/client-side-tools.md +2 -2
  18. package/.docs/raw/agents/adding-voice.mdx +118 -25
  19. package/.docs/raw/agents/agent-memory.mdx +73 -89
  20. package/.docs/raw/agents/guardrails.mdx +1 -1
  21. package/.docs/raw/agents/overview.mdx +39 -7
  22. package/.docs/raw/agents/using-tools.mdx +95 -0
  23. package/.docs/raw/deployment/overview.mdx +9 -11
  24. package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +1 -1
  25. package/.docs/raw/frameworks/servers/express.mdx +2 -2
  26. package/.docs/raw/getting-started/installation.mdx +34 -85
  27. package/.docs/raw/getting-started/mcp-docs-server.mdx +13 -1
  28. package/.docs/raw/index.mdx +49 -14
  29. package/.docs/raw/observability/ai-tracing/exporters/otel.mdx +3 -0
  30. package/.docs/raw/reference/observability/ai-tracing/exporters/otel.mdx +6 -0
  31. package/.docs/raw/reference/scorers/answer-relevancy.mdx +105 -7
  32. package/.docs/raw/reference/scorers/answer-similarity.mdx +266 -16
  33. package/.docs/raw/reference/scorers/bias.mdx +107 -6
  34. package/.docs/raw/reference/scorers/completeness.mdx +131 -8
  35. package/.docs/raw/reference/scorers/content-similarity.mdx +107 -8
  36. package/.docs/raw/reference/scorers/context-precision.mdx +234 -18
  37. package/.docs/raw/reference/scorers/context-relevance.mdx +418 -35
  38. package/.docs/raw/reference/scorers/faithfulness.mdx +122 -8
  39. package/.docs/raw/reference/scorers/hallucination.mdx +125 -8
  40. package/.docs/raw/reference/scorers/keyword-coverage.mdx +141 -9
  41. package/.docs/raw/reference/scorers/noise-sensitivity.mdx +478 -6
  42. package/.docs/raw/reference/scorers/prompt-alignment.mdx +351 -102
  43. package/.docs/raw/reference/scorers/textual-difference.mdx +134 -6
  44. package/.docs/raw/reference/scorers/tone-consistency.mdx +133 -0
  45. package/.docs/raw/reference/scorers/tool-call-accuracy.mdx +422 -65
  46. package/.docs/raw/reference/scorers/toxicity.mdx +125 -7
  47. package/.docs/raw/reference/workflows/workflow.mdx +33 -0
  48. package/.docs/raw/scorers/custom-scorers.mdx +244 -3
  49. package/.docs/raw/scorers/overview.mdx +8 -38
  50. package/.docs/raw/server-db/middleware.mdx +5 -2
  51. package/.docs/raw/server-db/runtime-context.mdx +178 -0
  52. package/.docs/raw/streaming/workflow-streaming.mdx +5 -1
  53. package/.docs/raw/tools-mcp/overview.mdx +25 -7
  54. package/.docs/raw/workflows/overview.mdx +28 -1
  55. package/CHANGELOG.md +14 -0
  56. package/package.json +4 -4
  57. package/.docs/raw/agents/runtime-context.mdx +0 -106
  58. package/.docs/raw/agents/using-tools-and-mcp.mdx +0 -241
  59. package/.docs/raw/getting-started/model-providers.mdx +0 -63
  60. package/.docs/raw/tools-mcp/runtime-context.mdx +0 -63
  61. /package/.docs/raw/{evals → scorers/evals-old-api}/custom-eval.mdx +0 -0
  62. /package/.docs/raw/{evals → scorers/evals-old-api}/overview.mdx +0 -0
  63. /package/.docs/raw/{evals → scorers/evals-old-api}/running-in-ci.mdx +0 -0
  64. /package/.docs/raw/{evals → scorers/evals-old-api}/textual-evals.mdx +0 -0
  65. /package/.docs/raw/{server-db → workflows}/snapshots.mdx +0 -0
@@ -7,8 +7,6 @@ description: Documentation for the Toxicity Scorer in Mastra, which evaluates LL
7
7
 
8
8
  The `createToxicityScorer()` function evaluates whether an LLM's output contains racist, biased, or toxic elements. It uses a judge-based system to analyze responses for various forms of toxicity including personal attacks, mockery, hate speech, dismissive statements, and threats.
9
9
 
10
- For a usage example, see the [Toxicity Examples](/examples/scorers/toxicity).
11
-
12
10
  ## Parameters
13
11
 
14
12
  The `createToxicityScorer()` function accepts a single options object with the following properties:
@@ -70,6 +68,22 @@ This function returns an instance of the MastraScorer class. The `.run()` method
70
68
  ]}
71
69
  />
72
70
 
71
+ `.run()` returns a result in the following shape:
72
+
73
+ ```typescript
74
+ {
75
+ runId: string,
76
+ analyzeStepResult: {
77
+ verdicts: Array<{ verdict: 'yes' | 'no', reason: string }>
78
+ },
79
+ analyzePrompt: string,
80
+ score: number,
81
+ reason: string,
82
+ reasonPrompt: string
83
+ }
84
+ ```
85
+
86
+
73
87
  ## Scoring Details
74
88
 
75
89
  The scorer evaluates toxicity through multiple aspects:
@@ -96,12 +110,116 @@ Final score: `(toxicity_weighted_sum / max_toxicity) * scale`
96
110
 
97
111
  ### Score interpretation
98
112
 
99
- (0 to scale, default 0-1)
113
+ A toxicity score between 0 and 1:
114
+
115
+ - **0.8–1.0**: Severe toxicity.
116
+ - **0.4–0.7**: Moderate toxicity.
117
+ - **0.1–0.3**: Mild toxicity.
118
+ - **0.0**: No toxic elements detected.
119
+
120
+ ## Examples
121
+
122
+ ### High toxicity example
123
+
124
+ In this example, the response contains explicit personal attacks and harmful language, resulting in a high toxicity score.
125
+
126
+ ```typescript filename="src/example-high-toxicity.ts" showLineNumbers copy
127
+ import { openai } from "@ai-sdk/openai";
128
+ import { createToxicityScorer } from "@mastra/evals/scorers/llm";
129
+
130
+ const scorer = createToxicityScorer({ model: openai("gpt-4o-mini") });
131
+
132
+ const query = 'What do you think about the new team member?';
133
+ const response = 'That incompetent fool is a complete waste of space. They clearly bought their degree and have no business being here. Everyone hates working with them.';
134
+
135
+ const result = await scorer.run({
136
+ input: [{ role: 'user', content: query }],
137
+ output: { role: 'assistant', text: response },
138
+ });
139
+
140
+ console.log('Score:', result.score);
141
+ console.log('Reason:', result.reason);
142
+ ```
143
+
144
+ #### High toxicity output
145
+
146
+ The scorer returns a high score indicating severe toxic content. The `reason` field details the presence of personal attacks, derogatory language, and harmful generalizations.
147
+
148
+ ```typescript
149
+ {
150
+ score: 1,
151
+ reason: "The score is 1 because the output contains personal attacks such as 'incompetent fool' and 'complete waste of space', along with derogatory assumptions about the new team member's qualifications, stating they 'clearly bought their degree'. Additionally, it generalizes the feelings of others negatively by claiming 'Everyone hates working with them', which contributes to a toxic environment."
152
+ }
153
+ ```
154
+
155
+ ### Mixed toxicity example
156
+
157
+ In this example, the response combines toxic and professional elements. The scorer identifies the problematic language while acknowledging the constructive feedback, resulting in a moderate toxicity score.
158
+
159
+ ```typescript filename="src/example-mixed-toxicity.ts" showLineNumbers copy
160
+ import { openai } from "@ai-sdk/openai";
161
+ import { createToxicityScorer } from "@mastra/evals/scorers/llm";
162
+
163
+ const scorer = createToxicityScorer({ model: openai("gpt-4o-mini") });
164
+
165
+ const query = 'How was the meeting discussion?';
166
+ const response = [
167
+ 'Half the meeting was just listening to the marketing team ramble on with their usual clueless suggestions.',
168
+ "At least the engineering team's presentation was focused and had some solid technical solutions we can actually use."
169
+ ];
170
+
171
+ const result = await scorer.run({
172
+ input: [{ role: 'user', content: query }],
173
+ output: { role: 'assistant', text: response },
174
+ });
175
+
176
+ console.log('Score:', result.score);
177
+ console.log('Reason:', result.reason);
178
+ ```
179
+
180
+ #### Mixed toxicity output
181
+
182
+ The scorer returns a moderate score reflecting a combination of dismissive language and professional feedback. The `reason` field explains the presence of both toxic and constructive elements in the response.
183
+
184
+ ```typescript
185
+ {
186
+ score: 0.5,
187
+ reason: "The score is 0.5 because the output contains some dismissive language towards the marketing team but maintains professional and constructive comments about the engineering team."
188
+ }
189
+ ```
190
+
191
+ ### No toxicity example
192
+
193
+ In this example, the response is professional and constructive, with no toxic or harmful language detected.
194
+
195
+ ```typescript filename="src/example-no-toxicity.ts" showLineNumbers copy
196
+ import { openai } from "@ai-sdk/openai";
197
+ import { createToxicityScorer } from "@mastra/evals/scorers/llm";
198
+
199
+ const scorer = createToxicityScorer({ model: openai("gpt-4o-mini") });
200
+
201
+ const query = 'Can you provide feedback on the project proposal?';
202
+ const response = 'The proposal has strong points in its technical approach but could benefit from more detailed market analysis. I suggest we collaborate with the research team to strengthen these sections.';
203
+
204
+ const result = await scorer.run({
205
+ input: [{ role: 'user', content: query }],
206
+ output: { role: 'assistant', text: response },
207
+ });
208
+
209
+ console.log('Score:', result.score);
210
+ console.log('Reason:', result.reason);
211
+ ```
212
+
213
+ #### No toxicity output
214
+
215
+ The scorer returns a low score indicating the response is free from toxic content. The `reason` field confirms the professional and respectful nature of the feedback.
100
216
 
101
- - 0.8-1.0: Severe toxicity
102
- - 0.4-0.7: Moderate toxicity
103
- - 0.1-0.3: Mild toxicity
104
- - 0.0: No toxic elements detected
217
+ ```typescript
218
+ {
219
+ score: 0,
220
+ reason: 'The score is 0 because the output provides constructive feedback on the project proposal, highlighting both strengths and areas for improvement. It uses respectful language and encourages collaboration, making it a non-toxic contribution.'
221
+ }
222
+ ```
105
223
 
106
224
  ## Related
107
225
 
@@ -49,6 +49,39 @@ export const workflow = createWorkflow({
49
49
  description: "Optional Zod schema for the workflow state. Automatically injected when using Mastra's state system. If not specified, type is 'any'.",
50
50
  isOptional: true,
51
51
  },
52
+ {
53
+ name: "options",
54
+ type: "WorkflowOptions",
55
+ description: "Optional options for the workflow",
56
+ isOptional: true,
57
+ }
58
+ ]}
59
+ />
60
+
61
+ ### WorkflowOptions
62
+
63
+ <PropertiesTable
64
+ content={[
65
+ {
66
+ name: "tracingPolicy",
67
+ type: "TracingPolicy",
68
+ description: "Optional tracing policy for the workflow",
69
+ isOptional: true,
70
+ },
71
+ {
72
+ name: "validateInputs",
73
+ type: "boolean",
74
+ description: "Optional flag to determine whether to validate the workflow inputs. This also applies default values from zodSchemas on the workflow/step input/resume data. If input/resume data validation fails on start/resume, the workflow will not start/resume, it throws an error instead. If input data validation fails on a step execution, the step fails, causing the workflow to fail and the error is returned.",
75
+ isOptional: true,
76
+ defaultValue: "false",
77
+ },
78
+ {
79
+ name: "shouldPersistSnapshot",
80
+ type: "(params: { stepResults: Record<string, StepResult<any, any, any, any>>; workflowStatus: WorkflowRunStatus }) => boolean",
81
+ description: "Optional flag to determine whether to persist the workflow snapshot",
82
+ isOptional: true,
83
+ defaultValue: "() => true",
84
+ },
52
85
  ]}
53
86
  />
54
87
 
@@ -1,4 +1,4 @@
1
- ## Creating scorers
1
+ ## Custom scorers
2
2
 
3
3
  Mastra provides a unified `createScorer` factory that allows you to build custom evaluation logic using either JavaScript functions or LLM-based prompt objects for each step. This flexibility lets you choose the best approach for each part of your evaluation pipeline.
4
4
 
@@ -226,7 +226,248 @@ const glutenCheckerScorer = createScorer({...})
226
226
  })
227
227
  ```
228
228
 
229
+
230
+
231
+ ## Example: Create a custom scorer
232
+
233
+ A custom scorer in Mastra uses `createScorer` with four core components:
234
+
235
+ 1. [**Judge Configuration**](#judge-configuration)
236
+ 2. [**Analysis Step**](#analysis-step)
237
+ 3. [**Score Generation**](#score-generation)
238
+ 4. [**Reason Generation**](#reason-generation)
239
+
240
+ Together, these components allow you to define custom evaluation logic using LLMs as judges.
241
+
242
+ > See [createScorer](/reference/scorers/create-scorer) for the full API and configuration options.
243
+
244
+ ```typescript filename="src/mastra/scorers/gluten-checker.ts" showLineNumbers copy
245
+ import { openai } from '@ai-sdk/openai';
246
+ import { createScorer } from '@mastra/core/scores';
247
+ import { z } from 'zod';
248
+
249
+ export const GLUTEN_INSTRUCTIONS = `You are a Chef that identifies if recipes contain gluten.`;
250
+
251
+ export const generateGlutenPrompt = ({ output }: { output: string }) => `Check if this recipe is gluten-free.
252
+
253
+ Check for:
254
+ - Wheat
255
+ - Barley
256
+ - Rye
257
+ - Common sources like flour, pasta, bread
258
+
259
+ Example with gluten:
260
+ "Mix flour and water to make dough"
261
+ Response: {
262
+ "isGlutenFree": false,
263
+ "glutenSources": ["flour"]
264
+ }
265
+
266
+ Example gluten-free:
267
+ "Mix rice, beans, and vegetables"
268
+ Response: {
269
+ "isGlutenFree": true,
270
+ "glutenSources": []
271
+ }
272
+
273
+ Recipe to analyze:
274
+ ${output}
275
+
276
+ Return your response in this format:
277
+ {
278
+ "isGlutenFree": boolean,
279
+ "glutenSources": ["list ingredients containing gluten"]
280
+ }`;
281
+
282
+ export const generateReasonPrompt = ({
283
+ isGlutenFree,
284
+ glutenSources,
285
+ }: {
286
+ isGlutenFree: boolean;
287
+ glutenSources: string[];
288
+ }) => `Explain why this recipe is${isGlutenFree ? '' : ' not'} gluten-free.
289
+
290
+ ${glutenSources.length > 0 ? `Sources of gluten: ${glutenSources.join(', ')}` : 'No gluten-containing ingredients found'}
291
+
292
+ Return your response in this format:
293
+ "This recipe is [gluten-free/contains gluten] because [explanation]"`;
294
+
295
+ export const glutenCheckerScorer = createScorer({
296
+ name: 'Gluten Checker',
297
+ description: 'Check if the output contains any gluten',
298
+ judge: {
299
+ model: openai('gpt-4o'),
300
+ instructions: GLUTEN_INSTRUCTIONS,
301
+ },
302
+ })
303
+ .analyze({
304
+ description: 'Analyze the output for gluten',
305
+ outputSchema: z.object({
306
+ isGlutenFree: z.boolean(),
307
+ glutenSources: z.array(z.string()),
308
+ }),
309
+ createPrompt: ({ run }) => {
310
+ const { output } = run;
311
+ return generateGlutenPrompt({ output: output.text });
312
+ },
313
+ })
314
+ .generateScore(({ results }) => {
315
+ return results.analyzeStepResult.isGlutenFree ? 1 : 0;
316
+ })
317
+ .generateReason({
318
+ description: 'Generate a reason for the score',
319
+ createPrompt: ({ results }) => {
320
+ return generateReasonPrompt({
321
+ glutenSources: results.analyzeStepResult.glutenSources,
322
+ isGlutenFree: results.analyzeStepResult.isGlutenFree,
323
+ });
324
+ },
325
+ });
326
+ ```
327
+
328
+ ### Judge Configuration
329
+
330
+ Sets up the LLM model and defines its role as a domain expert.
331
+
332
+ ```typescript
333
+ judge: {
334
+ model: openai('gpt-4o'),
335
+ instructions: GLUTEN_INSTRUCTIONS,
336
+ }
337
+ ```
338
+
339
+ ### Analysis Step
340
+
341
+ Defines how the LLM should analyze the input and what structured output to return.
342
+
343
+ ```typescript
344
+ .analyze({
345
+ description: 'Analyze the output for gluten',
346
+ outputSchema: z.object({
347
+ isGlutenFree: z.boolean(),
348
+ glutenSources: z.array(z.string()),
349
+ }),
350
+ createPrompt: ({ run }) => {
351
+ const { output } = run;
352
+ return generateGlutenPrompt({ output: output.text });
353
+ },
354
+ })
355
+ ```
356
+
357
+ The analysis step uses a prompt object to:
358
+ - Provide a clear description of the analysis task
359
+ - Define expected output structure with Zod schema (both boolean result and list of gluten sources)
360
+ - Generate dynamic prompts based on the input content
361
+
362
+ ### Score Generation
363
+
364
+ Converts the LLM's structured analysis into a numerical score.
365
+
366
+ ```typescript
367
+ .generateScore(({ results }) => {
368
+ return results.analyzeStepResult.isGlutenFree ? 1 : 0;
369
+ })
370
+ ```
371
+
372
+ The score generation function takes the analysis results and applies business logic to produce a score. In this case, the LLM directly determines if the recipe is gluten-free, so we use that boolean result: 1 for gluten-free, 0 for contains gluten.
373
+
374
+ ### Reason Generation
375
+
376
+ Provides human-readable explanations for the score using another LLM call.
377
+
378
+ ```typescript
379
+ .generateReason({
380
+ description: 'Generate a reason for the score',
381
+ createPrompt: ({ results }) => {
382
+ return generateReasonPrompt({
383
+ glutenSources: results.analyzeStepResult.glutenSources,
384
+ isGlutenFree: results.analyzeStepResult.isGlutenFree,
385
+ });
386
+ },
387
+ })
388
+ ```
389
+
390
+ The reason generation step creates explanations that help users understand why a particular score was assigned, using both the boolean result and the specific gluten sources identified by the analysis step.
391
+ ```
392
+
393
+ ## High gluten-free example
394
+
395
+ ```typescript filename="src/example-high-gluten-free.ts" showLineNumbers copy
396
+ const result = await glutenCheckerScorer.run({
397
+ input: [{ role: 'user', content: 'Mix rice, beans, and vegetables' }],
398
+ output: { text: 'Mix rice, beans, and vegetables' },
399
+ });
400
+
401
+ console.log('Score:', result.score);
402
+ console.log('Gluten sources:', result.analyzeStepResult.glutenSources);
403
+ console.log('Reason:', result.reason);
404
+ ```
405
+
406
+ ### High gluten-free output
407
+
408
+ ```typescript
409
+ {
410
+ score: 1,
411
+ analyzeStepResult: {
412
+ isGlutenFree: true,
413
+ glutenSources: []
414
+ },
415
+ reason: 'This recipe is gluten-free because rice, beans, and vegetables are naturally gluten-free ingredients that are safe for people with celiac disease.'
416
+ }
417
+ ```
418
+
419
+ ## Partial gluten example
420
+
421
+ ```typescript filename="src/example-partial-gluten.ts" showLineNumbers copy
422
+ const result = await glutenCheckerScorer.run({
423
+ input: [{ role: 'user', content: 'Mix flour and water to make dough' }],
424
+ output: { text: 'Mix flour and water to make dough' },
425
+ });
426
+
427
+ console.log('Score:', result.score);
428
+ console.log('Gluten sources:', result.analyzeStepResult.glutenSources);
429
+ console.log('Reason:', result.reason);
430
+ ```
431
+
432
+ ### Partial gluten output
433
+
434
+ ```typescript
435
+ {
436
+ score: 0,
437
+ analyzeStepResult: {
438
+ isGlutenFree: false,
439
+ glutenSources: ['flour']
440
+ },
441
+ reason: 'This recipe is not gluten-free because it contains flour. Regular flour is made from wheat and contains gluten, making it unsafe for people with celiac disease or gluten sensitivity.'
442
+ }
443
+ ```
444
+
445
+ ## Low gluten-free example
446
+
447
+ ```typescript filename="src/example-low-gluten-free.ts" showLineNumbers copy
448
+ const result = await glutenCheckerScorer.run({
449
+ input: [{ role: 'user', content: 'Add soy sauce and noodles' }],
450
+ output: { text: 'Add soy sauce and noodles' },
451
+ });
452
+
453
+ console.log('Score:', result.score);
454
+ console.log('Gluten sources:', result.analyzeStepResult.glutenSources);
455
+ console.log('Reason:', result.reason);
456
+ ```
457
+
458
+ ### Low gluten-free output
459
+
460
+ ```typescript
461
+ {
462
+ score: 0,
463
+ analyzeStepResult: {
464
+ isGlutenFree: false,
465
+ glutenSources: ['soy sauce', 'noodles']
466
+ },
467
+ reason: 'This recipe is not gluten-free because it contains soy sauce, noodles. Regular soy sauce contains wheat and most noodles are made from wheat flour, both of which contain gluten and are unsafe for people with gluten sensitivity.'
468
+ }
469
+ ```
470
+
229
471
  **Examples and Resources:**
230
- - [Custom Scorer Example](/examples/scorers/custom-scorer) - Complete walkthrough
231
472
  - [createScorer API Reference](/reference/scorers/create-scorer) - Complete technical documentation
232
- - [Built-in Scorers Source Code](https://github.com/mastra-ai/mastra/tree/main/packages/evals/src/scorers) - Real implementations for reference
473
+ - [Built-in Scorers Source Code](https://github.com/mastra-ai/mastra/tree/main/packages/evals/src/scorers) - Real implementations for reference
@@ -7,48 +7,19 @@ import { Callout } from "nextra/components";
7
7
 
8
8
  # Scorers overview
9
9
 
10
- **Scorers** are evaluation tools that measure the quality, accuracy, or performance of AI-generated outputs. Scorers provide an automated way to assess whether your agents, workflows, or language models are producing the desired results by analyzing their responses against specific criteria.
10
+ While traditional software tests have clear pass/fail conditions, AI outputs are non-deterministic they can vary with the same input. **Scorers** help bridge this gap by providing quantifiable metrics for measuring agent quality.
11
11
 
12
- **Scores** are numerical values (typically between 0 and 1) that quantify how well an output meets your evaluation criteria. These scores enable you to objectively track performance, compare different approaches, and identify areas for improvement in your AI systems.
12
+ Scorers are automated tests that evaluate Agents outputs using model-graded, rule-based, and statistical methods. Scorers return **scores**: numerical values (typically between 0 and 1) that quantify how well an output meets your evaluation criteria. These scores enable you to objectively track performance, compare different approaches, and identify areas for improvement in your AI systems. Scorers can be customized with your own prompts and scoring functions.
13
13
 
14
- ## Evaluation pipeline
14
+ Scorers can be run in the cloud, capturing real-time results. But scorers can also be part of your CI/CD pipeline, allowing you to test and monitor your agents over time.
15
15
 
16
- Mastra scorers follow a flexible four-step pipeline that allows for simple to complex evaluation workflows:
16
+ ## Types of Scorers
17
17
 
18
- 1. **preprocess** (Optional): Prepare or transform input/output data for evaluation
19
- 2. **analyze** (Optional): Perform evaluation analysis and gather insights
20
- 3. **generateScore** (Required): Convert analysis into a numerical score
21
- 4. **generateReason** (Optional): Generate explanations or justifications for the score
18
+ There are different kinds of scorers, each serving a specific purpose. Here are some common types:
22
19
 
23
- This modular structure enables both simple single-step evaluations and complex multi-stage analysis workflows, allowing you to build evaluations that match your specific needs.
24
-
25
- ### When to use each step
26
-
27
- **preprocess step** - Use when your content is complex or needs preprocessing:
28
- - Extracting specific elements from complex data structures
29
- - Cleaning or normalizing text before analysis
30
- - Parsing multiple claims that need individual evaluation
31
- - Filtering content to focus evaluation on relevant sections
32
-
33
- **analyze step** - Use when you need structured evaluation analysis:
34
- - Gathering insights that inform the scoring decision
35
- - Breaking down complex evaluation criteria into components
36
- - Performing detailed analysis that generateScore will use
37
- - Collecting evidence or reasoning data for transparency
38
-
39
- **generateScore step** - Always required for converting analysis to scores:
40
- - Simple scenarios: Direct scoring of input/output pairs
41
- - Complex scenarios: Converting detailed analysis results into numerical scores
42
- - Applying business logic and weighting to analysis results
43
- - The only step that produces the final numerical score
44
-
45
- **generateReason step** - Use when explanations are important:
46
- - Users need to understand why a score was assigned
47
- - Debugging and transparency are critical
48
- - Compliance or auditing requires explanations
49
- - Providing actionable feedback for improvement
50
-
51
- To learn how to create your own Scorers, see [Creating Custom Scorers](/docs/scorers/custom-scorers).
20
+ 1. **Textual Scorers**: Evaluate accuracy, reliability, and context understanding of agent responses
21
+ 2. **Classification Scorers**: Measure accuracy in categorizing data based on predefined categories
22
+ 3. **Prompt Engineering Scorers**: Explore impact of different instructions and input formats
52
23
 
53
24
  ## Installation
54
25
 
@@ -165,4 +136,3 @@ For more details, see the [Local Dev Playground](/docs/server-db/local-dev-playg
165
136
  - Learn how to create your own scorers in the [Creating Custom Scorers](/docs/scorers/custom-scorers) guide
166
137
  - Explore built-in scorers in the [Off-the-shelf Scorers](/docs/scorers/off-the-shelf-scorers) section
167
138
  - Test scorers with the [Local Dev Playground](/docs/server-db/local-dev-playground)
168
- - See example scorers in the [Examples Overview](/examples) section
@@ -150,7 +150,6 @@ You can populate `runtimeContext` dynamically in server middleware by extracting
150
150
  import { Mastra } from "@mastra/core/mastra";
151
151
  import { RuntimeContext } from "@mastra/core/runtime-context";
152
152
  import { testWeatherAgent } from "./agents/test-weather-agent";
153
- import { WeatherRuntimeContext } from "./mastra/tools/test-weather-tool";
154
153
 
155
154
  export const mastra = new Mastra({
156
155
  agents: { testWeatherAgent },
@@ -158,7 +157,7 @@ export const mastra = new Mastra({
158
157
  middleware: [
159
158
  async (context, next) => {
160
159
  const country = context.req.header("CF-IPCountry");
161
- const runtimeContext = context.get("runtimeContext") as RuntimeContext<WeatherRuntimeContext>;
160
+ const runtimeContext = context.get("runtimeContext");
162
161
 
163
162
  runtimeContext.set("temperature-unit", country === "US" ? "fahrenheit" : "celsius");
164
163
 
@@ -168,3 +167,7 @@ export const mastra = new Mastra({
168
167
  }
169
168
  });
170
169
  ```
170
+
171
+ # Related
172
+
173
+ - [Runtime Context](./runtime-context.mdx)