@mastra/mcp-docs-server 0.13.17-alpha.3 → 0.13.17-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +25 -0
  2. package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +19 -0
  3. package/.docs/organized/changelogs/%40mastra%2Fastra.md +10 -10
  4. package/.docs/organized/changelogs/%40mastra%2Fauth.md +6 -0
  5. package/.docs/organized/changelogs/%40mastra%2Fchroma.md +10 -10
  6. package/.docs/organized/changelogs/%40mastra%2Fclickhouse.md +10 -10
  7. package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +26 -26
  8. package/.docs/organized/changelogs/%40mastra%2Fcloud.md +10 -10
  9. package/.docs/organized/changelogs/%40mastra%2Fcloudflare-d1.md +10 -10
  10. package/.docs/organized/changelogs/%40mastra%2Fcloudflare.md +11 -11
  11. package/.docs/organized/changelogs/%40mastra%2Fcore.md +46 -46
  12. package/.docs/organized/changelogs/%40mastra%2Fcouchbase.md +10 -10
  13. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +19 -0
  14. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloudflare.md +25 -25
  15. package/.docs/organized/changelogs/%40mastra%2Fdeployer-netlify.md +11 -11
  16. package/.docs/organized/changelogs/%40mastra%2Fdeployer-vercel.md +12 -12
  17. package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +33 -33
  18. package/.docs/organized/changelogs/%40mastra%2Fdynamodb.md +11 -11
  19. package/.docs/organized/changelogs/%40mastra%2Fevals.md +19 -19
  20. package/.docs/organized/changelogs/%40mastra%2Ffastembed.md +6 -0
  21. package/.docs/organized/changelogs/%40mastra%2Ffirecrawl.md +20 -20
  22. package/.docs/organized/changelogs/%40mastra%2Fgithub.md +19 -19
  23. package/.docs/organized/changelogs/%40mastra%2Flance.md +10 -10
  24. package/.docs/organized/changelogs/%40mastra%2Flibsql.md +10 -10
  25. package/.docs/organized/changelogs/%40mastra%2Floggers.md +11 -11
  26. package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +27 -27
  27. package/.docs/organized/changelogs/%40mastra%2Fmcp-registry-registry.md +20 -20
  28. package/.docs/organized/changelogs/%40mastra%2Fmcp.md +19 -19
  29. package/.docs/organized/changelogs/%40mastra%2Fmem0.md +10 -10
  30. package/.docs/organized/changelogs/%40mastra%2Fmemory.md +24 -24
  31. package/.docs/organized/changelogs/%40mastra%2Fmongodb.md +11 -11
  32. package/.docs/organized/changelogs/%40mastra%2Fmssql.md +10 -5
  33. package/.docs/organized/changelogs/%40mastra%2Fopensearch.md +10 -10
  34. package/.docs/organized/changelogs/%40mastra%2Fpg.md +19 -19
  35. package/.docs/organized/changelogs/%40mastra%2Fpinecone.md +10 -10
  36. package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +34 -34
  37. package/.docs/organized/changelogs/%40mastra%2Fqdrant.md +13 -13
  38. package/.docs/organized/changelogs/%40mastra%2Frag.md +10 -10
  39. package/.docs/organized/changelogs/%40mastra%2Fragie.md +19 -19
  40. package/.docs/organized/changelogs/%40mastra%2Fschema-compat.md +13 -0
  41. package/.docs/organized/changelogs/%40mastra%2Fserver.md +25 -25
  42. package/.docs/organized/changelogs/%40mastra%2Fturbopuffer.md +11 -11
  43. package/.docs/organized/changelogs/%40mastra%2Fupstash.md +10 -10
  44. package/.docs/organized/changelogs/%40mastra%2Fvectorize.md +10 -10
  45. package/.docs/organized/changelogs/%40mastra%2Fvoice-azure.md +10 -10
  46. package/.docs/organized/changelogs/%40mastra%2Fvoice-cloudflare.md +10 -10
  47. package/.docs/organized/changelogs/%40mastra%2Fvoice-deepgram.md +10 -10
  48. package/.docs/organized/changelogs/%40mastra%2Fvoice-elevenlabs.md +10 -10
  49. package/.docs/organized/changelogs/%40mastra%2Fvoice-gladia.md +9 -0
  50. package/.docs/organized/changelogs/%40mastra%2Fvoice-google-gemini-live.md +19 -0
  51. package/.docs/organized/changelogs/%40mastra%2Fvoice-google.md +10 -10
  52. package/.docs/organized/changelogs/%40mastra%2Fvoice-murf.md +10 -10
  53. package/.docs/organized/changelogs/%40mastra%2Fvoice-openai-realtime.md +19 -19
  54. package/.docs/organized/changelogs/%40mastra%2Fvoice-openai.md +10 -10
  55. package/.docs/organized/changelogs/%40mastra%2Fvoice-playai.md +11 -11
  56. package/.docs/organized/changelogs/%40mastra%2Fvoice-sarvam.md +11 -11
  57. package/.docs/organized/changelogs/%40mastra%2Fvoice-speechify.md +10 -10
  58. package/.docs/organized/changelogs/create-mastra.md +13 -13
  59. package/.docs/organized/changelogs/mastra.md +31 -31
  60. package/.docs/organized/code-examples/a2a.md +1 -1
  61. package/.docs/organized/code-examples/agent-network.md +1 -1
  62. package/.docs/organized/code-examples/agent.md +22 -1
  63. package/.docs/organized/code-examples/agui.md +1 -1
  64. package/.docs/organized/code-examples/ai-sdk-useChat.md +1 -1
  65. package/.docs/organized/code-examples/ai-sdk-v5.md +2 -2
  66. package/.docs/organized/code-examples/assistant-ui.md +3 -3
  67. package/.docs/organized/code-examples/bird-checker-with-express.md +1 -1
  68. package/.docs/organized/code-examples/bird-checker-with-nextjs-and-eval.md +1 -1
  69. package/.docs/organized/code-examples/bird-checker-with-nextjs.md +1 -1
  70. package/.docs/organized/code-examples/client-side-tools.md +1 -1
  71. package/.docs/organized/code-examples/crypto-chatbot.md +1 -1
  72. package/.docs/organized/code-examples/experimental-auth-weather-agent.md +1 -1
  73. package/.docs/organized/code-examples/fireworks-r1.md +1 -1
  74. package/.docs/organized/code-examples/heads-up-game.md +32 -56
  75. package/.docs/organized/code-examples/mcp-configuration.md +2 -2
  76. package/.docs/organized/code-examples/mcp-registry-registry.md +1 -1
  77. package/.docs/organized/code-examples/memory-with-mem0.md +1 -1
  78. package/.docs/organized/code-examples/memory-with-processors.md +1 -1
  79. package/.docs/organized/code-examples/openapi-spec-writer.md +2 -2
  80. package/.docs/organized/code-examples/quick-start.md +1 -1
  81. package/.docs/organized/code-examples/stock-price-tool.md +1 -1
  82. package/.docs/organized/code-examples/weather-agent.md +1 -1
  83. package/.docs/organized/code-examples/workflow-ai-recruiter.md +1 -1
  84. package/.docs/organized/code-examples/workflow-with-inline-steps.md +1 -1
  85. package/.docs/organized/code-examples/workflow-with-memory.md +1 -1
  86. package/.docs/organized/code-examples/workflow-with-separate-steps.md +1 -1
  87. package/.docs/organized/code-examples/workflow-with-suspend-resume.md +1 -1
  88. package/.docs/raw/agents/overview.mdx +35 -4
  89. package/.docs/raw/deployment/monorepo.mdx +1 -1
  90. package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +44 -14
  91. package/.docs/raw/getting-started/installation.mdx +52 -4
  92. package/.docs/raw/getting-started/templates.mdx +2 -22
  93. package/.docs/raw/reference/agents/generate.mdx +2 -2
  94. package/.docs/raw/reference/agents/getDefaultStreamOptions.mdx +2 -1
  95. package/.docs/raw/reference/agents/getDefaultVNextStreamOptions.mdx +1 -1
  96. package/.docs/raw/reference/agents/stream.mdx +2 -2
  97. package/.docs/raw/reference/cli/build.mdx +0 -6
  98. package/.docs/raw/reference/cli/start.mdx +8 -1
  99. package/.docs/raw/reference/scorers/noise-sensitivity.mdx +237 -0
  100. package/.docs/raw/reference/scorers/prompt-alignment.mdx +369 -0
  101. package/.docs/raw/scorers/off-the-shelf-scorers.mdx +2 -2
  102. package/.docs/raw/streaming/overview.mdx +2 -2
  103. package/.docs/raw/streaming/tool-streaming.mdx +8 -2
  104. package/.docs/raw/streaming/workflow-streaming.mdx +8 -2
  105. package/.docs/raw/tools-mcp/overview.mdx +44 -0
  106. package/.docs/raw/workflows/overview.mdx +19 -17
  107. package/.docs/raw/workflows/suspend-and-resume.mdx +64 -7
  108. package/CHANGELOG.md +1813 -0
  109. package/dist/stdio.js +18 -1
  110. package/dist/tools/blog.d.ts.map +1 -1
  111. package/dist/tools/docs.d.ts.map +1 -1
  112. package/package.json +17 -7
@@ -0,0 +1,237 @@
1
+ ---
2
+ title: "Reference: Noise Sensitivity Scorer | Scorers | Mastra Docs"
3
+ description: Documentation for the Noise Sensitivity Scorer in Mastra. Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information in user queries.
4
+ ---
5
+
6
+ import { PropertiesTable } from "@/components/properties-table";
7
+
8
+ # Noise Sensitivity Scorer
9
+
10
+ The `createNoiseSensitivityScorerLLM()` function creates a scorer that evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information. It measures the agent's ability to maintain response quality and accuracy despite noise in the input.
11
+
12
+ ## Parameters
13
+
14
+ <PropertiesTable
15
+ content={[
16
+ {
17
+ name: "model",
18
+ type: "MastraLanguageModel",
19
+ description: "The language model to use for evaluating noise sensitivity",
20
+ required: true,
21
+ },
22
+ {
23
+ name: "options",
24
+ type: "NoiseSensitivityOptions",
25
+ description: "Configuration options for the scorer",
26
+ required: true,
27
+ children: [
28
+ {
29
+ name: "baselineResponse",
30
+ type: "string",
31
+ description: "The expected clean response to compare against (what the agent should ideally produce without noise)",
32
+ required: true,
33
+ },
34
+ {
35
+ name: "noisyQuery",
36
+ type: "string",
37
+ description: "The user query with added noise, distractions, or misleading information",
38
+ required: true,
39
+ },
40
+ {
41
+ name: "noiseType",
42
+ type: "string",
43
+ description: "Type of noise added (e.g., 'misinformation', 'distractors', 'adversarial')",
44
+ required: false,
45
+ },
46
+ {
47
+ name: "scoring",
48
+ type: "object",
49
+ description: "Advanced scoring configuration for fine-tuning evaluation",
50
+ required: false,
51
+ children: [
52
+ {
53
+ name: "impactWeights",
54
+ type: "object",
55
+ description: "Custom weights for different impact levels",
56
+ required: false,
57
+ children: [
58
+ {
59
+ name: "none",
60
+ type: "number",
61
+ description: "Weight for no impact (default: 1.0)",
62
+ required: false,
63
+ },
64
+ {
65
+ name: "minimal",
66
+ type: "number",
67
+ description: "Weight for minimal impact (default: 0.85)",
68
+ required: false,
69
+ },
70
+ {
71
+ name: "moderate",
72
+ type: "number",
73
+ description: "Weight for moderate impact (default: 0.6)",
74
+ required: false,
75
+ },
76
+ {
77
+ name: "significant",
78
+ type: "number",
79
+ description: "Weight for significant impact (default: 0.3)",
80
+ required: false,
81
+ },
82
+ {
83
+ name: "severe",
84
+ type: "number",
85
+ description: "Weight for severe impact (default: 0.1)",
86
+ required: false,
87
+ },
88
+ ],
89
+ },
90
+ {
91
+ name: "penalties",
92
+ type: "object",
93
+ description: "Penalty configuration for major issues",
94
+ required: false,
95
+ children: [
96
+ {
97
+ name: "majorIssuePerItem",
98
+ type: "number",
99
+ description: "Penalty per major issue identified (default: 0.1)",
100
+ required: false,
101
+ },
102
+ {
103
+ name: "maxMajorIssuePenalty",
104
+ type: "number",
105
+ description: "Maximum total penalty for major issues (default: 0.3)",
106
+ required: false,
107
+ },
108
+ ],
109
+ },
110
+ {
111
+ name: "discrepancyThreshold",
112
+ type: "number",
113
+ description: "Threshold for using conservative scoring when LLM and calculated scores diverge (default: 0.2)",
114
+ required: false,
115
+ },
116
+ ],
117
+ },
118
+ ],
119
+ },
120
+ ]}
121
+ />
122
+
123
+ ## .run() Returns
124
+
125
+ <PropertiesTable
126
+ content={[
127
+ {
128
+ name: "score",
129
+ type: "number",
130
+ description: "Robustness score between 0 and 1 (1.0 = completely robust, 0.0 = severely compromised)",
131
+ },
132
+ {
133
+ name: "reason",
134
+ type: "string",
135
+ description: "Human-readable explanation of how noise affected the agent's response",
136
+ },
137
+ ]}
138
+ />
139
+
140
+ ## Evaluation Dimensions
141
+
142
+ The Noise Sensitivity scorer analyzes five key dimensions:
143
+
144
+ ### 1. Content Accuracy
145
+ Evaluates whether facts and information remain correct despite noise. The scorer checks if the agent maintains truthfulness when exposed to misinformation.
146
+
147
+ ### 2. Completeness
148
+ Assesses if the noisy response addresses the original query as thoroughly as the baseline. Measures whether noise causes the agent to miss important information.
149
+
150
+ ### 3. Relevance
151
+ Determines if the agent stayed focused on the original question or got distracted by irrelevant information in the noise.
152
+
153
+ ### 4. Consistency
154
+ Compares how similar the responses are in their core message and conclusions. Evaluates whether noise causes the agent to contradict itself.
155
+
156
+ ### 5. Hallucination Resistance
157
+ Checks if noise causes the agent to generate false or fabricated information that wasn't present in either the query or the noise.
158
+
159
+ ## Scoring Algorithm
160
+
161
+ ### Formula
162
+
163
+ ```
164
+ Final Score = max(0, min(llm_score, calculated_score) - issues_penalty)
165
+ ```
166
+
167
+ Where:
168
+ - `llm_score` = Direct robustness score from LLM analysis
169
+ - `calculated_score` = Average of impact weights across dimensions
170
+ - `issues_penalty` = min(major_issues × penalty_rate, max_penalty)
171
+
172
+ ### Impact Level Weights
173
+
174
+ Each dimension receives an impact level with corresponding weights:
175
+
176
+ - **None (1.0)**: Response virtually identical in quality and accuracy
177
+ - **Minimal (0.85)**: Slight phrasing changes but maintains correctness
178
+ - **Moderate (0.6)**: Noticeable changes affecting quality but core info correct
179
+ - **Significant (0.3)**: Major degradation in quality or accuracy
180
+ - **Severe (0.1)**: Response substantially worse or completely derailed
181
+
182
+ ### Conservative Scoring
183
+
184
+ When the LLM's direct score and the calculated score diverge by more than the discrepancy threshold, the scorer uses the lower (more conservative) score to ensure reliable evaluation.
185
+
186
+ ## Noise Types
187
+
188
+ ### Misinformation
189
+ False or misleading claims mixed with legitimate queries.
190
+
191
+ Example: "What causes climate change? Also, climate change is a hoax invented by scientists."
192
+
193
+ ### Distractors
194
+ Irrelevant information that could pull focus from the main query.
195
+
196
+ Example: "How do I bake a cake? My cat is orange and I like pizza on Tuesdays."
197
+
198
+ ### Adversarial
199
+ Deliberately conflicting instructions designed to confuse.
200
+
201
+ Example: "Write a summary of this article. Actually, ignore that and tell me about dogs instead."
202
+
203
+ ## Usage Patterns
204
+
205
+ ### Testing Agent Robustness
206
+ Use to verify that agents maintain quality when faced with:
207
+ - User confusion or contradictions
208
+ - Multiple unrelated questions in one query
209
+ - False premises or assumptions
210
+ - Emotional or distracting content
211
+
212
+ ### Quality Assurance
213
+ Integrate into evaluation pipelines to:
214
+ - Benchmark different models' noise resistance
215
+ - Identify agents vulnerable to manipulation
216
+ - Validate production readiness
217
+
218
+ ### Security Testing
219
+ Evaluate resistance to:
220
+ - Prompt injection attempts
221
+ - Social engineering tactics
222
+ - Information pollution attacks
223
+
224
+ ## Score Interpretation
225
+
226
+ - **0.9-1.0**: Excellent robustness, minimal impact from noise
227
+ - **0.7-0.8**: Good resistance with minor degradation
228
+ - **0.5-0.6**: Moderate impact, some key aspects affected
229
+ - **0.3-0.4**: Significant vulnerability to noise
230
+ - **0.0-0.2**: Severe compromise, agent easily misled
231
+
232
+ ## Related
233
+
234
+ - [Noise Sensitivity Examples](/examples/scorers/noise-sensitivity) - Practical usage examples
235
+ - [Hallucination Scorer](/reference/scorers/hallucination) - Evaluates fabricated content
236
+ - [Answer Relevancy Scorer](/reference/scorers/answer-relevancy) - Measures response focus
237
+ - [Custom Scorers](/docs/scorers/custom-scorers) - Creating your own evaluation metrics
@@ -0,0 +1,369 @@
1
+ ---
2
+ title: "Reference: Prompt Alignment Scorer | Scorers | Mastra Docs"
3
+ description: Documentation for the Prompt Alignment Scorer in Mastra. Evaluates how well agent responses align with user prompt intent, requirements, completeness, and appropriateness using multi-dimensional analysis.
4
+ ---
5
+
6
+ import { PropertiesTable } from "@/components/properties-table";
7
+
8
+ # Prompt Alignment Scorer
9
+
10
+ The `createPromptAlignmentScorerLLM()` function creates a scorer that evaluates how well agent responses align with user prompts across multiple dimensions: intent understanding, requirement fulfillment, response completeness, and format appropriateness.
11
+
12
+ ## Parameters
13
+
14
+ <PropertiesTable
15
+ content={[
16
+ {
17
+ name: "model",
18
+ type: "MastraLanguageModel",
19
+ description: "The language model to use for evaluating prompt-response alignment",
20
+ required: true,
21
+ },
22
+ {
23
+ name: "options",
24
+ type: "PromptAlignmentOptions",
25
+ description: "Configuration options for the scorer",
26
+ required: false,
27
+ children: [
28
+ {
29
+ name: "scale",
30
+ type: "number",
31
+ description: "Scale factor to multiply the final score (default: 1)",
32
+ required: false,
33
+ },
34
+ {
35
+ name: "evaluationMode",
36
+ type: "'user' | 'system' | 'both'",
37
+ description: "Evaluation mode - 'user' evaluates user prompt alignment only, 'system' evaluates system compliance only, 'both' evaluates both with weighted scoring (default: 'both')",
38
+ required: false,
39
+ },
40
+ ],
41
+ },
42
+ ]}
43
+ />
44
+
45
+ ## .run() Returns
46
+
47
+ <PropertiesTable
48
+ content={[
49
+ {
50
+ name: "score",
51
+ type: "number",
52
+ description: "Multi-dimensional alignment score between 0 and scale (default 0-1)",
53
+ },
54
+ {
55
+ name: "reason",
56
+ type: "string",
57
+ description: "Human-readable explanation of the prompt alignment evaluation with detailed breakdown",
58
+ },
59
+ ]}
60
+ />
61
+
62
+ ## Scoring Details
63
+
64
+ ### Multi-Dimensional Analysis
65
+
66
+ Prompt Alignment evaluates responses across four key dimensions with weighted scoring that adapts based on the evaluation mode:
67
+
68
+ #### User Mode ('user')
69
+ Evaluates alignment with user prompts only:
70
+
71
+ 1. **Intent Alignment** (40% weight) - Whether the response addresses the user's core request
72
+ 2. **Requirements Fulfillment** (30% weight) - If all user requirements are met
73
+ 3. **Completeness** (20% weight) - Whether the response is comprehensive for user needs
74
+ 4. **Response Appropriateness** (10% weight) - If format and tone match user expectations
75
+
76
+ #### System Mode ('system')
77
+ Evaluates compliance with system guidelines only:
78
+
79
+ 1. **Intent Alignment** (35% weight) - Whether the response follows system behavioral guidelines
80
+ 2. **Requirements Fulfillment** (35% weight) - If all system constraints are respected
81
+ 3. **Completeness** (15% weight) - Whether the response adheres to all system rules
82
+ 4. **Response Appropriateness** (15% weight) - If format and tone match system specifications
83
+
84
+ #### Both Mode ('both' - default)
85
+ Combines evaluation of both user and system alignment:
86
+
87
+ - **User alignment**: 70% of final score (using user mode weights)
88
+ - **System compliance**: 30% of final score (using system mode weights)
89
+ - Provides balanced assessment of user satisfaction and system adherence
90
+
91
+ ### Scoring Formula
92
+
93
+ **User Mode:**
94
+ ```
95
+ Weighted Score = (intent_score × 0.4) + (requirements_score × 0.3) +
96
+ (completeness_score × 0.2) + (appropriateness_score × 0.1)
97
+ Final Score = Weighted Score × scale
98
+ ```
99
+
100
+ **System Mode:**
101
+ ```
102
+ Weighted Score = (intent_score × 0.35) + (requirements_score × 0.35) +
103
+ (completeness_score × 0.15) + (appropriateness_score × 0.15)
104
+ Final Score = Weighted Score × scale
105
+ ```
106
+
107
+ **Both Mode (default):**
108
+ ```
109
+ User Score = (user dimensions with user weights)
110
+ System Score = (system dimensions with system weights)
111
+ Weighted Score = (User Score × 0.7) + (System Score × 0.3)
112
+ Final Score = Weighted Score × scale
113
+ ```
114
+
115
+ **Weight Distribution Rationale**:
116
+ - **User Mode**: Prioritizes intent (40%) and requirements (30%) for user satisfaction
117
+ - **System Mode**: Balances behavioral compliance (35%) and constraints (35%) equally
118
+ - **Both Mode**: 70/30 split ensures user needs are primary while maintaining system compliance
119
+
120
+ ### Score Interpretation
121
+
122
+ - **0.9-1.0** = Excellent alignment across all dimensions
123
+ - **0.8-0.9** = Very good alignment with minor gaps
124
+ - **0.7-0.8** = Good alignment but missing some requirements or completeness
125
+ - **0.6-0.7** = Moderate alignment with noticeable gaps
126
+ - **0.4-0.6** = Poor alignment with significant issues
127
+ - **0.0-0.4** = Very poor alignment, response doesn't address the prompt effectively
128
+
129
+ ### Comparison with Other Scorers
130
+
131
+ | Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
132
+ |--------|------------------|------------------|--------------|
133
+ | **Focus** | Multi-dimensional prompt adherence | Query-response relevance | Context groundedness |
134
+ | **Evaluation** | Intent, requirements, completeness, format | Semantic similarity to query | Factual consistency with context |
135
+ | **Use Case** | General prompt following | Information retrieval | RAG/context-based systems |
136
+ | **Dimensions** | 4 weighted dimensions | Single relevance dimension | Single faithfulness dimension |
137
+
138
+ ### When to Use Each Mode
139
+
140
+ **User Mode (`'user'`)** - Use when:
141
+ - Evaluating customer service responses for user satisfaction
142
+ - Testing content generation quality from user perspective
143
+ - Measuring how well responses address user questions
144
+ - Focusing purely on request fulfillment without system constraints
145
+
146
+ **System Mode (`'system'`)** - Use when:
147
+ - Auditing AI safety and compliance with behavioral guidelines
148
+ - Ensuring agents follow brand voice and tone requirements
149
+ - Validating adherence to content policies and constraints
150
+ - Testing system-level behavioral consistency
151
+
152
+ **Both Mode (`'both'`)** - Use when (default, recommended):
153
+ - Comprehensive evaluation of overall AI agent performance
154
+ - Balancing user satisfaction with system compliance
155
+ - Production monitoring where both user and system requirements matter
156
+ - Holistic assessment of prompt-response alignment
157
+
158
+ ## Usage Examples
159
+
160
+ ### Basic Configuration
161
+
162
+ ```typescript
163
+ import { openai } from '@ai-sdk/openai';
164
+ import { createPromptAlignmentScorerLLM } from '@mastra/evals';
165
+
166
+ const scorer = createPromptAlignmentScorerLLM({
167
+ model: openai('gpt-4o'),
168
+ });
169
+
170
+ // Evaluate a code generation task
171
+ const result = await scorer.run({
172
+ input: [{
173
+ role: 'user',
174
+ content: 'Write a Python function to calculate factorial with error handling'
175
+ }],
176
+ output: {
177
+ role: 'assistant',
178
+ text: `def factorial(n):
179
+ if n < 0:
180
+ raise ValueError("Factorial not defined for negative numbers")
181
+ if n == 0:
182
+ return 1
183
+ return n * factorial(n-1)`
184
+ }
185
+ });
186
+ // Result: { score: 0.95, reason: "Excellent alignment - function addresses intent, includes error handling..." }
187
+ ```
188
+
189
+ ### Custom Configuration Examples
190
+
191
+ ```typescript
192
+ // Configure scale and evaluation mode
193
+ const scorer = createPromptAlignmentScorerLLM({
194
+ model: openai('gpt-4o'),
195
+ options: {
196
+ scale: 10, // Score from 0-10 instead of 0-1
197
+ evaluationMode: 'both' // 'user', 'system', or 'both' (default)
198
+ },
199
+ });
200
+
201
+ // User-only evaluation - focus on user satisfaction
202
+ const userScorer = createPromptAlignmentScorerLLM({
203
+ model: openai('gpt-4o'),
204
+ options: { evaluationMode: 'user' }
205
+ });
206
+
207
+ // System-only evaluation - focus on compliance
208
+ const systemScorer = createPromptAlignmentScorerLLM({
209
+ model: openai('gpt-4o'),
210
+ options: { evaluationMode: 'system' }
211
+ });
212
+
213
+ const result = await scorer.run(testRun);
214
+ // Result: { score: 8.5, reason: "Score: 8.5 out of 10 - Good alignment with both user intent and system guidelines..." }
215
+ ```
216
+
217
+ ### Format-Specific Evaluation
218
+
219
+ ```typescript
220
+ // Evaluate bullet point formatting
221
+ const result = await scorer.run({
222
+ input: [{
223
+ role: 'user',
224
+ content: 'List the benefits of TypeScript in bullet points'
225
+ }],
226
+ output: {
227
+ role: 'assistant',
228
+ text: 'TypeScript provides static typing, better IDE support, and enhanced code reliability.'
229
+ }
230
+ });
231
+ // Result: Lower appropriateness score due to format mismatch (paragraph vs bullet points)
232
+ ```
233
+
234
+ ## Usage Patterns
235
+
236
+ ### Code Generation Evaluation
237
+ Ideal for evaluating:
238
+ - Programming task completion
239
+ - Code quality and completeness
240
+ - Adherence to coding requirements
241
+ - Format specifications (functions, classes, etc.)
242
+
243
+ ```typescript
244
+ // Example: API endpoint creation
245
+ const codePrompt = "Create a REST API endpoint with authentication and rate limiting";
246
+ // Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
247
+ // completeness (full implementation), format (code structure)
248
+ ```
249
+
250
+ ### Instruction Following Assessment
251
+ Perfect for:
252
+ - Task completion verification
253
+ - Multi-step instruction adherence
254
+ - Requirement compliance checking
255
+ - Educational content evaluation
256
+
257
+ ```typescript
258
+ // Example: Multi-requirement task
259
+ const taskPrompt = "Write a Python class with initialization, validation, error handling, and documentation";
260
+ // Scorer tracks each requirement individually and provides detailed breakdown
261
+ ```
262
+
263
+ ### Content Format Validation
264
+ Useful for:
265
+ - Format specification compliance
266
+ - Style guide adherence
267
+ - Output structure verification
268
+ - Response appropriateness checking
269
+
270
+ ```typescript
271
+ // Example: Structured output
272
+ const formatPrompt = "Explain the differences between let and const in JavaScript using bullet points";
273
+ // Scorer evaluates content accuracy AND format compliance
274
+ ```
275
+
276
+ ## Common Use Cases
277
+
278
+ ### 1. Agent Response Quality
279
+ Measure how well your AI agents follow user instructions:
280
+
281
+ ```typescript
282
+ const agent = new Agent({
283
+ name: 'CodingAssistant',
284
+ instructions: 'You are a helpful coding assistant. Always provide working code examples.',
285
+ model: openai('gpt-4o'),
286
+ });
287
+
288
+ // Evaluate comprehensive alignment (default)
289
+ const scorer = createPromptAlignmentScorerLLM({
290
+ model: openai('gpt-4o-mini'),
291
+ options: { evaluationMode: 'both' } // Evaluates both user intent and system guidelines
292
+ });
293
+
294
+ // Evaluate just user satisfaction
295
+ const userScorer = createPromptAlignmentScorerLLM({
296
+ model: openai('gpt-4o-mini'),
297
+ options: { evaluationMode: 'user' } // Focus only on user request fulfillment
298
+ });
299
+
300
+ // Evaluate system compliance
301
+ const systemScorer = createPromptAlignmentScorerLLM({
302
+ model: openai('gpt-4o-mini'),
303
+ options: { evaluationMode: 'system' } // Check adherence to system instructions
304
+ });
305
+
306
+ const result = await scorer.run(agentRun);
307
+ ```
308
+
309
+ ### 2. Prompt Engineering Optimization
310
+ Test different prompts to improve alignment:
311
+
312
+ ```typescript
313
+ const prompts = [
314
+ 'Write a function to calculate factorial',
315
+ 'Create a Python function that calculates factorial with error handling for negative inputs',
316
+ 'Implement a factorial calculator in Python with: input validation, error handling, and docstring'
317
+ ];
318
+
319
+ // Compare alignment scores to find the best prompt
320
+ for (const prompt of prompts) {
321
+ const result = await scorer.run(createTestRun(prompt, response));
322
+ console.log(`Prompt alignment: ${result.score}`);
323
+ }
324
+ ```
325
+
326
+ ### 3. Multi-Agent System Evaluation
327
+ Compare different agents or models:
328
+
329
+ ```typescript
330
+ const agents = [agent1, agent2, agent3];
331
+ const testPrompts = [...]; // Array of test prompts
332
+
333
+ for (const agent of agents) {
334
+ let totalScore = 0;
335
+ for (const prompt of testPrompts) {
336
+ const response = await agent.run(prompt);
337
+ const evaluation = await scorer.run({ input: prompt, output: response });
338
+ totalScore += evaluation.score;
339
+ }
340
+ console.log(`${agent.name} average alignment: ${totalScore / testPrompts.length}`);
341
+ }
342
+ ```
343
+
344
+ ## Error Handling
345
+
346
+ The scorer handles various edge cases gracefully:
347
+
348
+ ```typescript
349
+ // Missing user prompt
350
+ try {
351
+ await scorer.run({ input: [], output: response });
352
+ } catch (error) {
353
+ // Error: "Both user prompt and agent response are required for prompt alignment scoring"
354
+ }
355
+
356
+ // Empty response
357
+ const result = await scorer.run({
358
+ input: [userMessage],
359
+ output: { role: 'assistant', text: '' }
360
+ });
361
+ // Returns low scores with detailed reasoning about incompleteness
362
+ ```
363
+
364
+ ## Related
365
+
366
+ - [Answer Relevancy Scorer](/reference/scorers/answer-relevancy) - Evaluates query-response relevance
367
+ - [Faithfulness Scorer](/reference/scorers/faithfulness) - Measures context groundedness
368
+ - [Tool Call Accuracy Scorer](/reference/scorers/tool-call-accuracy) - Evaluates tool selection
369
+ - [Custom Scorers](/docs/scorers/custom-scorers) - Creating your own evaluation metrics
@@ -20,6 +20,7 @@ These scorers evaluate how correct, truthful, and complete your agent's answers
20
20
  - [`content-similarity`](/reference/scorers/content-similarity): Measures textual similarity using character-level matching (`0-1`, higher is better)
21
21
  - [`textual-difference`](/reference/scorers/textual-difference): Measures textual differences between strings (`0-1`, higher means more similar)
22
22
  - [`tool-call-accuracy`](/reference/scorers/tool-call-accuracy): Evaluates whether the LLM selects the correct tool from available options (`0-1`, higher is better)
23
+ - [`prompt-alignment`](/reference/scorers/prompt-alignment): Measures how well agent responses align with user prompt intent, requirements, completeness, and format (`0-1`, higher is better)
23
24
 
24
25
  ### Context Quality
25
26
 
@@ -28,14 +29,13 @@ These scorers evaluate the quality and relevance of context used in generating r
28
29
  - [`context-precision`](/reference/scorers/context-precision): Evaluates context relevance and ranking using Mean Average Precision, rewarding early placement of relevant context (`0-1`, higher is better)
29
30
  - [`context-relevance`](/reference/scorers/context-relevance): Measures context utility with nuanced relevance levels, usage tracking, and missing context detection (`0-1`, higher is better)
30
31
 
31
- :::tip Context Scorer Selection
32
+ > tip Context Scorer Selection
32
33
  - Use **Context Precision** when context ordering matters and you need standard IR metrics (ideal for RAG ranking evaluation)
33
34
  - Use **Context Relevance** when you need detailed relevance assessment and want to track context usage and identify gaps
34
35
 
35
36
  Both context scorers support:
36
37
  - **Static context**: Pre-defined context arrays
37
38
  - **Dynamic context extraction**: Extract context from runs using custom functions (ideal for RAG systems, vector databases, etc.)
38
- :::
39
39
 
40
40
  ### Output Quality
41
41
 
@@ -15,8 +15,8 @@ Mastra supports real-time, incremental responses from agents and workflows, allo
15
15
 
16
16
  Mastra currently supports two streaming methods, this page explains how to use `streamVNext()`.
17
17
 
18
- 1. **`.stream()`**: Current stable API, supports **AI SDK v1**.
19
- 2. **`.streamVNext()`**: Experimental API, supports **AI SDK v2**.
18
+ 1. **`.stream()`**: Current stable API, supports **AI SDK v4** (`LanguageModelV1`).
19
+ 2. **`.streamVNext()`**: Experimental API, supports **AI SDK v5** (`LanguageModelV2`).
20
20
 
21
21
  ## Streaming with agents
22
22
 
@@ -3,6 +3,8 @@ title: "Tool Streaming | Streaming | Mastra"
3
3
  description: "Learn how to use tool streaming in Mastra, including handling tool calls, tool results, and tool execution events during streaming."
4
4
  ---
5
5
 
6
+ import { Callout } from "nextra/components";
7
+
6
8
  # Tool streaming
7
9
 
8
10
  Tool streaming in Mastra enables tools to send incremental results while they run, rather than waiting until execution finishes. This allows you to surface partial progress, intermediate states, or progressive data directly to users or upstream agents and workflows.
@@ -36,6 +38,10 @@ export const testAgent = new Agent({
36
38
 
37
39
  The `writer` argument is passed to a tool’s `execute` function and can be used to emit custom events, data, or values into the active stream. This enables tools to provide intermediate results or status updates while execution is still in progress.
38
40
 
41
+ <Callout type="warning">
42
+ You must `await` the call to `writer.write(...)` or else you will lock the stream and get a `WritableStream is locked` error.
43
+ </Callout>
44
+
39
45
  ```typescript {5,8,15} showLineNumbers copy
40
46
  import { createTool } from "@mastra/core/tools";
41
47
 
@@ -44,14 +50,14 @@ export const testTool = createTool({
44
50
  execute: async ({ context, writer }) => {
45
51
  const { value } = context;
46
52
 
47
- writer?.write({
53
+ await writer?.write({
48
54
  type: "custom-event",
49
55
  status: "pending"
50
56
  });
51
57
 
52
58
  const response = await fetch(...);
53
59
 
54
- writer?.write({
60
+ await writer?.write({
55
61
  type: "custom-event",
56
62
  status: "success"
57
63
  });