@mastra/mcp-docs-server 0.13.17-alpha.3 → 0.13.17-alpha.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +15 -0
- package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +10 -0
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +17 -17
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +35 -35
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +8 -0
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloudflare.md +12 -12
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +20 -20
- package/.docs/organized/changelogs/%40mastra%2Fevals.md +10 -10
- package/.docs/organized/changelogs/%40mastra%2Ffirecrawl.md +10 -10
- package/.docs/organized/changelogs/%40mastra%2Fgithub.md +11 -11
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fmcp-registry-registry.md +11 -11
- package/.docs/organized/changelogs/%40mastra%2Fmcp.md +10 -10
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +14 -14
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +10 -10
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +21 -21
- package/.docs/organized/changelogs/%40mastra%2Fragie.md +10 -10
- package/.docs/organized/changelogs/%40mastra%2Fschema-compat.md +7 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fvoice-google-gemini-live.md +10 -0
- package/.docs/organized/changelogs/%40mastra%2Fvoice-openai-realtime.md +11 -11
- package/.docs/organized/changelogs/create-mastra.md +7 -7
- package/.docs/organized/changelogs/mastra.md +19 -19
- package/.docs/organized/code-examples/a2a.md +1 -1
- package/.docs/organized/code-examples/agent-network.md +1 -1
- package/.docs/organized/code-examples/agent.md +22 -1
- package/.docs/organized/code-examples/agui.md +1 -1
- package/.docs/organized/code-examples/ai-sdk-useChat.md +1 -1
- package/.docs/organized/code-examples/ai-sdk-v5.md +2 -2
- package/.docs/organized/code-examples/assistant-ui.md +3 -3
- package/.docs/organized/code-examples/bird-checker-with-express.md +1 -1
- package/.docs/organized/code-examples/bird-checker-with-nextjs-and-eval.md +1 -1
- package/.docs/organized/code-examples/bird-checker-with-nextjs.md +1 -1
- package/.docs/organized/code-examples/client-side-tools.md +1 -1
- package/.docs/organized/code-examples/crypto-chatbot.md +1 -1
- package/.docs/organized/code-examples/experimental-auth-weather-agent.md +1 -1
- package/.docs/organized/code-examples/fireworks-r1.md +1 -1
- package/.docs/organized/code-examples/mcp-configuration.md +2 -2
- package/.docs/organized/code-examples/mcp-registry-registry.md +1 -1
- package/.docs/organized/code-examples/memory-with-mem0.md +1 -1
- package/.docs/organized/code-examples/memory-with-processors.md +1 -1
- package/.docs/organized/code-examples/openapi-spec-writer.md +2 -2
- package/.docs/organized/code-examples/quick-start.md +1 -1
- package/.docs/organized/code-examples/stock-price-tool.md +1 -1
- package/.docs/organized/code-examples/weather-agent.md +1 -1
- package/.docs/organized/code-examples/workflow-ai-recruiter.md +1 -1
- package/.docs/organized/code-examples/workflow-with-inline-steps.md +1 -1
- package/.docs/organized/code-examples/workflow-with-memory.md +1 -1
- package/.docs/organized/code-examples/workflow-with-separate-steps.md +1 -1
- package/.docs/organized/code-examples/workflow-with-suspend-resume.md +1 -1
- package/.docs/raw/agents/overview.mdx +35 -4
- package/.docs/raw/deployment/monorepo.mdx +1 -1
- package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +44 -14
- package/.docs/raw/getting-started/installation.mdx +52 -4
- package/.docs/raw/getting-started/templates.mdx +2 -22
- package/.docs/raw/reference/agents/generate.mdx +2 -2
- package/.docs/raw/reference/agents/getDefaultStreamOptions.mdx +2 -1
- package/.docs/raw/reference/agents/getDefaultVNextStreamOptions.mdx +1 -1
- package/.docs/raw/reference/agents/stream.mdx +2 -2
- package/.docs/raw/reference/cli/build.mdx +0 -6
- package/.docs/raw/reference/cli/start.mdx +8 -1
- package/.docs/raw/reference/scorers/noise-sensitivity.mdx +237 -0
- package/.docs/raw/reference/scorers/prompt-alignment.mdx +369 -0
- package/.docs/raw/scorers/off-the-shelf-scorers.mdx +2 -2
- package/.docs/raw/streaming/overview.mdx +2 -2
- package/.docs/raw/streaming/tool-streaming.mdx +8 -2
- package/.docs/raw/streaming/workflow-streaming.mdx +8 -2
- package/.docs/raw/tools-mcp/overview.mdx +44 -0
- package/.docs/raw/workflows/overview.mdx +19 -17
- package/dist/stdio.js +5 -1
- package/dist/tools/docs.d.ts.map +1 -1
- package/package.json +6 -6
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Reference: Prompt Alignment Scorer | Scorers | Mastra Docs"
|
|
3
|
+
description: Documentation for the Prompt Alignment Scorer in Mastra. Evaluates how well agent responses align with user prompt intent, requirements, completeness, and appropriateness using multi-dimensional analysis.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
import { PropertiesTable } from "@/components/properties-table";
|
|
7
|
+
|
|
8
|
+
# Prompt Alignment Scorer
|
|
9
|
+
|
|
10
|
+
The `createPromptAlignmentScorerLLM()` function creates a scorer that evaluates how well agent responses align with user prompts across multiple dimensions: intent understanding, requirement fulfillment, response completeness, and format appropriateness.
|
|
11
|
+
|
|
12
|
+
## Parameters
|
|
13
|
+
|
|
14
|
+
<PropertiesTable
|
|
15
|
+
content={[
|
|
16
|
+
{
|
|
17
|
+
name: "model",
|
|
18
|
+
type: "MastraLanguageModel",
|
|
19
|
+
description: "The language model to use for evaluating prompt-response alignment",
|
|
20
|
+
required: true,
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
name: "options",
|
|
24
|
+
type: "PromptAlignmentOptions",
|
|
25
|
+
description: "Configuration options for the scorer",
|
|
26
|
+
required: false,
|
|
27
|
+
children: [
|
|
28
|
+
{
|
|
29
|
+
name: "scale",
|
|
30
|
+
type: "number",
|
|
31
|
+
description: "Scale factor to multiply the final score (default: 1)",
|
|
32
|
+
required: false,
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
name: "evaluationMode",
|
|
36
|
+
type: "'user' | 'system' | 'both'",
|
|
37
|
+
description: "Evaluation mode - 'user' evaluates user prompt alignment only, 'system' evaluates system compliance only, 'both' evaluates both with weighted scoring (default: 'both')",
|
|
38
|
+
required: false,
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
},
|
|
42
|
+
]}
|
|
43
|
+
/>
|
|
44
|
+
|
|
45
|
+
## .run() Returns
|
|
46
|
+
|
|
47
|
+
<PropertiesTable
|
|
48
|
+
content={[
|
|
49
|
+
{
|
|
50
|
+
name: "score",
|
|
51
|
+
type: "number",
|
|
52
|
+
description: "Multi-dimensional alignment score between 0 and scale (default 0-1)",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
name: "reason",
|
|
56
|
+
type: "string",
|
|
57
|
+
description: "Human-readable explanation of the prompt alignment evaluation with detailed breakdown",
|
|
58
|
+
},
|
|
59
|
+
]}
|
|
60
|
+
/>
|
|
61
|
+
|
|
62
|
+
## Scoring Details
|
|
63
|
+
|
|
64
|
+
### Multi-Dimensional Analysis
|
|
65
|
+
|
|
66
|
+
Prompt Alignment evaluates responses across four key dimensions with weighted scoring that adapts based on the evaluation mode:
|
|
67
|
+
|
|
68
|
+
#### User Mode ('user')
|
|
69
|
+
Evaluates alignment with user prompts only:
|
|
70
|
+
|
|
71
|
+
1. **Intent Alignment** (40% weight) - Whether the response addresses the user's core request
|
|
72
|
+
2. **Requirements Fulfillment** (30% weight) - If all user requirements are met
|
|
73
|
+
3. **Completeness** (20% weight) - Whether the response is comprehensive for user needs
|
|
74
|
+
4. **Response Appropriateness** (10% weight) - If format and tone match user expectations
|
|
75
|
+
|
|
76
|
+
#### System Mode ('system')
|
|
77
|
+
Evaluates compliance with system guidelines only:
|
|
78
|
+
|
|
79
|
+
1. **Intent Alignment** (35% weight) - Whether the response follows system behavioral guidelines
|
|
80
|
+
2. **Requirements Fulfillment** (35% weight) - If all system constraints are respected
|
|
81
|
+
3. **Completeness** (15% weight) - Whether the response adheres to all system rules
|
|
82
|
+
4. **Response Appropriateness** (15% weight) - If format and tone match system specifications
|
|
83
|
+
|
|
84
|
+
#### Both Mode ('both' - default)
|
|
85
|
+
Combines evaluation of both user and system alignment:
|
|
86
|
+
|
|
87
|
+
- **User alignment**: 70% of final score (using user mode weights)
|
|
88
|
+
- **System compliance**: 30% of final score (using system mode weights)
|
|
89
|
+
- Provides balanced assessment of user satisfaction and system adherence
|
|
90
|
+
|
|
91
|
+
### Scoring Formula
|
|
92
|
+
|
|
93
|
+
**User Mode:**
|
|
94
|
+
```
|
|
95
|
+
Weighted Score = (intent_score × 0.4) + (requirements_score × 0.3) +
|
|
96
|
+
(completeness_score × 0.2) + (appropriateness_score × 0.1)
|
|
97
|
+
Final Score = Weighted Score × scale
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**System Mode:**
|
|
101
|
+
```
|
|
102
|
+
Weighted Score = (intent_score × 0.35) + (requirements_score × 0.35) +
|
|
103
|
+
(completeness_score × 0.15) + (appropriateness_score × 0.15)
|
|
104
|
+
Final Score = Weighted Score × scale
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**Both Mode (default):**
|
|
108
|
+
```
|
|
109
|
+
User Score = (user dimensions with user weights)
|
|
110
|
+
System Score = (system dimensions with system weights)
|
|
111
|
+
Weighted Score = (User Score × 0.7) + (System Score × 0.3)
|
|
112
|
+
Final Score = Weighted Score × scale
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**Weight Distribution Rationale**:
|
|
116
|
+
- **User Mode**: Prioritizes intent (40%) and requirements (30%) for user satisfaction
|
|
117
|
+
- **System Mode**: Balances behavioral compliance (35%) and constraints (35%) equally
|
|
118
|
+
- **Both Mode**: 70/30 split ensures user needs are primary while maintaining system compliance
|
|
119
|
+
|
|
120
|
+
### Score Interpretation
|
|
121
|
+
|
|
122
|
+
- **0.9-1.0** = Excellent alignment across all dimensions
|
|
123
|
+
- **0.8-0.9** = Very good alignment with minor gaps
|
|
124
|
+
- **0.7-0.8** = Good alignment but missing some requirements or completeness
|
|
125
|
+
- **0.6-0.7** = Moderate alignment with noticeable gaps
|
|
126
|
+
- **0.4-0.6** = Poor alignment with significant issues
|
|
127
|
+
- **0.0-0.4** = Very poor alignment, response doesn't address the prompt effectively
|
|
128
|
+
|
|
129
|
+
### Comparison with Other Scorers
|
|
130
|
+
|
|
131
|
+
| Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
|
|
132
|
+
|--------|------------------|------------------|--------------|
|
|
133
|
+
| **Focus** | Multi-dimensional prompt adherence | Query-response relevance | Context groundedness |
|
|
134
|
+
| **Evaluation** | Intent, requirements, completeness, format | Semantic similarity to query | Factual consistency with context |
|
|
135
|
+
| **Use Case** | General prompt following | Information retrieval | RAG/context-based systems |
|
|
136
|
+
| **Dimensions** | 4 weighted dimensions | Single relevance dimension | Single faithfulness dimension |
|
|
137
|
+
|
|
138
|
+
### When to Use Each Mode
|
|
139
|
+
|
|
140
|
+
**User Mode (`'user'`)** - Use when:
|
|
141
|
+
- Evaluating customer service responses for user satisfaction
|
|
142
|
+
- Testing content generation quality from user perspective
|
|
143
|
+
- Measuring how well responses address user questions
|
|
144
|
+
- Focusing purely on request fulfillment without system constraints
|
|
145
|
+
|
|
146
|
+
**System Mode (`'system'`)** - Use when:
|
|
147
|
+
- Auditing AI safety and compliance with behavioral guidelines
|
|
148
|
+
- Ensuring agents follow brand voice and tone requirements
|
|
149
|
+
- Validating adherence to content policies and constraints
|
|
150
|
+
- Testing system-level behavioral consistency
|
|
151
|
+
|
|
152
|
+
**Both Mode (`'both'`)** - Use when (default, recommended):
|
|
153
|
+
- Comprehensive evaluation of overall AI agent performance
|
|
154
|
+
- Balancing user satisfaction with system compliance
|
|
155
|
+
- Production monitoring where both user and system requirements matter
|
|
156
|
+
- Holistic assessment of prompt-response alignment
|
|
157
|
+
|
|
158
|
+
## Usage Examples
|
|
159
|
+
|
|
160
|
+
### Basic Configuration
|
|
161
|
+
|
|
162
|
+
```typescript
|
|
163
|
+
import { openai } from '@ai-sdk/openai';
|
|
164
|
+
import { createPromptAlignmentScorerLLM } from '@mastra/evals';
|
|
165
|
+
|
|
166
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
167
|
+
model: openai('gpt-4o'),
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
// Evaluate a code generation task
|
|
171
|
+
const result = await scorer.run({
|
|
172
|
+
input: [{
|
|
173
|
+
role: 'user',
|
|
174
|
+
content: 'Write a Python function to calculate factorial with error handling'
|
|
175
|
+
}],
|
|
176
|
+
output: {
|
|
177
|
+
role: 'assistant',
|
|
178
|
+
text: `def factorial(n):
|
|
179
|
+
if n < 0:
|
|
180
|
+
raise ValueError("Factorial not defined for negative numbers")
|
|
181
|
+
if n == 0:
|
|
182
|
+
return 1
|
|
183
|
+
return n * factorial(n-1)`
|
|
184
|
+
}
|
|
185
|
+
});
|
|
186
|
+
// Result: { score: 0.95, reason: "Excellent alignment - function addresses intent, includes error handling..." }
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Custom Configuration Examples
|
|
190
|
+
|
|
191
|
+
```typescript
|
|
192
|
+
// Configure scale and evaluation mode
|
|
193
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
194
|
+
model: openai('gpt-4o'),
|
|
195
|
+
options: {
|
|
196
|
+
scale: 10, // Score from 0-10 instead of 0-1
|
|
197
|
+
evaluationMode: 'both' // 'user', 'system', or 'both' (default)
|
|
198
|
+
},
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
// User-only evaluation - focus on user satisfaction
|
|
202
|
+
const userScorer = createPromptAlignmentScorerLLM({
|
|
203
|
+
model: openai('gpt-4o'),
|
|
204
|
+
options: { evaluationMode: 'user' }
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
// System-only evaluation - focus on compliance
|
|
208
|
+
const systemScorer = createPromptAlignmentScorerLLM({
|
|
209
|
+
model: openai('gpt-4o'),
|
|
210
|
+
options: { evaluationMode: 'system' }
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
const result = await scorer.run(testRun);
|
|
214
|
+
// Result: { score: 8.5, reason: "Score: 8.5 out of 10 - Good alignment with both user intent and system guidelines..." }
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Format-Specific Evaluation
|
|
218
|
+
|
|
219
|
+
```typescript
|
|
220
|
+
// Evaluate bullet point formatting
|
|
221
|
+
const result = await scorer.run({
|
|
222
|
+
input: [{
|
|
223
|
+
role: 'user',
|
|
224
|
+
content: 'List the benefits of TypeScript in bullet points'
|
|
225
|
+
}],
|
|
226
|
+
output: {
|
|
227
|
+
role: 'assistant',
|
|
228
|
+
text: 'TypeScript provides static typing, better IDE support, and enhanced code reliability.'
|
|
229
|
+
}
|
|
230
|
+
});
|
|
231
|
+
// Result: Lower appropriateness score due to format mismatch (paragraph vs bullet points)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Usage Patterns
|
|
235
|
+
|
|
236
|
+
### Code Generation Evaluation
|
|
237
|
+
Ideal for evaluating:
|
|
238
|
+
- Programming task completion
|
|
239
|
+
- Code quality and completeness
|
|
240
|
+
- Adherence to coding requirements
|
|
241
|
+
- Format specifications (functions, classes, etc.)
|
|
242
|
+
|
|
243
|
+
```typescript
|
|
244
|
+
// Example: API endpoint creation
|
|
245
|
+
const codePrompt = "Create a REST API endpoint with authentication and rate limiting";
|
|
246
|
+
// Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
|
|
247
|
+
// completeness (full implementation), format (code structure)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Instruction Following Assessment
|
|
251
|
+
Perfect for:
|
|
252
|
+
- Task completion verification
|
|
253
|
+
- Multi-step instruction adherence
|
|
254
|
+
- Requirement compliance checking
|
|
255
|
+
- Educational content evaluation
|
|
256
|
+
|
|
257
|
+
```typescript
|
|
258
|
+
// Example: Multi-requirement task
|
|
259
|
+
const taskPrompt = "Write a Python class with initialization, validation, error handling, and documentation";
|
|
260
|
+
// Scorer tracks each requirement individually and provides detailed breakdown
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Content Format Validation
|
|
264
|
+
Useful for:
|
|
265
|
+
- Format specification compliance
|
|
266
|
+
- Style guide adherence
|
|
267
|
+
- Output structure verification
|
|
268
|
+
- Response appropriateness checking
|
|
269
|
+
|
|
270
|
+
```typescript
|
|
271
|
+
// Example: Structured output
|
|
272
|
+
const formatPrompt = "Explain the differences between let and const in JavaScript using bullet points";
|
|
273
|
+
// Scorer evaluates content accuracy AND format compliance
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## Common Use Cases
|
|
277
|
+
|
|
278
|
+
### 1. Agent Response Quality
|
|
279
|
+
Measure how well your AI agents follow user instructions:
|
|
280
|
+
|
|
281
|
+
```typescript
|
|
282
|
+
const agent = new Agent({
|
|
283
|
+
name: 'CodingAssistant',
|
|
284
|
+
instructions: 'You are a helpful coding assistant. Always provide working code examples.',
|
|
285
|
+
model: openai('gpt-4o'),
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
// Evaluate comprehensive alignment (default)
|
|
289
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
290
|
+
model: openai('gpt-4o-mini'),
|
|
291
|
+
options: { evaluationMode: 'both' } // Evaluates both user intent and system guidelines
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
// Evaluate just user satisfaction
|
|
295
|
+
const userScorer = createPromptAlignmentScorerLLM({
|
|
296
|
+
model: openai('gpt-4o-mini'),
|
|
297
|
+
options: { evaluationMode: 'user' } // Focus only on user request fulfillment
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
// Evaluate system compliance
|
|
301
|
+
const systemScorer = createPromptAlignmentScorerLLM({
|
|
302
|
+
model: openai('gpt-4o-mini'),
|
|
303
|
+
options: { evaluationMode: 'system' } // Check adherence to system instructions
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
const result = await scorer.run(agentRun);
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### 2. Prompt Engineering Optimization
|
|
310
|
+
Test different prompts to improve alignment:
|
|
311
|
+
|
|
312
|
+
```typescript
|
|
313
|
+
const prompts = [
|
|
314
|
+
'Write a function to calculate factorial',
|
|
315
|
+
'Create a Python function that calculates factorial with error handling for negative inputs',
|
|
316
|
+
'Implement a factorial calculator in Python with: input validation, error handling, and docstring'
|
|
317
|
+
];
|
|
318
|
+
|
|
319
|
+
// Compare alignment scores to find the best prompt
|
|
320
|
+
for (const prompt of prompts) {
|
|
321
|
+
const result = await scorer.run(createTestRun(prompt, response));
|
|
322
|
+
console.log(`Prompt alignment: ${result.score}`);
|
|
323
|
+
}
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### 3. Multi-Agent System Evaluation
|
|
327
|
+
Compare different agents or models:
|
|
328
|
+
|
|
329
|
+
```typescript
|
|
330
|
+
const agents = [agent1, agent2, agent3];
|
|
331
|
+
const testPrompts = [...]; // Array of test prompts
|
|
332
|
+
|
|
333
|
+
for (const agent of agents) {
|
|
334
|
+
let totalScore = 0;
|
|
335
|
+
for (const prompt of testPrompts) {
|
|
336
|
+
const response = await agent.run(prompt);
|
|
337
|
+
const evaluation = await scorer.run({ input: prompt, output: response });
|
|
338
|
+
totalScore += evaluation.score;
|
|
339
|
+
}
|
|
340
|
+
console.log(`${agent.name} average alignment: ${totalScore / testPrompts.length}`);
|
|
341
|
+
}
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
## Error Handling
|
|
345
|
+
|
|
346
|
+
The scorer handles various edge cases gracefully:
|
|
347
|
+
|
|
348
|
+
```typescript
|
|
349
|
+
// Missing user prompt
|
|
350
|
+
try {
|
|
351
|
+
await scorer.run({ input: [], output: response });
|
|
352
|
+
} catch (error) {
|
|
353
|
+
// Error: "Both user prompt and agent response are required for prompt alignment scoring"
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Empty response
|
|
357
|
+
const result = await scorer.run({
|
|
358
|
+
input: [userMessage],
|
|
359
|
+
output: { role: 'assistant', text: '' }
|
|
360
|
+
});
|
|
361
|
+
// Returns low scores with detailed reasoning about incompleteness
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
## Related
|
|
365
|
+
|
|
366
|
+
- [Answer Relevancy Scorer](/reference/scorers/answer-relevancy) - Evaluates query-response relevance
|
|
367
|
+
- [Faithfulness Scorer](/reference/scorers/faithfulness) - Measures context groundedness
|
|
368
|
+
- [Tool Call Accuracy Scorer](/reference/scorers/tool-call-accuracy) - Evaluates tool selection
|
|
369
|
+
- [Custom Scorers](/docs/scorers/custom-scorers) - Creating your own evaluation metrics
|
|
@@ -20,6 +20,7 @@ These scorers evaluate how correct, truthful, and complete your agent's answers
|
|
|
20
20
|
- [`content-similarity`](/reference/scorers/content-similarity): Measures textual similarity using character-level matching (`0-1`, higher is better)
|
|
21
21
|
- [`textual-difference`](/reference/scorers/textual-difference): Measures textual differences between strings (`0-1`, higher means more similar)
|
|
22
22
|
- [`tool-call-accuracy`](/reference/scorers/tool-call-accuracy): Evaluates whether the LLM selects the correct tool from available options (`0-1`, higher is better)
|
|
23
|
+
- [`prompt-alignment`](/reference/scorers/prompt-alignment): Measures how well agent responses align with user prompt intent, requirements, completeness, and format (`0-1`, higher is better)
|
|
23
24
|
|
|
24
25
|
### Context Quality
|
|
25
26
|
|
|
@@ -28,14 +29,13 @@ These scorers evaluate the quality and relevance of context used in generating r
|
|
|
28
29
|
- [`context-precision`](/reference/scorers/context-precision): Evaluates context relevance and ranking using Mean Average Precision, rewarding early placement of relevant context (`0-1`, higher is better)
|
|
29
30
|
- [`context-relevance`](/reference/scorers/context-relevance): Measures context utility with nuanced relevance levels, usage tracking, and missing context detection (`0-1`, higher is better)
|
|
30
31
|
|
|
31
|
-
|
|
32
|
+
> tip Context Scorer Selection
|
|
32
33
|
- Use **Context Precision** when context ordering matters and you need standard IR metrics (ideal for RAG ranking evaluation)
|
|
33
34
|
- Use **Context Relevance** when you need detailed relevance assessment and want to track context usage and identify gaps
|
|
34
35
|
|
|
35
36
|
Both context scorers support:
|
|
36
37
|
- **Static context**: Pre-defined context arrays
|
|
37
38
|
- **Dynamic context extraction**: Extract context from runs using custom functions (ideal for RAG systems, vector databases, etc.)
|
|
38
|
-
:::
|
|
39
39
|
|
|
40
40
|
### Output Quality
|
|
41
41
|
|
|
@@ -15,8 +15,8 @@ Mastra supports real-time, incremental responses from agents and workflows, allo
|
|
|
15
15
|
|
|
16
16
|
Mastra currently supports two streaming methods, this page explains how to use `streamVNext()`.
|
|
17
17
|
|
|
18
|
-
1. **`.stream()`**: Current stable API, supports **AI SDK
|
|
19
|
-
2. **`.streamVNext()`**: Experimental API, supports **AI SDK
|
|
18
|
+
1. **`.stream()`**: Current stable API, supports **AI SDK v4** (`LanguageModelV1`).
|
|
19
|
+
2. **`.streamVNext()`**: Experimental API, supports **AI SDK v5** (`LanguageModelV2`).
|
|
20
20
|
|
|
21
21
|
## Streaming with agents
|
|
22
22
|
|
|
@@ -3,6 +3,8 @@ title: "Tool Streaming | Streaming | Mastra"
|
|
|
3
3
|
description: "Learn how to use tool streaming in Mastra, including handling tool calls, tool results, and tool execution events during streaming."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
+
import { Callout } from "nextra/components";
|
|
7
|
+
|
|
6
8
|
# Tool streaming
|
|
7
9
|
|
|
8
10
|
Tool streaming in Mastra enables tools to send incremental results while they run, rather than waiting until execution finishes. This allows you to surface partial progress, intermediate states, or progressive data directly to users or upstream agents and workflows.
|
|
@@ -36,6 +38,10 @@ export const testAgent = new Agent({
|
|
|
36
38
|
|
|
37
39
|
The `writer` argument is passed to a tool’s `execute` function and can be used to emit custom events, data, or values into the active stream. This enables tools to provide intermediate results or status updates while execution is still in progress.
|
|
38
40
|
|
|
41
|
+
<Callout type="warning">
|
|
42
|
+
You must `await` the call to `writer.write(...)` or else you will lock the stream and get a `WritableStream is locked` error.
|
|
43
|
+
</Callout>
|
|
44
|
+
|
|
39
45
|
```typescript {5,8,15} showLineNumbers copy
|
|
40
46
|
import { createTool } from "@mastra/core/tools";
|
|
41
47
|
|
|
@@ -44,14 +50,14 @@ export const testTool = createTool({
|
|
|
44
50
|
execute: async ({ context, writer }) => {
|
|
45
51
|
const { value } = context;
|
|
46
52
|
|
|
47
|
-
|
|
53
|
+
await writer?.write({
|
|
48
54
|
type: "custom-event",
|
|
49
55
|
status: "pending"
|
|
50
56
|
});
|
|
51
57
|
|
|
52
58
|
const response = await fetch(...);
|
|
53
59
|
|
|
54
|
-
|
|
60
|
+
await writer?.write({
|
|
55
61
|
type: "custom-event",
|
|
56
62
|
status: "success"
|
|
57
63
|
});
|
|
@@ -3,6 +3,8 @@ title: "Workflow Streaming | Streaming | Mastra"
|
|
|
3
3
|
description: "Learn how to use workflow streaming in Mastra, including handling workflow execution events, step streaming, and workflow integration with agents and tools."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
+
import { Callout } from "nextra/components";
|
|
7
|
+
|
|
6
8
|
# Workflow streaming
|
|
7
9
|
|
|
8
10
|
Workflow streaming in Mastra enables workflows to send incremental results while they execute, rather than waiting until completion. This allows you to surface partial progress, intermediate states, or progressive data directly to users or upstream agents and workflows.
|
|
@@ -18,6 +20,10 @@ By combining writable workflow streams with agent streaming, you gain fine-grain
|
|
|
18
20
|
|
|
19
21
|
The `writer` argument is passed to a workflow step's `execute` function and can be used to emit custom events, data, or values into the active stream. This enables workflow steps to provide intermediate results or status updates while execution is still in progress.
|
|
20
22
|
|
|
23
|
+
<Callout type="warning">
|
|
24
|
+
You must `await` the call to `writer.write(...)` or else you will lock the stream and get a `WritableStream is locked` error.
|
|
25
|
+
</Callout>
|
|
26
|
+
|
|
21
27
|
```typescript {5,8,15} showLineNumbers copy
|
|
22
28
|
import { createStep } from "@mastra/core/workflows";
|
|
23
29
|
|
|
@@ -26,14 +32,14 @@ export const testStep = createStep({
|
|
|
26
32
|
execute: async ({ inputData, writer }) => {
|
|
27
33
|
const { value } = inputData;
|
|
28
34
|
|
|
29
|
-
writer?.write({
|
|
35
|
+
await writer?.write({
|
|
30
36
|
type: "custom-event",
|
|
31
37
|
status: "pending"
|
|
32
38
|
});
|
|
33
39
|
|
|
34
40
|
const response = await fetch(...);
|
|
35
41
|
|
|
36
|
-
|
|
42
|
+
await writer?.write({
|
|
37
43
|
type: "custom-event",
|
|
38
44
|
status: "success"
|
|
39
45
|
});
|
|
@@ -3,6 +3,8 @@ title: "Tools Overview | Tools & MCP | Mastra Docs"
|
|
|
3
3
|
description: Understand what tools are in Mastra, how to add them to agents, and best practices for designing effective tools.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
+
import { Steps } from "nextra/components";
|
|
7
|
+
|
|
6
8
|
# Tools Overview
|
|
7
9
|
|
|
8
10
|
Tools are functions that agents can execute to perform specific tasks or access external information. They extend an agent's capabilities beyond simple text generation, allowing interaction with APIs, databases, or other systems.
|
|
@@ -65,3 +67,45 @@ Some providers that we include this layer for:
|
|
|
65
67
|
- **DeepSeek & Meta:** Apply similar compatibility logic to ensure schema alignment and tool usability.
|
|
66
68
|
|
|
67
69
|
This approach makes tool usage more reliable and model-agnostic for both custom and MCP tools.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Testing tools locally
|
|
73
|
+
There are two ways to run and test tools.
|
|
74
|
+
|
|
75
|
+
<Steps>
|
|
76
|
+
|
|
77
|
+
### Mastra Playground
|
|
78
|
+
|
|
79
|
+
With the Mastra Dev Server running you can test a tool from the Mastra Playground by visiting [http://localhost:4111/tools](http://localhost:4111/tools) in your browser.
|
|
80
|
+
|
|
81
|
+
> For more information, see the [Local Dev Playground](/docs/server-db/local-dev-playground) documentation.
|
|
82
|
+
|
|
83
|
+
### Command line
|
|
84
|
+
|
|
85
|
+
Invoke a tool using `.execute()`.
|
|
86
|
+
|
|
87
|
+
```typescript filename="src/test-tool.ts" showLineNumbers copy
|
|
88
|
+
import { RuntimeContext } from "@mastra/core/runtime-context";
|
|
89
|
+
import { testTool } from "./mastra/tools/test-tool";
|
|
90
|
+
|
|
91
|
+
const runtimeContext = new RuntimeContext();
|
|
92
|
+
|
|
93
|
+
const result = await testTool.execute({
|
|
94
|
+
context: {
|
|
95
|
+
value: "foo"
|
|
96
|
+
},
|
|
97
|
+
runtimeContext
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
console.log(result);
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
> See [createTool()](../../reference/tools/create-tool.mdx) for more information.
|
|
104
|
+
|
|
105
|
+
To test this tool, run the following:
|
|
106
|
+
|
|
107
|
+
```bash copy
|
|
108
|
+
npx tsx src/test-tool.ts
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
</Steps>
|
|
@@ -130,7 +130,7 @@ export const testWorkflow = createWorkflow({
|
|
|
130
130
|
.commit();
|
|
131
131
|
```
|
|
132
132
|
|
|
133
|
-
|
|
133
|
+
## Register workflow
|
|
134
134
|
|
|
135
135
|
Register a workflow using `workflows` in the main Mastra instance:
|
|
136
136
|
|
|
@@ -154,20 +154,24 @@ export const mastra = new Mastra({
|
|
|
154
154
|
});
|
|
155
155
|
```
|
|
156
156
|
|
|
157
|
-
|
|
157
|
+
## Testing workflows locally
|
|
158
158
|
There are two ways to run and test workflows.
|
|
159
159
|
|
|
160
160
|
<Steps>
|
|
161
161
|
|
|
162
|
-
|
|
162
|
+
### Mastra Playground
|
|
163
163
|
|
|
164
164
|
With the Mastra Dev Server running you can run the workflow from the Mastra Playground by visiting [http://localhost:4111/workflows](http://localhost:4111/workflows) in your browser.
|
|
165
165
|
|
|
166
|
-
|
|
166
|
+
> For more information, see the [Local Dev Playground](/docs/server-db/local-dev-playground) documentation.
|
|
167
167
|
|
|
168
|
-
|
|
168
|
+
### Command line
|
|
169
|
+
|
|
170
|
+
Create a workflow run instance using `createRunAsync` and `start`:
|
|
169
171
|
|
|
170
172
|
```typescript {3,5} filename="src/test-workflow.ts" showLineNumbers copy
|
|
173
|
+
import "dotenv/config";
|
|
174
|
+
|
|
171
175
|
import { mastra } from "./mastra";
|
|
172
176
|
|
|
173
177
|
const run = await mastra.getWorkflow("testWorkflow").createRunAsync();
|
|
@@ -178,15 +182,13 @@ const result = await run.start({
|
|
|
178
182
|
}
|
|
179
183
|
});
|
|
180
184
|
|
|
181
|
-
|
|
182
|
-
console.log(JSON.stringify(result, null, 2));
|
|
185
|
+
console.log(result);
|
|
183
186
|
|
|
184
|
-
// Get the workflow output value
|
|
185
187
|
if (result.status === 'success') {
|
|
186
|
-
console.log(
|
|
188
|
+
console.log(result.result.output);
|
|
187
189
|
}
|
|
188
190
|
```
|
|
189
|
-
> see [createRunAsync](
|
|
191
|
+
> see [createRunAsync](../../reference/workflows/create-run.mdx) and [start](../../reference/workflows/run-methods/start.mdx) for more information.
|
|
190
192
|
|
|
191
193
|
To trigger this workflow, run the following:
|
|
192
194
|
|
|
@@ -196,11 +198,11 @@ npx tsx src/test-workflow.ts
|
|
|
196
198
|
|
|
197
199
|
</Steps>
|
|
198
200
|
|
|
199
|
-
|
|
201
|
+
### Run workflow results
|
|
200
202
|
|
|
201
203
|
The result of running a workflow using either `start()` or `resume()` will look like one of the following, depending on the outcome.
|
|
202
204
|
|
|
203
|
-
|
|
205
|
+
#### Status success
|
|
204
206
|
|
|
205
207
|
```json
|
|
206
208
|
{
|
|
@@ -224,7 +226,7 @@ The result of running a workflow using either `start()` or `resume()` will look
|
|
|
224
226
|
- **result**: Includes the final output of the workflow, typed according to the `outputSchema`
|
|
225
227
|
|
|
226
228
|
|
|
227
|
-
|
|
229
|
+
#### Status suspended
|
|
228
230
|
|
|
229
231
|
```json
|
|
230
232
|
{
|
|
@@ -246,7 +248,7 @@ The result of running a workflow using either `start()` or `resume()` will look
|
|
|
246
248
|
|
|
247
249
|
- **suspended**: An optional array listing any steps currently awaiting input before continuing
|
|
248
250
|
|
|
249
|
-
|
|
251
|
+
#### Status failed
|
|
250
252
|
|
|
251
253
|
```json
|
|
252
254
|
{
|
|
@@ -264,7 +266,7 @@ The result of running a workflow using either `start()` or `resume()` will look
|
|
|
264
266
|
```
|
|
265
267
|
- **error**: An optional field that includes the error message if the workflow fails
|
|
266
268
|
|
|
267
|
-
|
|
269
|
+
## Stream workflow
|
|
268
270
|
|
|
269
271
|
Similar to the run method shown above, workflows can also be streamed:
|
|
270
272
|
|
|
@@ -286,7 +288,7 @@ for await (const chunk of result.stream) {
|
|
|
286
288
|
|
|
287
289
|
> See [stream](/reference/workflows/run-methods/stream) and [messages](/reference/workflows/run-methods/stream#messages) for more information.
|
|
288
290
|
|
|
289
|
-
|
|
291
|
+
## Watch Workflow
|
|
290
292
|
|
|
291
293
|
A workflow can also be watched, allowing you to inspect each event that is emitted.
|
|
292
294
|
|
|
@@ -308,7 +310,7 @@ const result = await run.start({
|
|
|
308
310
|
|
|
309
311
|
> See [watch](/reference/workflows/run-methods/watch) for more information.
|
|
310
312
|
|
|
311
|
-
##
|
|
313
|
+
## Related
|
|
312
314
|
|
|
313
315
|
- The [Workflow Guide](../../guides/guide/ai-recruiter.mdx) in the Guides section is a tutorial that covers the main concepts.
|
|
314
316
|
- [Parallel Steps workflow example](../../examples/workflows/parallel-steps.mdx)
|
package/dist/stdio.js
CHANGED
|
@@ -798,7 +798,11 @@ async function listDirContents(dirPath) {
|
|
|
798
798
|
}
|
|
799
799
|
}
|
|
800
800
|
async function readMdxContent(docPath, queryKeywords) {
|
|
801
|
-
const fullPath = path3__default.join(docsBaseDir, docPath);
|
|
801
|
+
const fullPath = path3__default.resolve(path3__default.join(docsBaseDir, docPath));
|
|
802
|
+
if (!fullPath.startsWith(path3__default.resolve(docsBaseDir))) {
|
|
803
|
+
void logger.error(`Path traversal attempt detected`);
|
|
804
|
+
return { found: false };
|
|
805
|
+
}
|
|
802
806
|
void logger.debug(`Reading MDX content from: ${fullPath}`);
|
|
803
807
|
try {
|
|
804
808
|
const stats = await fs3.stat(fullPath);
|
package/dist/tools/docs.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"docs.d.ts","sourceRoot":"","sources":["../../src/tools/docs.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;
|
|
1
|
+
{"version":3,"file":"docs.d.ts","sourceRoot":"","sources":["../../src/tools/docs.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAgKxB,eAAO,MAAM,eAAe;;;;;;;;;EAW1B,CAAC;AAEH,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AAExD,eAAO,MAAM,QAAQ;;;;;;;;;;;;;oBAkBG,SAAS;CAiDhC,CAAC"}
|