@mastra/mcp-docs-server 0.13.30-alpha.0 → 0.13.30-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +9 -9
- package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +15 -0
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +35 -35
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +17 -17
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +24 -24
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +15 -15
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +16 -16
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +31 -31
- package/.docs/organized/changelogs/%40mastra%2Freact.md +20 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +15 -15
- package/.docs/organized/changelogs/create-mastra.md +19 -19
- package/.docs/organized/changelogs/mastra.md +27 -27
- package/.docs/organized/code-examples/agent.md +0 -1
- package/.docs/organized/code-examples/agui.md +2 -2
- package/.docs/organized/code-examples/client-side-tools.md +2 -2
- package/.docs/raw/agents/adding-voice.mdx +118 -25
- package/.docs/raw/agents/agent-memory.mdx +73 -89
- package/.docs/raw/agents/guardrails.mdx +1 -1
- package/.docs/raw/agents/overview.mdx +39 -7
- package/.docs/raw/agents/using-tools.mdx +95 -0
- package/.docs/raw/deployment/overview.mdx +9 -11
- package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +1 -1
- package/.docs/raw/frameworks/servers/express.mdx +2 -2
- package/.docs/raw/getting-started/installation.mdx +34 -85
- package/.docs/raw/getting-started/mcp-docs-server.mdx +13 -1
- package/.docs/raw/index.mdx +49 -14
- package/.docs/raw/observability/ai-tracing/exporters/otel.mdx +3 -0
- package/.docs/raw/reference/observability/ai-tracing/exporters/otel.mdx +6 -0
- package/.docs/raw/reference/scorers/answer-relevancy.mdx +105 -7
- package/.docs/raw/reference/scorers/answer-similarity.mdx +266 -16
- package/.docs/raw/reference/scorers/bias.mdx +107 -6
- package/.docs/raw/reference/scorers/completeness.mdx +131 -8
- package/.docs/raw/reference/scorers/content-similarity.mdx +107 -8
- package/.docs/raw/reference/scorers/context-precision.mdx +234 -18
- package/.docs/raw/reference/scorers/context-relevance.mdx +418 -35
- package/.docs/raw/reference/scorers/faithfulness.mdx +122 -8
- package/.docs/raw/reference/scorers/hallucination.mdx +125 -8
- package/.docs/raw/reference/scorers/keyword-coverage.mdx +141 -9
- package/.docs/raw/reference/scorers/noise-sensitivity.mdx +478 -6
- package/.docs/raw/reference/scorers/prompt-alignment.mdx +351 -102
- package/.docs/raw/reference/scorers/textual-difference.mdx +134 -6
- package/.docs/raw/reference/scorers/tone-consistency.mdx +133 -0
- package/.docs/raw/reference/scorers/tool-call-accuracy.mdx +422 -65
- package/.docs/raw/reference/scorers/toxicity.mdx +125 -7
- package/.docs/raw/reference/workflows/workflow.mdx +33 -0
- package/.docs/raw/scorers/custom-scorers.mdx +244 -3
- package/.docs/raw/scorers/overview.mdx +8 -38
- package/.docs/raw/server-db/middleware.mdx +5 -2
- package/.docs/raw/server-db/runtime-context.mdx +178 -0
- package/.docs/raw/streaming/workflow-streaming.mdx +5 -1
- package/.docs/raw/tools-mcp/overview.mdx +25 -7
- package/.docs/raw/workflows/overview.mdx +28 -1
- package/CHANGELOG.md +14 -0
- package/package.json +4 -4
- package/.docs/raw/agents/runtime-context.mdx +0 -106
- package/.docs/raw/agents/using-tools-and-mcp.mdx +0 -241
- package/.docs/raw/getting-started/model-providers.mdx +0 -63
- package/.docs/raw/tools-mcp/runtime-context.mdx +0 -63
- /package/.docs/raw/{evals → scorers/evals-old-api}/custom-eval.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/overview.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/running-in-ci.mdx +0 -0
- /package/.docs/raw/{evals → scorers/evals-old-api}/textual-evals.mdx +0 -0
- /package/.docs/raw/{server-db → workflows}/snapshots.mdx +0 -0
|
@@ -59,8 +59,60 @@ The `createPromptAlignmentScorerLLM()` function creates a scorer that evaluates
|
|
|
59
59
|
]}
|
|
60
60
|
/>
|
|
61
61
|
|
|
62
|
+
`.run()` returns a result in the following shape:
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
{
|
|
66
|
+
runId: string,
|
|
67
|
+
score: number,
|
|
68
|
+
reason: string,
|
|
69
|
+
analyzeStepResult: {
|
|
70
|
+
intentAlignment: {
|
|
71
|
+
score: number,
|
|
72
|
+
primaryIntent: string,
|
|
73
|
+
isAddressed: boolean,
|
|
74
|
+
reasoning: string
|
|
75
|
+
},
|
|
76
|
+
requirementsFulfillment: {
|
|
77
|
+
requirements: Array<{
|
|
78
|
+
requirement: string,
|
|
79
|
+
isFulfilled: boolean,
|
|
80
|
+
reasoning: string
|
|
81
|
+
}>,
|
|
82
|
+
overallScore: number
|
|
83
|
+
},
|
|
84
|
+
completeness: {
|
|
85
|
+
score: number,
|
|
86
|
+
missingElements: string[],
|
|
87
|
+
reasoning: string
|
|
88
|
+
},
|
|
89
|
+
responseAppropriateness: {
|
|
90
|
+
score: number,
|
|
91
|
+
formatAlignment: boolean,
|
|
92
|
+
toneAlignment: boolean,
|
|
93
|
+
reasoning: string
|
|
94
|
+
},
|
|
95
|
+
overallAssessment: string
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
62
100
|
## Scoring Details
|
|
63
101
|
|
|
102
|
+
### Scorer configuration
|
|
103
|
+
|
|
104
|
+
You can customize the Prompt Alignment Scorer by adjusting the scale parameter and evaluation mode to fit your scoring needs.
|
|
105
|
+
|
|
106
|
+
```typescript showLineNumbers copy
|
|
107
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
108
|
+
model: openai("gpt-4o-mini"),
|
|
109
|
+
options: {
|
|
110
|
+
scale: 10, // Score from 0-10 instead of 0-1
|
|
111
|
+
evaluationMode: 'both' // 'user', 'system', or 'both' (default)
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
```
|
|
115
|
+
|
|
64
116
|
### Multi-Dimensional Analysis
|
|
65
117
|
|
|
66
118
|
Prompt Alignment evaluates responses across four key dimensions with weighted scoring that adapts based on the evaluation mode:
|
|
@@ -126,15 +178,6 @@ Final Score = Weighted Score × scale
|
|
|
126
178
|
- **0.4-0.6** = Poor alignment with significant issues
|
|
127
179
|
- **0.0-0.4** = Very poor alignment, response doesn't address the prompt effectively
|
|
128
180
|
|
|
129
|
-
### Comparison with Other Scorers
|
|
130
|
-
|
|
131
|
-
| Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
|
|
132
|
-
|--------|------------------|------------------|--------------|
|
|
133
|
-
| **Focus** | Multi-dimensional prompt adherence | Query-response relevance | Context groundedness |
|
|
134
|
-
| **Evaluation** | Intent, requirements, completeness, format | Semantic similarity to query | Factual consistency with context |
|
|
135
|
-
| **Use Case** | General prompt following | Information retrieval | RAG/context-based systems |
|
|
136
|
-
| **Dimensions** | 4 weighted dimensions | Single relevance dimension | Single faithfulness dimension |
|
|
137
|
-
|
|
138
181
|
### When to Use Each Mode
|
|
139
182
|
|
|
140
183
|
**User Mode (`'user'`)** - Use when:
|
|
@@ -155,7 +198,115 @@ Final Score = Weighted Score × scale
|
|
|
155
198
|
- Production monitoring where both user and system requirements matter
|
|
156
199
|
- Holistic assessment of prompt-response alignment
|
|
157
200
|
|
|
158
|
-
##
|
|
201
|
+
## Common Use Cases
|
|
202
|
+
|
|
203
|
+
### Code Generation Evaluation
|
|
204
|
+
Ideal for evaluating:
|
|
205
|
+
- Programming task completion
|
|
206
|
+
- Code quality and completeness
|
|
207
|
+
- Adherence to coding requirements
|
|
208
|
+
- Format specifications (functions, classes, etc.)
|
|
209
|
+
|
|
210
|
+
```typescript
|
|
211
|
+
// Example: API endpoint creation
|
|
212
|
+
const codePrompt = "Create a REST API endpoint with authentication and rate limiting";
|
|
213
|
+
// Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
|
|
214
|
+
// completeness (full implementation), format (code structure)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Instruction Following Assessment
|
|
218
|
+
Perfect for:
|
|
219
|
+
- Task completion verification
|
|
220
|
+
- Multi-step instruction adherence
|
|
221
|
+
- Requirement compliance checking
|
|
222
|
+
- Educational content evaluation
|
|
223
|
+
|
|
224
|
+
```typescript
|
|
225
|
+
// Example: Multi-requirement task
|
|
226
|
+
const taskPrompt = "Write a Python class with initialization, validation, error handling, and documentation";
|
|
227
|
+
// Scorer tracks each requirement individually and provides detailed breakdown
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Content Format Validation
|
|
231
|
+
Useful for:
|
|
232
|
+
- Format specification compliance
|
|
233
|
+
- Style guide adherence
|
|
234
|
+
- Output structure verification
|
|
235
|
+
- Response appropriateness checking
|
|
236
|
+
|
|
237
|
+
```typescript
|
|
238
|
+
// Example: Structured output
|
|
239
|
+
const formatPrompt = "Explain the differences between let and const in JavaScript using bullet points";
|
|
240
|
+
// Scorer evaluates content accuracy AND format compliance
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Agent Response Quality
|
|
244
|
+
Measure how well your AI agents follow user instructions:
|
|
245
|
+
|
|
246
|
+
```typescript
|
|
247
|
+
const agent = new Agent({
|
|
248
|
+
name: 'CodingAssistant',
|
|
249
|
+
instructions: 'You are a helpful coding assistant. Always provide working code examples.',
|
|
250
|
+
model: openai('gpt-4o'),
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
// Evaluate comprehensive alignment (default)
|
|
254
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
255
|
+
model: openai('gpt-4o-mini'),
|
|
256
|
+
options: { evaluationMode: 'both' } // Evaluates both user intent and system guidelines
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
// Evaluate just user satisfaction
|
|
260
|
+
const userScorer = createPromptAlignmentScorerLLM({
|
|
261
|
+
model: openai('gpt-4o-mini'),
|
|
262
|
+
options: { evaluationMode: 'user' } // Focus only on user request fulfillment
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
// Evaluate system compliance
|
|
266
|
+
const systemScorer = createPromptAlignmentScorerLLM({
|
|
267
|
+
model: openai('gpt-4o-mini'),
|
|
268
|
+
options: { evaluationMode: 'system' } // Check adherence to system instructions
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
const result = await scorer.run(agentRun);
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Prompt Engineering Optimization
|
|
275
|
+
Test different prompts to improve alignment:
|
|
276
|
+
|
|
277
|
+
```typescript
|
|
278
|
+
const prompts = [
|
|
279
|
+
'Write a function to calculate factorial',
|
|
280
|
+
'Create a Python function that calculates factorial with error handling for negative inputs',
|
|
281
|
+
'Implement a factorial calculator in Python with: input validation, error handling, and docstring'
|
|
282
|
+
];
|
|
283
|
+
|
|
284
|
+
// Compare alignment scores to find the best prompt
|
|
285
|
+
for (const prompt of prompts) {
|
|
286
|
+
const result = await scorer.run(createTestRun(prompt, response));
|
|
287
|
+
console.log(`Prompt alignment: ${result.score}`);
|
|
288
|
+
}
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Multi-Agent System Evaluation
|
|
292
|
+
Compare different agents or models:
|
|
293
|
+
|
|
294
|
+
```typescript
|
|
295
|
+
const agents = [agent1, agent2, agent3];
|
|
296
|
+
const testPrompts = [...]; // Array of test prompts
|
|
297
|
+
|
|
298
|
+
for (const agent of agents) {
|
|
299
|
+
let totalScore = 0;
|
|
300
|
+
for (const prompt of testPrompts) {
|
|
301
|
+
const response = await agent.run(prompt);
|
|
302
|
+
const evaluation = await scorer.run({ input: prompt, output: response });
|
|
303
|
+
totalScore += evaluation.score;
|
|
304
|
+
}
|
|
305
|
+
console.log(`${agent.name} average alignment: ${totalScore / testPrompts.length}`);
|
|
306
|
+
}
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## Examples
|
|
159
310
|
|
|
160
311
|
### Basic Configuration
|
|
161
312
|
|
|
@@ -231,136 +382,234 @@ const result = await scorer.run({
|
|
|
231
382
|
// Result: Lower appropriateness score due to format mismatch (paragraph vs bullet points)
|
|
232
383
|
```
|
|
233
384
|
|
|
234
|
-
|
|
385
|
+
### Excellent alignment example
|
|
235
386
|
|
|
236
|
-
|
|
237
|
-
Ideal for evaluating:
|
|
238
|
-
- Programming task completion
|
|
239
|
-
- Code quality and completeness
|
|
240
|
-
- Adherence to coding requirements
|
|
241
|
-
- Format specifications (functions, classes, etc.)
|
|
387
|
+
In this example, the response fully addresses the user's prompt with all requirements met.
|
|
242
388
|
|
|
243
|
-
```typescript
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
// Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
|
|
247
|
-
// completeness (full implementation), format (code structure)
|
|
248
|
-
```
|
|
389
|
+
```typescript filename="src/example-excellent-prompt-alignment.ts" showLineNumbers copy
|
|
390
|
+
import { openai } from "@ai-sdk/openai";
|
|
391
|
+
import { createPromptAlignmentScorerLLM } from "@mastra/evals/scorers/llm";
|
|
249
392
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
- Multi-step instruction adherence
|
|
254
|
-
- Requirement compliance checking
|
|
255
|
-
- Educational content evaluation
|
|
393
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
394
|
+
model: openai("gpt-4o-mini")
|
|
395
|
+
});
|
|
256
396
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
397
|
+
const inputMessages = [{
|
|
398
|
+
role: 'user',
|
|
399
|
+
content: "Write a Python function to calculate factorial with error handling for negative numbers"
|
|
400
|
+
}];
|
|
401
|
+
|
|
402
|
+
const outputMessage = {
|
|
403
|
+
text: `def factorial(n):
|
|
404
|
+
"""Calculate factorial of a number."""
|
|
405
|
+
if n < 0:
|
|
406
|
+
raise ValueError("Factorial not defined for negative numbers")
|
|
407
|
+
if n == 0 or n == 1:
|
|
408
|
+
return 1
|
|
409
|
+
return n * factorial(n - 1)`
|
|
410
|
+
};
|
|
411
|
+
|
|
412
|
+
const result = await scorer.run({
|
|
413
|
+
input: inputMessages,
|
|
414
|
+
output: outputMessage,
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
console.log(result);
|
|
261
418
|
```
|
|
262
419
|
|
|
263
|
-
###
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
- Style guide adherence
|
|
267
|
-
- Output structure verification
|
|
268
|
-
- Response appropriateness checking
|
|
420
|
+
### Excellent alignment output
|
|
421
|
+
|
|
422
|
+
The output receives a high score because it perfectly addresses the intent, fulfills all requirements, and uses appropriate format.
|
|
269
423
|
|
|
270
424
|
```typescript
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
425
|
+
{
|
|
426
|
+
score: 0.95,
|
|
427
|
+
reason: 'The score is 0.95 because the response perfectly addresses the primary intent of creating a factorial function and fulfills all requirements including Python implementation, error handling for negative numbers, and proper documentation. The code format is appropriate and the implementation is complete.'
|
|
428
|
+
}
|
|
274
429
|
```
|
|
275
430
|
|
|
276
|
-
|
|
431
|
+
### Partial alignment example
|
|
277
432
|
|
|
278
|
-
|
|
279
|
-
Measure how well your AI agents follow user instructions:
|
|
433
|
+
In this example, the response addresses the core intent but misses some requirements or has format issues.
|
|
280
434
|
|
|
281
|
-
```typescript
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
435
|
+
```typescript filename="src/example-partial-prompt-alignment.ts" showLineNumbers copy
|
|
436
|
+
import { openai } from "@ai-sdk/openai";
|
|
437
|
+
import { createPromptAlignmentScorerLLM } from "@mastra/evals/scorers/llm";
|
|
438
|
+
|
|
439
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
440
|
+
model: openai("gpt-4o-mini")
|
|
286
441
|
});
|
|
287
442
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
443
|
+
const inputMessages = [{
|
|
444
|
+
role: 'user',
|
|
445
|
+
content: "List the benefits of TypeScript in bullet points"
|
|
446
|
+
}];
|
|
447
|
+
|
|
448
|
+
const outputMessage = {
|
|
449
|
+
text: "TypeScript provides static typing, better IDE support, and enhanced code reliability through compile-time error checking."
|
|
450
|
+
};
|
|
451
|
+
|
|
452
|
+
const result = await scorer.run({
|
|
453
|
+
input: inputMessages,
|
|
454
|
+
output: outputMessage,
|
|
292
455
|
});
|
|
293
456
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
457
|
+
console.log(result);
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
#### Partial alignment output
|
|
461
|
+
|
|
462
|
+
The output receives a lower score because while the content is accurate, it doesn't follow the requested format (bullet points).
|
|
463
|
+
|
|
464
|
+
```typescript
|
|
465
|
+
{
|
|
466
|
+
score: 0.75,
|
|
467
|
+
reason: 'The score is 0.75 because the response addresses the intent of explaining TypeScript benefits and provides accurate information, but fails to use the requested bullet point format, resulting in lower appropriateness scoring.'
|
|
468
|
+
}
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
### Poor alignment example
|
|
472
|
+
|
|
473
|
+
In this example, the response fails to address the user's specific requirements.
|
|
474
|
+
|
|
475
|
+
```typescript filename="src/example-poor-prompt-alignment.ts" showLineNumbers copy
|
|
476
|
+
import { openai } from "@ai-sdk/openai";
|
|
477
|
+
import { createPromptAlignmentScorerLLM } from "@mastra/evals/scorers/llm";
|
|
478
|
+
|
|
479
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
480
|
+
model: openai("gpt-4o-mini")
|
|
298
481
|
});
|
|
299
482
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
483
|
+
const inputMessages = [{
|
|
484
|
+
role: 'user',
|
|
485
|
+
content: "Write a Python class with initialization, validation, error handling, and documentation"
|
|
486
|
+
}];
|
|
487
|
+
|
|
488
|
+
const outputMessage = {
|
|
489
|
+
text: `class Example:
|
|
490
|
+
def __init__(self, value):
|
|
491
|
+
self.value = value`
|
|
492
|
+
};
|
|
493
|
+
|
|
494
|
+
const result = await scorer.run({
|
|
495
|
+
input: inputMessages,
|
|
496
|
+
output: outputMessage,
|
|
304
497
|
});
|
|
305
498
|
|
|
306
|
-
|
|
499
|
+
console.log(result);
|
|
307
500
|
```
|
|
308
501
|
|
|
309
|
-
###
|
|
310
|
-
Test different prompts to improve alignment:
|
|
502
|
+
### Poor alignment output
|
|
311
503
|
|
|
312
|
-
|
|
313
|
-
const prompts = [
|
|
314
|
-
'Write a function to calculate factorial',
|
|
315
|
-
'Create a Python function that calculates factorial with error handling for negative inputs',
|
|
316
|
-
'Implement a factorial calculator in Python with: input validation, error handling, and docstring'
|
|
317
|
-
];
|
|
504
|
+
The output receives a low score because it only partially fulfills the requirements, missing validation, error handling, and documentation.
|
|
318
505
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
506
|
+
```typescript
|
|
507
|
+
{
|
|
508
|
+
score: 0.35,
|
|
509
|
+
reason: 'The score is 0.35 because while the response addresses the basic intent of creating a Python class with initialization, it fails to include validation, error handling, and documentation as specifically requested, resulting in incomplete requirement fulfillment.'
|
|
323
510
|
}
|
|
324
511
|
```
|
|
325
512
|
|
|
326
|
-
###
|
|
327
|
-
Compare different agents or models:
|
|
513
|
+
### Evaluation Mode Examples
|
|
328
514
|
|
|
329
|
-
|
|
330
|
-
const agents = [agent1, agent2, agent3];
|
|
331
|
-
const testPrompts = [...]; // Array of test prompts
|
|
515
|
+
#### User Mode - Focus on User Prompt Only
|
|
332
516
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
517
|
+
Evaluates how well the response addresses the user's request, ignoring system instructions:
|
|
518
|
+
|
|
519
|
+
```typescript filename="src/example-user-mode.ts" showLineNumbers copy
|
|
520
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
521
|
+
model: openai("gpt-4o-mini"),
|
|
522
|
+
options: { evaluationMode: 'user' }
|
|
523
|
+
});
|
|
524
|
+
|
|
525
|
+
const result = await scorer.run({
|
|
526
|
+
input: {
|
|
527
|
+
inputMessages: [{
|
|
528
|
+
role: 'user',
|
|
529
|
+
content: "Explain recursion with an example"
|
|
530
|
+
}],
|
|
531
|
+
systemMessages: [{
|
|
532
|
+
role: 'system',
|
|
533
|
+
content: "Always provide code examples in Python"
|
|
534
|
+
}]
|
|
535
|
+
},
|
|
536
|
+
output: {
|
|
537
|
+
text: "Recursion is when a function calls itself. For example: factorial(5) = 5 * factorial(4)"
|
|
339
538
|
}
|
|
340
|
-
|
|
341
|
-
|
|
539
|
+
});
|
|
540
|
+
// Scores high for addressing user request, even without Python code
|
|
342
541
|
```
|
|
343
542
|
|
|
344
|
-
|
|
543
|
+
#### System Mode - Focus on System Guidelines Only
|
|
345
544
|
|
|
346
|
-
|
|
545
|
+
Evaluates compliance with system behavioral guidelines and constraints:
|
|
347
546
|
|
|
348
|
-
```typescript
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
}
|
|
353
|
-
// Error: "Both user prompt and agent response are required for prompt alignment scoring"
|
|
354
|
-
}
|
|
547
|
+
```typescript filename="src/example-system-mode.ts" showLineNumbers copy
|
|
548
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
549
|
+
model: openai("gpt-4o-mini"),
|
|
550
|
+
options: { evaluationMode: 'system' }
|
|
551
|
+
});
|
|
355
552
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
553
|
+
const result = await scorer.run({
|
|
554
|
+
input: {
|
|
555
|
+
systemMessages: [{
|
|
556
|
+
role: 'system',
|
|
557
|
+
content: "You are a helpful assistant. Always be polite, concise, and provide examples."
|
|
558
|
+
}],
|
|
559
|
+
inputMessages: [{
|
|
560
|
+
role: 'user',
|
|
561
|
+
content: "What is machine learning?"
|
|
562
|
+
}]
|
|
563
|
+
},
|
|
564
|
+
output: {
|
|
565
|
+
text: "Machine learning is a subset of AI where computers learn from data. For example, spam filters learn to identify unwanted emails by analyzing patterns in previously marked spam."
|
|
566
|
+
}
|
|
360
567
|
});
|
|
361
|
-
//
|
|
568
|
+
// Evaluates politeness, conciseness, and example provision
|
|
362
569
|
```
|
|
363
570
|
|
|
571
|
+
#### Both Mode - Combined Evaluation (Default)
|
|
572
|
+
|
|
573
|
+
Evaluates both user intent fulfillment and system compliance with weighted scoring (70% user, 30% system):
|
|
574
|
+
|
|
575
|
+
```typescript filename="src/example-both-mode.ts" showLineNumbers copy
|
|
576
|
+
const scorer = createPromptAlignmentScorerLLM({
|
|
577
|
+
model: openai("gpt-4o-mini"),
|
|
578
|
+
options: { evaluationMode: 'both' } // This is the default
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
const result = await scorer.run({
|
|
582
|
+
input: {
|
|
583
|
+
systemMessages: [{
|
|
584
|
+
role: 'system',
|
|
585
|
+
content: "Always provide code examples when explaining programming concepts"
|
|
586
|
+
}],
|
|
587
|
+
inputMessages: [{
|
|
588
|
+
role: 'user',
|
|
589
|
+
content: "Explain how to reverse a string"
|
|
590
|
+
}]
|
|
591
|
+
},
|
|
592
|
+
output: {
|
|
593
|
+
text: `To reverse a string, you can iterate through it backwards. Here's an example in Python:
|
|
594
|
+
|
|
595
|
+
def reverse_string(s):
|
|
596
|
+
return s[::-1]
|
|
597
|
+
|
|
598
|
+
# Usage: reverse_string("hello") returns "olleh"`
|
|
599
|
+
}
|
|
600
|
+
});
|
|
601
|
+
// High score for both addressing the user's request AND following system guidelines
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
## Comparison with Other Scorers
|
|
605
|
+
|
|
606
|
+
| Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
|
|
607
|
+
|--------|------------------|------------------|--------------|
|
|
608
|
+
| **Focus** | Multi-dimensional prompt adherence | Query-response relevance | Context groundedness |
|
|
609
|
+
| **Evaluation** | Intent, requirements, completeness, format | Semantic similarity to query | Factual consistency with context |
|
|
610
|
+
| **Use Case** | General prompt following | Information retrieval | RAG/context-based systems |
|
|
611
|
+
| **Dimensions** | 4 weighted dimensions | Single relevance dimension | Single faithfulness dimension |
|
|
612
|
+
|
|
364
613
|
## Related
|
|
365
614
|
|
|
366
615
|
- [Answer Relevancy Scorer](/reference/scorers/answer-relevancy) - Evaluates query-response relevance
|
|
@@ -37,6 +37,21 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
|
|
|
37
37
|
]}
|
|
38
38
|
/>
|
|
39
39
|
|
|
40
|
+
`.run()` returns a result in the following shape:
|
|
41
|
+
|
|
42
|
+
```typescript
|
|
43
|
+
{
|
|
44
|
+
runId: string,
|
|
45
|
+
analyzeStepResult: {
|
|
46
|
+
confidence: number,
|
|
47
|
+
ratio: number,
|
|
48
|
+
changes: number,
|
|
49
|
+
lengthDiff: number
|
|
50
|
+
},
|
|
51
|
+
score: number
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
40
55
|
## Scoring Details
|
|
41
56
|
|
|
42
57
|
The scorer calculates several measures:
|
|
@@ -61,13 +76,126 @@ Final score: `(similarity_ratio * confidence) * scale`
|
|
|
61
76
|
|
|
62
77
|
### Score interpretation
|
|
63
78
|
|
|
64
|
-
|
|
79
|
+
A textual difference score between 0 and 1:
|
|
80
|
+
|
|
81
|
+
- **1.0**: Identical texts – no differences detected.
|
|
82
|
+
- **0.7–0.9**: Minor differences – few changes needed.
|
|
83
|
+
- **0.4–0.6**: Moderate differences – noticeable changes required.
|
|
84
|
+
- **0.1–0.3**: Major differences – extensive changes needed.
|
|
85
|
+
- **0.0**: Completely different texts.
|
|
86
|
+
|
|
87
|
+
## Examples
|
|
88
|
+
|
|
89
|
+
### No differences example
|
|
90
|
+
|
|
91
|
+
In this example, the texts are exactly the same. The scorer identifies complete similarity with a perfect score and no detected changes.
|
|
92
|
+
|
|
93
|
+
```typescript filename="src/example-no-differences.ts" showLineNumbers copy
|
|
94
|
+
import { createTextualDifferenceScorer } from "@mastra/evals/scorers/code";
|
|
95
|
+
|
|
96
|
+
const scorer = createTextualDifferenceScorer();
|
|
97
|
+
|
|
98
|
+
const input = 'The quick brown fox jumps over the lazy dog';
|
|
99
|
+
const output = 'The quick brown fox jumps over the lazy dog';
|
|
100
|
+
|
|
101
|
+
const result = await scorer.run({
|
|
102
|
+
input: [{ role: 'user', content: input }],
|
|
103
|
+
output: { role: 'assistant', text: output },
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
console.log('Score:', result.score);
|
|
107
|
+
console.log('AnalyzeStepResult:', result.analyzeStepResult);
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
#### No differences output
|
|
111
|
+
|
|
112
|
+
The scorer returns a high score, indicating the texts are identical. The detailed info confirms zero changes and no length difference.
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
{
|
|
116
|
+
score: 1,
|
|
117
|
+
analyzeStepResult: {
|
|
118
|
+
confidence: 1,
|
|
119
|
+
ratio: 1,
|
|
120
|
+
changes: 0,
|
|
121
|
+
lengthDiff: 0,
|
|
122
|
+
},
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Minor differences example
|
|
127
|
+
|
|
128
|
+
In this example, the texts have small variations. The scorer detects these minor differences and returns a moderate similarity score.
|
|
129
|
+
|
|
130
|
+
```typescript filename="src/example-minor-differences.ts" showLineNumbers copy
|
|
131
|
+
import { createTextualDifferenceScorer } from "@mastra/evals/scorers/code";
|
|
132
|
+
|
|
133
|
+
const scorer = createTextualDifferenceScorer();
|
|
134
|
+
|
|
135
|
+
const input = 'Hello world! How are you?';
|
|
136
|
+
const output = 'Hello there! How is it going?';
|
|
137
|
+
|
|
138
|
+
const result = await scorer.run({
|
|
139
|
+
input: [{ role: 'user', content: input }],
|
|
140
|
+
output: { role: 'assistant', text: output },
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
console.log('Score:', result.score);
|
|
144
|
+
console.log('AnalyzeStepResult:', result.analyzeStepResult);
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
#### Minor differences output
|
|
148
|
+
|
|
149
|
+
The scorer returns a moderate score reflecting the small variations between the texts. The detailed info includes the number of changes and length difference observed.
|
|
150
|
+
|
|
151
|
+
```typescript
|
|
152
|
+
{
|
|
153
|
+
score: 0.5925925925925926,
|
|
154
|
+
analyzeStepResult: {
|
|
155
|
+
confidence: 0.8620689655172413,
|
|
156
|
+
ratio: 0.5925925925925926,
|
|
157
|
+
changes: 5,
|
|
158
|
+
lengthDiff: 0.13793103448275862
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Major differences example
|
|
164
|
+
|
|
165
|
+
In this example, the texts differ significantly. The scorer detects extensive changes and returns a low similarity score.
|
|
166
|
+
|
|
167
|
+
```typescript filename="src/example-major-differences.ts" showLineNumbers copy
|
|
168
|
+
import { createTextualDifferenceScorer } from "@mastra/evals/scorers/code";
|
|
169
|
+
|
|
170
|
+
const scorer = createTextualDifferenceScorer();
|
|
171
|
+
|
|
172
|
+
const input = 'Python is a high-level programming language';
|
|
173
|
+
const output = 'JavaScript is used for web development';
|
|
174
|
+
|
|
175
|
+
const result = await scorer.run({
|
|
176
|
+
input: [{ role: 'user', content: input }],
|
|
177
|
+
output: { role: 'assistant', text: output },
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
console.log('Score:', result.score);
|
|
181
|
+
console.log('AnalyzeStepResult:', result.analyzeStepResult);
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
#### Major differences output
|
|
185
|
+
|
|
186
|
+
The scorer returns a low score due to significant differences between the texts. The detailed `analyzeStepResult` shows numerous changes and a notable length difference.
|
|
65
187
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
188
|
+
```typescript
|
|
189
|
+
{
|
|
190
|
+
score: 0.3170731707317073,
|
|
191
|
+
analyzeStepResult: {
|
|
192
|
+
confidence: 0.8636363636363636,
|
|
193
|
+
ratio: 0.3170731707317073,
|
|
194
|
+
changes: 8,
|
|
195
|
+
lengthDiff: 0.13636363636363635
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
```
|
|
71
199
|
|
|
72
200
|
## Related
|
|
73
201
|
|