@mastra/mcp-docs-server 0.13.29 → 0.13.30-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/.docs/organized/changelogs/%40internal%2Fchangeset-cli.md +2 -0
  2. package/.docs/organized/changelogs/%40internal%2Fstorage-test-utils.md +9 -9
  3. package/.docs/organized/changelogs/%40internal%2Ftypes-builder.md +2 -0
  4. package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +31 -31
  5. package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +36 -0
  6. package/.docs/organized/changelogs/%40mastra%2Fastra.md +16 -16
  7. package/.docs/organized/changelogs/%40mastra%2Fchroma.md +16 -16
  8. package/.docs/organized/changelogs/%40mastra%2Fclickhouse.md +16 -16
  9. package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +28 -28
  10. package/.docs/organized/changelogs/%40mastra%2Fcloud.md +16 -16
  11. package/.docs/organized/changelogs/%40mastra%2Fcloudflare-d1.md +16 -16
  12. package/.docs/organized/changelogs/%40mastra%2Fcloudflare.md +16 -16
  13. package/.docs/organized/changelogs/%40mastra%2Fcore.md +106 -106
  14. package/.docs/organized/changelogs/%40mastra%2Fcouchbase.md +16 -16
  15. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +37 -37
  16. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloudflare.md +25 -25
  17. package/.docs/organized/changelogs/%40mastra%2Fdeployer-netlify.md +25 -25
  18. package/.docs/organized/changelogs/%40mastra%2Fdeployer-vercel.md +25 -25
  19. package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +49 -49
  20. package/.docs/organized/changelogs/%40mastra%2Fdynamodb.md +16 -16
  21. package/.docs/organized/changelogs/%40mastra%2Fevals.md +33 -33
  22. package/.docs/organized/changelogs/%40mastra%2Flance.md +16 -16
  23. package/.docs/organized/changelogs/%40mastra%2Flibsql.md +16 -16
  24. package/.docs/organized/changelogs/%40mastra%2Floggers.md +16 -16
  25. package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +23 -23
  26. package/.docs/organized/changelogs/%40mastra%2Fmcp-registry-registry.md +16 -16
  27. package/.docs/organized/changelogs/%40mastra%2Fmcp.md +16 -16
  28. package/.docs/organized/changelogs/%40mastra%2Fmemory.md +36 -36
  29. package/.docs/organized/changelogs/%40mastra%2Fmongodb.md +16 -16
  30. package/.docs/organized/changelogs/%40mastra%2Fmssql.md +16 -16
  31. package/.docs/organized/changelogs/%40mastra%2Fopensearch.md +17 -17
  32. package/.docs/organized/changelogs/%40mastra%2Fpg.md +31 -31
  33. package/.docs/organized/changelogs/%40mastra%2Fpinecone.md +16 -16
  34. package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +67 -67
  35. package/.docs/organized/changelogs/%40mastra%2Fqdrant.md +16 -16
  36. package/.docs/organized/changelogs/%40mastra%2Frag.md +16 -16
  37. package/.docs/organized/changelogs/%40mastra%2Freact.md +37 -0
  38. package/.docs/organized/changelogs/%40mastra%2Fs3vectors.md +15 -0
  39. package/.docs/organized/changelogs/%40mastra%2Fserver.md +37 -37
  40. package/.docs/organized/changelogs/%40mastra%2Fturbopuffer.md +16 -16
  41. package/.docs/organized/changelogs/%40mastra%2Fupstash.md +19 -19
  42. package/.docs/organized/changelogs/%40mastra%2Fvectorize.md +17 -17
  43. package/.docs/organized/changelogs/%40mastra%2Fvoice-azure.md +18 -18
  44. package/.docs/organized/changelogs/%40mastra%2Fvoice-cloudflare.md +16 -16
  45. package/.docs/organized/changelogs/%40mastra%2Fvoice-deepgram.md +16 -16
  46. package/.docs/organized/changelogs/%40mastra%2Fvoice-elevenlabs.md +16 -16
  47. package/.docs/organized/changelogs/%40mastra%2Fvoice-gladia.md +16 -16
  48. package/.docs/organized/changelogs/%40mastra%2Fvoice-google-gemini-live.md +15 -0
  49. package/.docs/organized/changelogs/%40mastra%2Fvoice-google.md +16 -16
  50. package/.docs/organized/changelogs/%40mastra%2Fvoice-murf.md +16 -16
  51. package/.docs/organized/changelogs/%40mastra%2Fvoice-openai-realtime.md +16 -16
  52. package/.docs/organized/changelogs/%40mastra%2Fvoice-openai.md +16 -16
  53. package/.docs/organized/changelogs/%40mastra%2Fvoice-playai.md +16 -16
  54. package/.docs/organized/changelogs/%40mastra%2Fvoice-sarvam.md +16 -16
  55. package/.docs/organized/changelogs/%40mastra%2Fvoice-speechify.md +16 -16
  56. package/.docs/organized/changelogs/create-mastra.md +35 -35
  57. package/.docs/organized/changelogs/mastra.md +63 -63
  58. package/.docs/organized/code-examples/agent.md +26 -7
  59. package/.docs/organized/code-examples/agui.md +4 -4
  60. package/.docs/organized/code-examples/ai-elements.md +1 -1
  61. package/.docs/organized/code-examples/ai-sdk-useChat.md +2 -2
  62. package/.docs/organized/code-examples/ai-sdk-v5.md +2 -2
  63. package/.docs/organized/code-examples/assistant-ui.md +2 -2
  64. package/.docs/organized/code-examples/bird-checker-with-nextjs-and-eval.md +2 -2
  65. package/.docs/organized/code-examples/bird-checker-with-nextjs.md +2 -2
  66. package/.docs/organized/code-examples/client-side-tools.md +4 -4
  67. package/.docs/organized/code-examples/crypto-chatbot.md +2 -2
  68. package/.docs/organized/code-examples/heads-up-game.md +2 -2
  69. package/.docs/organized/code-examples/openapi-spec-writer.md +2 -2
  70. package/.docs/raw/agents/adding-voice.mdx +118 -25
  71. package/.docs/raw/agents/agent-memory.mdx +73 -89
  72. package/.docs/raw/agents/guardrails.mdx +1 -1
  73. package/.docs/raw/agents/networks.mdx +12 -6
  74. package/.docs/raw/agents/overview.mdx +46 -11
  75. package/.docs/raw/agents/using-tools.mdx +95 -0
  76. package/.docs/raw/deployment/overview.mdx +9 -11
  77. package/.docs/raw/frameworks/agentic-uis/ai-sdk.mdx +7 -4
  78. package/.docs/raw/frameworks/servers/express.mdx +2 -2
  79. package/.docs/raw/getting-started/installation.mdx +34 -132
  80. package/.docs/raw/getting-started/mcp-docs-server.mdx +13 -1
  81. package/.docs/raw/index.mdx +49 -14
  82. package/.docs/raw/observability/ai-tracing/exporters/otel.mdx +3 -0
  83. package/.docs/raw/reference/agents/generateLegacy.mdx +4 -4
  84. package/.docs/raw/reference/observability/ai-tracing/exporters/otel.mdx +6 -0
  85. package/.docs/raw/reference/scorers/answer-relevancy.mdx +105 -7
  86. package/.docs/raw/reference/scorers/answer-similarity.mdx +266 -16
  87. package/.docs/raw/reference/scorers/bias.mdx +107 -6
  88. package/.docs/raw/reference/scorers/completeness.mdx +131 -8
  89. package/.docs/raw/reference/scorers/content-similarity.mdx +107 -8
  90. package/.docs/raw/reference/scorers/context-precision.mdx +234 -18
  91. package/.docs/raw/reference/scorers/context-relevance.mdx +418 -35
  92. package/.docs/raw/reference/scorers/faithfulness.mdx +122 -8
  93. package/.docs/raw/reference/scorers/hallucination.mdx +125 -8
  94. package/.docs/raw/reference/scorers/keyword-coverage.mdx +141 -9
  95. package/.docs/raw/reference/scorers/noise-sensitivity.mdx +478 -6
  96. package/.docs/raw/reference/scorers/prompt-alignment.mdx +351 -102
  97. package/.docs/raw/reference/scorers/textual-difference.mdx +134 -6
  98. package/.docs/raw/reference/scorers/tone-consistency.mdx +133 -0
  99. package/.docs/raw/reference/scorers/tool-call-accuracy.mdx +422 -65
  100. package/.docs/raw/reference/scorers/toxicity.mdx +125 -7
  101. package/.docs/raw/reference/streaming/agents/MastraModelOutput.mdx +9 -5
  102. package/.docs/raw/reference/streaming/agents/streamLegacy.mdx +4 -4
  103. package/.docs/raw/reference/streaming/workflows/observeStream.mdx +49 -0
  104. package/.docs/raw/reference/streaming/workflows/observeStreamVNext.mdx +47 -0
  105. package/.docs/raw/reference/streaming/workflows/resumeStreamVNext.mdx +7 -5
  106. package/.docs/raw/reference/streaming/workflows/stream.mdx +1 -1
  107. package/.docs/raw/reference/workflows/workflow.mdx +33 -0
  108. package/.docs/raw/scorers/custom-scorers.mdx +244 -3
  109. package/.docs/raw/scorers/overview.mdx +8 -38
  110. package/.docs/raw/server-db/middleware.mdx +5 -2
  111. package/.docs/raw/server-db/runtime-context.mdx +178 -0
  112. package/.docs/raw/streaming/workflow-streaming.mdx +28 -1
  113. package/.docs/raw/tools-mcp/overview.mdx +25 -7
  114. package/.docs/raw/workflows/overview.mdx +28 -1
  115. package/CHANGELOG.md +15 -0
  116. package/package.json +6 -6
  117. package/.docs/raw/agents/runtime-context.mdx +0 -103
  118. package/.docs/raw/agents/using-tools-and-mcp.mdx +0 -241
  119. package/.docs/raw/getting-started/model-providers.mdx +0 -63
  120. package/.docs/raw/reference/agents/migration-guide.mdx +0 -291
  121. package/.docs/raw/tools-mcp/runtime-context.mdx +0 -63
  122. /package/.docs/raw/{evals → scorers/evals-old-api}/custom-eval.mdx +0 -0
  123. /package/.docs/raw/{evals → scorers/evals-old-api}/overview.mdx +0 -0
  124. /package/.docs/raw/{evals → scorers/evals-old-api}/running-in-ci.mdx +0 -0
  125. /package/.docs/raw/{evals → scorers/evals-old-api}/textual-evals.mdx +0 -0
  126. /package/.docs/raw/{server-db → workflows}/snapshots.mdx +0 -0
@@ -59,8 +59,60 @@ The `createPromptAlignmentScorerLLM()` function creates a scorer that evaluates
59
59
  ]}
60
60
  />
61
61
 
62
+ `.run()` returns a result in the following shape:
63
+
64
+ ```typescript
65
+ {
66
+ runId: string,
67
+ score: number,
68
+ reason: string,
69
+ analyzeStepResult: {
70
+ intentAlignment: {
71
+ score: number,
72
+ primaryIntent: string,
73
+ isAddressed: boolean,
74
+ reasoning: string
75
+ },
76
+ requirementsFulfillment: {
77
+ requirements: Array<{
78
+ requirement: string,
79
+ isFulfilled: boolean,
80
+ reasoning: string
81
+ }>,
82
+ overallScore: number
83
+ },
84
+ completeness: {
85
+ score: number,
86
+ missingElements: string[],
87
+ reasoning: string
88
+ },
89
+ responseAppropriateness: {
90
+ score: number,
91
+ formatAlignment: boolean,
92
+ toneAlignment: boolean,
93
+ reasoning: string
94
+ },
95
+ overallAssessment: string
96
+ }
97
+ }
98
+ ```
99
+
62
100
  ## Scoring Details
63
101
 
102
+ ### Scorer configuration
103
+
104
+ You can customize the Prompt Alignment Scorer by adjusting the scale parameter and evaluation mode to fit your scoring needs.
105
+
106
+ ```typescript showLineNumbers copy
107
+ const scorer = createPromptAlignmentScorerLLM({
108
+ model: openai("gpt-4o-mini"),
109
+ options: {
110
+ scale: 10, // Score from 0-10 instead of 0-1
111
+ evaluationMode: 'both' // 'user', 'system', or 'both' (default)
112
+ }
113
+ });
114
+ ```
115
+
64
116
  ### Multi-Dimensional Analysis
65
117
 
66
118
  Prompt Alignment evaluates responses across four key dimensions with weighted scoring that adapts based on the evaluation mode:
@@ -126,15 +178,6 @@ Final Score = Weighted Score × scale
126
178
  - **0.4-0.6** = Poor alignment with significant issues
127
179
  - **0.0-0.4** = Very poor alignment, response doesn't address the prompt effectively
128
180
 
129
- ### Comparison with Other Scorers
130
-
131
- | Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
132
- |--------|------------------|------------------|--------------|
133
- | **Focus** | Multi-dimensional prompt adherence | Query-response relevance | Context groundedness |
134
- | **Evaluation** | Intent, requirements, completeness, format | Semantic similarity to query | Factual consistency with context |
135
- | **Use Case** | General prompt following | Information retrieval | RAG/context-based systems |
136
- | **Dimensions** | 4 weighted dimensions | Single relevance dimension | Single faithfulness dimension |
137
-
138
181
  ### When to Use Each Mode
139
182
 
140
183
  **User Mode (`'user'`)** - Use when:
@@ -155,7 +198,115 @@ Final Score = Weighted Score × scale
155
198
  - Production monitoring where both user and system requirements matter
156
199
  - Holistic assessment of prompt-response alignment
157
200
 
158
- ## Usage Examples
201
+ ## Common Use Cases
202
+
203
+ ### Code Generation Evaluation
204
+ Ideal for evaluating:
205
+ - Programming task completion
206
+ - Code quality and completeness
207
+ - Adherence to coding requirements
208
+ - Format specifications (functions, classes, etc.)
209
+
210
+ ```typescript
211
+ // Example: API endpoint creation
212
+ const codePrompt = "Create a REST API endpoint with authentication and rate limiting";
213
+ // Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
214
+ // completeness (full implementation), format (code structure)
215
+ ```
216
+
217
+ ### Instruction Following Assessment
218
+ Perfect for:
219
+ - Task completion verification
220
+ - Multi-step instruction adherence
221
+ - Requirement compliance checking
222
+ - Educational content evaluation
223
+
224
+ ```typescript
225
+ // Example: Multi-requirement task
226
+ const taskPrompt = "Write a Python class with initialization, validation, error handling, and documentation";
227
+ // Scorer tracks each requirement individually and provides detailed breakdown
228
+ ```
229
+
230
+ ### Content Format Validation
231
+ Useful for:
232
+ - Format specification compliance
233
+ - Style guide adherence
234
+ - Output structure verification
235
+ - Response appropriateness checking
236
+
237
+ ```typescript
238
+ // Example: Structured output
239
+ const formatPrompt = "Explain the differences between let and const in JavaScript using bullet points";
240
+ // Scorer evaluates content accuracy AND format compliance
241
+ ```
242
+
243
+ ### Agent Response Quality
244
+ Measure how well your AI agents follow user instructions:
245
+
246
+ ```typescript
247
+ const agent = new Agent({
248
+ name: 'CodingAssistant',
249
+ instructions: 'You are a helpful coding assistant. Always provide working code examples.',
250
+ model: openai('gpt-4o'),
251
+ });
252
+
253
+ // Evaluate comprehensive alignment (default)
254
+ const scorer = createPromptAlignmentScorerLLM({
255
+ model: openai('gpt-4o-mini'),
256
+ options: { evaluationMode: 'both' } // Evaluates both user intent and system guidelines
257
+ });
258
+
259
+ // Evaluate just user satisfaction
260
+ const userScorer = createPromptAlignmentScorerLLM({
261
+ model: openai('gpt-4o-mini'),
262
+ options: { evaluationMode: 'user' } // Focus only on user request fulfillment
263
+ });
264
+
265
+ // Evaluate system compliance
266
+ const systemScorer = createPromptAlignmentScorerLLM({
267
+ model: openai('gpt-4o-mini'),
268
+ options: { evaluationMode: 'system' } // Check adherence to system instructions
269
+ });
270
+
271
+ const result = await scorer.run(agentRun);
272
+ ```
273
+
274
+ ### Prompt Engineering Optimization
275
+ Test different prompts to improve alignment:
276
+
277
+ ```typescript
278
+ const prompts = [
279
+ 'Write a function to calculate factorial',
280
+ 'Create a Python function that calculates factorial with error handling for negative inputs',
281
+ 'Implement a factorial calculator in Python with: input validation, error handling, and docstring'
282
+ ];
283
+
284
+ // Compare alignment scores to find the best prompt
285
+ for (const prompt of prompts) {
286
+ const result = await scorer.run(createTestRun(prompt, response));
287
+ console.log(`Prompt alignment: ${result.score}`);
288
+ }
289
+ ```
290
+
291
+ ### Multi-Agent System Evaluation
292
+ Compare different agents or models:
293
+
294
+ ```typescript
295
+ const agents = [agent1, agent2, agent3];
296
+ const testPrompts = [...]; // Array of test prompts
297
+
298
+ for (const agent of agents) {
299
+ let totalScore = 0;
300
+ for (const prompt of testPrompts) {
301
+ const response = await agent.run(prompt);
302
+ const evaluation = await scorer.run({ input: prompt, output: response });
303
+ totalScore += evaluation.score;
304
+ }
305
+ console.log(`${agent.name} average alignment: ${totalScore / testPrompts.length}`);
306
+ }
307
+ ```
308
+
309
+ ## Examples
159
310
 
160
311
  ### Basic Configuration
161
312
 
@@ -231,136 +382,234 @@ const result = await scorer.run({
231
382
  // Result: Lower appropriateness score due to format mismatch (paragraph vs bullet points)
232
383
  ```
233
384
 
234
- ## Usage Patterns
385
+ ### Excellent alignment example
235
386
 
236
- ### Code Generation Evaluation
237
- Ideal for evaluating:
238
- - Programming task completion
239
- - Code quality and completeness
240
- - Adherence to coding requirements
241
- - Format specifications (functions, classes, etc.)
387
+ In this example, the response fully addresses the user's prompt with all requirements met.
242
388
 
243
- ```typescript
244
- // Example: API endpoint creation
245
- const codePrompt = "Create a REST API endpoint with authentication and rate limiting";
246
- // Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
247
- // completeness (full implementation), format (code structure)
248
- ```
389
+ ```typescript filename="src/example-excellent-prompt-alignment.ts" showLineNumbers copy
390
+ import { openai } from "@ai-sdk/openai";
391
+ import { createPromptAlignmentScorerLLM } from "@mastra/evals/scorers/llm";
249
392
 
250
- ### Instruction Following Assessment
251
- Perfect for:
252
- - Task completion verification
253
- - Multi-step instruction adherence
254
- - Requirement compliance checking
255
- - Educational content evaluation
393
+ const scorer = createPromptAlignmentScorerLLM({
394
+ model: openai("gpt-4o-mini")
395
+ });
256
396
 
257
- ```typescript
258
- // Example: Multi-requirement task
259
- const taskPrompt = "Write a Python class with initialization, validation, error handling, and documentation";
260
- // Scorer tracks each requirement individually and provides detailed breakdown
397
+ const inputMessages = [{
398
+ role: 'user',
399
+ content: "Write a Python function to calculate factorial with error handling for negative numbers"
400
+ }];
401
+
402
+ const outputMessage = {
403
+ text: `def factorial(n):
404
+ """Calculate factorial of a number."""
405
+ if n < 0:
406
+ raise ValueError("Factorial not defined for negative numbers")
407
+ if n == 0 or n == 1:
408
+ return 1
409
+ return n * factorial(n - 1)`
410
+ };
411
+
412
+ const result = await scorer.run({
413
+ input: inputMessages,
414
+ output: outputMessage,
415
+ });
416
+
417
+ console.log(result);
261
418
  ```
262
419
 
263
- ### Content Format Validation
264
- Useful for:
265
- - Format specification compliance
266
- - Style guide adherence
267
- - Output structure verification
268
- - Response appropriateness checking
420
+ ### Excellent alignment output
421
+
422
+ The output receives a high score because it perfectly addresses the intent, fulfills all requirements, and uses appropriate format.
269
423
 
270
424
  ```typescript
271
- // Example: Structured output
272
- const formatPrompt = "Explain the differences between let and const in JavaScript using bullet points";
273
- // Scorer evaluates content accuracy AND format compliance
425
+ {
426
+ score: 0.95,
427
+ reason: 'The score is 0.95 because the response perfectly addresses the primary intent of creating a factorial function and fulfills all requirements including Python implementation, error handling for negative numbers, and proper documentation. The code format is appropriate and the implementation is complete.'
428
+ }
274
429
  ```
275
430
 
276
- ## Common Use Cases
431
+ ### Partial alignment example
277
432
 
278
- ### 1. Agent Response Quality
279
- Measure how well your AI agents follow user instructions:
433
+ In this example, the response addresses the core intent but misses some requirements or has format issues.
280
434
 
281
- ```typescript
282
- const agent = new Agent({
283
- name: 'CodingAssistant',
284
- instructions: 'You are a helpful coding assistant. Always provide working code examples.',
285
- model: openai('gpt-4o'),
435
+ ```typescript filename="src/example-partial-prompt-alignment.ts" showLineNumbers copy
436
+ import { openai } from "@ai-sdk/openai";
437
+ import { createPromptAlignmentScorerLLM } from "@mastra/evals/scorers/llm";
438
+
439
+ const scorer = createPromptAlignmentScorerLLM({
440
+ model: openai("gpt-4o-mini")
286
441
  });
287
442
 
288
- // Evaluate comprehensive alignment (default)
289
- const scorer = createPromptAlignmentScorerLLM({
290
- model: openai('gpt-4o-mini'),
291
- options: { evaluationMode: 'both' } // Evaluates both user intent and system guidelines
443
+ const inputMessages = [{
444
+ role: 'user',
445
+ content: "List the benefits of TypeScript in bullet points"
446
+ }];
447
+
448
+ const outputMessage = {
449
+ text: "TypeScript provides static typing, better IDE support, and enhanced code reliability through compile-time error checking."
450
+ };
451
+
452
+ const result = await scorer.run({
453
+ input: inputMessages,
454
+ output: outputMessage,
292
455
  });
293
456
 
294
- // Evaluate just user satisfaction
295
- const userScorer = createPromptAlignmentScorerLLM({
296
- model: openai('gpt-4o-mini'),
297
- options: { evaluationMode: 'user' } // Focus only on user request fulfillment
457
+ console.log(result);
458
+ ```
459
+
460
+ #### Partial alignment output
461
+
462
+ The output receives a lower score because while the content is accurate, it doesn't follow the requested format (bullet points).
463
+
464
+ ```typescript
465
+ {
466
+ score: 0.75,
467
+ reason: 'The score is 0.75 because the response addresses the intent of explaining TypeScript benefits and provides accurate information, but fails to use the requested bullet point format, resulting in lower appropriateness scoring.'
468
+ }
469
+ ```
470
+
471
+ ### Poor alignment example
472
+
473
+ In this example, the response fails to address the user's specific requirements.
474
+
475
+ ```typescript filename="src/example-poor-prompt-alignment.ts" showLineNumbers copy
476
+ import { openai } from "@ai-sdk/openai";
477
+ import { createPromptAlignmentScorerLLM } from "@mastra/evals/scorers/llm";
478
+
479
+ const scorer = createPromptAlignmentScorerLLM({
480
+ model: openai("gpt-4o-mini")
298
481
  });
299
482
 
300
- // Evaluate system compliance
301
- const systemScorer = createPromptAlignmentScorerLLM({
302
- model: openai('gpt-4o-mini'),
303
- options: { evaluationMode: 'system' } // Check adherence to system instructions
483
+ const inputMessages = [{
484
+ role: 'user',
485
+ content: "Write a Python class with initialization, validation, error handling, and documentation"
486
+ }];
487
+
488
+ const outputMessage = {
489
+ text: `class Example:
490
+ def __init__(self, value):
491
+ self.value = value`
492
+ };
493
+
494
+ const result = await scorer.run({
495
+ input: inputMessages,
496
+ output: outputMessage,
304
497
  });
305
498
 
306
- const result = await scorer.run(agentRun);
499
+ console.log(result);
307
500
  ```
308
501
 
309
- ### 2. Prompt Engineering Optimization
310
- Test different prompts to improve alignment:
502
+ ### Poor alignment output
311
503
 
312
- ```typescript
313
- const prompts = [
314
- 'Write a function to calculate factorial',
315
- 'Create a Python function that calculates factorial with error handling for negative inputs',
316
- 'Implement a factorial calculator in Python with: input validation, error handling, and docstring'
317
- ];
504
+ The output receives a low score because it only partially fulfills the requirements, missing validation, error handling, and documentation.
318
505
 
319
- // Compare alignment scores to find the best prompt
320
- for (const prompt of prompts) {
321
- const result = await scorer.run(createTestRun(prompt, response));
322
- console.log(`Prompt alignment: ${result.score}`);
506
+ ```typescript
507
+ {
508
+ score: 0.35,
509
+ reason: 'The score is 0.35 because while the response addresses the basic intent of creating a Python class with initialization, it fails to include validation, error handling, and documentation as specifically requested, resulting in incomplete requirement fulfillment.'
323
510
  }
324
511
  ```
325
512
 
326
- ### 3. Multi-Agent System Evaluation
327
- Compare different agents or models:
513
+ ### Evaluation Mode Examples
328
514
 
329
- ```typescript
330
- const agents = [agent1, agent2, agent3];
331
- const testPrompts = [...]; // Array of test prompts
515
+ #### User Mode - Focus on User Prompt Only
332
516
 
333
- for (const agent of agents) {
334
- let totalScore = 0;
335
- for (const prompt of testPrompts) {
336
- const response = await agent.run(prompt);
337
- const evaluation = await scorer.run({ input: prompt, output: response });
338
- totalScore += evaluation.score;
517
+ Evaluates how well the response addresses the user's request, ignoring system instructions:
518
+
519
+ ```typescript filename="src/example-user-mode.ts" showLineNumbers copy
520
+ const scorer = createPromptAlignmentScorerLLM({
521
+ model: openai("gpt-4o-mini"),
522
+ options: { evaluationMode: 'user' }
523
+ });
524
+
525
+ const result = await scorer.run({
526
+ input: {
527
+ inputMessages: [{
528
+ role: 'user',
529
+ content: "Explain recursion with an example"
530
+ }],
531
+ systemMessages: [{
532
+ role: 'system',
533
+ content: "Always provide code examples in Python"
534
+ }]
535
+ },
536
+ output: {
537
+ text: "Recursion is when a function calls itself. For example: factorial(5) = 5 * factorial(4)"
339
538
  }
340
- console.log(`${agent.name} average alignment: ${totalScore / testPrompts.length}`);
341
- }
539
+ });
540
+ // Scores high for addressing user request, even without Python code
342
541
  ```
343
542
 
344
- ## Error Handling
543
+ #### System Mode - Focus on System Guidelines Only
345
544
 
346
- The scorer handles various edge cases gracefully:
545
+ Evaluates compliance with system behavioral guidelines and constraints:
347
546
 
348
- ```typescript
349
- // Missing user prompt
350
- try {
351
- await scorer.run({ input: [], output: response });
352
- } catch (error) {
353
- // Error: "Both user prompt and agent response are required for prompt alignment scoring"
354
- }
547
+ ```typescript filename="src/example-system-mode.ts" showLineNumbers copy
548
+ const scorer = createPromptAlignmentScorerLLM({
549
+ model: openai("gpt-4o-mini"),
550
+ options: { evaluationMode: 'system' }
551
+ });
355
552
 
356
- // Empty response
357
- const result = await scorer.run({
358
- input: [userMessage],
359
- output: { role: 'assistant', text: '' }
553
+ const result = await scorer.run({
554
+ input: {
555
+ systemMessages: [{
556
+ role: 'system',
557
+ content: "You are a helpful assistant. Always be polite, concise, and provide examples."
558
+ }],
559
+ inputMessages: [{
560
+ role: 'user',
561
+ content: "What is machine learning?"
562
+ }]
563
+ },
564
+ output: {
565
+ text: "Machine learning is a subset of AI where computers learn from data. For example, spam filters learn to identify unwanted emails by analyzing patterns in previously marked spam."
566
+ }
360
567
  });
361
- // Returns low scores with detailed reasoning about incompleteness
568
+ // Evaluates politeness, conciseness, and example provision
362
569
  ```
363
570
 
571
+ #### Both Mode - Combined Evaluation (Default)
572
+
573
+ Evaluates both user intent fulfillment and system compliance with weighted scoring (70% user, 30% system):
574
+
575
+ ```typescript filename="src/example-both-mode.ts" showLineNumbers copy
576
+ const scorer = createPromptAlignmentScorerLLM({
577
+ model: openai("gpt-4o-mini"),
578
+ options: { evaluationMode: 'both' } // This is the default
579
+ });
580
+
581
+ const result = await scorer.run({
582
+ input: {
583
+ systemMessages: [{
584
+ role: 'system',
585
+ content: "Always provide code examples when explaining programming concepts"
586
+ }],
587
+ inputMessages: [{
588
+ role: 'user',
589
+ content: "Explain how to reverse a string"
590
+ }]
591
+ },
592
+ output: {
593
+ text: `To reverse a string, you can iterate through it backwards. Here's an example in Python:
594
+
595
+ def reverse_string(s):
596
+ return s[::-1]
597
+
598
+ # Usage: reverse_string("hello") returns "olleh"`
599
+ }
600
+ });
601
+ // High score for both addressing the user's request AND following system guidelines
602
+ ```
603
+
604
+ ## Comparison with Other Scorers
605
+
606
+ | Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
607
+ |--------|------------------|------------------|--------------|
608
+ | **Focus** | Multi-dimensional prompt adherence | Query-response relevance | Context groundedness |
609
+ | **Evaluation** | Intent, requirements, completeness, format | Semantic similarity to query | Factual consistency with context |
610
+ | **Use Case** | General prompt following | Information retrieval | RAG/context-based systems |
611
+ | **Dimensions** | 4 weighted dimensions | Single relevance dimension | Single faithfulness dimension |
612
+
364
613
  ## Related
365
614
 
366
615
  - [Answer Relevancy Scorer](/reference/scorers/answer-relevancy) - Evaluates query-response relevance
@@ -37,6 +37,21 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
37
37
  ]}
38
38
  />
39
39
 
40
+ `.run()` returns a result in the following shape:
41
+
42
+ ```typescript
43
+ {
44
+ runId: string,
45
+ analyzeStepResult: {
46
+ confidence: number,
47
+ ratio: number,
48
+ changes: number,
49
+ lengthDiff: number
50
+ },
51
+ score: number
52
+ }
53
+ ```
54
+
40
55
  ## Scoring Details
41
56
 
42
57
  The scorer calculates several measures:
@@ -61,13 +76,126 @@ Final score: `(similarity_ratio * confidence) * scale`
61
76
 
62
77
  ### Score interpretation
63
78
 
64
- (0 to scale, default 0-1)
79
+ A textual difference score between 0 and 1:
80
+
81
+ - **1.0**: Identical texts – no differences detected.
82
+ - **0.7–0.9**: Minor differences – few changes needed.
83
+ - **0.4–0.6**: Moderate differences – noticeable changes required.
84
+ - **0.1–0.3**: Major differences – extensive changes needed.
85
+ - **0.0**: Completely different texts.
86
+
87
+ ## Examples
88
+
89
+ ### No differences example
90
+
91
+ In this example, the texts are exactly the same. The scorer identifies complete similarity with a perfect score and no detected changes.
92
+
93
+ ```typescript filename="src/example-no-differences.ts" showLineNumbers copy
94
+ import { createTextualDifferenceScorer } from "@mastra/evals/scorers/code";
95
+
96
+ const scorer = createTextualDifferenceScorer();
97
+
98
+ const input = 'The quick brown fox jumps over the lazy dog';
99
+ const output = 'The quick brown fox jumps over the lazy dog';
100
+
101
+ const result = await scorer.run({
102
+ input: [{ role: 'user', content: input }],
103
+ output: { role: 'assistant', text: output },
104
+ });
105
+
106
+ console.log('Score:', result.score);
107
+ console.log('AnalyzeStepResult:', result.analyzeStepResult);
108
+ ```
109
+
110
+ #### No differences output
111
+
112
+ The scorer returns a high score, indicating the texts are identical. The detailed info confirms zero changes and no length difference.
113
+
114
+ ```typescript
115
+ {
116
+ score: 1,
117
+ analyzeStepResult: {
118
+ confidence: 1,
119
+ ratio: 1,
120
+ changes: 0,
121
+ lengthDiff: 0,
122
+ },
123
+ }
124
+ ```
125
+
126
+ ### Minor differences example
127
+
128
+ In this example, the texts have small variations. The scorer detects these minor differences and returns a moderate similarity score.
129
+
130
+ ```typescript filename="src/example-minor-differences.ts" showLineNumbers copy
131
+ import { createTextualDifferenceScorer } from "@mastra/evals/scorers/code";
132
+
133
+ const scorer = createTextualDifferenceScorer();
134
+
135
+ const input = 'Hello world! How are you?';
136
+ const output = 'Hello there! How is it going?';
137
+
138
+ const result = await scorer.run({
139
+ input: [{ role: 'user', content: input }],
140
+ output: { role: 'assistant', text: output },
141
+ });
142
+
143
+ console.log('Score:', result.score);
144
+ console.log('AnalyzeStepResult:', result.analyzeStepResult);
145
+ ```
146
+
147
+ #### Minor differences output
148
+
149
+ The scorer returns a moderate score reflecting the small variations between the texts. The detailed info includes the number of changes and length difference observed.
150
+
151
+ ```typescript
152
+ {
153
+ score: 0.5925925925925926,
154
+ analyzeStepResult: {
155
+ confidence: 0.8620689655172413,
156
+ ratio: 0.5925925925925926,
157
+ changes: 5,
158
+ lengthDiff: 0.13793103448275862
159
+ }
160
+ }
161
+ ```
162
+
163
+ ### Major differences example
164
+
165
+ In this example, the texts differ significantly. The scorer detects extensive changes and returns a low similarity score.
166
+
167
+ ```typescript filename="src/example-major-differences.ts" showLineNumbers copy
168
+ import { createTextualDifferenceScorer } from "@mastra/evals/scorers/code";
169
+
170
+ const scorer = createTextualDifferenceScorer();
171
+
172
+ const input = 'Python is a high-level programming language';
173
+ const output = 'JavaScript is used for web development';
174
+
175
+ const result = await scorer.run({
176
+ input: [{ role: 'user', content: input }],
177
+ output: { role: 'assistant', text: output },
178
+ });
179
+
180
+ console.log('Score:', result.score);
181
+ console.log('AnalyzeStepResult:', result.analyzeStepResult);
182
+ ```
183
+
184
+ #### Major differences output
185
+
186
+ The scorer returns a low score due to significant differences between the texts. The detailed `analyzeStepResult` shows numerous changes and a notable length difference.
65
187
 
66
- - 1.0: Identical texts - no differences
67
- - 0.7-0.9: Minor differences - few changes needed
68
- - 0.4-0.6: Moderate differences - significant changes
69
- - 0.1-0.3: Major differences - extensive changes
70
- - 0.0: Completely different texts
188
+ ```typescript
189
+ {
190
+ score: 0.3170731707317073,
191
+ analyzeStepResult: {
192
+ confidence: 0.8636363636363636,
193
+ ratio: 0.3170731707317073,
194
+ changes: 8,
195
+ lengthDiff: 0.13636363636363635
196
+ }
197
+ }
198
+ ```
71
199
 
72
200
  ## Related
73
201