@mastra/mcp-docs-server 0.13.17-alpha.4 → 0.13.17-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +20 -0
  2. package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +9 -0
  3. package/.docs/organized/changelogs/%40mastra%2Fastra.md +10 -10
  4. package/.docs/organized/changelogs/%40mastra%2Fauth.md +6 -0
  5. package/.docs/organized/changelogs/%40mastra%2Fchroma.md +10 -10
  6. package/.docs/organized/changelogs/%40mastra%2Fclickhouse.md +10 -10
  7. package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +17 -17
  8. package/.docs/organized/changelogs/%40mastra%2Fcloud.md +21 -21
  9. package/.docs/organized/changelogs/%40mastra%2Fcloudflare-d1.md +10 -10
  10. package/.docs/organized/changelogs/%40mastra%2Fcloudflare.md +11 -11
  11. package/.docs/organized/changelogs/%40mastra%2Fcore.md +39 -39
  12. package/.docs/organized/changelogs/%40mastra%2Fcouchbase.md +10 -10
  13. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +21 -0
  14. package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloudflare.md +15 -15
  15. package/.docs/organized/changelogs/%40mastra%2Fdeployer-netlify.md +11 -11
  16. package/.docs/organized/changelogs/%40mastra%2Fdeployer-vercel.md +12 -12
  17. package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +24 -24
  18. package/.docs/organized/changelogs/%40mastra%2Fdynamodb.md +11 -11
  19. package/.docs/organized/changelogs/%40mastra%2Fevals.md +10 -10
  20. package/.docs/organized/changelogs/%40mastra%2Ffastembed.md +6 -0
  21. package/.docs/organized/changelogs/%40mastra%2Ffirecrawl.md +20 -20
  22. package/.docs/organized/changelogs/%40mastra%2Fgithub.md +19 -19
  23. package/.docs/organized/changelogs/%40mastra%2Flance.md +10 -10
  24. package/.docs/organized/changelogs/%40mastra%2Flibsql.md +19 -19
  25. package/.docs/organized/changelogs/%40mastra%2Floggers.md +11 -11
  26. package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +23 -23
  27. package/.docs/organized/changelogs/%40mastra%2Fmcp-registry-registry.md +10 -10
  28. package/.docs/organized/changelogs/%40mastra%2Fmcp.md +23 -23
  29. package/.docs/organized/changelogs/%40mastra%2Fmem0.md +19 -19
  30. package/.docs/organized/changelogs/%40mastra%2Fmemory.md +21 -21
  31. package/.docs/organized/changelogs/%40mastra%2Fmongodb.md +11 -11
  32. package/.docs/organized/changelogs/%40mastra%2Fmssql.md +10 -5
  33. package/.docs/organized/changelogs/%40mastra%2Fopensearch.md +10 -10
  34. package/.docs/organized/changelogs/%40mastra%2Fpg.md +10 -10
  35. package/.docs/organized/changelogs/%40mastra%2Fpinecone.md +10 -10
  36. package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +24 -24
  37. package/.docs/organized/changelogs/%40mastra%2Fqdrant.md +13 -13
  38. package/.docs/organized/changelogs/%40mastra%2Frag.md +10 -10
  39. package/.docs/organized/changelogs/%40mastra%2Fragie.md +19 -19
  40. package/.docs/organized/changelogs/%40mastra%2Fschema-compat.md +12 -0
  41. package/.docs/organized/changelogs/%40mastra%2Fserver.md +20 -20
  42. package/.docs/organized/changelogs/%40mastra%2Fturbopuffer.md +11 -11
  43. package/.docs/organized/changelogs/%40mastra%2Fupstash.md +10 -10
  44. package/.docs/organized/changelogs/%40mastra%2Fvectorize.md +10 -10
  45. package/.docs/organized/changelogs/%40mastra%2Fvoice-azure.md +10 -10
  46. package/.docs/organized/changelogs/%40mastra%2Fvoice-cloudflare.md +10 -10
  47. package/.docs/organized/changelogs/%40mastra%2Fvoice-deepgram.md +10 -10
  48. package/.docs/organized/changelogs/%40mastra%2Fvoice-elevenlabs.md +10 -10
  49. package/.docs/organized/changelogs/%40mastra%2Fvoice-gladia.md +9 -0
  50. package/.docs/organized/changelogs/%40mastra%2Fvoice-google-gemini-live.md +9 -0
  51. package/.docs/organized/changelogs/%40mastra%2Fvoice-google.md +10 -10
  52. package/.docs/organized/changelogs/%40mastra%2Fvoice-murf.md +10 -10
  53. package/.docs/organized/changelogs/%40mastra%2Fvoice-openai-realtime.md +10 -10
  54. package/.docs/organized/changelogs/%40mastra%2Fvoice-openai.md +10 -10
  55. package/.docs/organized/changelogs/%40mastra%2Fvoice-playai.md +11 -11
  56. package/.docs/organized/changelogs/%40mastra%2Fvoice-sarvam.md +11 -11
  57. package/.docs/organized/changelogs/%40mastra%2Fvoice-speechify.md +10 -10
  58. package/.docs/organized/changelogs/create-mastra.md +13 -13
  59. package/.docs/organized/changelogs/mastra.md +26 -26
  60. package/.docs/organized/code-examples/assistant-ui.md +1 -1
  61. package/.docs/organized/code-examples/bird-checker-with-nextjs-and-eval.md +2 -2
  62. package/.docs/organized/code-examples/heads-up-game.md +32 -56
  63. package/.docs/raw/getting-started/installation.mdx +2 -7
  64. package/.docs/raw/getting-started/templates.mdx +2 -7
  65. package/.docs/raw/memory/working-memory.mdx +17 -17
  66. package/.docs/raw/observability/ai-tracing.mdx +438 -0
  67. package/.docs/raw/reference/scorers/noise-sensitivity.mdx +87 -20
  68. package/.docs/raw/reference/tools/mcp-server.mdx +26 -27
  69. package/.docs/raw/tools-mcp/mcp-overview.mdx +6 -5
  70. package/.docs/raw/workflows/suspend-and-resume.mdx +64 -7
  71. package/CHANGELOG.md +1823 -0
  72. package/dist/logger.d.ts +5 -1
  73. package/dist/logger.d.ts.map +1 -1
  74. package/dist/stdio.js +47 -4
  75. package/dist/tools/blog.d.ts.map +1 -1
  76. package/package.json +16 -6
@@ -0,0 +1,438 @@
1
+ ---
2
+ title: "AI Tracing | Mastra Observability Documentation"
3
+ description: "Set up AI tracing for Mastra applications"
4
+ ---
5
+
6
+ import { Callout } from "nextra/components";
7
+
8
+ # AI Tracing
9
+
10
+ AI Tracing provides specialized monitoring and debugging for the AI-related operations in your application. When enabled, Mastra automatically creates traces for agent runs, LLM generations, tool calls, and workflow steps with AI-specific context and metadata.
11
+
12
+ Unlike traditional application tracing, AI Tracing focuses specifically on understanding your AI pipeline — capturing token usage, model parameters, tool execution details, and conversation flows. This makes it easier to debug issues, optimize performance, and understand how your AI systems behave in production.
13
+
14
+ You create AI traces by:
15
+
16
+ - **Configuring exporters** to send trace data to observability platforms like Langfuse
17
+ - **Setting sampling strategies** to control which traces are collected
18
+ - **Running agents and workflows** — Mastra automatically instruments them with detailed AI tracing
19
+
20
+ This provides full visibility into your AI operations with minimal setup, helping you build more reliable and observable AI applications.
21
+
22
+ <Callout type="warning">
23
+ **Experimental Feature**
24
+
25
+ AI Tracing is available as of `@mastra/core 0.14.0` and is currently experimental. The API may change in future releases.
26
+ </Callout>
27
+
28
+ ## How It Differs from Standard Tracing
29
+
30
+ AI Tracing complements Mastra's existing [OpenTelemetry-based tracing](./tracing.mdx) but serves a different purpose:
31
+
32
+ | Feature | Standard Tracing | AI Tracing |
33
+ |---------|-----------------|------------|
34
+ | **Focus** | Application infrastructure | AI operations only |
35
+ | **Data Format** | OpenTelemetry standard | Provider-native (Langfuse, etc.) |
36
+ | **Timing** | Batch export | Real-time option for debugging |
37
+ | **Metadata** | Generic span attributes | AI-specific (tokens, models, tools) |
38
+
39
+ ## Current Status
40
+
41
+ **Supported Exporters:**
42
+ - ✅ [Langfuse](https://langfuse.com/) - Full support with real-time mode
43
+ - 🔄 [Braintrust](https://www.braintrust.dev/home) - Coming soon
44
+ - 🔄 [OpenTelemetry](https://opentelemetry.io/) - Coming soon
45
+
46
+ **Known Limitations:**
47
+ - Mastra playground traces still use the legacy tracing system
48
+ - API is experimental and may change
49
+
50
+ For the latest updates, see [GitHub issue #6773](https://github.com/mastra-ai/mastra/issues/6773).
51
+
52
+ ## Basic Configuration
53
+
54
+ Here's a simple example of enabling AI Tracing:
55
+
56
+ ```ts filename="src/mastra/index.ts" showLineNumbers copy
57
+ import { LangfuseExporter } from '@mastra/langfuse';
58
+
59
+ export const mastra = new Mastra({
60
+ // ... other config
61
+ observability: {
62
+ instances: {
63
+ langfuse: {
64
+ serviceName: 'my-service',
65
+ exporters: [
66
+ new LangfuseExporter({
67
+ publicKey: process.env.LANGFUSE_PUBLIC_KEY!,
68
+ secretKey: process.env.LANGFUSE_SECRET_KEY!,
69
+ baseUrl: process.env.LANGFUSE_BASE_URL!,
70
+ realtime: true,
71
+ }),
72
+ ],
73
+ },
74
+ },
75
+ },
76
+ });
77
+ ```
78
+
79
+ ## Configuration Options
80
+
81
+ The AI tracing config accepts these properties:
82
+
83
+ ```ts
84
+ type AITracingConfig = {
85
+ // Map of tracing instance names to their configurations
86
+ instances: Record<string, AITracingInstanceConfig | MastraAITracing>;
87
+
88
+ // Optional function to select which tracing instance to use
89
+ selector?: TracingSelector;
90
+ };
91
+
92
+ type AITracingInstanceConfig = {
93
+ // Name to identify your service in traces
94
+ serviceName: string;
95
+
96
+ // Control how many traces are sampled
97
+ sampling?: {
98
+ type: "always" | "never" | "ratio" | "custom";
99
+ probability?: number; // For ratio sampling (0.0 to 1.0)
100
+ sampler?: (context: TraceContext) => boolean; // For custom sampling
101
+ };
102
+
103
+ // Array of exporters to send trace data to
104
+ exporters?: AITracingExporter[];
105
+
106
+ // Array of processors to transform spans before export
107
+ processors?: AISpanProcessor[];
108
+ };
109
+ ```
110
+
111
+ ### Sampling Configuration
112
+
113
+ Control which traces are collected and exported:
114
+
115
+ ```ts filename="src/mastra/index.ts" showLineNumbers copy
116
+ export const mastra = new Mastra({
117
+ observability: {
118
+ instances: {
119
+ langfuse: {
120
+ serviceName: 'my-service',
121
+ // Sample all traces (default)
122
+ sampling: { type: 'always' },
123
+ exporters: [langfuseExporter],
124
+ },
125
+
126
+ development: {
127
+ serviceName: 'dev-service',
128
+ // Sample 10% of traces
129
+ sampling: {
130
+ type: 'ratio',
131
+ probability: 0.1
132
+ },
133
+ exporters: [langfuseExporter],
134
+ },
135
+
136
+ custom: {
137
+ serviceName: 'custom-service',
138
+ // Custom sampling logic
139
+ sampling: {
140
+ type: 'custom',
141
+ sampler: (context) => {
142
+ // Only trace requests from specific users
143
+ return context.metadata?.userId === 'debug-user';
144
+ }
145
+ },
146
+ exporters: [langfuseExporter],
147
+ },
148
+ },
149
+ },
150
+ });
151
+ ```
152
+
153
+ ### Langfuse Exporter Configuration
154
+
155
+ The Langfuse exporter accepts these options:
156
+
157
+ ```ts
158
+ type LangfuseExporterConfig = {
159
+ // Langfuse API credentials
160
+ publicKey: string;
161
+ secretKey: string;
162
+ baseUrl: string;
163
+
164
+ // Enable realtime mode for immediate trace visibility
165
+ realtime?: boolean; // defaults to false
166
+
167
+ // Additional options passed to Langfuse client
168
+ options?: any;
169
+ };
170
+ ```
171
+
172
+ Example with environment variables:
173
+
174
+ ```ts filename="mastra.config.ts" showLineNumbers copy
175
+ import { LangfuseExporter } from '@mastra/langfuse';
176
+
177
+ export const mastra = new Mastra({
178
+ observability: {
179
+ instances: {
180
+ langfuse: {
181
+ serviceName: process.env.SERVICE_NAME || 'mastra-app',
182
+ sampling: { type: 'always' },
183
+ exporters: [
184
+ new LangfuseExporter({
185
+ publicKey: process.env.LANGFUSE_PUBLIC_KEY!,
186
+ secretKey: process.env.LANGFUSE_SECRET_KEY!,
187
+ baseUrl: process.env.LANGFUSE_BASE_URL!,
188
+ realtime: process.env.NODE_ENV === 'development',
189
+ }),
190
+ ],
191
+ },
192
+ },
193
+ },
194
+ });
195
+ ```
196
+
197
+ #### Real-time vs Batch Mode
198
+
199
+ The Langfuse exporter supports two modes:
200
+
201
+ **Batch Mode (default)**
202
+ - Traces are buffered and sent periodically
203
+ - Better performance for production
204
+ - Traces may appear with slight delay
205
+
206
+ **Real-time Mode**
207
+ - Each trace event is immediately flushed
208
+ - Ideal for development and debugging
209
+ - Immediate visibility in Langfuse dashboard
210
+
211
+ ```ts
212
+ new LangfuseExporter({
213
+ // ... other config
214
+ realtime: process.env.NODE_ENV === 'development',
215
+ })
216
+ ```
217
+
218
+ #### Multi-Instance Configuration
219
+
220
+ You can configure multiple tracing instances and use a selector to choose which one to use:
221
+
222
+ ```ts filename="mastra.config.ts" showLineNumbers copy
223
+ export const mastra = new Mastra({
224
+ observability: {
225
+ instances: {
226
+ production: {
227
+ serviceName: 'prod-service',
228
+ sampling: { type: 'ratio', probability: 0.1 },
229
+ exporters: [prodLangfuseExporter],
230
+ },
231
+ development: {
232
+ serviceName: 'dev-service',
233
+ sampling: { type: 'always' },
234
+ exporters: [devLangfuseExporter],
235
+ },
236
+ },
237
+ selector: (context, availableTracers) => {
238
+ // Use development tracer for debug sessions
239
+ if (context.runtimeContext?.get('debug') === 'true') {
240
+ return 'development';
241
+ }
242
+ return 'production';
243
+ },
244
+ },
245
+ });
246
+ ```
247
+
248
+ ## Span Types and Attributes
249
+
250
+ AI Tracing automatically creates spans for different AI operations. Mastra supports the following span types:
251
+
252
+ ### Agent Operation Types
253
+ - **`AGENT_RUN`** - Agent execution from start to finish
254
+ - **`LLM_GENERATION`** - Individual model calls with prompts and completions
255
+ - **`TOOL_CALL`** - Function/tool executions with inputs and outputs
256
+ - **`MCP_TOOL_CALL`** - Model Context Protocol tool executions
257
+ - **`GENERIC`** - Custom operations
258
+
259
+ ### Workflow Operation Types
260
+ - **`WORKFLOW_RUN`** - Workflow execution from start to finish
261
+ - **`WORKFLOW_STEP`** - Individual step processing
262
+ - **`WORKFLOW_CONDITIONAL`** - Conditional execution blocks
263
+ - **`WORKFLOW_CONDITIONAL_EVAL`** - Individual condition evaluations
264
+ - **`WORKFLOW_PARALLEL`** - Parallel execution blocks
265
+ - **`WORKFLOW_LOOP`** - Loop execution blocks
266
+ - **`WORKFLOW_SLEEP`** - Sleep/delay operations
267
+ - **`WORKFLOW_WAIT_EVENT`** - Event waiting operations
268
+
269
+ ### Key Attributes
270
+ Each span type includes relevant attributes:
271
+ - **Agent spans**: Agent ID, instructions, available tools, max steps
272
+ - **LLM spans**: Model name, provider, token usage, parameters, finish reason
273
+ - **Tool spans**: Tool ID, tool type, success status
274
+ - **Workflow spans**: Step/workflow IDs, status information
275
+
276
+ ## Adding Custom Metadata to Spans
277
+
278
+ You can add custom metadata to spans using the `tracingContext.currentSpan` available in workflow steps and tool calls. This is useful for tracking additional context like API status codes, user IDs, or performance metrics.
279
+
280
+ ```ts showLineNumbers copy
281
+ execute: async ({ inputData, tracingContext }) => {
282
+ const response = await fetch(inputData.endpoint, {
283
+ method: 'POST',
284
+ body: JSON.stringify(inputData.payload),
285
+ });
286
+
287
+ // Add custom metadata to the current span
288
+ tracingContext.currentSpan?.update({
289
+ metadata: {
290
+ apiStatusCode: response.status,
291
+ responseHeaders: Object.fromEntries(response.headers.entries()),
292
+ endpoint: inputData.endpoint,
293
+ }
294
+ });
295
+
296
+ const data = await response.json();
297
+ return { data, statusCode: response.status };
298
+ }
299
+ ```
300
+
301
+ ## Creating Child Spans
302
+
303
+ You can create child spans to track specific operations within your workflow steps or tools. This provides more granular visibility into what's happening during execution.
304
+
305
+ ```ts showLineNumbers copy
306
+ execute: async ({ input, tracingContext }) => {
307
+ // Create a child span for the database query
308
+ const querySpan = tracingContext.currentSpan?.createChildSpan({
309
+ type: 'generic',
310
+ name: 'database-query',
311
+ input: {
312
+ query: input.query,
313
+ params: input.params,
314
+ }
315
+ });
316
+
317
+ try {
318
+ const results = await db.query(input.query, input.params);
319
+
320
+ // Update child span with results and end it
321
+ querySpan?.end({
322
+ output: results.data,
323
+ metadata: {
324
+ rowsReturned: results.length,
325
+ success: true,
326
+ }
327
+ });
328
+
329
+ return { results, rowCount: results.length };
330
+ } catch (error) {
331
+ // Record error on child span
332
+ querySpan?.error({error});
333
+ throw error;
334
+ }
335
+ }
336
+ ```
337
+
338
+ ## Span Processors and Data Filtering
339
+
340
+ Span processors allow you to modify or filter span data before it's exported to observability platforms. This is useful for adding computed fields, redacting sensitive information, or transforming data formats.
341
+
342
+ ### Built-in SensitiveDataFilter
343
+
344
+ Mastra includes a `SensitiveDataFilter` processor that automatically redacts sensitive fields from span data. It's enabled by default and scans for common sensitive field names:
345
+
346
+ ```ts filename="src/mastra/index.ts" showLineNumbers copy
347
+ import { LangfuseExporter } from '@mastra/langfuse';
348
+ import { SensitiveDataFilter } from '@mastra/core/ai-tracing';
349
+
350
+ export const mastra = new Mastra({
351
+ observability: {
352
+ instances: {
353
+ langfuse: {
354
+ serviceName: 'my-service',
355
+ exporters: [new LangfuseExporter({ /* config */ })],
356
+ // SensitiveDataFilter is included by default, but you can customize it
357
+ processors: [
358
+ new SensitiveDataFilter([
359
+ 'password', 'token', 'secret', 'key', 'apiKey',
360
+ 'auth', 'authorization', 'bearer', 'jwt',
361
+ 'credential', 'sessionId',
362
+ // Add your custom sensitive fields
363
+ 'ssn', 'creditCard', 'bankAccount'
364
+ ])
365
+ ],
366
+ },
367
+ },
368
+ },
369
+ });
370
+ ```
371
+
372
+ The `SensitiveDataFilter` automatically redacts matching fields in:
373
+ - Span attributes
374
+ - Span metadata
375
+ - Input/output data
376
+ - Error information
377
+
378
+ Fields are matched case-insensitively, and nested objects are processed recursively.
379
+
380
+ ### Custom Processors
381
+
382
+ You can create custom processors to implement your own span transformation logic:
383
+
384
+ ```ts showLineNumbers copy
385
+ import type { AISpanProcessor, AnyAISpan } from '@mastra/core/ai-tracing';
386
+
387
+ export class PerformanceEnrichmentProcessor implements AISpanProcessor {
388
+ name = 'performance-enrichment';
389
+
390
+ process(span: AnyAISpan): AnyAISpan | null {
391
+ const modifiedSpan = { ...span };
392
+
393
+ // Add computed performance metrics
394
+ if (span.startTime && span.endTime) {
395
+ const duration = span.endTime.getTime() - span.startTime.getTime();
396
+
397
+ modifiedSpan.metadata = {
398
+ ...span.metadata,
399
+ durationMs: duration,
400
+ performanceCategory: duration < 100 ? 'fast' : duration < 1000 ? 'medium' : 'slow',
401
+ };
402
+ }
403
+
404
+ // Add environment context
405
+ modifiedSpan.metadata = {
406
+ ...modifiedSpan.metadata,
407
+ environment: process.env.NODE_ENV || 'unknown',
408
+ region: process.env.AWS_REGION || 'unknown',
409
+ };
410
+
411
+ return modifiedSpan;
412
+ }
413
+
414
+ async shutdown(): Promise<void> {
415
+ // Cleanup if needed
416
+ }
417
+ }
418
+
419
+ // Use in your Mastra configuration
420
+ export const mastra = new Mastra({
421
+ observability: {
422
+ instances: {
423
+ langfuse: {
424
+ serviceName: 'my-service',
425
+ exporters: [new LangfuseExporter({ /* config */ })],
426
+ processors: [
427
+ new SensitiveDataFilter(),
428
+ new PerformanceEnrichmentProcessor(),
429
+ ],
430
+ },
431
+ },
432
+ },
433
+ });
434
+ ```
435
+
436
+ Processors are executed in the order they're defined, and each processor receives the output of the previous one.
437
+
438
+
@@ -1,13 +1,15 @@
1
1
  ---
2
- title: "Reference: Noise Sensitivity Scorer | Scorers | Mastra Docs"
3
- description: Documentation for the Noise Sensitivity Scorer in Mastra. Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information in user queries.
2
+ title: "Reference: Noise Sensitivity Scorer (CI/Testing) | Scorers | Mastra Docs"
3
+ description: Documentation for the Noise Sensitivity Scorer in Mastra. A CI/testing scorer that evaluates agent robustness by comparing responses between clean and noisy inputs in controlled test environments.
4
4
  ---
5
5
 
6
6
  import { PropertiesTable } from "@/components/properties-table";
7
7
 
8
- # Noise Sensitivity Scorer
8
+ # Noise Sensitivity Scorer (CI/Testing Only)
9
9
 
10
- The `createNoiseSensitivityScorerLLM()` function creates a scorer that evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information. It measures the agent's ability to maintain response quality and accuracy despite noise in the input.
10
+ The `createNoiseSensitivityScorerLLM()` function creates a **CI/testing scorer** that evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information. Unlike live scorers that evaluate single production runs, this scorer requires predetermined test data including both baseline responses and noisy variations.
11
+
12
+ **Important:** This is not a live scorer. It requires pre-computed baseline responses and cannot be used for real-time agent evaluation. Use this scorer in your CI/CD pipeline or testing suites only.
11
13
 
12
14
  ## Parameters
13
15
 
@@ -120,6 +122,68 @@ The `createNoiseSensitivityScorerLLM()` function creates a scorer that evaluates
120
122
  ]}
121
123
  />
122
124
 
125
+ ## CI/Testing Requirements
126
+
127
+ This scorer is designed exclusively for CI/testing environments and has specific requirements:
128
+
129
+ ### Why This Is a CI Scorer
130
+
131
+ 1. **Requires Baseline Data**: You must provide a pre-computed baseline response (the "correct" answer without noise)
132
+ 2. **Needs Test Variations**: Requires both the original query and a noisy variation prepared in advance
133
+ 3. **Comparative Analysis**: The scorer compares responses between baseline and noisy versions, which is only possible in controlled test conditions
134
+ 4. **Not Suitable for Production**: Cannot evaluate single, real-time agent responses without predetermined test data
135
+
136
+ ### Test Data Preparation
137
+
138
+ To use this scorer effectively, you need to prepare:
139
+ - **Original Query**: The clean user input without any noise
140
+ - **Baseline Response**: Run your agent with the original query and capture the response
141
+ - **Noisy Query**: Add distractions, misinformation, or irrelevant content to the original query
142
+ - **Test Execution**: Run your agent with the noisy query and evaluate using this scorer
143
+
144
+ ### Example: CI Test Implementation
145
+
146
+ ```typescript
147
+ import { describe, it, expect } from "vitest";
148
+ import { createNoiseSensitivityScorerLLM } from "@mastra/evals/scorers/llm";
149
+ import { openai } from "@ai-sdk/openai";
150
+ import { myAgent } from "./agents";
151
+
152
+ describe("Agent Noise Resistance Tests", () => {
153
+ it("should maintain accuracy despite misinformation noise", async () => {
154
+ // Step 1: Define test data
155
+ const originalQuery = "What is the capital of France?";
156
+ const noisyQuery = "What is the capital of France? Berlin is the capital of Germany, and Rome is in Italy. Some people incorrectly say Lyon is the capital.";
157
+
158
+ // Step 2: Get baseline response (pre-computed or cached)
159
+ const baselineResponse = "The capital of France is Paris.";
160
+
161
+ // Step 3: Run agent with noisy query
162
+ const noisyResult = await myAgent.run({
163
+ messages: [{ role: "user", content: noisyQuery }]
164
+ });
165
+
166
+ // Step 4: Evaluate using noise sensitivity scorer
167
+ const scorer = createNoiseSensitivityScorerLLM({
168
+ model: openai("gpt-4o-mini"),
169
+ options: {
170
+ baselineResponse,
171
+ noisyQuery,
172
+ noiseType: "misinformation"
173
+ }
174
+ });
175
+
176
+ const evaluation = await scorer.run({
177
+ input: originalQuery,
178
+ output: noisyResult.content
179
+ });
180
+
181
+ // Assert the agent maintains robustness
182
+ expect(evaluation.score).toBeGreaterThan(0.8);
183
+ });
184
+ });
185
+ ```
186
+
123
187
  ## .run() Returns
124
188
 
125
189
  <PropertiesTable
@@ -200,26 +264,28 @@ Deliberately conflicting instructions designed to confuse.
200
264
 
201
265
  Example: "Write a summary of this article. Actually, ignore that and tell me about dogs instead."
202
266
 
203
- ## Usage Patterns
267
+ ## CI/Testing Usage Patterns
204
268
 
205
- ### Testing Agent Robustness
206
- Use to verify that agents maintain quality when faced with:
207
- - User confusion or contradictions
208
- - Multiple unrelated questions in one query
209
- - False premises or assumptions
210
- - Emotional or distracting content
269
+ ### Integration Testing
270
+ Use in your CI pipeline to verify agent robustness:
271
+ - Create test suites with baseline and noisy query pairs
272
+ - Run regression tests to ensure noise resistance doesn't degrade
273
+ - Compare different model versions' noise handling capabilities
274
+ - Validate fixes for noise-related issues
211
275
 
212
- ### Quality Assurance
213
- Integrate into evaluation pipelines to:
214
- - Benchmark different models' noise resistance
215
- - Identify agents vulnerable to manipulation
216
- - Validate production readiness
276
+ ### Quality Assurance Testing
277
+ Include in your test harness to:
278
+ - Benchmark different models' noise resistance before deployment
279
+ - Identify agents vulnerable to manipulation during development
280
+ - Create comprehensive test coverage for various noise types
281
+ - Ensure consistent behavior across updates
217
282
 
218
283
  ### Security Testing
219
- Evaluate resistance to:
220
- - Prompt injection attempts
221
- - Social engineering tactics
222
- - Information pollution attacks
284
+ Evaluate resistance in controlled environments:
285
+ - Test prompt injection resistance with prepared attack vectors
286
+ - Validate defenses against social engineering attempts
287
+ - Measure resilience to information pollution
288
+ - Document security boundaries and limitations
223
289
 
224
290
  ## Score Interpretation
225
291
 
@@ -231,6 +297,7 @@ Evaluate resistance to:
231
297
 
232
298
  ## Related
233
299
 
300
+ - [Running in CI](/docs/evals/running-in-ci) - Setting up scorers in CI/CD pipelines
234
301
  - [Noise Sensitivity Examples](/examples/scorers/noise-sensitivity) - Practical usage examples
235
302
  - [Hallucination Scorer](/reference/scorers/hallucination) - Evaluates fabricated content
236
303
  - [Answer Relevancy Scorer](/reference/scorers/answer-relevancy) - Measures response focus