@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,4 +1,4 @@
1
- # Scorer Utils
1
+ # Scorer utils
2
2
 
3
3
  Mastra provides utility functions to help extract and process data from scorer run inputs and outputs. These utilities are particularly useful in the `preprocess` step of custom scorers.
4
4
 
@@ -14,35 +14,47 @@ import {
14
14
  extractToolCalls,
15
15
  extractInputMessages,
16
16
  extractAgentResponseMessages,
17
- } from "@mastra/evals/scorers/utils";
17
+ compareTrajectories,
18
+ createTrajectoryTestRun,
19
+ } from '@mastra/evals/scorers/utils'
18
20
  ```
19
21
 
20
- ## Message Extraction
22
+ Trajectory extraction functions are available from `@mastra/core/evals`:
21
23
 
22
- ### getAssistantMessageFromRunOutput
24
+ ```typescript
25
+ import {
26
+ extractTrajectory,
27
+ extractWorkflowTrajectory,
28
+ extractTrajectoryFromTrace,
29
+ } from '@mastra/core/evals'
30
+ ```
31
+
32
+ ## Message extraction
33
+
34
+ ### `getAssistantMessageFromRunOutput`
23
35
 
24
36
  Extracts the text content from the first assistant message in the run output.
25
37
 
26
38
  ```typescript
27
39
  const scorer = createScorer({
28
- id: "my-scorer",
29
- description: "My scorer",
30
- type: "agent",
40
+ id: 'my-scorer',
41
+ description: 'My scorer',
42
+ type: 'agent',
31
43
  })
32
44
  .preprocess(({ run }) => {
33
- const response = getAssistantMessageFromRunOutput(run.output);
34
- return { response };
45
+ const response = getAssistantMessageFromRunOutput(run.output)
46
+ return { response }
35
47
  })
36
48
  .generateScore(({ results }) => {
37
- return results.preprocessStepResult?.response ? 1 : 0;
38
- });
49
+ return results.preprocessStepResult?.response ? 1 : 0
50
+ })
39
51
  ```
40
52
 
41
- **output?:** (`ScorerRunOutputForAgent`): The scorer run output (array of MastraDBMessage)
53
+ **output** (`ScorerRunOutputForAgent`): The scorer run output (array of MastraDBMessage)
42
54
 
43
55
  **Returns:** `string | undefined` - The assistant message text, or undefined if no assistant message is found.
44
56
 
45
- ### getUserMessageFromRunInput
57
+ ### `getUserMessageFromRunInput`
46
58
 
47
59
  Extracts the text content from the first user message in the run input.
48
60
 
@@ -53,11 +65,11 @@ Extracts the text content from the first user message in the run input.
53
65
  })
54
66
  ```
55
67
 
56
- **input?:** (`ScorerRunInputForAgent`): The scorer run input containing input messages
68
+ **input** (`ScorerRunInputForAgent`): The scorer run input containing input messages
57
69
 
58
70
  **Returns:** `string | undefined` - The user message text, or undefined if no user message is found.
59
71
 
60
- ### extractInputMessages
72
+ ### `extractInputMessages`
61
73
 
62
74
  Extracts text content from all input messages as an array.
63
75
 
@@ -70,7 +82,7 @@ Extracts text content from all input messages as an array.
70
82
 
71
83
  **Returns:** `string[]` - Array of text strings from each input message.
72
84
 
73
- ### extractAgentResponseMessages
85
+ ### `extractAgentResponseMessages`
74
86
 
75
87
  Extracts text content from all assistant response messages as an array.
76
88
 
@@ -83,9 +95,9 @@ Extracts text content from all assistant response messages as an array.
83
95
 
84
96
  **Returns:** `string[]` - Array of text strings from each assistant message.
85
97
 
86
- ## Reasoning Extraction
98
+ ## Reasoning extraction
87
99
 
88
- ### getReasoningFromRunOutput
100
+ ### `getReasoningFromRunOutput`
89
101
 
90
102
  Extracts reasoning text from the run output. This is particularly useful when evaluating responses from reasoning models like `deepseek-reasoner` that produce chain-of-thought reasoning.
91
103
 
@@ -97,50 +109,50 @@ Reasoning can be stored in two places:
97
109
  ```typescript
98
110
  import {
99
111
  getReasoningFromRunOutput,
100
- getAssistantMessageFromRunOutput
101
- } from "@mastra/evals/scorers/utils";
112
+ getAssistantMessageFromRunOutput,
113
+ } from '@mastra/evals/scorers/utils'
102
114
 
103
115
  const reasoningQualityScorer = createScorer({
104
- id: "reasoning-quality",
105
- name: "Reasoning Quality",
106
- description: "Evaluates the quality of model reasoning",
107
- type: "agent",
116
+ id: 'reasoning-quality',
117
+ name: 'Reasoning Quality',
118
+ description: 'Evaluates the quality of model reasoning',
119
+ type: 'agent',
108
120
  })
109
121
  .preprocess(({ run }) => {
110
- const reasoning = getReasoningFromRunOutput(run.output);
111
- const response = getAssistantMessageFromRunOutput(run.output);
112
- return { reasoning, response };
122
+ const reasoning = getReasoningFromRunOutput(run.output)
123
+ const response = getAssistantMessageFromRunOutput(run.output)
124
+ return { reasoning, response }
113
125
  })
114
126
  .analyze(({ results }) => {
115
- const { reasoning } = results.preprocessStepResult || {};
127
+ const { reasoning } = results.preprocessStepResult || {}
116
128
  return {
117
129
  hasReasoning: !!reasoning,
118
130
  reasoningLength: reasoning?.length || 0,
119
- hasStepByStep: reasoning?.includes("step") || false,
120
- };
131
+ hasStepByStep: reasoning?.includes('step') || false,
132
+ }
121
133
  })
122
134
  .generateScore(({ results }) => {
123
- const { hasReasoning, reasoningLength } = results.analyzeStepResult || {};
124
- if (!hasReasoning) return 0;
135
+ const { hasReasoning, reasoningLength } = results.analyzeStepResult || {}
136
+ if (!hasReasoning) return 0
125
137
  // Score based on reasoning length (normalized to 0-1)
126
- return Math.min(reasoningLength / 500, 1);
138
+ return Math.min(reasoningLength / 500, 1)
127
139
  })
128
140
  .generateReason(({ results, score }) => {
129
- const { hasReasoning, reasoningLength } = results.analyzeStepResult || {};
141
+ const { hasReasoning, reasoningLength } = results.analyzeStepResult || {}
130
142
  if (!hasReasoning) {
131
- return "No reasoning was provided by the model.";
143
+ return 'No reasoning was provided by the model.'
132
144
  }
133
- return `Model provided ${reasoningLength} characters of reasoning. Score: ${score}`;
134
- });
145
+ return `Model provided ${reasoningLength} characters of reasoning. Score: ${score}`
146
+ })
135
147
  ```
136
148
 
137
- **output?:** (`ScorerRunOutputForAgent`): The scorer run output (array of MastraDBMessage)
149
+ **output** (`ScorerRunOutputForAgent`): The scorer run output (array of MastraDBMessage)
138
150
 
139
151
  **Returns:** `string | undefined` - The reasoning text, or undefined if no reasoning is present.
140
152
 
141
- ## System Message Extraction
153
+ ## System message extraction
142
154
 
143
- ### getSystemMessagesFromRunInput
155
+ ### `getSystemMessagesFromRunInput`
144
156
 
145
157
  Extracts all system messages from the run input, including both standard system messages and tagged system messages (specialized prompts like memory instructions).
146
158
 
@@ -156,7 +168,7 @@ Extracts all system messages from the run input, including both standard system
156
168
 
157
169
  **Returns:** `string[]` - Array of system message strings.
158
170
 
159
- ### getCombinedSystemPrompt
171
+ ### `getCombinedSystemPrompt`
160
172
 
161
173
  Combines all system messages into a single prompt string, joined with double newlines.
162
174
 
@@ -169,31 +181,31 @@ Combines all system messages into a single prompt string, joined with double new
169
181
 
170
182
  **Returns:** `string` - Combined system prompt string.
171
183
 
172
- ## Tool Call Extraction
184
+ ## Tool call extraction
173
185
 
174
- ### extractToolCalls
186
+ ### `extractToolCalls`
175
187
 
176
188
  Extracts information about all tool calls from the run output, including tool names, call IDs, and their positions in the message array.
177
189
 
178
190
  ```typescript
179
191
  const toolUsageScorer = createScorer({
180
- id: "tool-usage",
181
- description: "Evaluates tool usage patterns",
182
- type: "agent",
192
+ id: 'tool-usage',
193
+ description: 'Evaluates tool usage patterns',
194
+ type: 'agent',
183
195
  })
184
196
  .preprocess(({ run }) => {
185
- const { tools, toolCallInfos } = extractToolCalls(run.output);
197
+ const { tools, toolCallInfos } = extractToolCalls(run.output)
186
198
  return {
187
199
  toolsUsed: tools,
188
200
  toolCount: tools.length,
189
201
  toolDetails: toolCallInfos,
190
- };
202
+ }
191
203
  })
192
204
  .generateScore(({ results }) => {
193
- const { toolCount } = results.preprocessStepResult || {};
205
+ const { toolCount } = results.preprocessStepResult || {}
194
206
  // Score based on appropriate tool usage
195
- return toolCount > 0 ? 1 : 0;
196
- });
207
+ return toolCount > 0 ? 1 : 0
208
+ })
197
209
  ```
198
210
 
199
211
  **Returns:**
@@ -209,94 +221,262 @@ Where `ToolCallInfo` is:
209
221
 
210
222
  ```typescript
211
223
  type ToolCallInfo = {
212
- toolName: string; // Name of the tool
213
- toolCallId: string; // Unique call identifier
214
- messageIndex: number; // Index in the output array
215
- invocationIndex: number; // Index within message's tool invocations
216
- };
224
+ toolName: string // Name of the tool
225
+ toolCallId: string // Unique call identifier
226
+ messageIndex: number // Index in the output array
227
+ invocationIndex: number // Index within message's tool invocations
228
+ }
217
229
  ```
218
230
 
219
- ## Test Utilities
231
+ ## Test utilities
220
232
 
221
233
  These utilities help create test data for scorer development.
222
234
 
223
- ### createTestMessage
235
+ ### `createTestMessage`
224
236
 
225
237
  Creates a `MastraDBMessage` object for testing purposes.
226
238
 
227
239
  ```typescript
228
- import { createTestMessage } from "@mastra/evals/scorers/utils";
240
+ import { createTestMessage } from '@mastra/evals/scorers/utils'
229
241
 
230
242
  const userMessage = createTestMessage({
231
- content: "What is the weather?",
232
- role: "user",
233
- });
243
+ content: 'What is the weather?',
244
+ role: 'user',
245
+ })
234
246
 
235
247
  const assistantMessage = createTestMessage({
236
- content: "The weather is sunny.",
237
- role: "assistant",
248
+ content: 'The weather is sunny.',
249
+ role: 'assistant',
238
250
  toolInvocations: [
239
251
  {
240
- toolCallId: "call-1",
241
- toolName: "weatherTool",
242
- args: { location: "London" },
252
+ toolCallId: 'call-1',
253
+ toolName: 'weatherTool',
254
+ args: { location: 'London' },
243
255
  result: { temp: 20 },
244
- state: "result",
256
+ state: 'result',
245
257
  },
246
258
  ],
247
- });
259
+ })
248
260
  ```
249
261
 
250
- ### createAgentTestRun
262
+ ### `createAgentTestRun`
251
263
 
252
264
  Creates a complete test run object for testing scorers.
253
265
 
254
266
  ```typescript
255
- import { createAgentTestRun, createTestMessage } from "@mastra/evals/scorers/utils";
267
+ import { createAgentTestRun, createTestMessage } from '@mastra/evals/scorers/utils'
256
268
 
257
269
  const testRun = createAgentTestRun({
258
- inputMessages: [
259
- createTestMessage({ content: "Hello", role: "user" }),
260
- ],
261
- output: [
262
- createTestMessage({ content: "Hi there!", role: "assistant" }),
263
- ],
264
- });
270
+ inputMessages: [createTestMessage({ content: 'Hello', role: 'user' })],
271
+ output: [createTestMessage({ content: 'Hi there!', role: 'assistant' })],
272
+ })
265
273
 
266
274
  // Run your scorer with the test data
267
275
  const result = await myScorer.run({
268
276
  input: testRun.input,
269
277
  output: testRun.output,
270
- });
278
+ })
279
+ ```
280
+
281
+ ## Trajectory utilities
282
+
283
+ ### `extractTrajectory`
284
+
285
+ Extracts a `Trajectory` from agent output messages (`MastraDBMessage[]`). Converts tool invocations into `ToolCallStep` objects. The `runEvals` pipeline calls this automatically for trajectory scorers — you only need it for direct testing.
286
+
287
+ Available from `@mastra/core/evals`.
288
+
289
+ ```typescript
290
+ import { extractTrajectory } from '@mastra/core/evals'
291
+
292
+ const trajectory = extractTrajectory(agentOutputMessages)
293
+ // trajectory.steps — ToolCallStep[] extracted from toolInvocations
294
+ // trajectory.rawOutput — the original MastraDBMessage[] array
295
+ ```
296
+
297
+ **Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawOutput`.
298
+
299
+ ### `extractWorkflowTrajectory`
300
+
301
+ Extracts a `Trajectory` from workflow step results. Converts `StepResult` records into `WorkflowStepStep` objects, respecting the execution path ordering.
302
+
303
+ Available from `@mastra/core/evals`.
304
+
305
+ ```typescript
306
+ import { extractWorkflowTrajectory } from '@mastra/core/evals'
307
+
308
+ const trajectory = extractWorkflowTrajectory(
309
+ workflowResult.steps, // Record<string, StepResult>
310
+ workflowResult.stepExecutionPath, // string[] (optional)
311
+ )
312
+ // trajectory.steps — WorkflowStepStep[] in execution order
271
313
  ```
272
314
 
273
- ## Complete Example
315
+ **Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawWorkflowResult`.
316
+
317
+ ### `extractTrajectoryFromTrace`
318
+
319
+ Builds a hierarchical `Trajectory` from observability trace spans (`SpanRecord[]`). Reconstructs the parent-child span tree and maps each span to the appropriate `TrajectoryStep` discriminated union type with nested `children`.
320
+
321
+ This is the preferred extraction method when storage is available. The `runEvals` pipeline calls this automatically when the target's `Mastra` instance has a configured storage backend. It produces richer trajectories than `extractTrajectory` or `extractWorkflowTrajectory` because it captures the full execution tree, including nested agent runs, tool calls, and model generations.
322
+
323
+ Available from `@mastra/core/evals`.
324
+
325
+ ```typescript
326
+ import { extractTrajectoryFromTrace } from '@mastra/core/evals'
327
+
328
+ // After fetching a trace from the observability store
329
+ const traceData = await observabilityStore.getTrace({ traceId })
330
+ const trajectory = extractTrajectoryFromTrace(traceData.spans, rootSpanId)
331
+ // trajectory.steps — hierarchical TrajectoryStep[] with children
332
+ ```
333
+
334
+ **Parameters:**
335
+
336
+ - `spans` (`SpanRecord[]`) — Array of span records from a trace query.
337
+ - `rootSpanId` (`string`, optional) — Span ID to use as the starting point. When omitted, uses spans with no parent.
338
+
339
+ **Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]` with recursive `children` and `totalDurationMs`.
340
+
341
+ #### Span type mapping
342
+
343
+ | Span type | Trajectory step type | Key fields extracted |
344
+ | ---------------------- | ---------------------- | ------------------------------------------------------------- |
345
+ | `TOOL_CALL` | `tool_call` | `toolArgs`, `toolResult`, `success` |
346
+ | `MCP_TOOL_CALL` | `mcp_tool_call` | `toolArgs`, `toolResult`, `mcpServer`, `success` |
347
+ | `MODEL_GENERATION` | `model_generation` | `modelId`, `promptTokens`, `completionTokens`, `finishReason` |
348
+ | `AGENT_RUN` | `agent_run` | `agentId` (from entity ID) |
349
+ | `WORKFLOW_RUN` | `workflow_run` | `workflowId` (from entity ID) |
350
+ | `WORKFLOW_STEP` | `workflow_step` | `output` |
351
+ | `WORKFLOW_CONDITIONAL` | `workflow_conditional` | `conditionCount`, `selectedSteps` |
352
+ | `WORKFLOW_PARALLEL` | `workflow_parallel` | `branchCount`, `parallelSteps` |
353
+ | `WORKFLOW_LOOP` | `workflow_loop` | `loopType`, `totalIterations` |
354
+ | `WORKFLOW_SLEEP` | `workflow_sleep` | `sleepDurationMs`, `sleepType` |
355
+ | `WORKFLOW_WAIT_EVENT` | `workflow_wait_event` | `eventName`, `eventReceived` |
356
+ | `PROCESSOR_RUN` | `processor_run` | `processorId` |
357
+
358
+ Spans with types `GENERIC`, `MODEL_STEP`, `MODEL_CHUNK`, and `WORKFLOW_CONDITIONAL_EVAL` are skipped as noise.
359
+
360
+ ### `compareTrajectories`
361
+
362
+ Compares an actual trajectory against an expected trajectory and returns a detailed comparison result. Used internally by `createTrajectoryAccuracyScorerCode`.
363
+
364
+ The `expected` parameter accepts either a `Trajectory` (actual trajectory) or `{ steps: ExpectedStep[] }`. When using `ExpectedStep[]`, you can match by name only, name + stepType, or include data for comparison. See [Expected steps](https://mastra.ai/reference/evals/trajectory-accuracy) for details.
365
+
366
+ ```typescript
367
+ import { compareTrajectories } from '@mastra/evals/scorers/utils'
368
+
369
+ // Using ExpectedStep[] (recommended for expectations)
370
+ const result = compareTrajectories(
371
+ actualTrajectory,
372
+ { steps: [{ name: 'search' }, { name: 'summarize', stepType: 'tool_call' }] },
373
+ { compareStepData: false, allowRepeatedSteps: true },
374
+ )
375
+ // result.score — 0.0 to 1.0
376
+ // result.missingSteps — step names not found
377
+ // result.extraSteps — unexpected step names
378
+ // result.outOfOrderSteps — steps found but in wrong order
379
+ ```
380
+
381
+ **Returns:** `TrajectoryComparisonResult`
382
+
383
+ ### `createTrajectoryTestRun`
384
+
385
+ Creates a test run object for trajectory scorers. Wraps a `Trajectory` into the expected `ScorerRun` format.
386
+
387
+ ```typescript
388
+ import { createTrajectoryTestRun } from '@mastra/evals/scorers/utils'
389
+
390
+ const run = createTrajectoryTestRun({
391
+ steps: [
392
+ { stepType: 'tool_call', name: 'search', toolArgs: { q: 'test' } },
393
+ { stepType: 'tool_call', name: 'summarize' },
394
+ ],
395
+ })
396
+
397
+ const result = await trajectoryScorer.run(run)
398
+ ```
399
+
400
+ ### `checkTrajectoryEfficiency`
401
+
402
+ Evaluates trajectory efficiency against step, token, and duration budgets. Also detects redundant calls (same tool with same arguments).
403
+
404
+ ```typescript
405
+ import { checkTrajectoryEfficiency } from '@mastra/evals/scorers/utils'
406
+
407
+ const result = checkTrajectoryEfficiency(trajectory, {
408
+ maxSteps: 5,
409
+ maxTotalTokens: 2000,
410
+ maxTotalDurationMs: 5000,
411
+ noRedundantCalls: true,
412
+ })
413
+ // result.score — 1.0 if within all budgets, lower with penalties
414
+ // result.redundantCalls — duplicate tool+args combos
415
+ // result.overBudget — which budgets were exceeded
416
+ ```
417
+
418
+ **Returns:** `TrajectoryEfficiencyResult`
419
+
420
+ ### `checkTrajectoryBlacklist`
421
+
422
+ Checks whether a trajectory contains forbidden tools or tool call sequences.
423
+
424
+ ```typescript
425
+ import { checkTrajectoryBlacklist } from '@mastra/evals/scorers/utils'
426
+
427
+ const result = checkTrajectoryBlacklist(trajectory, {
428
+ blacklistedTools: ['deleteAll', 'admin-override'],
429
+ blacklistedSequences: [['escalate', 'admin-override']],
430
+ })
431
+ // result.passed — true if no violations
432
+ // result.violations — list of violations with type and details
433
+ ```
434
+
435
+ **Returns:** `TrajectoryBlacklistResult`
436
+
437
+ ### `analyzeToolFailures`
438
+
439
+ Detects tool failure patterns including retries, fallbacks, and argument corrections.
440
+
441
+ ```typescript
442
+ import { analyzeToolFailures } from '@mastra/evals/scorers/utils'
443
+
444
+ const result = analyzeToolFailures(trajectory, {
445
+ maxRetriesPerTool: 3,
446
+ })
447
+ // result.score — 1.0 if no failure patterns, lower if patterns detected
448
+ // result.patterns — detected patterns (retry, fallback, arg_correction)
449
+ ```
450
+
451
+ **Returns:** `ToolFailureAnalysisResult`
452
+
453
+ ## Complete example
274
454
 
275
455
  Here's a complete example showing how to use multiple utilities together:
276
456
 
277
457
  ```typescript
278
- import { createScorer } from "@mastra/core/evals";
458
+ import { createScorer } from '@mastra/core/evals'
279
459
  import {
280
460
  getAssistantMessageFromRunOutput,
281
461
  getReasoningFromRunOutput,
282
462
  getUserMessageFromRunInput,
283
463
  getCombinedSystemPrompt,
284
464
  extractToolCalls,
285
- } from "@mastra/evals/scorers/utils";
465
+ } from '@mastra/evals/scorers/utils'
286
466
 
287
467
  const comprehensiveScorer = createScorer({
288
- id: "comprehensive-analysis",
289
- name: "Comprehensive Analysis",
290
- description: "Analyzes all aspects of an agent response",
291
- type: "agent",
468
+ id: 'comprehensive-analysis',
469
+ name: 'Comprehensive Analysis',
470
+ description: 'Analyzes all aspects of an agent response',
471
+ type: 'agent',
292
472
  })
293
473
  .preprocess(({ run }) => {
294
474
  // Extract all relevant data
295
- const userMessage = getUserMessageFromRunInput(run.input);
296
- const response = getAssistantMessageFromRunOutput(run.output);
297
- const reasoning = getReasoningFromRunOutput(run.output);
298
- const systemPrompt = getCombinedSystemPrompt(run.input);
299
- const { tools, toolCallInfos } = extractToolCalls(run.output);
475
+ const userMessage = getUserMessageFromRunInput(run.input)
476
+ const response = getAssistantMessageFromRunOutput(run.output)
477
+ const reasoning = getReasoningFromRunOutput(run.output)
478
+ const systemPrompt = getCombinedSystemPrompt(run.input)
479
+ const { tools, toolCallInfos } = extractToolCalls(run.output)
300
480
 
301
481
  return {
302
482
  userMessage,
@@ -305,26 +485,26 @@ const comprehensiveScorer = createScorer({
305
485
  systemPrompt,
306
486
  toolsUsed: tools,
307
487
  toolCount: tools.length,
308
- };
488
+ }
309
489
  })
310
490
  .generateScore(({ results }) => {
311
- const { response, reasoning, toolCount } = results.preprocessStepResult || {};
491
+ const { response, reasoning, toolCount } = results.preprocessStepResult || {}
312
492
 
313
- let score = 0;
314
- if (response && response.length > 0) score += 0.4;
315
- if (reasoning) score += 0.3;
316
- if (toolCount > 0) score += 0.3;
493
+ let score = 0
494
+ if (response && response.length > 0) score += 0.4
495
+ if (reasoning) score += 0.3
496
+ if (toolCount > 0) score += 0.3
317
497
 
318
- return score;
498
+ return score
319
499
  })
320
500
  .generateReason(({ results, score }) => {
321
- const { response, reasoning, toolCount } = results.preprocessStepResult || {};
501
+ const { response, reasoning, toolCount } = results.preprocessStepResult || {}
322
502
 
323
- const parts = [];
324
- if (response) parts.push("provided a response");
325
- if (reasoning) parts.push("included reasoning");
326
- if (toolCount > 0) parts.push(`used ${toolCount} tool(s)`);
503
+ const parts = []
504
+ if (response) parts.push('provided a response')
505
+ if (reasoning) parts.push('included reasoning')
506
+ if (toolCount > 0) parts.push(`used ${toolCount} tool(s)`)
327
507
 
328
- return `Score: ${score}. The agent ${parts.join(", ")}.`;
329
- });
508
+ return `Score: ${score}. The agent ${parts.join(', ')}.`
509
+ })
330
510
  ```
@@ -1,20 +1,20 @@
1
- # Textual Difference Scorer
1
+ # Textual difference scorer
2
2
 
3
3
  The `createTextualDifferenceScorer()` function uses sequence matching to measure the textual differences between two strings. It provides detailed information about changes, including the number of operations needed to transform one text into another.
4
4
 
5
5
  ## Parameters
6
6
 
7
- The `createTextualDifferenceScorer()` function does not take any options.
7
+ The `createTextualDifferenceScorer()` function doesn't take any options.
8
8
 
9
9
  This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
10
10
 
11
- ## .run() Returns
11
+ ## `.run()` returns
12
12
 
13
- **runId:** (`string`): The id of the run (optional).
13
+ **runId** (`string`): The id of the run (optional).
14
14
 
15
- **analyzeStepResult:** (`object`): Object with difference metrics: { confidence: number, changes: number, lengthDiff: number }
15
+ **analyzeStepResult** (`object`): Object with difference metrics: { confidence: number, changes: number, lengthDiff: number }
16
16
 
17
- **score:** (`number`): Similarity ratio (0-1) where 1 indicates identical texts.
17
+ **score** (`number`): Similarity ratio (0-1) where 1 indicates identical texts.
18
18
 
19
19
  `.run()` returns a result in the following shape:
20
20
 
@@ -31,7 +31,7 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
31
31
  }
32
32
  ```
33
33
 
34
- ## Scoring Details
34
+ ## Scoring details
35
35
 
36
36
  The scorer calculates several measures:
37
37
 
@@ -71,22 +71,22 @@ A textual difference score between 0 and 1:
71
71
  Measure textual differences between expected and actual agent outputs:
72
72
 
73
73
  ```typescript
74
- import { runEvals } from "@mastra/core/evals";
75
- import { createTextualDifferenceScorer } from "@mastra/evals/scorers/prebuilt";
76
- import { myAgent } from "./agent";
74
+ import { runEvals } from '@mastra/core/evals'
75
+ import { createTextualDifferenceScorer } from '@mastra/evals/scorers/prebuilt'
76
+ import { myAgent } from './agent'
77
77
 
78
- const scorer = createTextualDifferenceScorer();
78
+ const scorer = createTextualDifferenceScorer()
79
79
 
80
80
  const result = await runEvals({
81
81
  data: [
82
82
  {
83
- input: "Summarize the concept of recursion",
83
+ input: 'Summarize the concept of recursion',
84
84
  groundTruth:
85
- "Recursion is when a function calls itself to solve a problem by breaking it into smaller subproblems.",
85
+ 'Recursion is when a function calls itself to solve a problem by breaking it into smaller subproblems.',
86
86
  },
87
87
  {
88
- input: "What is the capital of France?",
89
- groundTruth: "The capital of France is Paris.",
88
+ input: 'What is the capital of France?',
89
+ groundTruth: 'The capital of France is Paris.',
90
90
  },
91
91
  ],
92
92
  scorers: [scorer],
@@ -95,11 +95,11 @@ const result = await runEvals({
95
95
  console.log({
96
96
  score: scorerResults[scorer.id].score,
97
97
  groundTruth: scorerResults[scorer.id].groundTruth,
98
- });
98
+ })
99
99
  },
100
- });
100
+ })
101
101
 
102
- console.log(result.scores);
102
+ console.log(result.scores)
103
103
  ```
104
104
 
105
105
  For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).