@mastra/evals 1.1.2 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +50 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,4 +1,4 @@
1
- # Toxicity Scorer
1
+ # Toxicity scorer
2
2
 
3
3
  The `createToxicityScorer()` function evaluates whether an LLM's output contains racist, biased, or toxic elements. It uses a judge-based system to analyze responses for various forms of toxicity including personal attacks, mockery, hate speech, dismissive statements, and threats.
4
4
 
@@ -6,25 +6,25 @@ The `createToxicityScorer()` function evaluates whether an LLM's output contains
6
6
 
7
7
  The `createToxicityScorer()` function accepts a single options object with the following properties:
8
8
 
9
- **model:** (`LanguageModel`): Configuration for the model used to evaluate toxicity.
9
+ **model** (`LanguageModel`): Configuration for the model used to evaluate toxicity.
10
10
 
11
- **scale:** (`number`): Maximum score value (default is 1). (Default: `1`)
11
+ **scale** (`number`): Maximum score value (default is 1). (Default: `1`)
12
12
 
13
13
  This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
14
14
 
15
- ## .run() Returns
15
+ ## `.run()` returns
16
16
 
17
- **runId:** (`string`): The id of the run (optional).
17
+ **runId** (`string`): The id of the run (optional).
18
18
 
19
- **analyzeStepResult:** (`object`): Object with verdicts: { verdicts: Array<{ verdict: 'yes' | 'no', reason: string }> }
19
+ **analyzeStepResult** (`object`): Object with verdicts: { verdicts: Array<{ verdict: 'yes' | 'no', reason: string }> }
20
20
 
21
- **analyzePrompt:** (`string`): The prompt sent to the LLM for the analyze step (optional).
21
+ **analyzePrompt** (`string`): The prompt sent to the LLM for the analyze step (optional).
22
22
 
23
- **score:** (`number`): Toxicity score (0 to scale, default 0-1).
23
+ **score** (`number`): Toxicity score (0 to scale, default 0-1).
24
24
 
25
- **reason:** (`string`): Detailed explanation of the toxicity assessment.
25
+ **reason** (`string`): Detailed explanation of the toxicity assessment.
26
26
 
27
- **generateReasonPrompt:** (`string`): The prompt sent to the LLM for the generateReason step (optional).
27
+ **generateReasonPrompt** (`string`): The prompt sent to the LLM for the generateReason step (optional).
28
28
 
29
29
  `.run()` returns a result in the following shape:
30
30
 
@@ -41,7 +41,7 @@ This function returns an instance of the MastraScorer class. The `.run()` method
41
41
  }
42
42
  ```
43
43
 
44
- ## Scoring Details
44
+ ## Scoring details
45
45
 
46
46
  The scorer evaluates toxicity through multiple aspects:
47
47
 
@@ -82,22 +82,22 @@ A toxicity score between 0 and 1:
82
82
  Evaluate agent responses for toxic, biased, or harmful content:
83
83
 
84
84
  ```typescript
85
- import { runEvals } from "@mastra/core/evals";
86
- import { createToxicityScorer } from "@mastra/evals/scorers/prebuilt";
87
- import { myAgent } from "./agent";
85
+ import { runEvals } from '@mastra/core/evals'
86
+ import { createToxicityScorer } from '@mastra/evals/scorers/prebuilt'
87
+ import { myAgent } from './agent'
88
88
 
89
- const scorer = createToxicityScorer({ model: "openai/gpt-4o" });
89
+ const scorer = createToxicityScorer({ model: 'openai/gpt-5.4' })
90
90
 
91
91
  const result = await runEvals({
92
92
  data: [
93
93
  {
94
- input: "What do you think about the new team member?",
94
+ input: 'What do you think about the new team member?',
95
95
  },
96
96
  {
97
- input: "How was the meeting discussion?",
97
+ input: 'How was the meeting discussion?',
98
98
  },
99
99
  {
100
- input: "Can you provide feedback on the project proposal?",
100
+ input: 'Can you provide feedback on the project proposal?',
101
101
  },
102
102
  ],
103
103
  scorers: [scorer],
@@ -106,11 +106,11 @@ const result = await runEvals({
106
106
  console.log({
107
107
  score: scorerResults[scorer.id].score,
108
108
  reason: scorerResults[scorer.id].reason,
109
- });
109
+ })
110
110
  },
111
- });
111
+ })
112
112
 
113
- console.log(result.scores);
113
+ console.log(result.scores)
114
114
  ```
115
115
 
116
116
  For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
@@ -0,0 +1,613 @@
1
+ # Trajectory accuracy scorers
2
+
3
+ Mastra provides two trajectory accuracy scorers for evaluating whether an agent or workflow follows an expected sequence of actions:
4
+
5
+ 1. **Code-based scorer** - Deterministic evaluation using exact step matching and ordering
6
+ 2. **LLM-based scorer** - Semantic evaluation using AI to assess trajectory quality and appropriateness
7
+
8
+ Both scorers work with agents and workflows. The `runEvals` pipeline automatically extracts trajectories, so scorers receive a `Trajectory` object directly.
9
+
10
+ ## Trajectory extraction
11
+
12
+ The `runEvals` pipeline uses two extraction strategies, depending on whether observability storage is configured:
13
+
14
+ ### Trace-based extraction (preferred)
15
+
16
+ When the target's `Mastra` instance has storage configured, the pipeline fetches the full execution trace from the observability store and calls `extractTrajectoryFromTrace()`. This produces a hierarchical trajectory with nested `children`, capturing the complete execution tree — including nested agent runs, tool calls within workflow steps, and model generations.
17
+
18
+ For example, a workflow that calls an agent, which in turn calls tools, produces:
19
+
20
+ ```text
21
+ workflow_run
22
+ └─ workflow_step (validate-input)
23
+ └─ workflow_step (process-data)
24
+ └─ agent_run (my-agent)
25
+ └─ model_generation
26
+ └─ tool_call (search)
27
+ └─ model_generation
28
+ └─ tool_call (summarize)
29
+ └─ workflow_step (save-result)
30
+ ```
31
+
32
+ ### Fallback extraction
33
+
34
+ When storage is not available, the pipeline falls back to:
35
+
36
+ - **Agents:** `extractTrajectory()` — Extracts `ToolCallStep` entries from `toolInvocations` in the agent's message output. Produces a flat list of tool calls.
37
+ - **Workflows:** `extractWorkflowTrajectory()` — Extracts `WorkflowStepStep` entries from `stepResults`. Produces a flat list of workflow steps.
38
+
39
+ These fallbacks don't capture nested execution or non-tool-call spans.
40
+
41
+ ## Trajectory types
42
+
43
+ Trajectory steps use a discriminated union on `stepType`. Each step type has specific properties:
44
+
45
+ ### `ToolCallStep`
46
+
47
+ Represents an agent tool call.
48
+
49
+ **stepType** (`'tool_call'`): Discriminant.
50
+
51
+ **name** (`string`): Tool name.
52
+
53
+ **toolArgs** (`Record<string, unknown>`): Arguments passed to the tool.
54
+
55
+ **toolResult** (`Record<string, unknown>`): Result returned by the tool.
56
+
57
+ **success** (`boolean`): Whether the call succeeded.
58
+
59
+ **durationMs** (`number`): Execution time in milliseconds.
60
+
61
+ **metadata** (`Record<string, unknown>`): Arbitrary metadata.
62
+
63
+ **children** (`TrajectoryStep[]`): Nested sub-steps.
64
+
65
+ ### `WorkflowStepStep`
66
+
67
+ Represents a workflow step execution.
68
+
69
+ **stepType** (`'workflow_step'`): Discriminant.
70
+
71
+ **name** (`string`): Step identifier.
72
+
73
+ **stepId** (`string`): Step ID in the workflow.
74
+
75
+ **status** (`string`): Step result status (success, failed, suspended, etc.).
76
+
77
+ **output** (`Record<string, unknown>`): Step output data.
78
+
79
+ **durationMs** (`number`): Execution time in milliseconds.
80
+
81
+ **metadata** (`Record<string, unknown>`): Arbitrary metadata.
82
+
83
+ **children** (`TrajectoryStep[]`): Nested sub-steps (e.g. tool calls inside the step).
84
+
85
+ ### Other step types
86
+
87
+ The discriminated union includes these additional step types:
88
+
89
+ | Step type | Key properties |
90
+ | ---------------------- | ------------------------------------------------------------- |
91
+ | `mcp_tool_call` | `toolArgs`, `toolResult`, `mcpServer`, `success` |
92
+ | `model_generation` | `modelId`, `promptTokens`, `completionTokens`, `finishReason` |
93
+ | `agent_run` | `agentId` |
94
+ | `workflow_run` | `workflowId`, `status` |
95
+ | `workflow_conditional` | `conditionCount`, `selectedSteps` |
96
+ | `workflow_parallel` | `branchCount`, `parallelSteps` |
97
+ | `workflow_loop` | `loopType`, `totalIterations` |
98
+ | `workflow_sleep` | `durationMs`, `sleepType` |
99
+ | `workflow_wait_event` | `eventName`, `eventReceived` |
100
+ | `processor_run` | `processorId` |
101
+
102
+ All step types share the base properties `name`, `durationMs`, `metadata`, and `children`.
103
+
104
+ ## Expected steps
105
+
106
+ When defining expected trajectories, use `ExpectedStep` instead of the full `TrajectoryStep` discriminated union. `ExpectedStep` is a simpler type designed for expectations:
107
+
108
+ **name** (`string`): Step name to match (tool name, agent ID, workflow step name, etc.).
109
+
110
+ **stepType** (`TrajectoryStepType`): Step type to match. If omitted, matches any step type with the given name.
111
+
112
+ **data** (`Record<string, unknown>`): Expected step data. Compared against the actual step's type-specific data (toolArgs for tool\_call, output for workflow\_step, etc.).
113
+
114
+ **children** (`TrajectoryExpectation`): Nested expectation config for this step's children. Overrides the parent config for evaluating children of this step.
115
+
116
+ ### Simple expected steps
117
+
118
+ ```typescript
119
+ const steps: ExpectedStep[] = [
120
+ // Match by name only (any step type)
121
+ { name: 'search' },
122
+
123
+ // Match by name and step type
124
+ { name: 'search', stepType: 'tool_call' },
125
+
126
+ // Match with expected data
127
+ { name: 'search', stepType: 'tool_call', data: { input: { query: 'weather' } } },
128
+ ]
129
+ ```
130
+
131
+ ### Nested expectations
132
+
133
+ Each expected step can include a `children` config with its own evaluation rules. This lets you set different ordering or comparison rules at each level of the hierarchy.
134
+
135
+ ```typescript
136
+ const scorer = createTrajectoryScorerCode({
137
+ defaults: {
138
+ ordering: 'strict',
139
+ steps: [
140
+ { name: 'validate-input', stepType: 'workflow_step' },
141
+ {
142
+ name: 'research-agent',
143
+ stepType: 'agent_run',
144
+ children: {
145
+ // Sub-agent can call tools in any order
146
+ ordering: 'unordered',
147
+ steps: [
148
+ { name: 'search', stepType: 'tool_call' },
149
+ { name: 'summarize', stepType: 'tool_call' },
150
+ ],
151
+ },
152
+ },
153
+ { name: 'save-result', stepType: 'workflow_step' },
154
+ ],
155
+ },
156
+ })
157
+ ```
158
+
159
+ In this example, the parent workflow requires strict ordering of its steps, but the nested `research-agent` allows its tool calls in any order.
160
+
161
+ ## Choosing between scorers
162
+
163
+ ### Use the code-based scorer when:
164
+
165
+ - You need **deterministic, reproducible** results
166
+ - You have a **known expected trajectory** to compare against
167
+ - You want to validate **exact step sequences**
168
+ - Speed and cost are priorities (no LLM calls)
169
+ - You are running automated tests in CI/CD
170
+
171
+ ### Use the LLM-based scorer when:
172
+
173
+ - You need **semantic understanding** of whether steps were appropriate
174
+ - The optimal trajectory is **not predetermined** (evaluate based on task requirements)
175
+ - You want to detect **unnecessary, redundant, or missing** steps
176
+ - You need **explanations** for scoring decisions
177
+ - You are evaluating **production agent behavior**
178
+
179
+ ## Code-based trajectory accuracy scorer
180
+
181
+ The `createTrajectoryAccuracyScorerCode()` function from `@mastra/evals/scorers/prebuilt` provides deterministic scoring based on step matching and ordering against an expected trajectory.
182
+
183
+ ### Parameters
184
+
185
+ **expectedTrajectory** (`TrajectoryExpectation`): Static expected trajectory to compare against. When provided, all dataset items use this trajectory. When omitted, the scorer reads expectedTrajectory from each dataset item at runtime.
186
+
187
+ **comparisonOptions** (`TrajectoryComparisonOptions`): Controls how the comparison is performed.
188
+
189
+ This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
190
+
191
+ ### Expected trajectory sources
192
+
193
+ The code-based scorer resolves `expectedTrajectory` from two sources, in order of priority:
194
+
195
+ 1. **Constructor option** — A static trajectory passed when creating the scorer. Used for all dataset items.
196
+ 2. **Dataset item** — An `expectedTrajectory` field on the dataset item, passed through the `runEvals` pipeline. Allows different expected trajectories per item.
197
+
198
+ ```typescript
199
+ // Static: same expected trajectory for all items
200
+ const scorer = createTrajectoryAccuracyScorerCode({
201
+ expectedTrajectory: {
202
+ steps: [
203
+ { stepType: 'tool_call', name: 'search' },
204
+ { stepType: 'tool_call', name: 'summarize' },
205
+ ],
206
+ },
207
+ })
208
+ ```
209
+
210
+ ```typescript
211
+ // Per-item: each dataset item has its own expectedTrajectory
212
+ const scorer = createTrajectoryAccuracyScorerCode()
213
+
214
+ await runEvals({
215
+ target: myAgent,
216
+ scorers: { trajectory: [scorer] },
217
+ data: [
218
+ {
219
+ input: 'Search and summarize weather',
220
+ expectedTrajectory: {
221
+ steps: [
222
+ { stepType: 'tool_call', name: 'search' },
223
+ { stepType: 'tool_call', name: 'summarize' },
224
+ ],
225
+ },
226
+ },
227
+ {
228
+ input: 'Just search for weather',
229
+ expectedTrajectory: {
230
+ steps: [{ stepType: 'tool_call', name: 'search' }],
231
+ },
232
+ },
233
+ ],
234
+ })
235
+ ```
236
+
237
+ ### Evaluation modes
238
+
239
+ The code-based scorer operates in two modes based on `strictOrder`:
240
+
241
+ #### Strict mode (`strictOrder: true`)
242
+
243
+ Requires an exact match. The actual steps must match the expected steps in the same order with no extra or missing steps. Returns `1.0` for an exact match and `0.0` otherwise.
244
+
245
+ #### Relaxed mode (`strictOrder: false`, default)
246
+
247
+ Allows extra steps. Expected steps must appear in the correct relative order. The score is calculated based on how many expected steps were matched, with optional penalties for extra or repeated steps.
248
+
249
+ ## Code-based scoring details
250
+
251
+ - **Continuous scores**: Returns values between 0.0 and 1.0 in relaxed mode; binary (0 or 1) in strict mode
252
+ - **Deterministic**: Same input always produces the same output
253
+ - **Fast**: No external API calls
254
+
255
+ ### Code-based scorer results
256
+
257
+ ```typescript
258
+ {
259
+ runId: string,
260
+ preprocessStepResult: {
261
+ actualTrajectory: Trajectory,
262
+ expectedTrajectory: Trajectory,
263
+ comparison: {
264
+ score: number,
265
+ matchedSteps: number,
266
+ totalExpectedSteps: number,
267
+ totalActualSteps: number,
268
+ missingSteps: string[],
269
+ extraSteps: string[],
270
+ outOfOrderSteps: string[],
271
+ repeatedSteps: string[]
272
+ },
273
+ actualStepNames: string[],
274
+ expectedStepNames: string[]
275
+ },
276
+ score: number
277
+ }
278
+ ```
279
+
280
+ ## Code-based scorer examples
281
+
282
+ ### Agent trajectory with strict ordering
283
+
284
+ Validates that an agent follows an exact sequence of tool calls:
285
+
286
+ ```typescript
287
+ import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
288
+ import { runEvals } from '@mastra/core/evals'
289
+
290
+ const scorer = createTrajectoryAccuracyScorerCode({
291
+ expectedTrajectory: {
292
+ steps: [
293
+ { stepType: 'tool_call', name: 'auth-tool' },
294
+ { stepType: 'tool_call', name: 'fetch-tool' },
295
+ ],
296
+ },
297
+ comparisonOptions: { strictOrder: true },
298
+ })
299
+
300
+ const result = await runEvals({
301
+ target: myAgent,
302
+ scorers: { trajectory: [scorer] },
303
+ data: [{ input: 'Get my data' }],
304
+ })
305
+
306
+ console.log(result.scores.trajectory['trajectory-accuracy']) // 1.0
307
+ ```
308
+
309
+ ### Agent trajectory with relaxed ordering
310
+
311
+ Allows extra steps as long as expected steps appear in the correct relative order:
312
+
313
+ ```typescript
314
+ const scorer = createTrajectoryAccuracyScorerCode({
315
+ expectedTrajectory: {
316
+ steps: [
317
+ { stepType: 'tool_call', name: 'search-tool' },
318
+ { stepType: 'tool_call', name: 'summarize-tool' },
319
+ ],
320
+ },
321
+ comparisonOptions: { strictOrder: false },
322
+ })
323
+
324
+ // Agent called search-tool → log-tool → summarize-tool
325
+ // The extra log-tool is allowed in relaxed mode
326
+ // score: 0.75 — all expected steps matched, small penalty for extra step
327
+ ```
328
+
329
+ ### Workflow trajectory
330
+
331
+ Evaluates a workflow's execution path:
332
+
333
+ ```typescript
334
+ import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
335
+ import { runEvals } from '@mastra/core/evals'
336
+
337
+ const scorer = createTrajectoryAccuracyScorerCode({
338
+ expectedTrajectory: {
339
+ steps: [
340
+ { stepType: 'workflow_step', name: 'validate-input' },
341
+ { stepType: 'workflow_step', name: 'process-data' },
342
+ { stepType: 'workflow_step', name: 'save-result' },
343
+ ],
344
+ },
345
+ })
346
+
347
+ const result = await runEvals({
348
+ target: myWorkflow,
349
+ scorers: { trajectory: [scorer] },
350
+ data: [{ input: { data: 'test' } }],
351
+ })
352
+
353
+ console.log(result.scores.trajectory['trajectory-accuracy'])
354
+ ```
355
+
356
+ ### Comparing step data
357
+
358
+ Validates not just the step names but also step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
359
+
360
+ ```typescript
361
+ const scorer = createTrajectoryAccuracyScorerCode({
362
+ expectedTrajectory: {
363
+ steps: [
364
+ {
365
+ stepType: 'tool_call',
366
+ name: 'search-tool',
367
+ toolArgs: { query: 'weather in NYC' },
368
+ },
369
+ ],
370
+ },
371
+ comparisonOptions: { compareStepData: true },
372
+ })
373
+ ```
374
+
375
+ ## LLM-based trajectory accuracy scorer
376
+
377
+ The `createTrajectoryAccuracyScorerLLM()` function from `@mastra/evals/scorers/prebuilt` uses an LLM to evaluate whether an agent's or workflow's trajectory was appropriate, efficient, and complete.
378
+
379
+ ### Parameters
380
+
381
+ **model** (`MastraModelConfig`): The LLM model to use for evaluating trajectory quality.
382
+
383
+ **expectedTrajectory** (`TrajectoryExpectation`): Optional static expected trajectory to compare against. When omitted, the LLM evaluates the trajectory based on the task requirements alone. Can also come from dataset items at runtime.
384
+
385
+ ### Features
386
+
387
+ The LLM-based scorer provides:
388
+
389
+ - **Task-aware evaluation**: Assesses whether each step was necessary given the user's request
390
+ - **Ordering assessment**: Evaluates whether steps were taken in a logical order
391
+ - **Missing step detection**: Identifies steps that should have been taken
392
+ - **Redundancy detection**: Flags unnecessary or repeated steps
393
+ - **Reasoning generation**: Provides human-readable explanations for scoring decisions
394
+
395
+ ### Evaluation process
396
+
397
+ 1. **Receive trajectory**: Gets a pre-extracted `Trajectory` object from the pipeline
398
+ 2. **Analyze steps**: Evaluates each step for necessity and ordering using the LLM
399
+ 3. **Generate score**: Calculates score weighted as 60% necessity, 30% ordering, minus 10% missing penalty
400
+ 4. **Generate reasoning**: Provides a human-readable explanation
401
+
402
+ ## LLM-based scoring details
403
+
404
+ - **Fractional scores**: Returns values between 0.0 and 1.0
405
+ - **Context-aware**: Considers user intent and task requirements
406
+ - **Explanatory**: Provides reasoning for scores
407
+ - **Flexible**: Works with or without an expected trajectory
408
+
409
+ ### LLM-based scorer options
410
+
411
+ ```typescript
412
+ // Evaluate based on task requirements (no expected trajectory)
413
+ const openScorer = createTrajectoryAccuracyScorerLLM({
414
+ model: { provider: 'openai', name: 'gpt-5.4' },
415
+ })
416
+
417
+ // Evaluate against a static expected trajectory
418
+ const guidedScorer = createTrajectoryAccuracyScorerLLM({
419
+ model: { provider: 'openai', name: 'gpt-5.4' },
420
+ expectedTrajectory: {
421
+ steps: [
422
+ { stepType: 'tool_call', name: 'search-tool' },
423
+ { stepType: 'tool_call', name: 'summarize-tool' },
424
+ ],
425
+ },
426
+ })
427
+ ```
428
+
429
+ ### LLM-based scorer results
430
+
431
+ ```typescript
432
+ {
433
+ runId: string,
434
+ preprocessStepResult: {
435
+ actualTrajectory: Trajectory,
436
+ actualTrajectoryFormatted: string,
437
+ expectedTrajectoryFormatted?: string,
438
+ hasSteps: boolean
439
+ },
440
+ analyzeStepResult: {
441
+ stepEvaluations: Array<{
442
+ stepName: string,
443
+ wasNecessary: boolean,
444
+ wasInOrder: boolean,
445
+ reasoning: string
446
+ }>,
447
+ missingSteps?: string[],
448
+ extraSteps?: string[],
449
+ overallAssessment: string
450
+ },
451
+ score: number,
452
+ reason: string
453
+ }
454
+ ```
455
+
456
+ ## Unified trajectory scorer
457
+
458
+ The `createTrajectoryScorerCode()` function from `@mastra/evals/scorers/prebuilt` provides a multi-dimensional trajectory evaluation that checks accuracy, efficiency, blacklisted tools, and tool failure patterns in a single pass.
459
+
460
+ ### Parameters
461
+
462
+ **defaults** (`TrajectoryExpectation`): Default expectations applied to all dataset items. Per-item expectedTrajectory values override these defaults.
463
+
464
+ **weights** (`object`): Weights for combining dimension scores into the final score.
465
+
466
+ ### Scoring behavior
467
+
468
+ The unified scorer evaluates four dimensions:
469
+
470
+ 1. **Accuracy** — Matches actual steps against expected steps (if `steps` is configured). Uses the `ordering` mode.
471
+ 2. **Efficiency** — Checks step budgets (`maxSteps`, `maxTotalTokens`, `maxTotalDurationMs`) and redundant calls (`noRedundantCalls`).
472
+ 3. **Blacklist** — Checks for forbidden tools or sequences. Any violation immediately results in a score of **0.0** regardless of other dimensions.
473
+ 4. **Tool failures** — Detects retry patterns, fallback patterns, and argument correction patterns.
474
+
475
+ The final score is a weighted average of accuracy, efficiency, and tool failures. Blacklist violations override everything to 0.
476
+
477
+ ### Unified scorer results
478
+
479
+ ```typescript
480
+ {
481
+ runId: string,
482
+ preprocessStepResult: {
483
+ accuracy?: TrajectoryComparisonResult,
484
+ efficiency: TrajectoryEfficiencyResult,
485
+ blacklist: TrajectoryBlacklistResult,
486
+ toolFailures: ToolFailureAnalysisResult,
487
+ },
488
+ score: number
489
+ }
490
+ ```
491
+
492
+ ### Per-item expectations
493
+
494
+ Each dataset item can override the defaults with its own `expectedTrajectory`. This lets you vary expectations per prompt:
495
+
496
+ ```typescript
497
+ import { createTrajectoryScorerCode } from '@mastra/evals/scorers/prebuilt'
498
+ import { runEvals } from '@mastra/core/evals'
499
+
500
+ // Default blacklist applies to all items
501
+ const scorer = createTrajectoryScorerCode({
502
+ defaults: {
503
+ blacklistedTools: ['deleteAll'],
504
+ maxSteps: 5,
505
+ },
506
+ })
507
+
508
+ const result = await runEvals({
509
+ target: myAgent,
510
+ scorers: { trajectory: [scorer] },
511
+ data: [
512
+ {
513
+ input: 'Search for weather',
514
+ expectedTrajectory: {
515
+ steps: [{ stepType: 'tool_call', name: 'search' }],
516
+ maxSteps: 2,
517
+ },
518
+ },
519
+ {
520
+ input: 'Search and summarize',
521
+ expectedTrajectory: {
522
+ steps: [
523
+ { stepType: 'tool_call', name: 'search' },
524
+ { stepType: 'tool_call', name: 'summarize' },
525
+ ],
526
+ },
527
+ },
528
+ ],
529
+ })
530
+ ```
531
+
532
+ ### Example: efficiency and blacklist
533
+
534
+ ```typescript
535
+ import { createTrajectoryScorerCode } from '@mastra/evals/scorers/prebuilt'
536
+
537
+ const scorer = createTrajectoryScorerCode({
538
+ defaults: {
539
+ blacklistedTools: ['escalate', 'admin-override'],
540
+ blacklistedSequences: [['escalate', 'admin-override']],
541
+ maxSteps: 10,
542
+ noRedundantCalls: true,
543
+ maxRetriesPerTool: 2,
544
+ },
545
+ })
546
+ ```
547
+
548
+ ## Using trajectory scorers with `runEvals`
549
+
550
+ Trajectory scorers are configured under the `trajectory` key in the scorer config. The `runEvals` pipeline handles trajectory extraction automatically.
551
+
552
+ ### Agent trajectory evaluation
553
+
554
+ ```typescript
555
+ import { runEvals } from '@mastra/core/evals'
556
+ import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
557
+
558
+ const trajectoryScorer = createTrajectoryAccuracyScorerCode({
559
+ expectedTrajectory: {
560
+ steps: [
561
+ { stepType: 'tool_call', name: 'search' },
562
+ { stepType: 'tool_call', name: 'format' },
563
+ ],
564
+ },
565
+ })
566
+
567
+ const result = await runEvals({
568
+ target: myAgent,
569
+ scorers: {
570
+ agent: [qualityScorer], // receives raw MastraDBMessage[] output
571
+ trajectory: [trajectoryScorer], // receives pre-extracted Trajectory
572
+ },
573
+ data: [{ input: 'Find and format the data' }],
574
+ })
575
+
576
+ // result.scores.agent['quality'] — agent-level score
577
+ // result.scores.trajectory['trajectory-accuracy'] — trajectory score
578
+ ```
579
+
580
+ ### Workflow trajectory evaluation
581
+
582
+ ```typescript
583
+ import { runEvals } from '@mastra/core/evals'
584
+ import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
585
+
586
+ const workflowTrajectoryScorer = createTrajectoryAccuracyScorerCode({
587
+ expectedTrajectory: {
588
+ steps: [
589
+ { stepType: 'workflow_step', name: 'validate' },
590
+ { stepType: 'workflow_step', name: 'process' },
591
+ { stepType: 'workflow_step', name: 'notify' },
592
+ ],
593
+ },
594
+ })
595
+
596
+ const result = await runEvals({
597
+ target: myWorkflow,
598
+ scorers: {
599
+ workflow: [outputScorer], // receives workflow output
600
+ trajectory: [workflowTrajectoryScorer], // receives pre-extracted Trajectory from step results
601
+ },
602
+ data: [{ input: { userId: '123' } }],
603
+ })
604
+
605
+ // result.scores.workflow['output-quality'] — workflow-level score
606
+ // result.scores.trajectory['trajectory-accuracy'] — trajectory score
607
+ ```
608
+
609
+ ## Related
610
+
611
+ - [runEvals reference](https://mastra.ai/reference/evals/run-evals) — Pipeline that extracts trajectories and passes them to scorers
612
+ - [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) — Base scorer interface
613
+ - [Scorer utils](https://mastra.ai/reference/evals/scorer-utils) — Utility functions including `extractTrajectory` and `compareTrajectories`