@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Scorer
|
|
1
|
+
# Scorer utils
|
|
2
2
|
|
|
3
3
|
Mastra provides utility functions to help extract and process data from scorer run inputs and outputs. These utilities are particularly useful in the `preprocess` step of custom scorers.
|
|
4
4
|
|
|
@@ -14,35 +14,47 @@ import {
|
|
|
14
14
|
extractToolCalls,
|
|
15
15
|
extractInputMessages,
|
|
16
16
|
extractAgentResponseMessages,
|
|
17
|
-
|
|
17
|
+
compareTrajectories,
|
|
18
|
+
createTrajectoryTestRun,
|
|
19
|
+
} from '@mastra/evals/scorers/utils'
|
|
18
20
|
```
|
|
19
21
|
|
|
20
|
-
|
|
22
|
+
Trajectory extraction functions are available from `@mastra/core/evals`:
|
|
21
23
|
|
|
22
|
-
|
|
24
|
+
```typescript
|
|
25
|
+
import {
|
|
26
|
+
extractTrajectory,
|
|
27
|
+
extractWorkflowTrajectory,
|
|
28
|
+
extractTrajectoryFromTrace,
|
|
29
|
+
} from '@mastra/core/evals'
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Message extraction
|
|
33
|
+
|
|
34
|
+
### `getAssistantMessageFromRunOutput`
|
|
23
35
|
|
|
24
36
|
Extracts the text content from the first assistant message in the run output.
|
|
25
37
|
|
|
26
38
|
```typescript
|
|
27
39
|
const scorer = createScorer({
|
|
28
|
-
id:
|
|
29
|
-
description:
|
|
30
|
-
type:
|
|
40
|
+
id: 'my-scorer',
|
|
41
|
+
description: 'My scorer',
|
|
42
|
+
type: 'agent',
|
|
31
43
|
})
|
|
32
44
|
.preprocess(({ run }) => {
|
|
33
|
-
const response = getAssistantMessageFromRunOutput(run.output)
|
|
34
|
-
return { response }
|
|
45
|
+
const response = getAssistantMessageFromRunOutput(run.output)
|
|
46
|
+
return { response }
|
|
35
47
|
})
|
|
36
48
|
.generateScore(({ results }) => {
|
|
37
|
-
return results.preprocessStepResult?.response ? 1 : 0
|
|
38
|
-
})
|
|
49
|
+
return results.preprocessStepResult?.response ? 1 : 0
|
|
50
|
+
})
|
|
39
51
|
```
|
|
40
52
|
|
|
41
|
-
**output
|
|
53
|
+
**output** (`ScorerRunOutputForAgent`): The scorer run output (array of MastraDBMessage)
|
|
42
54
|
|
|
43
55
|
**Returns:** `string | undefined` - The assistant message text, or undefined if no assistant message is found.
|
|
44
56
|
|
|
45
|
-
### getUserMessageFromRunInput
|
|
57
|
+
### `getUserMessageFromRunInput`
|
|
46
58
|
|
|
47
59
|
Extracts the text content from the first user message in the run input.
|
|
48
60
|
|
|
@@ -53,11 +65,11 @@ Extracts the text content from the first user message in the run input.
|
|
|
53
65
|
})
|
|
54
66
|
```
|
|
55
67
|
|
|
56
|
-
**input
|
|
68
|
+
**input** (`ScorerRunInputForAgent`): The scorer run input containing input messages
|
|
57
69
|
|
|
58
70
|
**Returns:** `string | undefined` - The user message text, or undefined if no user message is found.
|
|
59
71
|
|
|
60
|
-
### extractInputMessages
|
|
72
|
+
### `extractInputMessages`
|
|
61
73
|
|
|
62
74
|
Extracts text content from all input messages as an array.
|
|
63
75
|
|
|
@@ -70,7 +82,7 @@ Extracts text content from all input messages as an array.
|
|
|
70
82
|
|
|
71
83
|
**Returns:** `string[]` - Array of text strings from each input message.
|
|
72
84
|
|
|
73
|
-
### extractAgentResponseMessages
|
|
85
|
+
### `extractAgentResponseMessages`
|
|
74
86
|
|
|
75
87
|
Extracts text content from all assistant response messages as an array.
|
|
76
88
|
|
|
@@ -83,9 +95,9 @@ Extracts text content from all assistant response messages as an array.
|
|
|
83
95
|
|
|
84
96
|
**Returns:** `string[]` - Array of text strings from each assistant message.
|
|
85
97
|
|
|
86
|
-
## Reasoning
|
|
98
|
+
## Reasoning extraction
|
|
87
99
|
|
|
88
|
-
### getReasoningFromRunOutput
|
|
100
|
+
### `getReasoningFromRunOutput`
|
|
89
101
|
|
|
90
102
|
Extracts reasoning text from the run output. This is particularly useful when evaluating responses from reasoning models like `deepseek-reasoner` that produce chain-of-thought reasoning.
|
|
91
103
|
|
|
@@ -97,50 +109,50 @@ Reasoning can be stored in two places:
|
|
|
97
109
|
```typescript
|
|
98
110
|
import {
|
|
99
111
|
getReasoningFromRunOutput,
|
|
100
|
-
getAssistantMessageFromRunOutput
|
|
101
|
-
} from
|
|
112
|
+
getAssistantMessageFromRunOutput,
|
|
113
|
+
} from '@mastra/evals/scorers/utils'
|
|
102
114
|
|
|
103
115
|
const reasoningQualityScorer = createScorer({
|
|
104
|
-
id:
|
|
105
|
-
name:
|
|
106
|
-
description:
|
|
107
|
-
type:
|
|
116
|
+
id: 'reasoning-quality',
|
|
117
|
+
name: 'Reasoning Quality',
|
|
118
|
+
description: 'Evaluates the quality of model reasoning',
|
|
119
|
+
type: 'agent',
|
|
108
120
|
})
|
|
109
121
|
.preprocess(({ run }) => {
|
|
110
|
-
const reasoning = getReasoningFromRunOutput(run.output)
|
|
111
|
-
const response = getAssistantMessageFromRunOutput(run.output)
|
|
112
|
-
return { reasoning, response }
|
|
122
|
+
const reasoning = getReasoningFromRunOutput(run.output)
|
|
123
|
+
const response = getAssistantMessageFromRunOutput(run.output)
|
|
124
|
+
return { reasoning, response }
|
|
113
125
|
})
|
|
114
126
|
.analyze(({ results }) => {
|
|
115
|
-
const { reasoning } = results.preprocessStepResult || {}
|
|
127
|
+
const { reasoning } = results.preprocessStepResult || {}
|
|
116
128
|
return {
|
|
117
129
|
hasReasoning: !!reasoning,
|
|
118
130
|
reasoningLength: reasoning?.length || 0,
|
|
119
|
-
hasStepByStep: reasoning?.includes(
|
|
120
|
-
}
|
|
131
|
+
hasStepByStep: reasoning?.includes('step') || false,
|
|
132
|
+
}
|
|
121
133
|
})
|
|
122
134
|
.generateScore(({ results }) => {
|
|
123
|
-
const { hasReasoning, reasoningLength } = results.analyzeStepResult || {}
|
|
124
|
-
if (!hasReasoning) return 0
|
|
135
|
+
const { hasReasoning, reasoningLength } = results.analyzeStepResult || {}
|
|
136
|
+
if (!hasReasoning) return 0
|
|
125
137
|
// Score based on reasoning length (normalized to 0-1)
|
|
126
|
-
return Math.min(reasoningLength / 500, 1)
|
|
138
|
+
return Math.min(reasoningLength / 500, 1)
|
|
127
139
|
})
|
|
128
140
|
.generateReason(({ results, score }) => {
|
|
129
|
-
const { hasReasoning, reasoningLength } = results.analyzeStepResult || {}
|
|
141
|
+
const { hasReasoning, reasoningLength } = results.analyzeStepResult || {}
|
|
130
142
|
if (!hasReasoning) {
|
|
131
|
-
return
|
|
143
|
+
return 'No reasoning was provided by the model.'
|
|
132
144
|
}
|
|
133
|
-
return `Model provided ${reasoningLength} characters of reasoning. Score: ${score}
|
|
134
|
-
})
|
|
145
|
+
return `Model provided ${reasoningLength} characters of reasoning. Score: ${score}`
|
|
146
|
+
})
|
|
135
147
|
```
|
|
136
148
|
|
|
137
|
-
**output
|
|
149
|
+
**output** (`ScorerRunOutputForAgent`): The scorer run output (array of MastraDBMessage)
|
|
138
150
|
|
|
139
151
|
**Returns:** `string | undefined` - The reasoning text, or undefined if no reasoning is present.
|
|
140
152
|
|
|
141
|
-
## System
|
|
153
|
+
## System message extraction
|
|
142
154
|
|
|
143
|
-
### getSystemMessagesFromRunInput
|
|
155
|
+
### `getSystemMessagesFromRunInput`
|
|
144
156
|
|
|
145
157
|
Extracts all system messages from the run input, including both standard system messages and tagged system messages (specialized prompts like memory instructions).
|
|
146
158
|
|
|
@@ -156,7 +168,7 @@ Extracts all system messages from the run input, including both standard system
|
|
|
156
168
|
|
|
157
169
|
**Returns:** `string[]` - Array of system message strings.
|
|
158
170
|
|
|
159
|
-
### getCombinedSystemPrompt
|
|
171
|
+
### `getCombinedSystemPrompt`
|
|
160
172
|
|
|
161
173
|
Combines all system messages into a single prompt string, joined with double newlines.
|
|
162
174
|
|
|
@@ -169,31 +181,31 @@ Combines all system messages into a single prompt string, joined with double new
|
|
|
169
181
|
|
|
170
182
|
**Returns:** `string` - Combined system prompt string.
|
|
171
183
|
|
|
172
|
-
## Tool
|
|
184
|
+
## Tool call extraction
|
|
173
185
|
|
|
174
|
-
### extractToolCalls
|
|
186
|
+
### `extractToolCalls`
|
|
175
187
|
|
|
176
188
|
Extracts information about all tool calls from the run output, including tool names, call IDs, and their positions in the message array.
|
|
177
189
|
|
|
178
190
|
```typescript
|
|
179
191
|
const toolUsageScorer = createScorer({
|
|
180
|
-
id:
|
|
181
|
-
description:
|
|
182
|
-
type:
|
|
192
|
+
id: 'tool-usage',
|
|
193
|
+
description: 'Evaluates tool usage patterns',
|
|
194
|
+
type: 'agent',
|
|
183
195
|
})
|
|
184
196
|
.preprocess(({ run }) => {
|
|
185
|
-
const { tools, toolCallInfos } = extractToolCalls(run.output)
|
|
197
|
+
const { tools, toolCallInfos } = extractToolCalls(run.output)
|
|
186
198
|
return {
|
|
187
199
|
toolsUsed: tools,
|
|
188
200
|
toolCount: tools.length,
|
|
189
201
|
toolDetails: toolCallInfos,
|
|
190
|
-
}
|
|
202
|
+
}
|
|
191
203
|
})
|
|
192
204
|
.generateScore(({ results }) => {
|
|
193
|
-
const { toolCount } = results.preprocessStepResult || {}
|
|
205
|
+
const { toolCount } = results.preprocessStepResult || {}
|
|
194
206
|
// Score based on appropriate tool usage
|
|
195
|
-
return toolCount > 0 ? 1 : 0
|
|
196
|
-
})
|
|
207
|
+
return toolCount > 0 ? 1 : 0
|
|
208
|
+
})
|
|
197
209
|
```
|
|
198
210
|
|
|
199
211
|
**Returns:**
|
|
@@ -209,94 +221,262 @@ Where `ToolCallInfo` is:
|
|
|
209
221
|
|
|
210
222
|
```typescript
|
|
211
223
|
type ToolCallInfo = {
|
|
212
|
-
toolName: string
|
|
213
|
-
toolCallId: string
|
|
214
|
-
messageIndex: number
|
|
215
|
-
invocationIndex: number
|
|
216
|
-
}
|
|
224
|
+
toolName: string // Name of the tool
|
|
225
|
+
toolCallId: string // Unique call identifier
|
|
226
|
+
messageIndex: number // Index in the output array
|
|
227
|
+
invocationIndex: number // Index within message's tool invocations
|
|
228
|
+
}
|
|
217
229
|
```
|
|
218
230
|
|
|
219
|
-
## Test
|
|
231
|
+
## Test utilities
|
|
220
232
|
|
|
221
233
|
These utilities help create test data for scorer development.
|
|
222
234
|
|
|
223
|
-
### createTestMessage
|
|
235
|
+
### `createTestMessage`
|
|
224
236
|
|
|
225
237
|
Creates a `MastraDBMessage` object for testing purposes.
|
|
226
238
|
|
|
227
239
|
```typescript
|
|
228
|
-
import { createTestMessage } from
|
|
240
|
+
import { createTestMessage } from '@mastra/evals/scorers/utils'
|
|
229
241
|
|
|
230
242
|
const userMessage = createTestMessage({
|
|
231
|
-
content:
|
|
232
|
-
role:
|
|
233
|
-
})
|
|
243
|
+
content: 'What is the weather?',
|
|
244
|
+
role: 'user',
|
|
245
|
+
})
|
|
234
246
|
|
|
235
247
|
const assistantMessage = createTestMessage({
|
|
236
|
-
content:
|
|
237
|
-
role:
|
|
248
|
+
content: 'The weather is sunny.',
|
|
249
|
+
role: 'assistant',
|
|
238
250
|
toolInvocations: [
|
|
239
251
|
{
|
|
240
|
-
toolCallId:
|
|
241
|
-
toolName:
|
|
242
|
-
args: { location:
|
|
252
|
+
toolCallId: 'call-1',
|
|
253
|
+
toolName: 'weatherTool',
|
|
254
|
+
args: { location: 'London' },
|
|
243
255
|
result: { temp: 20 },
|
|
244
|
-
state:
|
|
256
|
+
state: 'result',
|
|
245
257
|
},
|
|
246
258
|
],
|
|
247
|
-
})
|
|
259
|
+
})
|
|
248
260
|
```
|
|
249
261
|
|
|
250
|
-
### createAgentTestRun
|
|
262
|
+
### `createAgentTestRun`
|
|
251
263
|
|
|
252
264
|
Creates a complete test run object for testing scorers.
|
|
253
265
|
|
|
254
266
|
```typescript
|
|
255
|
-
import { createAgentTestRun, createTestMessage } from
|
|
267
|
+
import { createAgentTestRun, createTestMessage } from '@mastra/evals/scorers/utils'
|
|
256
268
|
|
|
257
269
|
const testRun = createAgentTestRun({
|
|
258
|
-
inputMessages: [
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
output: [
|
|
262
|
-
createTestMessage({ content: "Hi there!", role: "assistant" }),
|
|
263
|
-
],
|
|
264
|
-
});
|
|
270
|
+
inputMessages: [createTestMessage({ content: 'Hello', role: 'user' })],
|
|
271
|
+
output: [createTestMessage({ content: 'Hi there!', role: 'assistant' })],
|
|
272
|
+
})
|
|
265
273
|
|
|
266
274
|
// Run your scorer with the test data
|
|
267
275
|
const result = await myScorer.run({
|
|
268
276
|
input: testRun.input,
|
|
269
277
|
output: testRun.output,
|
|
270
|
-
})
|
|
278
|
+
})
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## Trajectory utilities
|
|
282
|
+
|
|
283
|
+
### `extractTrajectory`
|
|
284
|
+
|
|
285
|
+
Extracts a `Trajectory` from agent output messages (`MastraDBMessage[]`). Converts tool invocations into `ToolCallStep` objects. The `runEvals` pipeline calls this automatically for trajectory scorers — you only need it for direct testing.
|
|
286
|
+
|
|
287
|
+
Available from `@mastra/core/evals`.
|
|
288
|
+
|
|
289
|
+
```typescript
|
|
290
|
+
import { extractTrajectory } from '@mastra/core/evals'
|
|
291
|
+
|
|
292
|
+
const trajectory = extractTrajectory(agentOutputMessages)
|
|
293
|
+
// trajectory.steps — ToolCallStep[] extracted from toolInvocations
|
|
294
|
+
// trajectory.rawOutput — the original MastraDBMessage[] array
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawOutput`.
|
|
298
|
+
|
|
299
|
+
### `extractWorkflowTrajectory`
|
|
300
|
+
|
|
301
|
+
Extracts a `Trajectory` from workflow step results. Converts `StepResult` records into `WorkflowStepStep` objects, respecting the execution path ordering.
|
|
302
|
+
|
|
303
|
+
Available from `@mastra/core/evals`.
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { extractWorkflowTrajectory } from '@mastra/core/evals'
|
|
307
|
+
|
|
308
|
+
const trajectory = extractWorkflowTrajectory(
|
|
309
|
+
workflowResult.steps, // Record<string, StepResult>
|
|
310
|
+
workflowResult.stepExecutionPath, // string[] (optional)
|
|
311
|
+
)
|
|
312
|
+
// trajectory.steps — WorkflowStepStep[] in execution order
|
|
271
313
|
```
|
|
272
314
|
|
|
273
|
-
|
|
315
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawWorkflowResult`.
|
|
316
|
+
|
|
317
|
+
### `extractTrajectoryFromTrace`
|
|
318
|
+
|
|
319
|
+
Builds a hierarchical `Trajectory` from observability trace spans (`SpanRecord[]`). Reconstructs the parent-child span tree and maps each span to the appropriate `TrajectoryStep` discriminated union type with nested `children`.
|
|
320
|
+
|
|
321
|
+
This is the preferred extraction method when storage is available. The `runEvals` pipeline calls this automatically when the target's `Mastra` instance has a configured storage backend. It produces richer trajectories than `extractTrajectory` or `extractWorkflowTrajectory` because it captures the full execution tree, including nested agent runs, tool calls, and model generations.
|
|
322
|
+
|
|
323
|
+
Available from `@mastra/core/evals`.
|
|
324
|
+
|
|
325
|
+
```typescript
|
|
326
|
+
import { extractTrajectoryFromTrace } from '@mastra/core/evals'
|
|
327
|
+
|
|
328
|
+
// After fetching a trace from the observability store
|
|
329
|
+
const traceData = await observabilityStore.getTrace({ traceId })
|
|
330
|
+
const trajectory = extractTrajectoryFromTrace(traceData.spans, rootSpanId)
|
|
331
|
+
// trajectory.steps — hierarchical TrajectoryStep[] with children
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
**Parameters:**
|
|
335
|
+
|
|
336
|
+
- `spans` (`SpanRecord[]`) — Array of span records from a trace query.
|
|
337
|
+
- `rootSpanId` (`string`, optional) — Span ID to use as the starting point. When omitted, uses spans with no parent.
|
|
338
|
+
|
|
339
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]` with recursive `children` and `totalDurationMs`.
|
|
340
|
+
|
|
341
|
+
#### Span type mapping
|
|
342
|
+
|
|
343
|
+
| Span type | Trajectory step type | Key fields extracted |
|
|
344
|
+
| ---------------------- | ---------------------- | ------------------------------------------------------------- |
|
|
345
|
+
| `TOOL_CALL` | `tool_call` | `toolArgs`, `toolResult`, `success` |
|
|
346
|
+
| `MCP_TOOL_CALL` | `mcp_tool_call` | `toolArgs`, `toolResult`, `mcpServer`, `success` |
|
|
347
|
+
| `MODEL_GENERATION` | `model_generation` | `modelId`, `promptTokens`, `completionTokens`, `finishReason` |
|
|
348
|
+
| `AGENT_RUN` | `agent_run` | `agentId` (from entity ID) |
|
|
349
|
+
| `WORKFLOW_RUN` | `workflow_run` | `workflowId` (from entity ID) |
|
|
350
|
+
| `WORKFLOW_STEP` | `workflow_step` | `output` |
|
|
351
|
+
| `WORKFLOW_CONDITIONAL` | `workflow_conditional` | `conditionCount`, `selectedSteps` |
|
|
352
|
+
| `WORKFLOW_PARALLEL` | `workflow_parallel` | `branchCount`, `parallelSteps` |
|
|
353
|
+
| `WORKFLOW_LOOP` | `workflow_loop` | `loopType`, `totalIterations` |
|
|
354
|
+
| `WORKFLOW_SLEEP` | `workflow_sleep` | `sleepDurationMs`, `sleepType` |
|
|
355
|
+
| `WORKFLOW_WAIT_EVENT` | `workflow_wait_event` | `eventName`, `eventReceived` |
|
|
356
|
+
| `PROCESSOR_RUN` | `processor_run` | `processorId` |
|
|
357
|
+
|
|
358
|
+
Spans with types `GENERIC`, `MODEL_STEP`, `MODEL_CHUNK`, and `WORKFLOW_CONDITIONAL_EVAL` are skipped as noise.
|
|
359
|
+
|
|
360
|
+
### `compareTrajectories`
|
|
361
|
+
|
|
362
|
+
Compares an actual trajectory against an expected trajectory and returns a detailed comparison result. Used internally by `createTrajectoryAccuracyScorerCode`.
|
|
363
|
+
|
|
364
|
+
The `expected` parameter accepts either a `Trajectory` (actual trajectory) or `{ steps: ExpectedStep[] }`. When using `ExpectedStep[]`, you can match by name only, name + stepType, or include data for comparison. See [Expected steps](https://mastra.ai/reference/evals/trajectory-accuracy) for details.
|
|
365
|
+
|
|
366
|
+
```typescript
|
|
367
|
+
import { compareTrajectories } from '@mastra/evals/scorers/utils'
|
|
368
|
+
|
|
369
|
+
// Using ExpectedStep[] (recommended for expectations)
|
|
370
|
+
const result = compareTrajectories(
|
|
371
|
+
actualTrajectory,
|
|
372
|
+
{ steps: [{ name: 'search' }, { name: 'summarize', stepType: 'tool_call' }] },
|
|
373
|
+
{ compareStepData: false, allowRepeatedSteps: true },
|
|
374
|
+
)
|
|
375
|
+
// result.score — 0.0 to 1.0
|
|
376
|
+
// result.missingSteps — step names not found
|
|
377
|
+
// result.extraSteps — unexpected step names
|
|
378
|
+
// result.outOfOrderSteps — steps found but in wrong order
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
**Returns:** `TrajectoryComparisonResult`
|
|
382
|
+
|
|
383
|
+
### `createTrajectoryTestRun`
|
|
384
|
+
|
|
385
|
+
Creates a test run object for trajectory scorers. Wraps a `Trajectory` into the expected `ScorerRun` format.
|
|
386
|
+
|
|
387
|
+
```typescript
|
|
388
|
+
import { createTrajectoryTestRun } from '@mastra/evals/scorers/utils'
|
|
389
|
+
|
|
390
|
+
const run = createTrajectoryTestRun({
|
|
391
|
+
steps: [
|
|
392
|
+
{ stepType: 'tool_call', name: 'search', toolArgs: { q: 'test' } },
|
|
393
|
+
{ stepType: 'tool_call', name: 'summarize' },
|
|
394
|
+
],
|
|
395
|
+
})
|
|
396
|
+
|
|
397
|
+
const result = await trajectoryScorer.run(run)
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
### `checkTrajectoryEfficiency`
|
|
401
|
+
|
|
402
|
+
Evaluates trajectory efficiency against step, token, and duration budgets. Also detects redundant calls (same tool with same arguments).
|
|
403
|
+
|
|
404
|
+
```typescript
|
|
405
|
+
import { checkTrajectoryEfficiency } from '@mastra/evals/scorers/utils'
|
|
406
|
+
|
|
407
|
+
const result = checkTrajectoryEfficiency(trajectory, {
|
|
408
|
+
maxSteps: 5,
|
|
409
|
+
maxTotalTokens: 2000,
|
|
410
|
+
maxTotalDurationMs: 5000,
|
|
411
|
+
noRedundantCalls: true,
|
|
412
|
+
})
|
|
413
|
+
// result.score — 1.0 if within all budgets, lower with penalties
|
|
414
|
+
// result.redundantCalls — duplicate tool+args combos
|
|
415
|
+
// result.overBudget — which budgets were exceeded
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
**Returns:** `TrajectoryEfficiencyResult`
|
|
419
|
+
|
|
420
|
+
### `checkTrajectoryBlacklist`
|
|
421
|
+
|
|
422
|
+
Checks whether a trajectory contains forbidden tools or tool call sequences.
|
|
423
|
+
|
|
424
|
+
```typescript
|
|
425
|
+
import { checkTrajectoryBlacklist } from '@mastra/evals/scorers/utils'
|
|
426
|
+
|
|
427
|
+
const result = checkTrajectoryBlacklist(trajectory, {
|
|
428
|
+
blacklistedTools: ['deleteAll', 'admin-override'],
|
|
429
|
+
blacklistedSequences: [['escalate', 'admin-override']],
|
|
430
|
+
})
|
|
431
|
+
// result.passed — true if no violations
|
|
432
|
+
// result.violations — list of violations with type and details
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
**Returns:** `TrajectoryBlacklistResult`
|
|
436
|
+
|
|
437
|
+
### `analyzeToolFailures`
|
|
438
|
+
|
|
439
|
+
Detects tool failure patterns including retries, fallbacks, and argument corrections.
|
|
440
|
+
|
|
441
|
+
```typescript
|
|
442
|
+
import { analyzeToolFailures } from '@mastra/evals/scorers/utils'
|
|
443
|
+
|
|
444
|
+
const result = analyzeToolFailures(trajectory, {
|
|
445
|
+
maxRetriesPerTool: 3,
|
|
446
|
+
})
|
|
447
|
+
// result.score — 1.0 if no failure patterns, lower if patterns detected
|
|
448
|
+
// result.patterns — detected patterns (retry, fallback, arg_correction)
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
**Returns:** `ToolFailureAnalysisResult`
|
|
452
|
+
|
|
453
|
+
## Complete example
|
|
274
454
|
|
|
275
455
|
Here's a complete example showing how to use multiple utilities together:
|
|
276
456
|
|
|
277
457
|
```typescript
|
|
278
|
-
import { createScorer } from
|
|
458
|
+
import { createScorer } from '@mastra/core/evals'
|
|
279
459
|
import {
|
|
280
460
|
getAssistantMessageFromRunOutput,
|
|
281
461
|
getReasoningFromRunOutput,
|
|
282
462
|
getUserMessageFromRunInput,
|
|
283
463
|
getCombinedSystemPrompt,
|
|
284
464
|
extractToolCalls,
|
|
285
|
-
} from
|
|
465
|
+
} from '@mastra/evals/scorers/utils'
|
|
286
466
|
|
|
287
467
|
const comprehensiveScorer = createScorer({
|
|
288
|
-
id:
|
|
289
|
-
name:
|
|
290
|
-
description:
|
|
291
|
-
type:
|
|
468
|
+
id: 'comprehensive-analysis',
|
|
469
|
+
name: 'Comprehensive Analysis',
|
|
470
|
+
description: 'Analyzes all aspects of an agent response',
|
|
471
|
+
type: 'agent',
|
|
292
472
|
})
|
|
293
473
|
.preprocess(({ run }) => {
|
|
294
474
|
// Extract all relevant data
|
|
295
|
-
const userMessage = getUserMessageFromRunInput(run.input)
|
|
296
|
-
const response = getAssistantMessageFromRunOutput(run.output)
|
|
297
|
-
const reasoning = getReasoningFromRunOutput(run.output)
|
|
298
|
-
const systemPrompt = getCombinedSystemPrompt(run.input)
|
|
299
|
-
const { tools, toolCallInfos } = extractToolCalls(run.output)
|
|
475
|
+
const userMessage = getUserMessageFromRunInput(run.input)
|
|
476
|
+
const response = getAssistantMessageFromRunOutput(run.output)
|
|
477
|
+
const reasoning = getReasoningFromRunOutput(run.output)
|
|
478
|
+
const systemPrompt = getCombinedSystemPrompt(run.input)
|
|
479
|
+
const { tools, toolCallInfos } = extractToolCalls(run.output)
|
|
300
480
|
|
|
301
481
|
return {
|
|
302
482
|
userMessage,
|
|
@@ -305,26 +485,26 @@ const comprehensiveScorer = createScorer({
|
|
|
305
485
|
systemPrompt,
|
|
306
486
|
toolsUsed: tools,
|
|
307
487
|
toolCount: tools.length,
|
|
308
|
-
}
|
|
488
|
+
}
|
|
309
489
|
})
|
|
310
490
|
.generateScore(({ results }) => {
|
|
311
|
-
const { response, reasoning, toolCount } = results.preprocessStepResult || {}
|
|
491
|
+
const { response, reasoning, toolCount } = results.preprocessStepResult || {}
|
|
312
492
|
|
|
313
|
-
let score = 0
|
|
314
|
-
if (response && response.length > 0) score += 0.4
|
|
315
|
-
if (reasoning) score += 0.3
|
|
316
|
-
if (toolCount > 0) score += 0.3
|
|
493
|
+
let score = 0
|
|
494
|
+
if (response && response.length > 0) score += 0.4
|
|
495
|
+
if (reasoning) score += 0.3
|
|
496
|
+
if (toolCount > 0) score += 0.3
|
|
317
497
|
|
|
318
|
-
return score
|
|
498
|
+
return score
|
|
319
499
|
})
|
|
320
500
|
.generateReason(({ results, score }) => {
|
|
321
|
-
const { response, reasoning, toolCount } = results.preprocessStepResult || {}
|
|
501
|
+
const { response, reasoning, toolCount } = results.preprocessStepResult || {}
|
|
322
502
|
|
|
323
|
-
const parts = []
|
|
324
|
-
if (response) parts.push(
|
|
325
|
-
if (reasoning) parts.push(
|
|
326
|
-
if (toolCount > 0) parts.push(`used ${toolCount} tool(s)`)
|
|
503
|
+
const parts = []
|
|
504
|
+
if (response) parts.push('provided a response')
|
|
505
|
+
if (reasoning) parts.push('included reasoning')
|
|
506
|
+
if (toolCount > 0) parts.push(`used ${toolCount} tool(s)`)
|
|
327
507
|
|
|
328
|
-
return `Score: ${score}. The agent ${parts.join(
|
|
329
|
-
})
|
|
508
|
+
return `Score: ${score}. The agent ${parts.join(', ')}.`
|
|
509
|
+
})
|
|
330
510
|
```
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
# Textual
|
|
1
|
+
# Textual difference scorer
|
|
2
2
|
|
|
3
3
|
The `createTextualDifferenceScorer()` function uses sequence matching to measure the textual differences between two strings. It provides detailed information about changes, including the number of operations needed to transform one text into another.
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
The `createTextualDifferenceScorer()` function
|
|
7
|
+
The `createTextualDifferenceScorer()` function doesn't take any options.
|
|
8
8
|
|
|
9
9
|
This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
|
|
10
10
|
|
|
11
|
-
##
|
|
11
|
+
## `.run()` returns
|
|
12
12
|
|
|
13
|
-
**runId
|
|
13
|
+
**runId** (`string`): The id of the run (optional).
|
|
14
14
|
|
|
15
|
-
**analyzeStepResult
|
|
15
|
+
**analyzeStepResult** (`object`): Object with difference metrics: { confidence: number, changes: number, lengthDiff: number }
|
|
16
16
|
|
|
17
|
-
**score
|
|
17
|
+
**score** (`number`): Similarity ratio (0-1) where 1 indicates identical texts.
|
|
18
18
|
|
|
19
19
|
`.run()` returns a result in the following shape:
|
|
20
20
|
|
|
@@ -31,7 +31,7 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
|
|
|
31
31
|
}
|
|
32
32
|
```
|
|
33
33
|
|
|
34
|
-
## Scoring
|
|
34
|
+
## Scoring details
|
|
35
35
|
|
|
36
36
|
The scorer calculates several measures:
|
|
37
37
|
|
|
@@ -71,22 +71,22 @@ A textual difference score between 0 and 1:
|
|
|
71
71
|
Measure textual differences between expected and actual agent outputs:
|
|
72
72
|
|
|
73
73
|
```typescript
|
|
74
|
-
import { runEvals } from
|
|
75
|
-
import { createTextualDifferenceScorer } from
|
|
76
|
-
import { myAgent } from
|
|
74
|
+
import { runEvals } from '@mastra/core/evals'
|
|
75
|
+
import { createTextualDifferenceScorer } from '@mastra/evals/scorers/prebuilt'
|
|
76
|
+
import { myAgent } from './agent'
|
|
77
77
|
|
|
78
|
-
const scorer = createTextualDifferenceScorer()
|
|
78
|
+
const scorer = createTextualDifferenceScorer()
|
|
79
79
|
|
|
80
80
|
const result = await runEvals({
|
|
81
81
|
data: [
|
|
82
82
|
{
|
|
83
|
-
input:
|
|
83
|
+
input: 'Summarize the concept of recursion',
|
|
84
84
|
groundTruth:
|
|
85
|
-
|
|
85
|
+
'Recursion is when a function calls itself to solve a problem by breaking it into smaller subproblems.',
|
|
86
86
|
},
|
|
87
87
|
{
|
|
88
|
-
input:
|
|
89
|
-
groundTruth:
|
|
88
|
+
input: 'What is the capital of France?',
|
|
89
|
+
groundTruth: 'The capital of France is Paris.',
|
|
90
90
|
},
|
|
91
91
|
],
|
|
92
92
|
scorers: [scorer],
|
|
@@ -95,11 +95,11 @@ const result = await runEvals({
|
|
|
95
95
|
console.log({
|
|
96
96
|
score: scorerResults[scorer.id].score,
|
|
97
97
|
groundTruth: scorerResults[scorer.id].groundTruth,
|
|
98
|
-
})
|
|
98
|
+
})
|
|
99
99
|
},
|
|
100
|
-
})
|
|
100
|
+
})
|
|
101
101
|
|
|
102
|
-
console.log(result.scores)
|
|
102
|
+
console.log(result.scores)
|
|
103
103
|
```
|
|
104
104
|
|
|
105
105
|
For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
|