@mastra/mcp-docs-server 1.1.17-alpha.5 → 1.1.17-alpha.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/docs/evals/built-in-scorers.md +1 -0
- package/.docs/models/index.md +1 -1
- package/.docs/models/providers/llmgateway.md +269 -0
- package/.docs/models/providers.md +1 -0
- package/.docs/reference/evals/run-evals.md +78 -3
- package/.docs/reference/evals/scorer-utils.md +184 -0
- package/.docs/reference/evals/trajectory-accuracy.md +613 -0
- package/.docs/reference/index.md +1 -0
- package/CHANGELOG.md +14 -0
- package/package.json +3 -3
|
@@ -18,6 +18,7 @@ These scorers evaluate how correct, truthful, and complete your agent's answers
|
|
|
18
18
|
- [`content-similarity`](https://mastra.ai/reference/evals/content-similarity): Measures textual similarity using character-level matching (`0-1`, higher is better)
|
|
19
19
|
- [`textual-difference`](https://mastra.ai/reference/evals/textual-difference): Measures textual differences between strings (`0-1`, higher means more similar)
|
|
20
20
|
- [`tool-call-accuracy`](https://mastra.ai/reference/evals/tool-call-accuracy): Evaluates whether the LLM selects the correct tool from available options (`0-1`, higher is better)
|
|
21
|
+
- [`trajectory-accuracy`](https://mastra.ai/reference/evals/trajectory-accuracy): Evaluates whether an agent follows the expected sequence of actions (tool calls, model generations, workflow steps, and other span types) (`0-1`, higher is better)
|
|
21
22
|
- [`prompt-alignment`](https://mastra.ai/reference/evals/prompt-alignment): Measures how well agent responses align with user prompt intent, requirements, completeness, and format (`0-1`, higher is better)
|
|
22
23
|
|
|
23
24
|
### Context quality
|
package/.docs/models/index.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Model Providers
|
|
2
2
|
|
|
3
|
-
Mastra provides a unified interface for working with LLMs across multiple providers, giving you access to
|
|
3
|
+
Mastra provides a unified interface for working with LLMs across multiple providers, giving you access to 3595 models from 95 providers through a single API.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# LLM Gateway
|
|
2
|
+
|
|
3
|
+
Access 199 LLM Gateway models through Mastra's model router. Authentication is handled automatically using the `LLMGATEWAY_API_KEY` environment variable.
|
|
4
|
+
|
|
5
|
+
Learn more in the [LLM Gateway documentation](https://llmgateway.io/docs).
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
LLMGATEWAY_API_KEY=your-api-key
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
```typescript
|
|
12
|
+
import { Agent } from "@mastra/core/agent";
|
|
13
|
+
|
|
14
|
+
const agent = new Agent({
|
|
15
|
+
id: "my-agent",
|
|
16
|
+
name: "My Agent",
|
|
17
|
+
instructions: "You are a helpful assistant",
|
|
18
|
+
model: "llmgateway/auto"
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// Generate a response
|
|
22
|
+
const response = await agent.generate("Hello!");
|
|
23
|
+
|
|
24
|
+
// Stream a response
|
|
25
|
+
const stream = await agent.stream("Tell me a story");
|
|
26
|
+
for await (const chunk of stream) {
|
|
27
|
+
console.log(chunk);
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
> **Info:** Mastra uses the OpenAI-compatible `/chat/completions` endpoint. Some provider-specific features may not be available. Check the [LLM Gateway documentation](https://llmgateway.io/docs) for details.
|
|
32
|
+
|
|
33
|
+
## Models
|
|
34
|
+
|
|
35
|
+
| Model | Context | Tools | Reasoning | Image | Audio | Video | Input $/1M | Output $/1M |
|
|
36
|
+
| -------------------------------------------------- | ------- | ----- | --------- | ----- | ----- | ----- | ---------- | ----------- |
|
|
37
|
+
| `llmgateway/auto` | 128K | | | | | | — | — |
|
|
38
|
+
| `llmgateway/claude-3-5-sonnet` | 200K | | | | | | $3 | $15 |
|
|
39
|
+
| `llmgateway/claude-3-7-sonnet` | 200K | | | | | | $3 | $15 |
|
|
40
|
+
| `llmgateway/claude-3-7-sonnet-20250219` | 200K | | | | | | $3 | $15 |
|
|
41
|
+
| `llmgateway/claude-3-haiku` | 200K | | | | | | $0.25 | $1 |
|
|
42
|
+
| `llmgateway/claude-3-haiku-20240307` | 200K | | | | | | $0.25 | $1 |
|
|
43
|
+
| `llmgateway/claude-3-opus` | 200K | | | | | | $15 | $75 |
|
|
44
|
+
| `llmgateway/claude-haiku-4-5` | 200K | | | | | | $1 | $5 |
|
|
45
|
+
| `llmgateway/claude-haiku-4-5-20251001` | 200K | | | | | | $1 | $5 |
|
|
46
|
+
| `llmgateway/claude-opus-4-1-20250805` | 200K | | | | | | $15 | $75 |
|
|
47
|
+
| `llmgateway/claude-opus-4-20250514` | 200K | | | | | | $15 | $75 |
|
|
48
|
+
| `llmgateway/claude-opus-4-5-20251101` | 200K | | | | | | $5 | $25 |
|
|
49
|
+
| `llmgateway/claude-opus-4-6` | 1.0M | | | | | | $5 | $25 |
|
|
50
|
+
| `llmgateway/claude-sonnet-4-20250514` | 200K | | | | | | $3 | $15 |
|
|
51
|
+
| `llmgateway/claude-sonnet-4-5` | 200K | | | | | | $3 | $15 |
|
|
52
|
+
| `llmgateway/claude-sonnet-4-5-20250929` | 200K | | | | | | $3 | $15 |
|
|
53
|
+
| `llmgateway/claude-sonnet-4-6` | 200K | | | | | | $3 | $15 |
|
|
54
|
+
| `llmgateway/codestral-2508` | 256K | | | | | | $0.30 | $0.90 |
|
|
55
|
+
| `llmgateway/cogview-4` | 2K | | | | | | — | — |
|
|
56
|
+
| `llmgateway/custom` | 128K | | | | | | — | — |
|
|
57
|
+
| `llmgateway/deepseek-r1-0528` | 64K | | | | | | $0.80 | $2 |
|
|
58
|
+
| `llmgateway/deepseek-v3.1` | 128K | | | | | | $0.56 | $2 |
|
|
59
|
+
| `llmgateway/deepseek-v3.2` | 164K | | | | | | $0.28 | $0.42 |
|
|
60
|
+
| `llmgateway/devstral-2512` | 262K | | | | | | $0.40 | $2 |
|
|
61
|
+
| `llmgateway/devstral-small-2507` | 131K | | | | | | $0.10 | $0.30 |
|
|
62
|
+
| `llmgateway/gemini-2.5-flash` | 1.0M | | | | | | $0.30 | $3 |
|
|
63
|
+
| `llmgateway/gemini-2.5-flash-image` | 33K | | | | | | $0.30 | $30 |
|
|
64
|
+
| `llmgateway/gemini-2.5-flash-image-preview` | 33K | | | | | | $0.30 | $3 |
|
|
65
|
+
| `llmgateway/gemini-2.5-flash-lite` | 1.0M | | | | | | $0.10 | $0.40 |
|
|
66
|
+
| `llmgateway/gemini-2.5-flash-lite-preview-09-2025` | 1.0M | | | | | | $0.10 | $0.40 |
|
|
67
|
+
| `llmgateway/gemini-2.5-pro` | 1.0M | | | | | | $1 | $10 |
|
|
68
|
+
| `llmgateway/gemini-3-flash-preview` | 1.0M | | | | | | $0.50 | $3 |
|
|
69
|
+
| `llmgateway/gemini-3-pro-image-preview` | 66K | | | | | | $2 | $12 |
|
|
70
|
+
| `llmgateway/gemini-3.1-flash-image-preview` | 66K | | | | | | $0.25 | $2 |
|
|
71
|
+
| `llmgateway/gemini-3.1-flash-lite-preview` | 1.0M | | | | | | $0.25 | $2 |
|
|
72
|
+
| `llmgateway/gemini-3.1-pro-preview` | 1.0M | | | | | | $2 | $12 |
|
|
73
|
+
| `llmgateway/gemini-pro-latest` | 1.0M | | | | | | $2 | $12 |
|
|
74
|
+
| `llmgateway/gemma-2-27b-it-together` | 8K | | | | | | $0.08 | $0.08 |
|
|
75
|
+
| `llmgateway/gemma-3-12b-it` | 1.0M | | | | | | $0.08 | $0.30 |
|
|
76
|
+
| `llmgateway/gemma-3-1b-it` | 1.0M | | | | | | $0.08 | $0.30 |
|
|
77
|
+
| `llmgateway/gemma-3-27b` | 128K | | | | | | $0.27 | $0.27 |
|
|
78
|
+
| `llmgateway/gemma-3-4b-it` | 1.0M | | | | | | $0.08 | $0.30 |
|
|
79
|
+
| `llmgateway/gemma-3n-e2b-it` | 1.0M | | | | | | $0.08 | $0.30 |
|
|
80
|
+
| `llmgateway/gemma-3n-e4b-it` | 1.0M | | | | | | $0.08 | $0.30 |
|
|
81
|
+
| `llmgateway/glm-4-32b-0414-128k` | 128K | | | | | | $0.10 | $0.10 |
|
|
82
|
+
| `llmgateway/glm-4.5` | 128K | | | | | | $0.60 | $2 |
|
|
83
|
+
| `llmgateway/glm-4.5-air` | 128K | | | | | | $0.20 | $1 |
|
|
84
|
+
| `llmgateway/glm-4.5-airx` | 128K | | | | | | $1 | $5 |
|
|
85
|
+
| `llmgateway/glm-4.5-flash` | 128K | | | | | | — | — |
|
|
86
|
+
| `llmgateway/glm-4.5-x` | 128K | | | | | | $2 | $9 |
|
|
87
|
+
| `llmgateway/glm-4.5v` | 128K | | | | | | $0.60 | $2 |
|
|
88
|
+
| `llmgateway/glm-4.6` | 200K | | | | | | $0.60 | $2 |
|
|
89
|
+
| `llmgateway/glm-4.6v` | 128K | | | | | | $0.30 | $0.90 |
|
|
90
|
+
| `llmgateway/glm-4.6v-flash` | 128K | | | | | | — | — |
|
|
91
|
+
| `llmgateway/glm-4.6v-flashx` | 128K | | | | | | $0.04 | $0.40 |
|
|
92
|
+
| `llmgateway/glm-4.7` | 200K | | | | | | $0.60 | $2 |
|
|
93
|
+
| `llmgateway/glm-4.7-flash` | 200K | | | | | | — | — |
|
|
94
|
+
| `llmgateway/glm-4.7-flashx` | 200K | | | | | | $0.07 | $0.40 |
|
|
95
|
+
| `llmgateway/glm-5` | 203K | | | | | | $1 | $3 |
|
|
96
|
+
| `llmgateway/glm-image` | 2K | | | | | | — | — |
|
|
97
|
+
| `llmgateway/gpt-3.5-turbo` | 16K | | | | | | $0.50 | $2 |
|
|
98
|
+
| `llmgateway/gpt-4` | 8K | | | | | | $30 | $60 |
|
|
99
|
+
| `llmgateway/gpt-4-turbo` | 128K | | | | | | $10 | $30 |
|
|
100
|
+
| `llmgateway/gpt-4.1` | 1.0M | | | | | | $2 | $8 |
|
|
101
|
+
| `llmgateway/gpt-4.1-mini` | 1.0M | | | | | | $0.40 | $2 |
|
|
102
|
+
| `llmgateway/gpt-4.1-nano` | 1.0M | | | | | | $0.10 | $0.40 |
|
|
103
|
+
| `llmgateway/gpt-4o` | 128K | | | | | | $3 | $10 |
|
|
104
|
+
| `llmgateway/gpt-4o-mini` | 128K | | | | | | $0.15 | $0.60 |
|
|
105
|
+
| `llmgateway/gpt-4o-mini-search-preview` | 128K | | | | | | $0.15 | $0.60 |
|
|
106
|
+
| `llmgateway/gpt-4o-search-preview` | 128K | | | | | | $3 | $10 |
|
|
107
|
+
| `llmgateway/gpt-5` | 400K | | | | | | $1 | $10 |
|
|
108
|
+
| `llmgateway/gpt-5-chat-latest` | 400K | | | | | | $1 | $10 |
|
|
109
|
+
| `llmgateway/gpt-5-mini` | 400K | | | | | | $0.25 | $2 |
|
|
110
|
+
| `llmgateway/gpt-5-nano` | 400K | | | | | | $0.05 | $0.40 |
|
|
111
|
+
| `llmgateway/gpt-5-pro` | 400K | | | | | | $15 | $120 |
|
|
112
|
+
| `llmgateway/gpt-5.1` | 400K | | | | | | $1 | $10 |
|
|
113
|
+
| `llmgateway/gpt-5.1-codex` | 400K | | | | | | $1 | $10 |
|
|
114
|
+
| `llmgateway/gpt-5.1-codex-mini` | 400K | | | | | | $0.25 | $2 |
|
|
115
|
+
| `llmgateway/gpt-5.2` | 400K | | | | | | $2 | $14 |
|
|
116
|
+
| `llmgateway/gpt-5.2-chat-latest` | 128K | | | | | | $2 | $14 |
|
|
117
|
+
| `llmgateway/gpt-5.2-codex` | 400K | | | | | | $2 | $14 |
|
|
118
|
+
| `llmgateway/gpt-5.2-pro` | 400K | | | | | | $21 | $168 |
|
|
119
|
+
| `llmgateway/gpt-5.3-chat-latest` | 128K | | | | | | $2 | $14 |
|
|
120
|
+
| `llmgateway/gpt-5.3-codex` | 400K | | | | | | $2 | $14 |
|
|
121
|
+
| `llmgateway/gpt-5.4` | 1.1M | | | | | | $3 | $15 |
|
|
122
|
+
| `llmgateway/gpt-5.4-mini` | 400K | | | | | | $0.75 | $5 |
|
|
123
|
+
| `llmgateway/gpt-5.4-nano` | 400K | | | | | | $0.20 | $1 |
|
|
124
|
+
| `llmgateway/gpt-5.4-pro` | 1.1M | | | | | | $30 | $180 |
|
|
125
|
+
| `llmgateway/gpt-oss-120b` | 131K | | | | | | $0.15 | $0.75 |
|
|
126
|
+
| `llmgateway/gpt-oss-20b` | 131K | | | | | | $0.10 | $0.50 |
|
|
127
|
+
| `llmgateway/grok-3` | 131K | | | | | | $3 | $15 |
|
|
128
|
+
| `llmgateway/grok-4` | 256K | | | | | | $3 | $15 |
|
|
129
|
+
| `llmgateway/grok-4-0709` | 256K | | | | | | $3 | $15 |
|
|
130
|
+
| `llmgateway/grok-4-1-fast` | 2.0M | | | | | | $0.20 | $0.50 |
|
|
131
|
+
| `llmgateway/grok-4-1-fast-non-reasoning` | 2.0M | | | | | | $0.20 | $0.50 |
|
|
132
|
+
| `llmgateway/grok-4-1-fast-reasoning` | 2.0M | | | | | | $0.20 | $0.50 |
|
|
133
|
+
| `llmgateway/grok-4-20-beta-0309-non-reasoning` | 2.0M | | | | | | $2 | $6 |
|
|
134
|
+
| `llmgateway/grok-4-20-beta-0309-reasoning` | 2.0M | | | | | | $2 | $6 |
|
|
135
|
+
| `llmgateway/grok-4-20-multi-agent-beta-0309` | 2.0M | | | | | | $2 | $6 |
|
|
136
|
+
| `llmgateway/grok-4-fast` | 2.0M | | | | | | $0.20 | $0.50 |
|
|
137
|
+
| `llmgateway/grok-4-fast-non-reasoning` | 2.0M | | | | | | $0.20 | $0.50 |
|
|
138
|
+
| `llmgateway/grok-4-fast-reasoning` | 2.0M | | | | | | $0.20 | $0.50 |
|
|
139
|
+
| `llmgateway/grok-code-fast-1` | 256K | | | | | | $0.20 | $2 |
|
|
140
|
+
| `llmgateway/grok-imagine-image` | 2K | | | | | | — | — |
|
|
141
|
+
| `llmgateway/grok-imagine-image-pro` | 2K | | | | | | — | — |
|
|
142
|
+
| `llmgateway/hermes-2-pro-llama-3-8b` | 8K | | | | | | $0.14 | $0.14 |
|
|
143
|
+
| `llmgateway/kimi-k2` | 131K | | | | | | $1 | $3 |
|
|
144
|
+
| `llmgateway/kimi-k2-thinking` | 262K | | | | | | $0.60 | $3 |
|
|
145
|
+
| `llmgateway/kimi-k2-thinking-turbo` | 262K | | | | | | $1 | $8 |
|
|
146
|
+
| `llmgateway/kimi-k2.5` | 262K | | | | | | $0.60 | $3 |
|
|
147
|
+
| `llmgateway/llama-3-70b-instruct` | 8K | | | | | | $0.51 | $0.74 |
|
|
148
|
+
| `llmgateway/llama-3-8b-instruct` | 8K | | | | | | $0.04 | $0.04 |
|
|
149
|
+
| `llmgateway/llama-3.1-70b-instruct` | 128K | | | | | | $0.72 | $0.72 |
|
|
150
|
+
| `llmgateway/llama-3.1-8b-instruct` | 128K | | | | | | $0.22 | $0.22 |
|
|
151
|
+
| `llmgateway/llama-3.1-nemotron-ultra-253b` | 128K | | | | | | $0.60 | $2 |
|
|
152
|
+
| `llmgateway/llama-3.2-11b-instruct` | 128K | | | | | | $0.07 | $0.33 |
|
|
153
|
+
| `llmgateway/llama-3.2-3b-instruct` | 33K | | | | | | $0.03 | $0.05 |
|
|
154
|
+
| `llmgateway/llama-3.3-70b-instruct` | 128K | | | | | | $0.13 | $0.40 |
|
|
155
|
+
| `llmgateway/llama-4-maverick-17b-instruct` | 8K | | | | | | $0.24 | $0.97 |
|
|
156
|
+
| `llmgateway/llama-4-scout` | 33K | | | | | | $0.18 | $0.59 |
|
|
157
|
+
| `llmgateway/llama-4-scout-17b-instruct` | 8K | | | | | | $0.17 | $0.66 |
|
|
158
|
+
| `llmgateway/llama-guard-4-12b` | 131K | | | | | | $0.20 | $0.20 |
|
|
159
|
+
| `llmgateway/minimax-m2` | 197K | | | | | | $0.20 | $1 |
|
|
160
|
+
| `llmgateway/minimax-m2.1` | 197K | | | | | | $0.27 | $1 |
|
|
161
|
+
| `llmgateway/minimax-m2.1-lightning` | 197K | | | | | | $0.12 | $0.48 |
|
|
162
|
+
| `llmgateway/minimax-m2.5` | 205K | | | | | | $0.30 | $1 |
|
|
163
|
+
| `llmgateway/minimax-m2.5-highspeed` | 205K | | | | | | $0.60 | $2 |
|
|
164
|
+
| `llmgateway/minimax-m2.7` | 205K | | | | | | $0.30 | $1 |
|
|
165
|
+
| `llmgateway/minimax-m2.7-highspeed` | 205K | | | | | | $0.60 | $2 |
|
|
166
|
+
| `llmgateway/minimax-text-01` | 1.0M | | | | | | $0.20 | $1 |
|
|
167
|
+
| `llmgateway/ministral-14b-2512` | 262K | | | | | | $0.20 | $0.20 |
|
|
168
|
+
| `llmgateway/ministral-3b-2512` | 131K | | | | | | $0.10 | $0.10 |
|
|
169
|
+
| `llmgateway/ministral-8b-2512` | 262K | | | | | | $0.15 | $0.15 |
|
|
170
|
+
| `llmgateway/mistral-large-2512` | 262K | | | | | | $0.50 | $2 |
|
|
171
|
+
| `llmgateway/mistral-large-latest` | 128K | | | | | | $4 | $12 |
|
|
172
|
+
| `llmgateway/mistral-small-2506` | 128K | | | | | | $0.10 | $0.30 |
|
|
173
|
+
| `llmgateway/mixtral-8x7b-instruct-together` | 33K | | | | | | $0.06 | $0.06 |
|
|
174
|
+
| `llmgateway/o1` | 200K | | | | | | $15 | $60 |
|
|
175
|
+
| `llmgateway/o3` | 200K | | | | | | $2 | $8 |
|
|
176
|
+
| `llmgateway/o3-mini` | 200K | | | | | | $1 | $4 |
|
|
177
|
+
| `llmgateway/o4-mini` | 200K | | | | | | $1 | $4 |
|
|
178
|
+
| `llmgateway/pixtral-large-latest` | 128K | | | | | | $4 | $12 |
|
|
179
|
+
| `llmgateway/qwen-coder-plus` | 131K | | | | | | $1 | $5 |
|
|
180
|
+
| `llmgateway/qwen-flash` | 1.0M | | | | | | $0.05 | $0.40 |
|
|
181
|
+
| `llmgateway/qwen-image` | 2K | | | | | | — | — |
|
|
182
|
+
| `llmgateway/qwen-image-edit-max` | 2K | | | | | | — | — |
|
|
183
|
+
| `llmgateway/qwen-image-edit-plus` | 2K | | | | | | — | — |
|
|
184
|
+
| `llmgateway/qwen-image-max` | 2K | | | | | | — | — |
|
|
185
|
+
| `llmgateway/qwen-image-max-2025-12-30` | 2K | | | | | | — | — |
|
|
186
|
+
| `llmgateway/qwen-image-plus` | 2K | | | | | | — | — |
|
|
187
|
+
| `llmgateway/qwen-max` | 131K | | | | | | $2 | $6 |
|
|
188
|
+
| `llmgateway/qwen-max-latest` | 131K | | | | | | $2 | $6 |
|
|
189
|
+
| `llmgateway/qwen-omni-turbo` | 33K | | | | | | $0.20 | $0.80 |
|
|
190
|
+
| `llmgateway/qwen-plus` | 131K | | | | | | $0.40 | $1 |
|
|
191
|
+
| `llmgateway/qwen-plus-latest` | 1.0M | | | | | | $0.40 | $1 |
|
|
192
|
+
| `llmgateway/qwen-turbo` | 1.0M | | | | | | $0.05 | $0.20 |
|
|
193
|
+
| `llmgateway/qwen-vl-max` | 131K | | | | | | $0.80 | $3 |
|
|
194
|
+
| `llmgateway/qwen-vl-plus` | 131K | | | | | | $0.21 | $0.64 |
|
|
195
|
+
| `llmgateway/qwen2-5-vl-32b-instruct` | 131K | | | | | | $1 | $4 |
|
|
196
|
+
| `llmgateway/qwen2-5-vl-72b-instruct` | 33K | | | | | | $0.13 | $0.40 |
|
|
197
|
+
| `llmgateway/qwen25-coder-7b` | 33K | | | | | | $0.01 | $0.03 |
|
|
198
|
+
| `llmgateway/qwen3-235b-a22b-fp8` | 41K | | | | | | $0.20 | $0.80 |
|
|
199
|
+
| `llmgateway/qwen3-235b-a22b-instruct-2507` | 262K | | | | | | $0.20 | $0.60 |
|
|
200
|
+
| `llmgateway/qwen3-235b-a22b-thinking-2507` | 262K | | | | | | $0.20 | $0.60 |
|
|
201
|
+
| `llmgateway/qwen3-30b-a3b-fp8` | 41K | | | | | | $0.09 | $0.45 |
|
|
202
|
+
| `llmgateway/qwen3-30b-a3b-instruct-2507` | 262K | | | | | | $0.10 | $0.30 |
|
|
203
|
+
| `llmgateway/qwen3-30b-a3b-thinking-2507` | 262K | | | | | | $0.10 | $0.30 |
|
|
204
|
+
| `llmgateway/qwen3-32b` | 33K | | | | | | $0.10 | $0.30 |
|
|
205
|
+
| `llmgateway/qwen3-32b-fp8` | 41K | | | | | | $0.10 | $0.45 |
|
|
206
|
+
| `llmgateway/qwen3-4b-fp8` | 128K | | | | | | $0.03 | $0.03 |
|
|
207
|
+
| `llmgateway/qwen3-coder-30b-a3b-instruct` | 262K | | | | | | $0.10 | $0.30 |
|
|
208
|
+
| `llmgateway/qwen3-coder-480b-a35b-instruct` | 262K | | | | | | $0.40 | $2 |
|
|
209
|
+
| `llmgateway/qwen3-coder-flash` | 1.0M | | | | | | $0.30 | $2 |
|
|
210
|
+
| `llmgateway/qwen3-coder-next` | 262K | | | | | | $0.11 | $0.68 |
|
|
211
|
+
| `llmgateway/qwen3-coder-plus` | 1.0M | | | | | | $6 | $60 |
|
|
212
|
+
| `llmgateway/qwen3-max` | 256K | | | | | | $3 | $15 |
|
|
213
|
+
| `llmgateway/qwen3-max-2026-01-23` | 262K | | | | | | $1 | $6 |
|
|
214
|
+
| `llmgateway/qwen3-next-80b-a3b-instruct` | 129K | | | | | | $0.50 | $2 |
|
|
215
|
+
| `llmgateway/qwen3-next-80b-a3b-thinking` | 131K | | | | | | $0.50 | $6 |
|
|
216
|
+
| `llmgateway/qwen3-vl-235b-a22b-instruct` | 131K | | | | | | $0.50 | $2 |
|
|
217
|
+
| `llmgateway/qwen3-vl-235b-a22b-thinking` | 131K | | | | | | $0.50 | $2 |
|
|
218
|
+
| `llmgateway/qwen3-vl-30b-a3b-instruct` | 131K | | | | | | $0.20 | $0.70 |
|
|
219
|
+
| `llmgateway/qwen3-vl-30b-a3b-thinking` | 131K | | | | | | $0.20 | $1 |
|
|
220
|
+
| `llmgateway/qwen3-vl-8b-instruct` | 131K | | | | | | $0.08 | $0.50 |
|
|
221
|
+
| `llmgateway/qwen3-vl-flash` | 262K | | | | | | $0.05 | $0.40 |
|
|
222
|
+
| `llmgateway/qwen3-vl-plus` | 262K | | | | | | $0.20 | $2 |
|
|
223
|
+
| `llmgateway/qwen35-397b-a17b` | 262K | | | | | | $0.60 | $4 |
|
|
224
|
+
| `llmgateway/qwq-plus` | 131K | | | | | | $0.80 | $2 |
|
|
225
|
+
| `llmgateway/seed-1-6-250615` | 256K | | | | | | $0.25 | $2 |
|
|
226
|
+
| `llmgateway/seed-1-6-250915` | 256K | | | | | | $0.25 | $2 |
|
|
227
|
+
| `llmgateway/seed-1-6-flash-250715` | 256K | | | | | | $0.07 | $0.30 |
|
|
228
|
+
| `llmgateway/seed-1-8-251228` | 256K | | | | | | $0.25 | $2 |
|
|
229
|
+
| `llmgateway/seedream-4-0` | 2K | | | | | | — | — |
|
|
230
|
+
| `llmgateway/seedream-4-5` | 2K | | | | | | — | — |
|
|
231
|
+
| `llmgateway/sonar` | 130K | | | | | | $1 | $1 |
|
|
232
|
+
| `llmgateway/sonar-pro` | 200K | | | | | | $3 | $15 |
|
|
233
|
+
| `llmgateway/sonar-reasoning-pro` | 128K | | | | | | $2 | $8 |
|
|
234
|
+
| `llmgateway/veo-3.1-fast-generate-preview` | 33K | | | | | | — | — |
|
|
235
|
+
| `llmgateway/veo-3.1-generate-preview` | 33K | | | | | | — | — |
|
|
236
|
+
|
|
237
|
+
## Advanced configuration
|
|
238
|
+
|
|
239
|
+
### Custom headers
|
|
240
|
+
|
|
241
|
+
```typescript
|
|
242
|
+
const agent = new Agent({
|
|
243
|
+
id: "custom-agent",
|
|
244
|
+
name: "custom-agent",
|
|
245
|
+
model: {
|
|
246
|
+
url: "https://api.llmgateway.io/v1",
|
|
247
|
+
id: "llmgateway/auto",
|
|
248
|
+
apiKey: process.env.LLMGATEWAY_API_KEY,
|
|
249
|
+
headers: {
|
|
250
|
+
"X-Custom-Header": "value"
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
});
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Dynamic model selection
|
|
257
|
+
|
|
258
|
+
```typescript
|
|
259
|
+
const agent = new Agent({
|
|
260
|
+
id: "dynamic-agent",
|
|
261
|
+
name: "Dynamic Agent",
|
|
262
|
+
model: ({ requestContext }) => {
|
|
263
|
+
const useAdvanced = requestContext.task === "complex";
|
|
264
|
+
return useAdvanced
|
|
265
|
+
? "llmgateway/veo-3.1-generate-preview"
|
|
266
|
+
: "llmgateway/auto";
|
|
267
|
+
}
|
|
268
|
+
});
|
|
269
|
+
```
|
|
@@ -45,6 +45,7 @@ Direct access to individual AI model providers. Each provider offers unique mode
|
|
|
45
45
|
- [Kimi For Coding](https://mastra.ai/models/providers/kimi-for-coding)
|
|
46
46
|
- [KUAE Cloud Coding Plan](https://mastra.ai/models/providers/kuae-cloud-coding-plan)
|
|
47
47
|
- [Llama](https://mastra.ai/models/providers/llama)
|
|
48
|
+
- [LLM Gateway](https://mastra.ai/models/providers/llmgateway)
|
|
48
49
|
- [LMStudio](https://mastra.ai/models/providers/lmstudio)
|
|
49
50
|
- [LucidQuery AI](https://mastra.ai/models/providers/lucidquery)
|
|
50
51
|
- [Meganova](https://mastra.ai/models/providers/meganova)
|
|
@@ -35,7 +35,7 @@ console.log(`Processed ${result.summary.totalItems} items`)
|
|
|
35
35
|
|
|
36
36
|
**data** (`RunEvalsDataItem[]`): Array of test cases with input data and optional ground truth.
|
|
37
37
|
|
|
38
|
-
**scorers** (`MastraScorer[] | WorkflowScorerConfig`):
|
|
38
|
+
**scorers** (`MastraScorer[] | AgentScorerConfig | WorkflowScorerConfig`): Scorers to use. A flat array applies all scorers to the raw output. For agents, an \`AgentScorerConfig\` object separates agent-level and trajectory scorers. For workflows, a \`WorkflowScorerConfig\` object specifies scorers for the workflow, individual steps, and trajectory.
|
|
39
39
|
|
|
40
40
|
**targetOptions** (`AgentExecutionOptions | WorkflowRunOptions`): Options forwarded to the target during execution. For agents: options passed to agent.generate() (e.g. maxSteps, modelSettings, instructions). For workflows: options passed to run.start() (e.g. perStep, outputOptions, initialState).
|
|
41
41
|
|
|
@@ -49,20 +49,32 @@ console.log(`Processed ${result.summary.totalItems} items`)
|
|
|
49
49
|
|
|
50
50
|
**groundTruth** (`any`): Expected or reference output for comparison during scoring.
|
|
51
51
|
|
|
52
|
+
**expectedTrajectory** (`TrajectoryExpectation`): Expected trajectory configuration for trajectory scoring. Includes expected steps, ordering, efficiency budgets, blacklists, and tool failure tolerance. Passed to trajectory scorers as \`run.expectedTrajectory\`. Overrides the static defaults in scorer constructors.
|
|
53
|
+
|
|
52
54
|
**requestContext** (`RequestContext`): Request Context to pass to the target during execution.
|
|
53
55
|
|
|
54
56
|
**tracingContext** (`TracingContext`): Tracing context for observability and debugging.
|
|
55
57
|
|
|
56
58
|
**startOptions** (`WorkflowRunOptions`): Per-item workflow run options (e.g. initialState, perStep, outputOptions). Merged on top of targetOptions, so per-item values take precedence. Only applicable when the target is a workflow.
|
|
57
59
|
|
|
60
|
+
## Agent scorer configuration
|
|
61
|
+
|
|
62
|
+
For agents, use `AgentScorerConfig` to separate agent-level scorers from trajectory scorers:
|
|
63
|
+
|
|
64
|
+
**agent** (`MastraScorer[]`): Scorers that receive the raw agent output (MastraDBMessage\[]). Use for evaluating response quality, content, etc.
|
|
65
|
+
|
|
66
|
+
**trajectory** (`MastraScorer[]`): Scorers that receive a pre-extracted Trajectory object. When storage is configured, the pipeline extracts a hierarchical trajectory from observability traces (including nested tool calls and model generations). Otherwise, it falls back to extracting tool calls from agent messages.
|
|
67
|
+
|
|
58
68
|
## Workflow scorer configuration
|
|
59
69
|
|
|
60
|
-
For workflows,
|
|
70
|
+
For workflows, use `WorkflowScorerConfig` to specify scorers at different levels:
|
|
61
71
|
|
|
62
|
-
**workflow** (`MastraScorer[]`):
|
|
72
|
+
**workflow** (`MastraScorer[]`): Scorers to evaluate the entire workflow output.
|
|
63
73
|
|
|
64
74
|
**steps** (`Record<string, MastraScorer[]>`): Object mapping step IDs to arrays of scorers for evaluating individual step outputs.
|
|
65
75
|
|
|
76
|
+
**trajectory** (`MastraScorer[]`): Scorers that receive a pre-extracted Trajectory from the workflow execution. When storage is configured, the pipeline extracts a hierarchical trajectory from observability traces (including nested agent runs and tool calls within workflow steps). Otherwise, it falls back to extracting step results from the workflow output.
|
|
77
|
+
|
|
66
78
|
## Returns
|
|
67
79
|
|
|
68
80
|
**scores** (`Record<string, any>`): Average scores across all test cases, organized by scorer name.
|
|
@@ -105,6 +117,36 @@ const result = await runEvals({
|
|
|
105
117
|
})
|
|
106
118
|
```
|
|
107
119
|
|
|
120
|
+
### Agent trajectory evaluation
|
|
121
|
+
|
|
122
|
+
Use `AgentScorerConfig` to evaluate both the agent response and its tool-calling trajectory:
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
import { runEvals } from '@mastra/core/evals'
|
|
126
|
+
import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/code/trajectory'
|
|
127
|
+
|
|
128
|
+
const trajectoryScorer = createTrajectoryAccuracyScorerCode()
|
|
129
|
+
|
|
130
|
+
const result = await runEvals({
|
|
131
|
+
target: chatAgent,
|
|
132
|
+
data: [
|
|
133
|
+
{
|
|
134
|
+
input: 'What is the weather in London?',
|
|
135
|
+
expectedTrajectory: {
|
|
136
|
+
steps: [{ stepType: 'tool_call', name: 'weatherTool' }],
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
],
|
|
140
|
+
scorers: {
|
|
141
|
+
// agent: [responseQualityScorer], // Optional: add agent-level scorers
|
|
142
|
+
trajectory: [trajectoryScorer],
|
|
143
|
+
},
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
// result.scores.agent — average agent-level scores
|
|
147
|
+
// result.scores.trajectory — average trajectory scores
|
|
148
|
+
```
|
|
149
|
+
|
|
108
150
|
### Agent with `targetOptions`
|
|
109
151
|
|
|
110
152
|
Pass execution options like `maxSteps` or `modelSettings` to customize agent behavior during evaluation:
|
|
@@ -149,6 +191,37 @@ const workflowResult = await runEvals({
|
|
|
149
191
|
})
|
|
150
192
|
```
|
|
151
193
|
|
|
194
|
+
### Workflow trajectory evaluation
|
|
195
|
+
|
|
196
|
+
Add trajectory scoring to workflow evaluations to validate step execution order:
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
const workflowResult = await runEvals({
|
|
200
|
+
target: myWorkflow,
|
|
201
|
+
data: [
|
|
202
|
+
{
|
|
203
|
+
input: { query: 'Process this data' },
|
|
204
|
+
expectedTrajectory: {
|
|
205
|
+
steps: [
|
|
206
|
+
{ stepType: 'workflow_step', name: 'validate' },
|
|
207
|
+
{ stepType: 'workflow_step', name: 'process' },
|
|
208
|
+
{ stepType: 'workflow_step', name: 'output' },
|
|
209
|
+
],
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
],
|
|
213
|
+
scorers: {
|
|
214
|
+
workflow: [outputQualityScorer],
|
|
215
|
+
steps: {
|
|
216
|
+
validate: [validationScorer],
|
|
217
|
+
},
|
|
218
|
+
trajectory: [trajectoryScorer],
|
|
219
|
+
},
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
// result.scores.trajectory — workflow trajectory scores
|
|
223
|
+
```
|
|
224
|
+
|
|
152
225
|
### Workflow with per-item `startOptions`
|
|
153
226
|
|
|
154
227
|
Use `startOptions` on individual data items to customize each workflow run. Per-item values take precedence over `targetOptions`:
|
|
@@ -175,5 +248,7 @@ const result = await runEvals({
|
|
|
175
248
|
|
|
176
249
|
- [createScorer()](https://mastra.ai/reference/evals/create-scorer) - Create custom scorers for experiments
|
|
177
250
|
- [MastraScorer](https://mastra.ai/reference/evals/mastra-scorer) - Learn about scorer structure and methods
|
|
251
|
+
- [Trajectory Accuracy](https://mastra.ai/reference/evals/trajectory-accuracy) - Built-in trajectory evaluation scorers
|
|
252
|
+
- [Scorer Utilities](https://mastra.ai/reference/evals/scorer-utils) - Helper functions for extracting trajectory data
|
|
178
253
|
- [Custom Scorers](https://mastra.ai/docs/evals/custom-scorers) - Guide to building evaluation logic
|
|
179
254
|
- [Scorers Overview](https://mastra.ai/docs/evals/overview) - Understanding scorer concepts
|
|
@@ -14,9 +14,21 @@ import {
|
|
|
14
14
|
extractToolCalls,
|
|
15
15
|
extractInputMessages,
|
|
16
16
|
extractAgentResponseMessages,
|
|
17
|
+
compareTrajectories,
|
|
18
|
+
createTrajectoryTestRun,
|
|
17
19
|
} from '@mastra/evals/scorers/utils'
|
|
18
20
|
```
|
|
19
21
|
|
|
22
|
+
Trajectory extraction functions are available from `@mastra/core/evals`:
|
|
23
|
+
|
|
24
|
+
```typescript
|
|
25
|
+
import {
|
|
26
|
+
extractTrajectory,
|
|
27
|
+
extractWorkflowTrajectory,
|
|
28
|
+
extractTrajectoryFromTrace,
|
|
29
|
+
} from '@mastra/core/evals'
|
|
30
|
+
```
|
|
31
|
+
|
|
20
32
|
## Message extraction
|
|
21
33
|
|
|
22
34
|
### `getAssistantMessageFromRunOutput`
|
|
@@ -266,6 +278,178 @@ const result = await myScorer.run({
|
|
|
266
278
|
})
|
|
267
279
|
```
|
|
268
280
|
|
|
281
|
+
## Trajectory utilities
|
|
282
|
+
|
|
283
|
+
### `extractTrajectory`
|
|
284
|
+
|
|
285
|
+
Extracts a `Trajectory` from agent output messages (`MastraDBMessage[]`). Converts tool invocations into `ToolCallStep` objects. The `runEvals` pipeline calls this automatically for trajectory scorers — you only need it for direct testing.
|
|
286
|
+
|
|
287
|
+
Available from `@mastra/core/evals`.
|
|
288
|
+
|
|
289
|
+
```typescript
|
|
290
|
+
import { extractTrajectory } from '@mastra/core/evals'
|
|
291
|
+
|
|
292
|
+
const trajectory = extractTrajectory(agentOutputMessages)
|
|
293
|
+
// trajectory.steps — ToolCallStep[] extracted from toolInvocations
|
|
294
|
+
// trajectory.rawOutput — the original MastraDBMessage[] array
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawOutput`.
|
|
298
|
+
|
|
299
|
+
### `extractWorkflowTrajectory`
|
|
300
|
+
|
|
301
|
+
Extracts a `Trajectory` from workflow step results. Converts `StepResult` records into `WorkflowStepStep` objects, respecting the execution path ordering.
|
|
302
|
+
|
|
303
|
+
Available from `@mastra/core/evals`.
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { extractWorkflowTrajectory } from '@mastra/core/evals'
|
|
307
|
+
|
|
308
|
+
const trajectory = extractWorkflowTrajectory(
|
|
309
|
+
workflowResult.steps, // Record<string, StepResult>
|
|
310
|
+
workflowResult.stepExecutionPath, // string[] (optional)
|
|
311
|
+
)
|
|
312
|
+
// trajectory.steps — WorkflowStepStep[] in execution order
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawWorkflowResult`.
|
|
316
|
+
|
|
317
|
+
### `extractTrajectoryFromTrace`
|
|
318
|
+
|
|
319
|
+
Builds a hierarchical `Trajectory` from observability trace spans (`SpanRecord[]`). Reconstructs the parent-child span tree and maps each span to the appropriate `TrajectoryStep` discriminated union type with nested `children`.
|
|
320
|
+
|
|
321
|
+
This is the preferred extraction method when storage is available. The `runEvals` pipeline calls this automatically when the target's `Mastra` instance has a configured storage backend. It produces richer trajectories than `extractTrajectory` or `extractWorkflowTrajectory` because it captures the full execution tree, including nested agent runs, tool calls, and model generations.
|
|
322
|
+
|
|
323
|
+
Available from `@mastra/core/evals`.
|
|
324
|
+
|
|
325
|
+
```typescript
|
|
326
|
+
import { extractTrajectoryFromTrace } from '@mastra/core/evals'
|
|
327
|
+
|
|
328
|
+
// After fetching a trace from the observability store
|
|
329
|
+
const traceData = await observabilityStore.getTrace({ traceId })
|
|
330
|
+
const trajectory = extractTrajectoryFromTrace(traceData.spans, rootSpanId)
|
|
331
|
+
// trajectory.steps — hierarchical TrajectoryStep[] with children
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
**Parameters:**
|
|
335
|
+
|
|
336
|
+
- `spans` (`SpanRecord[]`) — Array of span records from a trace query.
|
|
337
|
+
- `rootSpanId` (`string`, optional) — Span ID to use as the starting point. When omitted, uses spans with no parent.
|
|
338
|
+
|
|
339
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]` with recursive `children` and `totalDurationMs`.
|
|
340
|
+
|
|
341
|
+
#### Span type mapping
|
|
342
|
+
|
|
343
|
+
| Span type | Trajectory step type | Key fields extracted |
|
|
344
|
+
| ---------------------- | ---------------------- | ------------------------------------------------------------- |
|
|
345
|
+
| `TOOL_CALL` | `tool_call` | `toolArgs`, `toolResult`, `success` |
|
|
346
|
+
| `MCP_TOOL_CALL` | `mcp_tool_call` | `toolArgs`, `toolResult`, `mcpServer`, `success` |
|
|
347
|
+
| `MODEL_GENERATION` | `model_generation` | `modelId`, `promptTokens`, `completionTokens`, `finishReason` |
|
|
348
|
+
| `AGENT_RUN` | `agent_run` | `agentId` (from entity ID) |
|
|
349
|
+
| `WORKFLOW_RUN` | `workflow_run` | `workflowId` (from entity ID) |
|
|
350
|
+
| `WORKFLOW_STEP` | `workflow_step` | `output` |
|
|
351
|
+
| `WORKFLOW_CONDITIONAL` | `workflow_conditional` | `conditionCount`, `selectedSteps` |
|
|
352
|
+
| `WORKFLOW_PARALLEL` | `workflow_parallel` | `branchCount`, `parallelSteps` |
|
|
353
|
+
| `WORKFLOW_LOOP` | `workflow_loop` | `loopType`, `totalIterations` |
|
|
354
|
+
| `WORKFLOW_SLEEP` | `workflow_sleep` | `sleepDurationMs`, `sleepType` |
|
|
355
|
+
| `WORKFLOW_WAIT_EVENT` | `workflow_wait_event` | `eventName`, `eventReceived` |
|
|
356
|
+
| `PROCESSOR_RUN` | `processor_run` | `processorId` |
|
|
357
|
+
|
|
358
|
+
Spans with types `GENERIC`, `MODEL_STEP`, `MODEL_CHUNK`, and `WORKFLOW_CONDITIONAL_EVAL` are skipped as noise.
|
|
359
|
+
|
|
360
|
+
### `compareTrajectories`
|
|
361
|
+
|
|
362
|
+
Compares an actual trajectory against an expected trajectory and returns a detailed comparison result. Used internally by `createTrajectoryAccuracyScorerCode`.
|
|
363
|
+
|
|
364
|
+
The `expected` parameter accepts either a `Trajectory` (actual trajectory) or `{ steps: ExpectedStep[] }`. When using `ExpectedStep[]`, you can match by name only, name + stepType, or include data for comparison. See [Expected steps](https://mastra.ai/reference/evals/trajectory-accuracy) for details.
|
|
365
|
+
|
|
366
|
+
```typescript
|
|
367
|
+
import { compareTrajectories } from '@mastra/evals/scorers/utils'
|
|
368
|
+
|
|
369
|
+
// Using ExpectedStep[] (recommended for expectations)
|
|
370
|
+
const result = compareTrajectories(
|
|
371
|
+
actualTrajectory,
|
|
372
|
+
{ steps: [{ name: 'search' }, { name: 'summarize', stepType: 'tool_call' }] },
|
|
373
|
+
{ compareStepData: false, allowRepeatedSteps: true },
|
|
374
|
+
)
|
|
375
|
+
// result.score — 0.0 to 1.0
|
|
376
|
+
// result.missingSteps — step names not found
|
|
377
|
+
// result.extraSteps — unexpected step names
|
|
378
|
+
// result.outOfOrderSteps — steps found but in wrong order
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
**Returns:** `TrajectoryComparisonResult`
|
|
382
|
+
|
|
383
|
+
### `createTrajectoryTestRun`
|
|
384
|
+
|
|
385
|
+
Creates a test run object for trajectory scorers. Wraps a `Trajectory` into the expected `ScorerRun` format.
|
|
386
|
+
|
|
387
|
+
```typescript
|
|
388
|
+
import { createTrajectoryTestRun } from '@mastra/evals/scorers/utils'
|
|
389
|
+
|
|
390
|
+
const run = createTrajectoryTestRun({
|
|
391
|
+
steps: [
|
|
392
|
+
{ stepType: 'tool_call', name: 'search', toolArgs: { q: 'test' } },
|
|
393
|
+
{ stepType: 'tool_call', name: 'summarize' },
|
|
394
|
+
],
|
|
395
|
+
})
|
|
396
|
+
|
|
397
|
+
const result = await trajectoryScorer.run(run)
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
### `checkTrajectoryEfficiency`
|
|
401
|
+
|
|
402
|
+
Evaluates trajectory efficiency against step, token, and duration budgets. Also detects redundant calls (same tool with same arguments).
|
|
403
|
+
|
|
404
|
+
```typescript
|
|
405
|
+
import { checkTrajectoryEfficiency } from '@mastra/evals/scorers/utils'
|
|
406
|
+
|
|
407
|
+
const result = checkTrajectoryEfficiency(trajectory, {
|
|
408
|
+
maxSteps: 5,
|
|
409
|
+
maxTotalTokens: 2000,
|
|
410
|
+
maxTotalDurationMs: 5000,
|
|
411
|
+
noRedundantCalls: true,
|
|
412
|
+
})
|
|
413
|
+
// result.score — 1.0 if within all budgets, lower with penalties
|
|
414
|
+
// result.redundantCalls — duplicate tool+args combos
|
|
415
|
+
// result.overBudget — which budgets were exceeded
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
**Returns:** `TrajectoryEfficiencyResult`
|
|
419
|
+
|
|
420
|
+
### `checkTrajectoryBlacklist`
|
|
421
|
+
|
|
422
|
+
Checks whether a trajectory contains forbidden tools or tool call sequences.
|
|
423
|
+
|
|
424
|
+
```typescript
|
|
425
|
+
import { checkTrajectoryBlacklist } from '@mastra/evals/scorers/utils'
|
|
426
|
+
|
|
427
|
+
const result = checkTrajectoryBlacklist(trajectory, {
|
|
428
|
+
blacklistedTools: ['deleteAll', 'admin-override'],
|
|
429
|
+
blacklistedSequences: [['escalate', 'admin-override']],
|
|
430
|
+
})
|
|
431
|
+
// result.passed — true if no violations
|
|
432
|
+
// result.violations — list of violations with type and details
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
**Returns:** `TrajectoryBlacklistResult`
|
|
436
|
+
|
|
437
|
+
### `analyzeToolFailures`
|
|
438
|
+
|
|
439
|
+
Detects tool failure patterns including retries, fallbacks, and argument corrections.
|
|
440
|
+
|
|
441
|
+
```typescript
|
|
442
|
+
import { analyzeToolFailures } from '@mastra/evals/scorers/utils'
|
|
443
|
+
|
|
444
|
+
const result = analyzeToolFailures(trajectory, {
|
|
445
|
+
maxRetriesPerTool: 3,
|
|
446
|
+
})
|
|
447
|
+
// result.score — 1.0 if no failure patterns, lower if patterns detected
|
|
448
|
+
// result.patterns — detected patterns (retry, fallback, arg_correction)
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
**Returns:** `ToolFailureAnalysisResult`
|
|
452
|
+
|
|
269
453
|
## Complete example
|
|
270
454
|
|
|
271
455
|
Here's a complete example showing how to use multiple utilities together:
|
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
# Trajectory accuracy scorers
|
|
2
|
+
|
|
3
|
+
Mastra provides two trajectory accuracy scorers for evaluating whether an agent or workflow follows an expected sequence of actions:
|
|
4
|
+
|
|
5
|
+
1. **Code-based scorer** - Deterministic evaluation using exact step matching and ordering
|
|
6
|
+
2. **LLM-based scorer** - Semantic evaluation using AI to assess trajectory quality and appropriateness
|
|
7
|
+
|
|
8
|
+
Both scorers work with agents and workflows. The `runEvals` pipeline automatically extracts trajectories, so scorers receive a `Trajectory` object directly.
|
|
9
|
+
|
|
10
|
+
## Trajectory extraction
|
|
11
|
+
|
|
12
|
+
The `runEvals` pipeline uses two extraction strategies, depending on whether observability storage is configured:
|
|
13
|
+
|
|
14
|
+
### Trace-based extraction (preferred)
|
|
15
|
+
|
|
16
|
+
When the target's `Mastra` instance has storage configured, the pipeline fetches the full execution trace from the observability store and calls `extractTrajectoryFromTrace()`. This produces a hierarchical trajectory with nested `children`, capturing the complete execution tree — including nested agent runs, tool calls within workflow steps, and model generations.
|
|
17
|
+
|
|
18
|
+
For example, a workflow that calls an agent, which in turn calls tools, produces:
|
|
19
|
+
|
|
20
|
+
```text
|
|
21
|
+
workflow_run
|
|
22
|
+
└─ workflow_step (validate-input)
|
|
23
|
+
└─ workflow_step (process-data)
|
|
24
|
+
└─ agent_run (my-agent)
|
|
25
|
+
└─ model_generation
|
|
26
|
+
└─ tool_call (search)
|
|
27
|
+
└─ model_generation
|
|
28
|
+
└─ tool_call (summarize)
|
|
29
|
+
└─ workflow_step (save-result)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Fallback extraction
|
|
33
|
+
|
|
34
|
+
When storage is not available, the pipeline falls back to:
|
|
35
|
+
|
|
36
|
+
- **Agents:** `extractTrajectory()` — Extracts `ToolCallStep` entries from `toolInvocations` in the agent's message output. Produces a flat list of tool calls.
|
|
37
|
+
- **Workflows:** `extractWorkflowTrajectory()` — Extracts `WorkflowStepStep` entries from `stepResults`. Produces a flat list of workflow steps.
|
|
38
|
+
|
|
39
|
+
These fallbacks don't capture nested execution or non-tool-call spans.
|
|
40
|
+
|
|
41
|
+
## Trajectory types
|
|
42
|
+
|
|
43
|
+
Trajectory steps use a discriminated union on `stepType`. Each step type has specific properties:
|
|
44
|
+
|
|
45
|
+
### `ToolCallStep`
|
|
46
|
+
|
|
47
|
+
Represents an agent tool call.
|
|
48
|
+
|
|
49
|
+
**stepType** (`'tool_call'`): Discriminant.
|
|
50
|
+
|
|
51
|
+
**name** (`string`): Tool name.
|
|
52
|
+
|
|
53
|
+
**toolArgs** (`Record<string, unknown>`): Arguments passed to the tool.
|
|
54
|
+
|
|
55
|
+
**toolResult** (`Record<string, unknown>`): Result returned by the tool.
|
|
56
|
+
|
|
57
|
+
**success** (`boolean`): Whether the call succeeded.
|
|
58
|
+
|
|
59
|
+
**durationMs** (`number`): Execution time in milliseconds.
|
|
60
|
+
|
|
61
|
+
**metadata** (`Record<string, unknown>`): Arbitrary metadata.
|
|
62
|
+
|
|
63
|
+
**children** (`TrajectoryStep[]`): Nested sub-steps.
|
|
64
|
+
|
|
65
|
+
### `WorkflowStepStep`
|
|
66
|
+
|
|
67
|
+
Represents a workflow step execution.
|
|
68
|
+
|
|
69
|
+
**stepType** (`'workflow_step'`): Discriminant.
|
|
70
|
+
|
|
71
|
+
**name** (`string`): Step identifier.
|
|
72
|
+
|
|
73
|
+
**stepId** (`string`): Step ID in the workflow.
|
|
74
|
+
|
|
75
|
+
**status** (`string`): Step result status (success, failed, suspended, etc.).
|
|
76
|
+
|
|
77
|
+
**output** (`Record<string, unknown>`): Step output data.
|
|
78
|
+
|
|
79
|
+
**durationMs** (`number`): Execution time in milliseconds.
|
|
80
|
+
|
|
81
|
+
**metadata** (`Record<string, unknown>`): Arbitrary metadata.
|
|
82
|
+
|
|
83
|
+
**children** (`TrajectoryStep[]`): Nested sub-steps (e.g. tool calls inside the step).
|
|
84
|
+
|
|
85
|
+
### Other step types
|
|
86
|
+
|
|
87
|
+
The discriminated union includes these additional step types:
|
|
88
|
+
|
|
89
|
+
| Step type | Key properties |
|
|
90
|
+
| ---------------------- | ------------------------------------------------------------- |
|
|
91
|
+
| `mcp_tool_call` | `toolArgs`, `toolResult`, `mcpServer`, `success` |
|
|
92
|
+
| `model_generation` | `modelId`, `promptTokens`, `completionTokens`, `finishReason` |
|
|
93
|
+
| `agent_run` | `agentId` |
|
|
94
|
+
| `workflow_run` | `workflowId`, `status` |
|
|
95
|
+
| `workflow_conditional` | `conditionCount`, `selectedSteps` |
|
|
96
|
+
| `workflow_parallel` | `branchCount`, `parallelSteps` |
|
|
97
|
+
| `workflow_loop` | `loopType`, `totalIterations` |
|
|
98
|
+
| `workflow_sleep` | `durationMs`, `sleepType` |
|
|
99
|
+
| `workflow_wait_event` | `eventName`, `eventReceived` |
|
|
100
|
+
| `processor_run` | `processorId` |
|
|
101
|
+
|
|
102
|
+
All step types share the base properties `name`, `durationMs`, `metadata`, and `children`.
|
|
103
|
+
|
|
104
|
+
## Expected steps
|
|
105
|
+
|
|
106
|
+
When defining expected trajectories, use `ExpectedStep` instead of the full `TrajectoryStep` discriminated union. `ExpectedStep` is a simpler type designed for expectations:
|
|
107
|
+
|
|
108
|
+
**name** (`string`): Step name to match (tool name, agent ID, workflow step name, etc.).
|
|
109
|
+
|
|
110
|
+
**stepType** (`TrajectoryStepType`): Step type to match. If omitted, matches any step type with the given name.
|
|
111
|
+
|
|
112
|
+
**data** (`Record<string, unknown>`): Expected step data. Compared against the actual step's type-specific data (toolArgs for tool\_call, output for workflow\_step, etc.).
|
|
113
|
+
|
|
114
|
+
**children** (`TrajectoryExpectation`): Nested expectation config for this step's children. Overrides the parent config for evaluating children of this step.
|
|
115
|
+
|
|
116
|
+
### Simple expected steps
|
|
117
|
+
|
|
118
|
+
```typescript
|
|
119
|
+
const steps: ExpectedStep[] = [
|
|
120
|
+
// Match by name only (any step type)
|
|
121
|
+
{ name: 'search' },
|
|
122
|
+
|
|
123
|
+
// Match by name and step type
|
|
124
|
+
{ name: 'search', stepType: 'tool_call' },
|
|
125
|
+
|
|
126
|
+
// Match with expected data
|
|
127
|
+
{ name: 'search', stepType: 'tool_call', data: { input: { query: 'weather' } } },
|
|
128
|
+
]
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Nested expectations
|
|
132
|
+
|
|
133
|
+
Each expected step can include a `children` config with its own evaluation rules. This lets you set different ordering or comparison rules at each level of the hierarchy.
|
|
134
|
+
|
|
135
|
+
```typescript
|
|
136
|
+
const scorer = createTrajectoryScorerCode({
|
|
137
|
+
defaults: {
|
|
138
|
+
ordering: 'strict',
|
|
139
|
+
steps: [
|
|
140
|
+
{ name: 'validate-input', stepType: 'workflow_step' },
|
|
141
|
+
{
|
|
142
|
+
name: 'research-agent',
|
|
143
|
+
stepType: 'agent_run',
|
|
144
|
+
children: {
|
|
145
|
+
// Sub-agent can call tools in any order
|
|
146
|
+
ordering: 'unordered',
|
|
147
|
+
steps: [
|
|
148
|
+
{ name: 'search', stepType: 'tool_call' },
|
|
149
|
+
{ name: 'summarize', stepType: 'tool_call' },
|
|
150
|
+
],
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
{ name: 'save-result', stepType: 'workflow_step' },
|
|
154
|
+
],
|
|
155
|
+
},
|
|
156
|
+
})
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
In this example, the parent workflow requires strict ordering of its steps, but the nested `research-agent` allows its tool calls in any order.
|
|
160
|
+
|
|
161
|
+
## Choosing between scorers
|
|
162
|
+
|
|
163
|
+
### Use the code-based scorer when:
|
|
164
|
+
|
|
165
|
+
- You need **deterministic, reproducible** results
|
|
166
|
+
- You have a **known expected trajectory** to compare against
|
|
167
|
+
- You want to validate **exact step sequences**
|
|
168
|
+
- Speed and cost are priorities (no LLM calls)
|
|
169
|
+
- You are running automated tests in CI/CD
|
|
170
|
+
|
|
171
|
+
### Use the LLM-based scorer when:
|
|
172
|
+
|
|
173
|
+
- You need **semantic understanding** of whether steps were appropriate
|
|
174
|
+
- The optimal trajectory is **not predetermined** (evaluate based on task requirements)
|
|
175
|
+
- You want to detect **unnecessary, redundant, or missing** steps
|
|
176
|
+
- You need **explanations** for scoring decisions
|
|
177
|
+
- You are evaluating **production agent behavior**
|
|
178
|
+
|
|
179
|
+
## Code-based trajectory accuracy scorer
|
|
180
|
+
|
|
181
|
+
The `createTrajectoryAccuracyScorerCode()` function from `@mastra/evals/scorers/prebuilt` provides deterministic scoring based on step matching and ordering against an expected trajectory.
|
|
182
|
+
|
|
183
|
+
### Parameters
|
|
184
|
+
|
|
185
|
+
**expectedTrajectory** (`TrajectoryExpectation`): Static expected trajectory to compare against. When provided, all dataset items use this trajectory. When omitted, the scorer reads expectedTrajectory from each dataset item at runtime.
|
|
186
|
+
|
|
187
|
+
**comparisonOptions** (`TrajectoryComparisonOptions`): Controls how the comparison is performed.
|
|
188
|
+
|
|
189
|
+
This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
|
|
190
|
+
|
|
191
|
+
### Expected trajectory sources
|
|
192
|
+
|
|
193
|
+
The code-based scorer resolves `expectedTrajectory` from two sources, in order of priority:
|
|
194
|
+
|
|
195
|
+
1. **Constructor option** — A static trajectory passed when creating the scorer. Used for all dataset items.
|
|
196
|
+
2. **Dataset item** — An `expectedTrajectory` field on the dataset item, passed through the `runEvals` pipeline. Allows different expected trajectories per item.
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
// Static: same expected trajectory for all items
|
|
200
|
+
const scorer = createTrajectoryAccuracyScorerCode({
|
|
201
|
+
expectedTrajectory: {
|
|
202
|
+
steps: [
|
|
203
|
+
{ stepType: 'tool_call', name: 'search' },
|
|
204
|
+
{ stepType: 'tool_call', name: 'summarize' },
|
|
205
|
+
],
|
|
206
|
+
},
|
|
207
|
+
})
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
```typescript
|
|
211
|
+
// Per-item: each dataset item has its own expectedTrajectory
|
|
212
|
+
const scorer = createTrajectoryAccuracyScorerCode()
|
|
213
|
+
|
|
214
|
+
await runEvals({
|
|
215
|
+
target: myAgent,
|
|
216
|
+
scorers: { trajectory: [scorer] },
|
|
217
|
+
data: [
|
|
218
|
+
{
|
|
219
|
+
input: 'Search and summarize weather',
|
|
220
|
+
expectedTrajectory: {
|
|
221
|
+
steps: [
|
|
222
|
+
{ stepType: 'tool_call', name: 'search' },
|
|
223
|
+
{ stepType: 'tool_call', name: 'summarize' },
|
|
224
|
+
],
|
|
225
|
+
},
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
input: 'Just search for weather',
|
|
229
|
+
expectedTrajectory: {
|
|
230
|
+
steps: [{ stepType: 'tool_call', name: 'search' }],
|
|
231
|
+
},
|
|
232
|
+
},
|
|
233
|
+
],
|
|
234
|
+
})
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Evaluation modes
|
|
238
|
+
|
|
239
|
+
The code-based scorer operates in two modes based on `strictOrder`:
|
|
240
|
+
|
|
241
|
+
#### Strict mode (`strictOrder: true`)
|
|
242
|
+
|
|
243
|
+
Requires an exact match. The actual steps must match the expected steps in the same order with no extra or missing steps. Returns `1.0` for an exact match and `0.0` otherwise.
|
|
244
|
+
|
|
245
|
+
#### Relaxed mode (`strictOrder: false`, default)
|
|
246
|
+
|
|
247
|
+
Allows extra steps. Expected steps must appear in the correct relative order. The score is calculated based on how many expected steps were matched, with optional penalties for extra or repeated steps.
|
|
248
|
+
|
|
249
|
+
## Code-based scoring details
|
|
250
|
+
|
|
251
|
+
- **Continuous scores**: Returns values between 0.0 and 1.0 in relaxed mode; binary (0 or 1) in strict mode
|
|
252
|
+
- **Deterministic**: Same input always produces the same output
|
|
253
|
+
- **Fast**: No external API calls
|
|
254
|
+
|
|
255
|
+
### Code-based scorer results
|
|
256
|
+
|
|
257
|
+
```typescript
|
|
258
|
+
{
|
|
259
|
+
runId: string,
|
|
260
|
+
preprocessStepResult: {
|
|
261
|
+
actualTrajectory: Trajectory,
|
|
262
|
+
expectedTrajectory: Trajectory,
|
|
263
|
+
comparison: {
|
|
264
|
+
score: number,
|
|
265
|
+
matchedSteps: number,
|
|
266
|
+
totalExpectedSteps: number,
|
|
267
|
+
totalActualSteps: number,
|
|
268
|
+
missingSteps: string[],
|
|
269
|
+
extraSteps: string[],
|
|
270
|
+
outOfOrderSteps: string[],
|
|
271
|
+
repeatedSteps: string[]
|
|
272
|
+
},
|
|
273
|
+
actualStepNames: string[],
|
|
274
|
+
expectedStepNames: string[]
|
|
275
|
+
},
|
|
276
|
+
score: number
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Code-based scorer examples
|
|
281
|
+
|
|
282
|
+
### Agent trajectory with strict ordering
|
|
283
|
+
|
|
284
|
+
Validates that an agent follows an exact sequence of tool calls:
|
|
285
|
+
|
|
286
|
+
```typescript
|
|
287
|
+
import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
|
|
288
|
+
import { runEvals } from '@mastra/core/evals'
|
|
289
|
+
|
|
290
|
+
const scorer = createTrajectoryAccuracyScorerCode({
|
|
291
|
+
expectedTrajectory: {
|
|
292
|
+
steps: [
|
|
293
|
+
{ stepType: 'tool_call', name: 'auth-tool' },
|
|
294
|
+
{ stepType: 'tool_call', name: 'fetch-tool' },
|
|
295
|
+
],
|
|
296
|
+
},
|
|
297
|
+
comparisonOptions: { strictOrder: true },
|
|
298
|
+
})
|
|
299
|
+
|
|
300
|
+
const result = await runEvals({
|
|
301
|
+
target: myAgent,
|
|
302
|
+
scorers: { trajectory: [scorer] },
|
|
303
|
+
data: [{ input: 'Get my data' }],
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
console.log(result.scores.trajectory['trajectory-accuracy']) // 1.0
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### Agent trajectory with relaxed ordering
|
|
310
|
+
|
|
311
|
+
Allows extra steps as long as expected steps appear in the correct relative order:
|
|
312
|
+
|
|
313
|
+
```typescript
|
|
314
|
+
const scorer = createTrajectoryAccuracyScorerCode({
|
|
315
|
+
expectedTrajectory: {
|
|
316
|
+
steps: [
|
|
317
|
+
{ stepType: 'tool_call', name: 'search-tool' },
|
|
318
|
+
{ stepType: 'tool_call', name: 'summarize-tool' },
|
|
319
|
+
],
|
|
320
|
+
},
|
|
321
|
+
comparisonOptions: { strictOrder: false },
|
|
322
|
+
})
|
|
323
|
+
|
|
324
|
+
// Agent called search-tool → log-tool → summarize-tool
|
|
325
|
+
// The extra log-tool is allowed in relaxed mode
|
|
326
|
+
// score: 0.75 — all expected steps matched, small penalty for extra step
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Workflow trajectory
|
|
330
|
+
|
|
331
|
+
Evaluates a workflow's execution path:
|
|
332
|
+
|
|
333
|
+
```typescript
|
|
334
|
+
import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
|
|
335
|
+
import { runEvals } from '@mastra/core/evals'
|
|
336
|
+
|
|
337
|
+
const scorer = createTrajectoryAccuracyScorerCode({
|
|
338
|
+
expectedTrajectory: {
|
|
339
|
+
steps: [
|
|
340
|
+
{ stepType: 'workflow_step', name: 'validate-input' },
|
|
341
|
+
{ stepType: 'workflow_step', name: 'process-data' },
|
|
342
|
+
{ stepType: 'workflow_step', name: 'save-result' },
|
|
343
|
+
],
|
|
344
|
+
},
|
|
345
|
+
})
|
|
346
|
+
|
|
347
|
+
const result = await runEvals({
|
|
348
|
+
target: myWorkflow,
|
|
349
|
+
scorers: { trajectory: [scorer] },
|
|
350
|
+
data: [{ input: { data: 'test' } }],
|
|
351
|
+
})
|
|
352
|
+
|
|
353
|
+
console.log(result.scores.trajectory['trajectory-accuracy'])
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Comparing step data
|
|
357
|
+
|
|
358
|
+
Validates not just the step names but also step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
|
|
359
|
+
|
|
360
|
+
```typescript
|
|
361
|
+
const scorer = createTrajectoryAccuracyScorerCode({
|
|
362
|
+
expectedTrajectory: {
|
|
363
|
+
steps: [
|
|
364
|
+
{
|
|
365
|
+
stepType: 'tool_call',
|
|
366
|
+
name: 'search-tool',
|
|
367
|
+
toolArgs: { query: 'weather in NYC' },
|
|
368
|
+
},
|
|
369
|
+
],
|
|
370
|
+
},
|
|
371
|
+
comparisonOptions: { compareStepData: true },
|
|
372
|
+
})
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
## LLM-based trajectory accuracy scorer
|
|
376
|
+
|
|
377
|
+
The `createTrajectoryAccuracyScorerLLM()` function from `@mastra/evals/scorers/prebuilt` uses an LLM to evaluate whether an agent's or workflow's trajectory was appropriate, efficient, and complete.
|
|
378
|
+
|
|
379
|
+
### Parameters
|
|
380
|
+
|
|
381
|
+
**model** (`MastraModelConfig`): The LLM model to use for evaluating trajectory quality.
|
|
382
|
+
|
|
383
|
+
**expectedTrajectory** (`TrajectoryExpectation`): Optional static expected trajectory to compare against. When omitted, the LLM evaluates the trajectory based on the task requirements alone. Can also come from dataset items at runtime.
|
|
384
|
+
|
|
385
|
+
### Features
|
|
386
|
+
|
|
387
|
+
The LLM-based scorer provides:
|
|
388
|
+
|
|
389
|
+
- **Task-aware evaluation**: Assesses whether each step was necessary given the user's request
|
|
390
|
+
- **Ordering assessment**: Evaluates whether steps were taken in a logical order
|
|
391
|
+
- **Missing step detection**: Identifies steps that should have been taken
|
|
392
|
+
- **Redundancy detection**: Flags unnecessary or repeated steps
|
|
393
|
+
- **Reasoning generation**: Provides human-readable explanations for scoring decisions
|
|
394
|
+
|
|
395
|
+
### Evaluation process
|
|
396
|
+
|
|
397
|
+
1. **Receive trajectory**: Gets a pre-extracted `Trajectory` object from the pipeline
|
|
398
|
+
2. **Analyze steps**: Evaluates each step for necessity and ordering using the LLM
|
|
399
|
+
3. **Generate score**: Calculates score weighted as 60% necessity, 30% ordering, minus 10% missing penalty
|
|
400
|
+
4. **Generate reasoning**: Provides a human-readable explanation
|
|
401
|
+
|
|
402
|
+
## LLM-based scoring details
|
|
403
|
+
|
|
404
|
+
- **Fractional scores**: Returns values between 0.0 and 1.0
|
|
405
|
+
- **Context-aware**: Considers user intent and task requirements
|
|
406
|
+
- **Explanatory**: Provides reasoning for scores
|
|
407
|
+
- **Flexible**: Works with or without an expected trajectory
|
|
408
|
+
|
|
409
|
+
### LLM-based scorer options
|
|
410
|
+
|
|
411
|
+
```typescript
|
|
412
|
+
// Evaluate based on task requirements (no expected trajectory)
|
|
413
|
+
const openScorer = createTrajectoryAccuracyScorerLLM({
|
|
414
|
+
model: { provider: 'openai', name: 'gpt-5.4' },
|
|
415
|
+
})
|
|
416
|
+
|
|
417
|
+
// Evaluate against a static expected trajectory
|
|
418
|
+
const guidedScorer = createTrajectoryAccuracyScorerLLM({
|
|
419
|
+
model: { provider: 'openai', name: 'gpt-5.4' },
|
|
420
|
+
expectedTrajectory: {
|
|
421
|
+
steps: [
|
|
422
|
+
{ stepType: 'tool_call', name: 'search-tool' },
|
|
423
|
+
{ stepType: 'tool_call', name: 'summarize-tool' },
|
|
424
|
+
],
|
|
425
|
+
},
|
|
426
|
+
})
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
### LLM-based scorer results
|
|
430
|
+
|
|
431
|
+
```typescript
|
|
432
|
+
{
|
|
433
|
+
runId: string,
|
|
434
|
+
preprocessStepResult: {
|
|
435
|
+
actualTrajectory: Trajectory,
|
|
436
|
+
actualTrajectoryFormatted: string,
|
|
437
|
+
expectedTrajectoryFormatted?: string,
|
|
438
|
+
hasSteps: boolean
|
|
439
|
+
},
|
|
440
|
+
analyzeStepResult: {
|
|
441
|
+
stepEvaluations: Array<{
|
|
442
|
+
stepName: string,
|
|
443
|
+
wasNecessary: boolean,
|
|
444
|
+
wasInOrder: boolean,
|
|
445
|
+
reasoning: string
|
|
446
|
+
}>,
|
|
447
|
+
missingSteps?: string[],
|
|
448
|
+
extraSteps?: string[],
|
|
449
|
+
overallAssessment: string
|
|
450
|
+
},
|
|
451
|
+
score: number,
|
|
452
|
+
reason: string
|
|
453
|
+
}
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
## Unified trajectory scorer
|
|
457
|
+
|
|
458
|
+
The `createTrajectoryScorerCode()` function from `@mastra/evals/scorers/prebuilt` provides a multi-dimensional trajectory evaluation that checks accuracy, efficiency, blacklisted tools, and tool failure patterns in a single pass.
|
|
459
|
+
|
|
460
|
+
### Parameters
|
|
461
|
+
|
|
462
|
+
**defaults** (`TrajectoryExpectation`): Default expectations applied to all dataset items. Per-item expectedTrajectory values override these defaults.
|
|
463
|
+
|
|
464
|
+
**weights** (`object`): Weights for combining dimension scores into the final score.
|
|
465
|
+
|
|
466
|
+
### Scoring behavior
|
|
467
|
+
|
|
468
|
+
The unified scorer evaluates four dimensions:
|
|
469
|
+
|
|
470
|
+
1. **Accuracy** — Matches actual steps against expected steps (if `steps` is configured). Uses the `ordering` mode.
|
|
471
|
+
2. **Efficiency** — Checks step budgets (`maxSteps`, `maxTotalTokens`, `maxTotalDurationMs`) and redundant calls (`noRedundantCalls`).
|
|
472
|
+
3. **Blacklist** — Checks for forbidden tools or sequences. Any violation immediately results in a score of **0.0** regardless of other dimensions.
|
|
473
|
+
4. **Tool failures** — Detects retry patterns, fallback patterns, and argument correction patterns.
|
|
474
|
+
|
|
475
|
+
The final score is a weighted average of accuracy, efficiency, and tool failures. Blacklist violations override everything to 0.
|
|
476
|
+
|
|
477
|
+
### Unified scorer results
|
|
478
|
+
|
|
479
|
+
```typescript
|
|
480
|
+
{
|
|
481
|
+
runId: string,
|
|
482
|
+
preprocessStepResult: {
|
|
483
|
+
accuracy?: TrajectoryComparisonResult,
|
|
484
|
+
efficiency: TrajectoryEfficiencyResult,
|
|
485
|
+
blacklist: TrajectoryBlacklistResult,
|
|
486
|
+
toolFailures: ToolFailureAnalysisResult,
|
|
487
|
+
},
|
|
488
|
+
score: number
|
|
489
|
+
}
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
### Per-item expectations
|
|
493
|
+
|
|
494
|
+
Each dataset item can override the defaults with its own `expectedTrajectory`. This lets you vary expectations per prompt:
|
|
495
|
+
|
|
496
|
+
```typescript
|
|
497
|
+
import { createTrajectoryScorerCode } from '@mastra/evals/scorers/prebuilt'
|
|
498
|
+
import { runEvals } from '@mastra/core/evals'
|
|
499
|
+
|
|
500
|
+
// Default blacklist applies to all items
|
|
501
|
+
const scorer = createTrajectoryScorerCode({
|
|
502
|
+
defaults: {
|
|
503
|
+
blacklistedTools: ['deleteAll'],
|
|
504
|
+
maxSteps: 5,
|
|
505
|
+
},
|
|
506
|
+
})
|
|
507
|
+
|
|
508
|
+
const result = await runEvals({
|
|
509
|
+
target: myAgent,
|
|
510
|
+
scorers: { trajectory: [scorer] },
|
|
511
|
+
data: [
|
|
512
|
+
{
|
|
513
|
+
input: 'Search for weather',
|
|
514
|
+
expectedTrajectory: {
|
|
515
|
+
steps: [{ stepType: 'tool_call', name: 'search' }],
|
|
516
|
+
maxSteps: 2,
|
|
517
|
+
},
|
|
518
|
+
},
|
|
519
|
+
{
|
|
520
|
+
input: 'Search and summarize',
|
|
521
|
+
expectedTrajectory: {
|
|
522
|
+
steps: [
|
|
523
|
+
{ stepType: 'tool_call', name: 'search' },
|
|
524
|
+
{ stepType: 'tool_call', name: 'summarize' },
|
|
525
|
+
],
|
|
526
|
+
},
|
|
527
|
+
},
|
|
528
|
+
],
|
|
529
|
+
})
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
### Example: efficiency and blacklist
|
|
533
|
+
|
|
534
|
+
```typescript
|
|
535
|
+
import { createTrajectoryScorerCode } from '@mastra/evals/scorers/prebuilt'
|
|
536
|
+
|
|
537
|
+
const scorer = createTrajectoryScorerCode({
|
|
538
|
+
defaults: {
|
|
539
|
+
blacklistedTools: ['escalate', 'admin-override'],
|
|
540
|
+
blacklistedSequences: [['escalate', 'admin-override']],
|
|
541
|
+
maxSteps: 10,
|
|
542
|
+
noRedundantCalls: true,
|
|
543
|
+
maxRetriesPerTool: 2,
|
|
544
|
+
},
|
|
545
|
+
})
|
|
546
|
+
```
|
|
547
|
+
|
|
548
|
+
## Using trajectory scorers with `runEvals`
|
|
549
|
+
|
|
550
|
+
Trajectory scorers are configured under the `trajectory` key in the scorer config. The `runEvals` pipeline handles trajectory extraction automatically.
|
|
551
|
+
|
|
552
|
+
### Agent trajectory evaluation
|
|
553
|
+
|
|
554
|
+
```typescript
|
|
555
|
+
import { runEvals } from '@mastra/core/evals'
|
|
556
|
+
import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
|
|
557
|
+
|
|
558
|
+
const trajectoryScorer = createTrajectoryAccuracyScorerCode({
|
|
559
|
+
expectedTrajectory: {
|
|
560
|
+
steps: [
|
|
561
|
+
{ stepType: 'tool_call', name: 'search' },
|
|
562
|
+
{ stepType: 'tool_call', name: 'format' },
|
|
563
|
+
],
|
|
564
|
+
},
|
|
565
|
+
})
|
|
566
|
+
|
|
567
|
+
const result = await runEvals({
|
|
568
|
+
target: myAgent,
|
|
569
|
+
scorers: {
|
|
570
|
+
agent: [qualityScorer], // receives raw MastraDBMessage[] output
|
|
571
|
+
trajectory: [trajectoryScorer], // receives pre-extracted Trajectory
|
|
572
|
+
},
|
|
573
|
+
data: [{ input: 'Find and format the data' }],
|
|
574
|
+
})
|
|
575
|
+
|
|
576
|
+
// result.scores.agent['quality'] — agent-level score
|
|
577
|
+
// result.scores.trajectory['trajectory-accuracy'] — trajectory score
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
### Workflow trajectory evaluation
|
|
581
|
+
|
|
582
|
+
```typescript
|
|
583
|
+
import { runEvals } from '@mastra/core/evals'
|
|
584
|
+
import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt'
|
|
585
|
+
|
|
586
|
+
const workflowTrajectoryScorer = createTrajectoryAccuracyScorerCode({
|
|
587
|
+
expectedTrajectory: {
|
|
588
|
+
steps: [
|
|
589
|
+
{ stepType: 'workflow_step', name: 'validate' },
|
|
590
|
+
{ stepType: 'workflow_step', name: 'process' },
|
|
591
|
+
{ stepType: 'workflow_step', name: 'notify' },
|
|
592
|
+
],
|
|
593
|
+
},
|
|
594
|
+
})
|
|
595
|
+
|
|
596
|
+
const result = await runEvals({
|
|
597
|
+
target: myWorkflow,
|
|
598
|
+
scorers: {
|
|
599
|
+
workflow: [outputScorer], // receives workflow output
|
|
600
|
+
trajectory: [workflowTrajectoryScorer], // receives pre-extracted Trajectory from step results
|
|
601
|
+
},
|
|
602
|
+
data: [{ input: { userId: '123' } }],
|
|
603
|
+
})
|
|
604
|
+
|
|
605
|
+
// result.scores.workflow['output-quality'] — workflow-level score
|
|
606
|
+
// result.scores.trajectory['trajectory-accuracy'] — trajectory score
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
## Related
|
|
610
|
+
|
|
611
|
+
- [runEvals reference](https://mastra.ai/reference/evals/run-evals) — Pipeline that extracts trajectories and passes them to scorers
|
|
612
|
+
- [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) — Base scorer interface
|
|
613
|
+
- [Scorer utils](https://mastra.ai/reference/evals/scorer-utils) — Utility functions including `extractTrajectory` and `compareTrajectories`
|
package/.docs/reference/index.md
CHANGED
|
@@ -106,6 +106,7 @@ The Reference section provides documentation of Mastra's API, including paramete
|
|
|
106
106
|
- [Tone Consistency Scorer](https://mastra.ai/reference/evals/tone-consistency)
|
|
107
107
|
- [Tool Call Accuracy Scorers](https://mastra.ai/reference/evals/tool-call-accuracy)
|
|
108
108
|
- [Toxicity](https://mastra.ai/reference/evals/toxicity)
|
|
109
|
+
- [Trajectory Accuracy Scorers](https://mastra.ai/reference/evals/trajectory-accuracy)
|
|
109
110
|
- [Harness Class](https://mastra.ai/reference/harness/harness-class)
|
|
110
111
|
- [Cloned Thread Utilities](https://mastra.ai/reference/memory/clone-utilities)
|
|
111
112
|
- [Memory Class](https://mastra.ai/reference/memory/memory-class)
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# @mastra/mcp-docs-server
|
|
2
2
|
|
|
3
|
+
## 1.1.17-alpha.8
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Updated dependencies [[`dc9fc19`](https://github.com/mastra-ai/mastra/commit/dc9fc19da4437f6b508cc355f346a8856746a76b), [`260fe12`](https://github.com/mastra-ai/mastra/commit/260fe1295fe7354e39d6def2775e0797a7a277f0)]:
|
|
8
|
+
- @mastra/core@1.18.0-alpha.1
|
|
9
|
+
|
|
10
|
+
## 1.1.17-alpha.6
|
|
11
|
+
|
|
12
|
+
### Patch Changes
|
|
13
|
+
|
|
14
|
+
- Updated dependencies [[`dc514a8`](https://github.com/mastra-ai/mastra/commit/dc514a83dba5f719172dddfd2c7b858e4943d067), [`404fea1`](https://github.com/mastra-ai/mastra/commit/404fea13042181f0b0c73a101392ac87c79ceae2), [`ebf5047`](https://github.com/mastra-ai/mastra/commit/ebf5047e825c38a1a356f10b214c1d4260dfcd8d), [`675f15b`](https://github.com/mastra-ai/mastra/commit/675f15b7eaeea649158d228ea635be40480c584d), [`b174c63`](https://github.com/mastra-ai/mastra/commit/b174c63a093108d4e53b9bc89a078d9f66202b3f), [`eef7cb2`](https://github.com/mastra-ai/mastra/commit/eef7cb2abe7ef15951e2fdf792a5095c6c643333), [`e8a5b0b`](https://github.com/mastra-ai/mastra/commit/e8a5b0b9bc94d12dee4150095512ca27a288d778)]:
|
|
15
|
+
- @mastra/core@1.18.0-alpha.0
|
|
16
|
+
|
|
3
17
|
## 1.1.17-alpha.4
|
|
4
18
|
|
|
5
19
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/mcp-docs-server",
|
|
3
|
-
"version": "1.1.17-alpha.
|
|
3
|
+
"version": "1.1.17-alpha.9",
|
|
4
4
|
"description": "MCP server for accessing Mastra.ai documentation, changelogs, and news.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
"jsdom": "^26.1.0",
|
|
30
30
|
"local-pkg": "^1.1.2",
|
|
31
31
|
"zod": "^4.3.6",
|
|
32
|
-
"@mastra/core": "1.
|
|
32
|
+
"@mastra/core": "1.18.0-alpha.1",
|
|
33
33
|
"@mastra/mcp": "^1.3.1"
|
|
34
34
|
},
|
|
35
35
|
"devDependencies": {
|
|
@@ -48,7 +48,7 @@
|
|
|
48
48
|
"vitest": "4.0.18",
|
|
49
49
|
"@internal/lint": "0.0.74",
|
|
50
50
|
"@internal/types-builder": "0.0.49",
|
|
51
|
-
"@mastra/core": "1.
|
|
51
|
+
"@mastra/core": "1.18.0-alpha.1"
|
|
52
52
|
},
|
|
53
53
|
"homepage": "https://mastra.ai",
|
|
54
54
|
"repository": {
|