@mastra/mcp-docs-server 1.1.17-alpha.1 → 1.1.17-alpha.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/docs/evals/built-in-scorers.md +1 -0
- package/.docs/docs/memory/observational-memory.md +56 -9
- package/.docs/docs/observability/tracing/bridges/otel.md +3 -3
- package/.docs/docs/observability/tracing/exporters/sentry.md +1 -1
- package/.docs/docs/server/auth/okta.md +225 -0
- package/.docs/docs/server/auth.md +1 -0
- package/.docs/docs/server/mastra-client.md +17 -0
- package/.docs/docs/workspace/lsp.md +116 -0
- package/.docs/docs/workspace/overview.md +15 -1
- package/.docs/guides/agent-frameworks/ai-sdk.md +3 -3
- package/.docs/models/gateways/openrouter.md +2 -1
- package/.docs/models/index.md +1 -1
- package/.docs/models/providers/groq.md +24 -16
- package/.docs/models/providers/llmgateway.md +269 -0
- package/.docs/models/providers/poe.md +3 -1
- package/.docs/models/providers/zai-coding-plan.md +3 -2
- package/.docs/models/providers/zai.md +14 -13
- package/.docs/models/providers/zhipuai-coding-plan.md +5 -2
- package/.docs/models/providers/zhipuai.md +13 -12
- package/.docs/models/providers.md +1 -0
- package/.docs/reference/ai-sdk/handle-chat-stream.md +2 -0
- package/.docs/reference/ai-sdk/with-mastra.md +2 -2
- package/.docs/reference/auth/okta.md +162 -0
- package/.docs/reference/client-js/agents.md +13 -8
- package/.docs/reference/client-js/mastra-client.md +1 -1
- package/.docs/reference/client-js/memory.md +1 -1
- package/.docs/reference/deployer/cloudflare.md +31 -1
- package/.docs/reference/evals/noise-sensitivity.md +3 -3
- package/.docs/reference/evals/run-evals.md +78 -3
- package/.docs/reference/evals/scorer-utils.md +188 -0
- package/.docs/reference/evals/trajectory-accuracy.md +627 -0
- package/.docs/reference/harness/harness-class.md +2 -0
- package/.docs/reference/index.md +3 -2
- package/.docs/reference/logging/pino-logger.md +58 -0
- package/.docs/reference/memory/observational-memory.md +34 -8
- package/.docs/reference/observability/tracing/interfaces.md +1 -1
- package/.docs/reference/processors/message-history-processor.md +1 -1
- package/.docs/reference/processors/processor-interface.md +3 -3
- package/.docs/reference/processors/semantic-recall-processor.md +1 -1
- package/.docs/reference/processors/skill-search-processor.md +93 -0
- package/.docs/reference/processors/tool-call-filter.md +2 -2
- package/.docs/reference/processors/working-memory-processor.md +1 -1
- package/.docs/reference/streaming/agents/stream.md +1 -1
- package/.docs/reference/tools/mcp-client.md +1 -1
- package/CHANGELOG.md +42 -0
- package/package.json +4 -4
- package/.docs/reference/core/getStoredAgentById.md +0 -87
- package/.docs/reference/core/listStoredAgents.md +0 -91
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# MastraAuthOkta & MastraRBACOkta class
|
|
2
|
+
|
|
3
|
+
## MastraAuthOkta class
|
|
4
|
+
|
|
5
|
+
The `MastraAuthOkta` class provides authentication for Mastra using Okta. It implements an OAuth 2.0 / OIDC login flow with encrypted session cookies and integrates with the Mastra server using the `auth` option.
|
|
6
|
+
|
|
7
|
+
### Usage example
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
import { Mastra } from '@mastra/core'
|
|
11
|
+
import { MastraAuthOkta } from '@mastra/auth-okta'
|
|
12
|
+
|
|
13
|
+
export const mastra = new Mastra({
|
|
14
|
+
server: {
|
|
15
|
+
auth: new MastraAuthOkta({
|
|
16
|
+
domain: process.env.OKTA_DOMAIN,
|
|
17
|
+
clientId: process.env.OKTA_CLIENT_ID,
|
|
18
|
+
clientSecret: process.env.OKTA_CLIENT_SECRET,
|
|
19
|
+
redirectUri: process.env.OKTA_REDIRECT_URI,
|
|
20
|
+
}),
|
|
21
|
+
},
|
|
22
|
+
})
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
> **Note:** You can omit the constructor parameters if you have the appropriately named environment variables set. In that case, use `new MastraAuthOkta()` without any arguments.
|
|
26
|
+
|
|
27
|
+
### Constructor parameters
|
|
28
|
+
|
|
29
|
+
**domain** (`string`): Your Okta domain (e.g., \`dev-123456.okta.com\`). Used to construct the issuer URL and API endpoints. (Default: `process.env.OKTA_DOMAIN`)
|
|
30
|
+
|
|
31
|
+
**clientId** (`string`): The OAuth client ID from your Okta application. (Default: `process.env.OKTA_CLIENT_ID`)
|
|
32
|
+
|
|
33
|
+
**clientSecret** (`string`): The OAuth client secret. Required for the SSO authorization code flow. (Default: `process.env.OKTA_CLIENT_SECRET`)
|
|
34
|
+
|
|
35
|
+
**issuer** (`string`): The token issuer URL. Override this if you use a custom authorization server. (Default: `` `https://{domain}/oauth2/default` ``)
|
|
36
|
+
|
|
37
|
+
**redirectUri** (`string`): The OAuth redirect URI for the SSO callback. Must match the redirect URI configured in your Okta application. (Default: `process.env.OKTA_REDIRECT_URI`)
|
|
38
|
+
|
|
39
|
+
**scopes** (`string[]`): OAuth scopes to request during the login flow. (Default: `['openid', 'profile', 'email', 'groups']`)
|
|
40
|
+
|
|
41
|
+
**apiToken** (`string`): Okta API token for user lookups via the Users API. Required for \`getUser()\` to return user data by ID. (Default: `process.env.OKTA_API_TOKEN`)
|
|
42
|
+
|
|
43
|
+
**session** (`OktaSessionOptions`): Session cookie configuration.
|
|
44
|
+
|
|
45
|
+
**session.cookieName** (`string`): Name of the session cookie.
|
|
46
|
+
|
|
47
|
+
**session.cookieMaxAge** (`number`): Cookie max age in seconds.
|
|
48
|
+
|
|
49
|
+
**session.cookiePassword** (`string`): Password for encrypting session cookies. Must be at least 32 characters. If not set, an auto-generated value is used that does not survive restarts.
|
|
50
|
+
|
|
51
|
+
**session.secureCookies** (`boolean`): Set the \`Secure\` flag on session cookies.
|
|
52
|
+
|
|
53
|
+
**name** (`string`): Custom name for the auth provider instance. (Default: `'okta'`)
|
|
54
|
+
|
|
55
|
+
### Environment variables
|
|
56
|
+
|
|
57
|
+
The following environment variables are automatically used when constructor options are not provided:
|
|
58
|
+
|
|
59
|
+
**OKTA\_DOMAIN** (`string`): Your Okta domain (e.g., \`dev-123456.okta.com\`). Found in your Okta admin console.
|
|
60
|
+
|
|
61
|
+
**OKTA\_CLIENT\_ID** (`string`): The OAuth client ID from your Okta application.
|
|
62
|
+
|
|
63
|
+
**OKTA\_CLIENT\_SECRET** (`string`): The OAuth client secret from your Okta application.
|
|
64
|
+
|
|
65
|
+
**OKTA\_ISSUER** (`string`): Token issuer URL. Defaults to \`https\://{domain}/oauth2/default\` if not set.
|
|
66
|
+
|
|
67
|
+
**OKTA\_REDIRECT\_URI** (`string`): OAuth redirect URI for the SSO callback.
|
|
68
|
+
|
|
69
|
+
**OKTA\_COOKIE\_PASSWORD** (`string`): Password for encrypting session cookies. Must be at least 32 characters.
|
|
70
|
+
|
|
71
|
+
**OKTA\_API\_TOKEN** (`string`): Okta API token for user lookups and RBAC group resolution.
|
|
72
|
+
|
|
73
|
+
### Authentication flow
|
|
74
|
+
|
|
75
|
+
`MastraAuthOkta` authenticates requests in the following order:
|
|
76
|
+
|
|
77
|
+
1. **Session cookie**: Reads the encrypted session cookie and decrypts it. If the session is valid and not expired, the user is authenticated.
|
|
78
|
+
2. **JWT fallback**: If no session cookie is present, verifies the `Authorization` header token against Okta's JWKS endpoint.
|
|
79
|
+
|
|
80
|
+
After authentication, `authorizeUser` checks that the user has a valid `oktaId`. Provide a custom `authorizeUser` function to implement additional logic.
|
|
81
|
+
|
|
82
|
+
### `OktaUser` type
|
|
83
|
+
|
|
84
|
+
The `OktaUser` type extends the base `EEUser` interface with Okta-specific fields:
|
|
85
|
+
|
|
86
|
+
**id** (`string`): User identifier (maps to the \`sub\` claim).
|
|
87
|
+
|
|
88
|
+
**oktaId** (`string`): Okta user ID (same as \`id\`).
|
|
89
|
+
|
|
90
|
+
**email** (`string`): User email address.
|
|
91
|
+
|
|
92
|
+
**name** (`string`): User display name, constructed from token claims.
|
|
93
|
+
|
|
94
|
+
**avatarUrl** (`string`): URL to the user's profile picture.
|
|
95
|
+
|
|
96
|
+
**groups** (`string[]`): Okta groups the user belongs to, populated from the \`groups\` claim.
|
|
97
|
+
|
|
98
|
+
## MastraRBACOkta class
|
|
99
|
+
|
|
100
|
+
The `MastraRBACOkta` class maps Okta groups to Mastra permissions. It fetches user groups from the Okta API and resolves them against a configurable role mapping. Use it with `MastraAuthOkta` or any other auth provider.
|
|
101
|
+
|
|
102
|
+
> **Note:** RBAC requires a valid Enterprise Edition license. It works without a license in development so you can try it locally, but you’ll need a license for production. [Contact sales](https://mastra.ai/contact) for more information.
|
|
103
|
+
|
|
104
|
+
### Usage example
|
|
105
|
+
|
|
106
|
+
Use `MastraRBACOkta` alongside an auth provider by passing it to the `rbac` option:
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
import { Mastra } from '@mastra/core'
|
|
110
|
+
import { MastraAuthOkta, MastraRBACOkta } from '@mastra/auth-okta'
|
|
111
|
+
|
|
112
|
+
export const mastra = new Mastra({
|
|
113
|
+
server: {
|
|
114
|
+
auth: new MastraAuthOkta(),
|
|
115
|
+
rbac: new MastraRBACOkta({
|
|
116
|
+
roleMapping: {
|
|
117
|
+
Admin: ['*'],
|
|
118
|
+
Engineering: ['agents:*', 'workflows:*', 'tools:*'],
|
|
119
|
+
Viewer: ['agents:read', 'workflows:read'],
|
|
120
|
+
_default: [],
|
|
121
|
+
},
|
|
122
|
+
}),
|
|
123
|
+
},
|
|
124
|
+
})
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
To use Okta RBAC with a different auth provider, pass a `getUserId` function to resolve the Okta user ID from the other provider's user object:
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import { MastraAuthAuth0 } from '@mastra/auth-auth0'
|
|
131
|
+
import { MastraRBACOkta } from '@mastra/auth-okta'
|
|
132
|
+
|
|
133
|
+
export const mastra = new Mastra({
|
|
134
|
+
server: {
|
|
135
|
+
auth: new MastraAuthAuth0(),
|
|
136
|
+
rbac: new MastraRBACOkta({
|
|
137
|
+
getUserId: user => user.metadata?.oktaUserId || user.email,
|
|
138
|
+
roleMapping: {
|
|
139
|
+
Engineering: ['agents:*', 'workflows:*'],
|
|
140
|
+
Admin: ['*'],
|
|
141
|
+
_default: [],
|
|
142
|
+
},
|
|
143
|
+
}),
|
|
144
|
+
},
|
|
145
|
+
})
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Constructor parameters
|
|
149
|
+
|
|
150
|
+
**roleMapping** (`RoleMapping`): Maps Okta group names to arrays of Mastra permission strings. Use \`'\_default'\` to assign permissions to users who do not match any group. Supports wildcards like \`'\*'\` (full access) and \`'agents:\*'\` (all agent actions).
|
|
151
|
+
|
|
152
|
+
**domain** (`string`): Your Okta domain. Used to initialize the Okta management SDK. (Default: `process.env.OKTA_DOMAIN`)
|
|
153
|
+
|
|
154
|
+
**apiToken** (`string`): Okta API token for the management SDK. Required to fetch user groups from the Okta API. (Default: `process.env.OKTA_API_TOKEN`)
|
|
155
|
+
|
|
156
|
+
**getUserId** (`(user: unknown) => string | undefined`): Extract the Okta user ID from a user object. Use this when combining Okta RBAC with a different auth provider. If not provided, falls back to \`oktaId\` or \`id\` on the user object.
|
|
157
|
+
|
|
158
|
+
**cache** (`PermissionCacheOptions`): Configure the LRU cache for group lookups.
|
|
159
|
+
|
|
160
|
+
**cache.maxSize** (`number`): Maximum number of users to cache.
|
|
161
|
+
|
|
162
|
+
**cache.ttlMs** (`number`): Time-to-live in milliseconds.
|
|
@@ -308,7 +308,7 @@ response.processDataStream({
|
|
|
308
308
|
|
|
309
309
|
## Stored agents
|
|
310
310
|
|
|
311
|
-
Stored agents are agent configurations stored in a database that can be created, updated, and deleted at runtime. They reference primitives (tools, workflows, other agents,
|
|
311
|
+
Stored agents are agent configurations stored in a database that can be created, updated, and deleted at runtime. They reference primitives (tools, workflows, other agents, scorers) by key, which are resolved from the Mastra registry when the agent is instantiated. Memory is configured inline as a `SerializedMemoryConfig` object with options such as `lastMessages` and `semanticRecall`.
|
|
312
312
|
|
|
313
313
|
### `listStoredAgents()`
|
|
314
314
|
|
|
@@ -344,7 +344,7 @@ const agent = await mastraClient.createStoredAgent({
|
|
|
344
344
|
instructions: 'You are a helpful assistant.',
|
|
345
345
|
model: {
|
|
346
346
|
provider: 'openai',
|
|
347
|
-
name: 'gpt-4',
|
|
347
|
+
name: 'gpt-5.4',
|
|
348
348
|
},
|
|
349
349
|
})
|
|
350
350
|
```
|
|
@@ -359,12 +359,17 @@ const agent = await mastraClient.createStoredAgent({
|
|
|
359
359
|
instructions: 'You are a helpful assistant.',
|
|
360
360
|
model: {
|
|
361
361
|
provider: 'openai',
|
|
362
|
-
name: 'gpt-4',
|
|
362
|
+
name: 'gpt-5.4',
|
|
363
|
+
},
|
|
364
|
+
tools: { calculator: {}, weather: {} },
|
|
365
|
+
workflows: { 'data-processing': {} },
|
|
366
|
+
agents: { 'subagent-1': {} },
|
|
367
|
+
memory: {
|
|
368
|
+
options: {
|
|
369
|
+
lastMessages: 20,
|
|
370
|
+
semanticRecall: false,
|
|
371
|
+
},
|
|
363
372
|
},
|
|
364
|
-
tools: ['calculator', 'weather'],
|
|
365
|
-
workflows: ['data-processing'],
|
|
366
|
-
agents: ['subagent-1'],
|
|
367
|
-
memory: 'my-memory',
|
|
368
373
|
scorers: {
|
|
369
374
|
'quality-scorer': {
|
|
370
375
|
sampling: { type: 'ratio', rate: 0.1 },
|
|
@@ -415,7 +420,7 @@ const updated = await storedAgent.update({
|
|
|
415
420
|
```typescript
|
|
416
421
|
// Update just the tools
|
|
417
422
|
await storedAgent.update({
|
|
418
|
-
tools:
|
|
423
|
+
tools: { 'new-tool-1': {}, 'new-tool-2': {} },
|
|
419
424
|
})
|
|
420
425
|
|
|
421
426
|
// Update metadata
|
|
@@ -32,7 +32,7 @@ export const mastraClient = new MastraClient({
|
|
|
32
32
|
|
|
33
33
|
**getAgent(agentId)** (`Agent`): Retrieves a specific agent instance by ID.
|
|
34
34
|
|
|
35
|
-
**
|
|
35
|
+
**listMemoryThreads(params)** (`Promise<StorageThreadType[]>`): Retrieves memory threads for the specified resource and agent. Requires a \`resourceId\` and an \`agentId\`.
|
|
36
36
|
|
|
37
37
|
**createMemoryThread(params)** (`Promise<MemoryThread>`): Creates a new memory thread with the given parameters.
|
|
38
38
|
|
|
@@ -7,7 +7,7 @@ The Memory API provides methods to manage conversation threads and message histo
|
|
|
7
7
|
Retrieve all memory threads for a specific resource:
|
|
8
8
|
|
|
9
9
|
```typescript
|
|
10
|
-
const threads = await mastraClient.
|
|
10
|
+
const threads = await mastraClient.listMemoryThreads({
|
|
11
11
|
resourceId: 'resource-1',
|
|
12
12
|
agentId: 'agent-1', // Optional - can be omitted if storage is configured
|
|
13
13
|
})
|
|
@@ -87,4 +87,34 @@ Use `vars` in the `CloudflareDeployer` constructor only for non-sensitive config
|
|
|
87
87
|
|
|
88
88
|
## Build output
|
|
89
89
|
|
|
90
|
-
After running `mastra build`, the deployer generates a `wrangler.jsonc` file conforming to Cloudflare's [wrangler configuration](https://developers.cloudflare.com/workers/wrangler/configuration/). It points to files inside `.mastra/output` so you need to run `mastra build` before deploying with Wrangler.
|
|
90
|
+
After running `mastra build`, the deployer generates a `wrangler.jsonc` file conforming to Cloudflare's [wrangler configuration](https://developers.cloudflare.com/workers/wrangler/configuration/). It points to files inside `.mastra/output` so you need to run `mastra build` before deploying with Wrangler.
|
|
91
|
+
|
|
92
|
+
## Cloudflare bindings
|
|
93
|
+
|
|
94
|
+
When you use the Cloudflare deployer, you can import runtime bindings from `cloudflare:workers` in your Mastra config file. Mastra automatically preserves protocol-based runtime imports like `cloudflare:workers` during `mastra build` without trying to install them as npm dependencies.
|
|
95
|
+
|
|
96
|
+
```typescript
|
|
97
|
+
import { env } from 'cloudflare:workers'
|
|
98
|
+
import { Mastra } from '@mastra/core'
|
|
99
|
+
import { registerApiRoute } from '@mastra/core/server'
|
|
100
|
+
import { CloudflareDeployer } from '@mastra/deployer-cloudflare'
|
|
101
|
+
|
|
102
|
+
export const mastra = new Mastra({
|
|
103
|
+
deployer: new CloudflareDeployer({
|
|
104
|
+
name: 'my-worker',
|
|
105
|
+
kv_namespaces: [{ binding: 'CACHE', id: 'your-kv-namespace-id' }],
|
|
106
|
+
}),
|
|
107
|
+
server: {
|
|
108
|
+
apiRoutes: [
|
|
109
|
+
registerApiRoute('/bindings', {
|
|
110
|
+
method: 'GET',
|
|
111
|
+
requiresAuth: false,
|
|
112
|
+
handler: async c => {
|
|
113
|
+
await env.CACHE.put('status', 'ok')
|
|
114
|
+
return c.json({ status: await env.CACHE.get('status') })
|
|
115
|
+
},
|
|
116
|
+
}),
|
|
117
|
+
],
|
|
118
|
+
},
|
|
119
|
+
})
|
|
120
|
+
```
|
|
@@ -546,9 +546,9 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
|
546
546
|
|
|
547
547
|
async function compareModelRobustness() {
|
|
548
548
|
const models = [
|
|
549
|
-
{ name: 'GPT-5.
|
|
550
|
-
{ name: 'GPT-4
|
|
551
|
-
{ name: 'Claude', model: 'anthropic/claude-
|
|
549
|
+
{ name: 'GPT-5.4', model: 'openai/gpt-5.4' },
|
|
550
|
+
{ name: 'GPT-5.4-mini', model: 'openai/gpt-5.4-mini' },
|
|
551
|
+
{ name: 'Claude', model: 'anthropic/claude-opus-4-6' },
|
|
552
552
|
]
|
|
553
553
|
|
|
554
554
|
const testScenario = {
|
|
@@ -35,7 +35,7 @@ console.log(`Processed ${result.summary.totalItems} items`)
|
|
|
35
35
|
|
|
36
36
|
**data** (`RunEvalsDataItem[]`): Array of test cases with input data and optional ground truth.
|
|
37
37
|
|
|
38
|
-
**scorers** (`MastraScorer[] | WorkflowScorerConfig`):
|
|
38
|
+
**scorers** (`MastraScorer[] | AgentScorerConfig | WorkflowScorerConfig`): Scorers to use. A flat array applies all scorers to the raw output. For agents, an \`AgentScorerConfig\` object separates agent-level and trajectory scorers. For workflows, a \`WorkflowScorerConfig\` object specifies scorers for the workflow, individual steps, and trajectory.
|
|
39
39
|
|
|
40
40
|
**targetOptions** (`AgentExecutionOptions | WorkflowRunOptions`): Options forwarded to the target during execution. For agents: options passed to agent.generate() (e.g. maxSteps, modelSettings, instructions). For workflows: options passed to run.start() (e.g. perStep, outputOptions, initialState).
|
|
41
41
|
|
|
@@ -49,20 +49,32 @@ console.log(`Processed ${result.summary.totalItems} items`)
|
|
|
49
49
|
|
|
50
50
|
**groundTruth** (`any`): Expected or reference output for comparison during scoring.
|
|
51
51
|
|
|
52
|
+
**expectedTrajectory** (`TrajectoryExpectation`): Expected trajectory configuration for trajectory scoring. Includes expected steps, ordering, efficiency budgets, blacklists, and tool failure tolerance. Passed to trajectory scorers as \`run.expectedTrajectory\`. Overrides the static defaults in scorer constructors.
|
|
53
|
+
|
|
52
54
|
**requestContext** (`RequestContext`): Request Context to pass to the target during execution.
|
|
53
55
|
|
|
54
56
|
**tracingContext** (`TracingContext`): Tracing context for observability and debugging.
|
|
55
57
|
|
|
56
58
|
**startOptions** (`WorkflowRunOptions`): Per-item workflow run options (e.g. initialState, perStep, outputOptions). Merged on top of targetOptions, so per-item values take precedence. Only applicable when the target is a workflow.
|
|
57
59
|
|
|
60
|
+
## Agent scorer configuration
|
|
61
|
+
|
|
62
|
+
For agents, use `AgentScorerConfig` to separate agent-level scorers from trajectory scorers:
|
|
63
|
+
|
|
64
|
+
**agent** (`MastraScorer[]`): Scorers that receive the raw agent output (MastraDBMessage\[]). Use for evaluating response quality, content, etc.
|
|
65
|
+
|
|
66
|
+
**trajectory** (`MastraScorer[]`): Scorers that receive a pre-extracted Trajectory object. When storage is configured, the pipeline extracts a hierarchical trajectory from observability traces (including nested tool calls and model generations). Otherwise, it falls back to extracting tool calls from agent messages.
|
|
67
|
+
|
|
58
68
|
## Workflow scorer configuration
|
|
59
69
|
|
|
60
|
-
For workflows,
|
|
70
|
+
For workflows, use `WorkflowScorerConfig` to specify scorers at different levels:
|
|
61
71
|
|
|
62
|
-
**workflow** (`MastraScorer[]`):
|
|
72
|
+
**workflow** (`MastraScorer[]`): Scorers to evaluate the entire workflow output.
|
|
63
73
|
|
|
64
74
|
**steps** (`Record<string, MastraScorer[]>`): Object mapping step IDs to arrays of scorers for evaluating individual step outputs.
|
|
65
75
|
|
|
76
|
+
**trajectory** (`MastraScorer[]`): Scorers that receive a pre-extracted Trajectory from the workflow execution. When storage is configured, the pipeline extracts a hierarchical trajectory from observability traces (including nested agent runs and tool calls within workflow steps). Otherwise, it falls back to extracting step results from the workflow output.
|
|
77
|
+
|
|
66
78
|
## Returns
|
|
67
79
|
|
|
68
80
|
**scores** (`Record<string, any>`): Average scores across all test cases, organized by scorer name.
|
|
@@ -105,6 +117,36 @@ const result = await runEvals({
|
|
|
105
117
|
})
|
|
106
118
|
```
|
|
107
119
|
|
|
120
|
+
### Agent trajectory evaluation
|
|
121
|
+
|
|
122
|
+
Use `AgentScorerConfig` to evaluate both the agent response and its tool-calling trajectory:
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
import { runEvals } from '@mastra/core/evals'
|
|
126
|
+
import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers/code/trajectory'
|
|
127
|
+
|
|
128
|
+
const trajectoryScorer = createTrajectoryAccuracyScorerCode()
|
|
129
|
+
|
|
130
|
+
const result = await runEvals({
|
|
131
|
+
target: chatAgent,
|
|
132
|
+
data: [
|
|
133
|
+
{
|
|
134
|
+
input: 'What is the weather in London?',
|
|
135
|
+
expectedTrajectory: {
|
|
136
|
+
steps: [{ stepType: 'tool_call', name: 'weatherTool' }],
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
],
|
|
140
|
+
scorers: {
|
|
141
|
+
// agent: [responseQualityScorer], // Optional: add agent-level scorers
|
|
142
|
+
trajectory: [trajectoryScorer],
|
|
143
|
+
},
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
// result.scores.agent — average agent-level scores
|
|
147
|
+
// result.scores.trajectory — average trajectory scores
|
|
148
|
+
```
|
|
149
|
+
|
|
108
150
|
### Agent with `targetOptions`
|
|
109
151
|
|
|
110
152
|
Pass execution options like `maxSteps` or `modelSettings` to customize agent behavior during evaluation:
|
|
@@ -149,6 +191,37 @@ const workflowResult = await runEvals({
|
|
|
149
191
|
})
|
|
150
192
|
```
|
|
151
193
|
|
|
194
|
+
### Workflow trajectory evaluation
|
|
195
|
+
|
|
196
|
+
Add trajectory scoring to workflow evaluations to validate step execution order:
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
const workflowResult = await runEvals({
|
|
200
|
+
target: myWorkflow,
|
|
201
|
+
data: [
|
|
202
|
+
{
|
|
203
|
+
input: { query: 'Process this data' },
|
|
204
|
+
expectedTrajectory: {
|
|
205
|
+
steps: [
|
|
206
|
+
{ stepType: 'workflow_step', name: 'validate' },
|
|
207
|
+
{ stepType: 'workflow_step', name: 'process' },
|
|
208
|
+
{ stepType: 'workflow_step', name: 'output' },
|
|
209
|
+
],
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
],
|
|
213
|
+
scorers: {
|
|
214
|
+
workflow: [outputQualityScorer],
|
|
215
|
+
steps: {
|
|
216
|
+
validate: [validationScorer],
|
|
217
|
+
},
|
|
218
|
+
trajectory: [trajectoryScorer],
|
|
219
|
+
},
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
// result.scores.trajectory — workflow trajectory scores
|
|
223
|
+
```
|
|
224
|
+
|
|
152
225
|
### Workflow with per-item `startOptions`
|
|
153
226
|
|
|
154
227
|
Use `startOptions` on individual data items to customize each workflow run. Per-item values take precedence over `targetOptions`:
|
|
@@ -175,5 +248,7 @@ const result = await runEvals({
|
|
|
175
248
|
|
|
176
249
|
- [createScorer()](https://mastra.ai/reference/evals/create-scorer) - Create custom scorers for experiments
|
|
177
250
|
- [MastraScorer](https://mastra.ai/reference/evals/mastra-scorer) - Learn about scorer structure and methods
|
|
251
|
+
- [Trajectory Accuracy](https://mastra.ai/reference/evals/trajectory-accuracy) - Built-in trajectory evaluation scorers
|
|
252
|
+
- [Scorer Utilities](https://mastra.ai/reference/evals/scorer-utils) - Helper functions for extracting trajectory data
|
|
178
253
|
- [Custom Scorers](https://mastra.ai/docs/evals/custom-scorers) - Guide to building evaluation logic
|
|
179
254
|
- [Scorers Overview](https://mastra.ai/docs/evals/overview) - Understanding scorer concepts
|
|
@@ -14,9 +14,21 @@ import {
|
|
|
14
14
|
extractToolCalls,
|
|
15
15
|
extractInputMessages,
|
|
16
16
|
extractAgentResponseMessages,
|
|
17
|
+
compareTrajectories,
|
|
18
|
+
createTrajectoryTestRun,
|
|
17
19
|
} from '@mastra/evals/scorers/utils'
|
|
18
20
|
```
|
|
19
21
|
|
|
22
|
+
Trajectory extraction functions are available from `@mastra/core/evals`:
|
|
23
|
+
|
|
24
|
+
```typescript
|
|
25
|
+
import {
|
|
26
|
+
extractTrajectory,
|
|
27
|
+
extractWorkflowTrajectory,
|
|
28
|
+
extractTrajectoryFromTrace,
|
|
29
|
+
} from '@mastra/core/evals'
|
|
30
|
+
```
|
|
31
|
+
|
|
20
32
|
## Message extraction
|
|
21
33
|
|
|
22
34
|
### `getAssistantMessageFromRunOutput`
|
|
@@ -266,6 +278,182 @@ const result = await myScorer.run({
|
|
|
266
278
|
})
|
|
267
279
|
```
|
|
268
280
|
|
|
281
|
+
## Trajectory utilities
|
|
282
|
+
|
|
283
|
+
### `extractTrajectory`
|
|
284
|
+
|
|
285
|
+
Extracts a `Trajectory` from agent output messages (`MastraDBMessage[]`). Converts tool invocations into `ToolCallStep` objects. The `runEvals` pipeline calls this automatically for trajectory scorers — you only need it for direct testing.
|
|
286
|
+
|
|
287
|
+
Available from `@mastra/core/evals`.
|
|
288
|
+
|
|
289
|
+
```typescript
|
|
290
|
+
import { extractTrajectory } from '@mastra/core/evals'
|
|
291
|
+
|
|
292
|
+
const trajectory = extractTrajectory(agentOutputMessages)
|
|
293
|
+
// trajectory.steps — ToolCallStep[] extracted from toolInvocations
|
|
294
|
+
// trajectory.rawOutput — the original MastraDBMessage[] array
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawOutput`.
|
|
298
|
+
|
|
299
|
+
### `extractWorkflowTrajectory`
|
|
300
|
+
|
|
301
|
+
Extracts a `Trajectory` from workflow step results. Converts `StepResult` records into `WorkflowStepStep` objects, respecting the execution path ordering.
|
|
302
|
+
|
|
303
|
+
Available from `@mastra/core/evals`.
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { extractWorkflowTrajectory } from '@mastra/core/evals'
|
|
307
|
+
|
|
308
|
+
const trajectory = extractWorkflowTrajectory(
|
|
309
|
+
workflowResult.steps, // Record<string, StepResult>
|
|
310
|
+
workflowResult.stepExecutionPath, // string[] (optional)
|
|
311
|
+
)
|
|
312
|
+
// trajectory.steps — WorkflowStepStep[] in execution order
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]`, `totalDurationMs`, and `rawWorkflowResult`.
|
|
316
|
+
|
|
317
|
+
### `extractTrajectoryFromTrace`
|
|
318
|
+
|
|
319
|
+
Builds a hierarchical `Trajectory` from observability trace spans (`SpanRecord[]`). Reconstructs the parent-child span tree and maps each span to the appropriate `TrajectoryStep` discriminated union type with nested `children`.
|
|
320
|
+
|
|
321
|
+
This is the preferred extraction method when storage is available. The `runEvals` pipeline calls this automatically when the target's `Mastra` instance has a configured storage backend. It produces richer trajectories than `extractTrajectory` or `extractWorkflowTrajectory` because it captures the full execution tree, including nested agent runs, tool calls, and model generations.
|
|
322
|
+
|
|
323
|
+
Available from `@mastra/core/evals`.
|
|
324
|
+
|
|
325
|
+
```typescript
|
|
326
|
+
import { extractTrajectoryFromTrace } from '@mastra/core/evals'
|
|
327
|
+
|
|
328
|
+
// After fetching a trace from the observability store
|
|
329
|
+
const traceData = await observabilityStore.getTrace({ traceId })
|
|
330
|
+
const trajectory = extractTrajectoryFromTrace(traceData.spans, rootSpanId)
|
|
331
|
+
// trajectory.steps — hierarchical TrajectoryStep[] with children
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
**Parameters:**
|
|
335
|
+
|
|
336
|
+
- `spans` (`SpanRecord[]`) — Array of span records from a trace query.
|
|
337
|
+
- `rootSpanId` (`string`, optional) — Span ID to use as the starting point. When omitted, uses spans with no parent.
|
|
338
|
+
|
|
339
|
+
**Returns:** `Trajectory` — Contains `steps: TrajectoryStep[]` with recursive `children` and `totalDurationMs`.
|
|
340
|
+
|
|
341
|
+
#### Span type mapping
|
|
342
|
+
|
|
343
|
+
| Span type | Trajectory step type | Key fields extracted |
|
|
344
|
+
| ---------------------- | ---------------------- | ------------------------------------------------------------- |
|
|
345
|
+
| `TOOL_CALL` | `tool_call` | `toolArgs`, `toolResult`, `success` |
|
|
346
|
+
| `MCP_TOOL_CALL` | `mcp_tool_call` | `toolArgs`, `toolResult`, `mcpServer`, `success` |
|
|
347
|
+
| `MODEL_GENERATION` | `model_generation` | `modelId`, `promptTokens`, `completionTokens`, `finishReason` |
|
|
348
|
+
| `AGENT_RUN` | `agent_run` | `agentId` (from entity ID) |
|
|
349
|
+
| `WORKFLOW_RUN` | `workflow_run` | `workflowId` (from entity ID) |
|
|
350
|
+
| `WORKFLOW_STEP` | `workflow_step` | `output` |
|
|
351
|
+
| `WORKFLOW_CONDITIONAL` | `workflow_conditional` | `conditionCount`, `selectedSteps` |
|
|
352
|
+
| `WORKFLOW_PARALLEL` | `workflow_parallel` | `branchCount`, `parallelSteps` |
|
|
353
|
+
| `WORKFLOW_LOOP` | `workflow_loop` | `loopType`, `totalIterations` |
|
|
354
|
+
| `WORKFLOW_SLEEP` | `workflow_sleep` | `sleepDurationMs`, `sleepType` |
|
|
355
|
+
| `WORKFLOW_WAIT_EVENT` | `workflow_wait_event` | `eventName`, `eventReceived` |
|
|
356
|
+
| `PROCESSOR_RUN` | `processor_run` | `processorId` |
|
|
357
|
+
|
|
358
|
+
Spans with types `GENERIC`, `MODEL_STEP`, `MODEL_CHUNK`, and `WORKFLOW_CONDITIONAL_EVAL` are skipped as noise.
|
|
359
|
+
|
|
360
|
+
### `compareTrajectories`
|
|
361
|
+
|
|
362
|
+
Compares an actual trajectory against an expected trajectory and returns a detailed comparison result. Used internally by `createTrajectoryAccuracyScorerCode`.
|
|
363
|
+
|
|
364
|
+
The `expected` parameter accepts either a `Trajectory` (actual trajectory) or `{ steps: ExpectedStep[] }`. When using `ExpectedStep[]`, you can match by name only, name + stepType, or include data for comparison. See [Expected steps](https://mastra.ai/reference/evals/trajectory-accuracy) for details.
|
|
365
|
+
|
|
366
|
+
```typescript
|
|
367
|
+
import { compareTrajectories } from '@mastra/evals/scorers/utils'
|
|
368
|
+
|
|
369
|
+
// Using ExpectedStep[] (recommended for expectations)
|
|
370
|
+
// Data fields (e.g. toolArgs) are auto-compared when present on expected steps
|
|
371
|
+
const result = compareTrajectories(
|
|
372
|
+
actualTrajectory,
|
|
373
|
+
{ steps: [{ name: 'search' }, { name: 'summarize', stepType: 'tool_call' }] },
|
|
374
|
+
{ allowRepeatedSteps: true },
|
|
375
|
+
)
|
|
376
|
+
// result.score — 0.0 to 1.0
|
|
377
|
+
// result.missingSteps — step names not found
|
|
378
|
+
// result.extraSteps — unexpected step names
|
|
379
|
+
// result.outOfOrderSteps — steps found but in wrong order
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
**Returns:** `TrajectoryComparisonResult`
|
|
383
|
+
|
|
384
|
+
### `createTrajectoryTestRun`
|
|
385
|
+
|
|
386
|
+
Creates a test run object for trajectory scorers. Wraps a `Trajectory` into the expected `ScorerRun` format.
|
|
387
|
+
|
|
388
|
+
```typescript
|
|
389
|
+
import { createTrajectoryTestRun } from '@mastra/evals/scorers/utils'
|
|
390
|
+
|
|
391
|
+
const run = createTrajectoryTestRun({
|
|
392
|
+
steps: [
|
|
393
|
+
{ stepType: 'tool_call', name: 'search', toolArgs: { q: 'test' } },
|
|
394
|
+
{ stepType: 'tool_call', name: 'summarize' },
|
|
395
|
+
],
|
|
396
|
+
})
|
|
397
|
+
|
|
398
|
+
const result = await trajectoryScorer.run(run)
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
### `checkTrajectoryEfficiency`
|
|
402
|
+
|
|
403
|
+
Evaluates trajectory efficiency against step, token, and duration budgets. Also detects redundant calls (same tool with same arguments).
|
|
404
|
+
|
|
405
|
+
```typescript
|
|
406
|
+
import { checkTrajectoryEfficiency } from '@mastra/evals/scorers/utils'
|
|
407
|
+
|
|
408
|
+
const result = checkTrajectoryEfficiency(trajectory, {
|
|
409
|
+
maxSteps: 5,
|
|
410
|
+
maxTotalTokens: 2000,
|
|
411
|
+
maxTotalDurationMs: 5000,
|
|
412
|
+
noRedundantCalls: true,
|
|
413
|
+
})
|
|
414
|
+
// result.score — 1.0 if within all budgets, lower with penalties
|
|
415
|
+
// result.redundantCalls — duplicate tool+args combos
|
|
416
|
+
// result.overStepBudget — true if maxSteps exceeded
|
|
417
|
+
// result.overTokenBudget — true if maxTotalTokens exceeded
|
|
418
|
+
// result.overDurationBudget — true if maxTotalDurationMs exceeded
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
**Returns:** `TrajectoryEfficiencyResult`
|
|
422
|
+
|
|
423
|
+
### `checkTrajectoryBlacklist`
|
|
424
|
+
|
|
425
|
+
Checks whether a trajectory contains forbidden tools or tool call sequences.
|
|
426
|
+
|
|
427
|
+
```typescript
|
|
428
|
+
import { checkTrajectoryBlacklist } from '@mastra/evals/scorers/utils'
|
|
429
|
+
|
|
430
|
+
const result = checkTrajectoryBlacklist(trajectory, {
|
|
431
|
+
blacklistedTools: ['deleteAll', 'admin-override'],
|
|
432
|
+
blacklistedSequences: [['escalate', 'admin-override']],
|
|
433
|
+
})
|
|
434
|
+
// result.score — 1.0 if no violations, 0.0 if any found
|
|
435
|
+
// result.violatedTools — blacklisted tools that were called
|
|
436
|
+
// result.violatedSequences — blacklisted sequences that were detected
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
**Returns:** `TrajectoryBlacklistResult`
|
|
440
|
+
|
|
441
|
+
### `analyzeToolFailures`
|
|
442
|
+
|
|
443
|
+
Detects tool failure patterns including retries, fallbacks, and argument corrections.
|
|
444
|
+
|
|
445
|
+
```typescript
|
|
446
|
+
import { analyzeToolFailures } from '@mastra/evals/scorers/utils'
|
|
447
|
+
|
|
448
|
+
const result = analyzeToolFailures(trajectory, {
|
|
449
|
+
maxRetriesPerTool: 2,
|
|
450
|
+
})
|
|
451
|
+
// result.score — 1.0 if no failure patterns, lower if patterns detected
|
|
452
|
+
// result.patterns — detected patterns (retry, fallback, arg_correction)
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
**Returns:** `ToolFailureAnalysisResult`
|
|
456
|
+
|
|
269
457
|
## Complete example
|
|
270
458
|
|
|
271
459
|
Here's a complete example showing how to use multiple utilities together:
|