@mastra/evals 1.1.2 → 1.2.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +78 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-AY4K3J4R.cjs +581 -0
- package/dist/chunk-AY4K3J4R.cjs.map +1 -0
- package/dist/chunk-X4MKZ735.js +555 -0
- package/dist/chunk-X4MKZ735.js.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +289 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +627 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +164 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +627 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +567 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +168 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,81 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
+
## 1.2.0-alpha.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- **Configurable weights**: Add `weights` option to `createTrajectoryScorerCode` for controlling how dimension scores are combined. Defaults to `{ accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }`. ([#14740](https://github.com/mastra-ai/mastra/pull/14740))
|
|
8
|
+
|
|
9
|
+
```ts
|
|
10
|
+
const scorer = createTrajectoryScorerCode({
|
|
11
|
+
defaults: { steps: [{ name: 'search' }], maxSteps: 5 },
|
|
12
|
+
weights: { accuracy: 0.6, efficiency: 0.2, toolFailures: 0.1, blacklist: 0.1 },
|
|
13
|
+
});
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
**ExpectedStep redesign**: `ExpectedStep` is now a discriminated union mirroring `TrajectoryStep`. When you specify a `stepType`, you get autocomplete for that variant's fields (e.g., `toolArgs` for `tool_call`, `modelId` for `model_generation`). The old `data: Record<string, unknown>` field is replaced by direct variant fields.
|
|
17
|
+
|
|
18
|
+
```ts
|
|
19
|
+
// Before: { name: 'search', stepType: 'tool_call', data: { input: { query: 'weather' } } }
|
|
20
|
+
// After:
|
|
21
|
+
{ name: 'search', stepType: 'tool_call', toolArgs: { query: 'weather' } }
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
**Remove `compareStepData`**: The `compareStepData` option is removed from `compareTrajectories`, `TrajectoryExpectation`, and all scorers. Data fields are now auto-compared when present on expected steps — if you specify `toolArgs` on an `ExpectedStep`, it will be compared against the actual step. If you omit it, only name and stepType are matched.
|
|
25
|
+
|
|
26
|
+
Also fixes documentation inaccuracies in `trajectory-accuracy.mdx` and `scorer-utils.mdx`.
|
|
27
|
+
|
|
28
|
+
- Updated dependencies [[`e333b77`](https://github.com/mastra-ai/mastra/commit/e333b77e2d76ba57ccec1818e08cebc1993469ff), [`60a224d`](https://github.com/mastra-ai/mastra/commit/60a224dd497240e83698cfa5bfd02e3d1d854844), [`949b7bf`](https://github.com/mastra-ai/mastra/commit/949b7bfd4e40f2b2cba7fef5eb3f108a02cfe938), [`d084b66`](https://github.com/mastra-ai/mastra/commit/d084b6692396057e83c086b954c1857d20b58a14), [`79c699a`](https://github.com/mastra-ai/mastra/commit/79c699acf3cd8a77e11c55530431f48eb48456e9), [`62757b6`](https://github.com/mastra-ai/mastra/commit/62757b6db6e8bb86569d23ad0b514178f57053f8), [`3d70b0b`](https://github.com/mastra-ai/mastra/commit/3d70b0b3524d817173ad870768f259c06d61bd23), [`3b45a13`](https://github.com/mastra-ai/mastra/commit/3b45a138d09d040779c0aba1edbbfc1b57442d23), [`8127d96`](https://github.com/mastra-ai/mastra/commit/8127d96280492e335d49b244501088dfdd59a8f1)]:
|
|
29
|
+
- @mastra/core@1.18.0-alpha.3
|
|
30
|
+
|
|
31
|
+
## 1.2.0-alpha.0
|
|
32
|
+
|
|
33
|
+
### Minor Changes
|
|
34
|
+
|
|
35
|
+
- **Trajectory scorers**: Added scorers for evaluating agent and workflow execution paths. ([#14697](https://github.com/mastra-ai/mastra/pull/14697))
|
|
36
|
+
- `createTrajectoryScorerCode` — unified scorer that evaluates accuracy, efficiency, blacklist violations, and tool failure patterns in a single pass. Supports per-item expectations from datasets with static defaults. Nested `ExpectedStep.children` configs allow recursive evaluation with different rules per hierarchy level.
|
|
37
|
+
- `createTrajectoryAccuracyScorerCode` — deterministic accuracy scorer with strict, relaxed, and unordered ordering modes.
|
|
38
|
+
- `createTrajectoryAccuracyScorerLLM` — LLM-based scorer for semantic trajectory evaluation.
|
|
39
|
+
|
|
40
|
+
**Utility functions:**
|
|
41
|
+
- `extractTrajectory` / `extractWorkflowTrajectory` — Convert agent runs and workflow executions into structured trajectories
|
|
42
|
+
- `extractTrajectoryFromTrace` — Build hierarchical trajectories from observability trace spans, including nested agent/tool calls
|
|
43
|
+
- `compareTrajectories` — Compare actual vs. expected trajectories with configurable ordering and data matching. Accepts `ExpectedStep[]` for simpler expected step definitions
|
|
44
|
+
- `checkTrajectoryEfficiency` — Evaluate step counts, token usage, and duration against budgets
|
|
45
|
+
- `checkTrajectoryBlacklist` — Detect forbidden tools or tool sequences
|
|
46
|
+
- `analyzeToolFailures` — Detect retry patterns, fallbacks, and argument corrections
|
|
47
|
+
|
|
48
|
+
**Example — unified scorer with defaults:**
|
|
49
|
+
|
|
50
|
+
```ts
|
|
51
|
+
import { createTrajectoryScorerCode } from '@mastra/evals/scorers';
|
|
52
|
+
|
|
53
|
+
const scorer = createTrajectoryScorerCode({
|
|
54
|
+
defaults: {
|
|
55
|
+
ordering: 'strict',
|
|
56
|
+
steps: [
|
|
57
|
+
{ name: 'validate-input' },
|
|
58
|
+
{
|
|
59
|
+
name: 'research-agent',
|
|
60
|
+
stepType: 'agent_run',
|
|
61
|
+
children: {
|
|
62
|
+
ordering: 'unordered',
|
|
63
|
+
steps: [{ name: 'search' }, { name: 'summarize' }],
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
{ name: 'save-result' },
|
|
67
|
+
],
|
|
68
|
+
maxSteps: 10,
|
|
69
|
+
blacklistedTools: ['deleteAll'],
|
|
70
|
+
},
|
|
71
|
+
});
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Patch Changes
|
|
75
|
+
|
|
76
|
+
- Updated dependencies [[`dc9fc19`](https://github.com/mastra-ai/mastra/commit/dc9fc19da4437f6b508cc355f346a8856746a76b), [`260fe12`](https://github.com/mastra-ai/mastra/commit/260fe1295fe7354e39d6def2775e0797a7a277f0)]:
|
|
77
|
+
- @mastra/core@1.18.0-alpha.1
|
|
78
|
+
|
|
3
79
|
## 1.1.2
|
|
4
80
|
|
|
5
81
|
### Patch Changes
|
|
@@ -2369,7 +2445,7 @@
|
|
|
2369
2445
|
- 876b8a2: Rename difference metric to textual difference metric
|
|
2370
2446
|
- 1bbec77: Reorganized evals into nlp and llm
|
|
2371
2447
|
- 35764f4: Added workflow for eval tests
|
|
2372
|
-
- 8769a62: Split core into
|
|
2448
|
+
- 8769a62: Split core into separate entry files
|
|
2373
2449
|
- aea3c13: Fix evals export for llm and nlp
|
|
2374
2450
|
- 4f1d1a1: Enforce types ann cleanup package.json
|
|
2375
2451
|
- 202d404: Added instructions when generating evals
|
|
@@ -2783,7 +2859,7 @@
|
|
|
2783
2859
|
### Patch Changes
|
|
2784
2860
|
|
|
2785
2861
|
- 9625602: Use mastra core splitted bundles in other packages
|
|
2786
|
-
- 8769a62: Split core into
|
|
2862
|
+
- 8769a62: Split core into separate entry files
|
|
2787
2863
|
- Updated dependencies [30322ce]
|
|
2788
2864
|
- Updated dependencies [78eec7c]
|
|
2789
2865
|
- Updated dependencies [9625602]
|
package/LICENSE.md
CHANGED
|
@@ -1,3 +1,18 @@
|
|
|
1
|
+
Portions of this software are licensed as follows:
|
|
2
|
+
|
|
3
|
+
- All content that resides under any directory named "ee/" within this
|
|
4
|
+
repository, including but not limited to:
|
|
5
|
+
- `packages/core/src/auth/ee/`
|
|
6
|
+
- `packages/server/src/server/auth/ee/`
|
|
7
|
+
is licensed under the license defined in `ee/LICENSE`.
|
|
8
|
+
|
|
9
|
+
- All third-party components incorporated into the Mastra Software are
|
|
10
|
+
licensed under the original license provided by the owner of the
|
|
11
|
+
applicable component.
|
|
12
|
+
|
|
13
|
+
- Content outside of the above-mentioned directories or restrictions is
|
|
14
|
+
available under the "Apache License 2.0" as defined below.
|
|
15
|
+
|
|
1
16
|
# Apache License 2.0
|
|
2
17
|
|
|
3
18
|
Copyright (c) 2025 Kepler Software, Inc.
|
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var requestContext = require('@mastra/core/request-context');
|
|
4
|
+
var evals = require('@mastra/core/evals');
|
|
5
|
+
|
|
6
|
+
// src/scorers/utils.ts
|
|
7
|
+
function getTextContentFromMastraDBMessage(message) {
|
|
8
|
+
if (typeof message.content.content === "string" && message.content.content !== "") {
|
|
9
|
+
return message.content.content;
|
|
10
|
+
}
|
|
11
|
+
if (message.content.parts && Array.isArray(message.content.parts)) {
|
|
12
|
+
const textParts = message.content.parts.filter((p) => p.type === "text");
|
|
13
|
+
return textParts.length > 0 ? textParts[textParts.length - 1]?.text || "" : "";
|
|
14
|
+
}
|
|
15
|
+
return "";
|
|
16
|
+
}
|
|
17
|
+
var roundToTwoDecimals = (num) => {
|
|
18
|
+
return Math.round((num + Number.EPSILON) * 100) / 100;
|
|
19
|
+
};
|
|
20
|
+
function isCloserTo(value, target1, target2) {
|
|
21
|
+
return Math.abs(value - target1) < Math.abs(value - target2);
|
|
22
|
+
}
|
|
23
|
+
var createTestRun = (input, output, additionalContext, requestContext) => {
|
|
24
|
+
return {
|
|
25
|
+
input: [{ role: "user", content: input }],
|
|
26
|
+
output: { role: "assistant", text: output },
|
|
27
|
+
additionalContext: additionalContext ?? {},
|
|
28
|
+
requestContext: requestContext ?? {}
|
|
29
|
+
};
|
|
30
|
+
};
|
|
31
|
+
var getUserMessageFromRunInput = (input) => {
|
|
32
|
+
const message = input?.inputMessages.find(({ role }) => role === "user");
|
|
33
|
+
return message ? getTextContentFromMastraDBMessage(message) : void 0;
|
|
34
|
+
};
|
|
35
|
+
var getSystemMessagesFromRunInput = (input) => {
|
|
36
|
+
const systemMessages = [];
|
|
37
|
+
if (input?.systemMessages) {
|
|
38
|
+
systemMessages.push(
|
|
39
|
+
...input.systemMessages.map((msg) => {
|
|
40
|
+
if (typeof msg.content === "string") {
|
|
41
|
+
return msg.content;
|
|
42
|
+
} else if (Array.isArray(msg.content)) {
|
|
43
|
+
return msg.content.filter((part) => part.type === "text").map((part) => part.text || "").join(" ");
|
|
44
|
+
}
|
|
45
|
+
return "";
|
|
46
|
+
}).filter((content) => content)
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
if (input?.taggedSystemMessages) {
|
|
50
|
+
Object.values(input.taggedSystemMessages).forEach((messages) => {
|
|
51
|
+
messages.forEach((msg) => {
|
|
52
|
+
if (typeof msg.content === "string") {
|
|
53
|
+
systemMessages.push(msg.content);
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
return systemMessages;
|
|
59
|
+
};
|
|
60
|
+
var getCombinedSystemPrompt = (input) => {
|
|
61
|
+
const systemMessages = getSystemMessagesFromRunInput(input);
|
|
62
|
+
return systemMessages.join("\n\n");
|
|
63
|
+
};
|
|
64
|
+
var getAssistantMessageFromRunOutput = (output) => {
|
|
65
|
+
const message = output?.find(({ role }) => role === "assistant");
|
|
66
|
+
return message ? getTextContentFromMastraDBMessage(message) : void 0;
|
|
67
|
+
};
|
|
68
|
+
var getReasoningFromRunOutput = (output) => {
|
|
69
|
+
if (!output) return void 0;
|
|
70
|
+
const message = output.find(({ role }) => role === "assistant");
|
|
71
|
+
if (!message) return void 0;
|
|
72
|
+
if (message.content.reasoning) {
|
|
73
|
+
return message.content.reasoning;
|
|
74
|
+
}
|
|
75
|
+
const reasoningParts = message.content.parts?.filter((p) => p.type === "reasoning");
|
|
76
|
+
if (reasoningParts && reasoningParts.length > 0) {
|
|
77
|
+
const reasoningTexts = reasoningParts.map((p) => {
|
|
78
|
+
if (p.details && Array.isArray(p.details)) {
|
|
79
|
+
return p.details.filter((d) => d.type === "text").map((d) => d.text).join("");
|
|
80
|
+
}
|
|
81
|
+
return p.reasoning || "";
|
|
82
|
+
}).filter(Boolean);
|
|
83
|
+
return reasoningTexts.length > 0 ? reasoningTexts.join("\n") : void 0;
|
|
84
|
+
}
|
|
85
|
+
return void 0;
|
|
86
|
+
};
|
|
87
|
+
var createToolInvocation = ({
|
|
88
|
+
toolCallId,
|
|
89
|
+
toolName,
|
|
90
|
+
args,
|
|
91
|
+
result,
|
|
92
|
+
state = "result"
|
|
93
|
+
}) => {
|
|
94
|
+
return {
|
|
95
|
+
toolCallId,
|
|
96
|
+
toolName,
|
|
97
|
+
args,
|
|
98
|
+
result,
|
|
99
|
+
state
|
|
100
|
+
};
|
|
101
|
+
};
|
|
102
|
+
function createTestMessage({
|
|
103
|
+
content,
|
|
104
|
+
role,
|
|
105
|
+
id = "test-message",
|
|
106
|
+
toolInvocations = []
|
|
107
|
+
}) {
|
|
108
|
+
return {
|
|
109
|
+
id,
|
|
110
|
+
role,
|
|
111
|
+
content: {
|
|
112
|
+
format: 2,
|
|
113
|
+
parts: [{ type: "text", text: content }],
|
|
114
|
+
content,
|
|
115
|
+
...toolInvocations.length > 0 && {
|
|
116
|
+
toolInvocations: toolInvocations.map((ti) => ({
|
|
117
|
+
toolCallId: ti.toolCallId,
|
|
118
|
+
toolName: ti.toolName,
|
|
119
|
+
args: ti.args,
|
|
120
|
+
result: ti.result,
|
|
121
|
+
state: ti.state
|
|
122
|
+
}))
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
createdAt: /* @__PURE__ */ new Date()
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
var createAgentTestRun = ({
|
|
129
|
+
inputMessages = [],
|
|
130
|
+
output,
|
|
131
|
+
rememberedMessages = [],
|
|
132
|
+
systemMessages = [],
|
|
133
|
+
taggedSystemMessages = {},
|
|
134
|
+
requestContext: requestContext$1 = new requestContext.RequestContext(),
|
|
135
|
+
runId = crypto.randomUUID()
|
|
136
|
+
}) => {
|
|
137
|
+
return {
|
|
138
|
+
input: {
|
|
139
|
+
inputMessages,
|
|
140
|
+
rememberedMessages,
|
|
141
|
+
systemMessages,
|
|
142
|
+
taggedSystemMessages
|
|
143
|
+
},
|
|
144
|
+
output,
|
|
145
|
+
requestContext: requestContext$1,
|
|
146
|
+
runId
|
|
147
|
+
};
|
|
148
|
+
};
|
|
149
|
+
var createTrajectoryTestRun = ({
|
|
150
|
+
inputMessages = [],
|
|
151
|
+
trajectory,
|
|
152
|
+
rememberedMessages = [],
|
|
153
|
+
systemMessages = [],
|
|
154
|
+
taggedSystemMessages = {},
|
|
155
|
+
requestContext: requestContext$1 = new requestContext.RequestContext(),
|
|
156
|
+
runId = crypto.randomUUID(),
|
|
157
|
+
expectedTrajectory
|
|
158
|
+
}) => {
|
|
159
|
+
return {
|
|
160
|
+
input: {
|
|
161
|
+
inputMessages,
|
|
162
|
+
rememberedMessages,
|
|
163
|
+
systemMessages,
|
|
164
|
+
taggedSystemMessages
|
|
165
|
+
},
|
|
166
|
+
output: trajectory,
|
|
167
|
+
expectedTrajectory,
|
|
168
|
+
requestContext: requestContext$1,
|
|
169
|
+
runId
|
|
170
|
+
};
|
|
171
|
+
};
|
|
172
|
+
function extractToolCalls(output) {
|
|
173
|
+
const toolCalls = [];
|
|
174
|
+
const toolCallInfos = [];
|
|
175
|
+
for (let messageIndex = 0; messageIndex < output.length; messageIndex++) {
|
|
176
|
+
const message = output[messageIndex];
|
|
177
|
+
if (message?.content?.toolInvocations) {
|
|
178
|
+
for (let invocationIndex = 0; invocationIndex < message.content.toolInvocations.length; invocationIndex++) {
|
|
179
|
+
const invocation = message.content.toolInvocations[invocationIndex];
|
|
180
|
+
if (invocation && invocation.toolName && (invocation.state === "result" || invocation.state === "call")) {
|
|
181
|
+
toolCalls.push(invocation.toolName);
|
|
182
|
+
toolCallInfos.push({
|
|
183
|
+
toolName: invocation.toolName,
|
|
184
|
+
toolCallId: invocation.toolCallId || `${messageIndex}-${invocationIndex}`,
|
|
185
|
+
messageIndex,
|
|
186
|
+
invocationIndex
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return { tools: toolCalls, toolCallInfos };
|
|
193
|
+
}
|
|
194
|
+
var extractInputMessages = (runInput) => {
|
|
195
|
+
return runInput?.inputMessages?.map((msg) => getTextContentFromMastraDBMessage(msg)) || [];
|
|
196
|
+
};
|
|
197
|
+
var extractAgentResponseMessages = (runOutput) => {
|
|
198
|
+
return runOutput.filter((msg) => msg.role === "assistant").map((msg) => getTextContentFromMastraDBMessage(msg));
|
|
199
|
+
};
|
|
200
|
+
function extractToolResults(output) {
|
|
201
|
+
const results = [];
|
|
202
|
+
for (const message of output) {
|
|
203
|
+
const toolInvocations = message?.content?.toolInvocations;
|
|
204
|
+
if (!toolInvocations) continue;
|
|
205
|
+
for (const invocation of toolInvocations) {
|
|
206
|
+
if (invocation.state === "result" && invocation.result !== void 0) {
|
|
207
|
+
results.push({
|
|
208
|
+
toolName: invocation.toolName,
|
|
209
|
+
toolCallId: invocation.toolCallId || "",
|
|
210
|
+
args: invocation.args || {},
|
|
211
|
+
result: invocation.result
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
return results;
|
|
217
|
+
}
|
|
218
|
+
function compareTrajectories(actual, expected, options = {}) {
|
|
219
|
+
const { allowRepeatedSteps = true, ordering = "relaxed" } = options;
|
|
220
|
+
const normalizedExpected = {
|
|
221
|
+
steps: expected.steps
|
|
222
|
+
};
|
|
223
|
+
if (normalizedExpected.steps.length === 0) {
|
|
224
|
+
return {
|
|
225
|
+
score: actual.steps.length === 0 ? 1 : 0,
|
|
226
|
+
matchedSteps: 0,
|
|
227
|
+
totalExpectedSteps: 0,
|
|
228
|
+
totalActualSteps: actual.steps.length,
|
|
229
|
+
missingSteps: [],
|
|
230
|
+
extraSteps: actual.steps.map((s) => s.name),
|
|
231
|
+
outOfOrderSteps: [],
|
|
232
|
+
repeatedSteps: []
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
const actualNames = actual.steps.map((s) => s.name);
|
|
236
|
+
const nameCounts = /* @__PURE__ */ new Map();
|
|
237
|
+
for (const name of actualNames) {
|
|
238
|
+
nameCounts.set(name, (nameCounts.get(name) || 0) + 1);
|
|
239
|
+
}
|
|
240
|
+
const repeatedSteps = [...nameCounts.entries()].filter(([_, count]) => count > 1).map(([name]) => name);
|
|
241
|
+
if (ordering === "strict") {
|
|
242
|
+
return compareStrictOrder(actual, normalizedExpected, { allowRepeatedSteps, repeatedSteps });
|
|
243
|
+
}
|
|
244
|
+
if (ordering === "unordered") {
|
|
245
|
+
return compareUnorderedPresence(actual, normalizedExpected, { allowRepeatedSteps, repeatedSteps });
|
|
246
|
+
}
|
|
247
|
+
return compareRelaxedOrder(actual, normalizedExpected, { allowRepeatedSteps, repeatedSteps });
|
|
248
|
+
}
|
|
249
|
+
function compareStrictOrder(actual, expected, opts) {
|
|
250
|
+
const actualNames = actual.steps.map((s) => s.name);
|
|
251
|
+
const expectedNames = expected.steps.map((s) => s.name);
|
|
252
|
+
let matchedSteps = 0;
|
|
253
|
+
const outOfOrderSteps = [];
|
|
254
|
+
const matchedExpectedIndices = /* @__PURE__ */ new Set();
|
|
255
|
+
const maxLen = Math.max(actualNames.length, expectedNames.length);
|
|
256
|
+
for (let i = 0; i < maxLen; i++) {
|
|
257
|
+
const actualName = actualNames[i];
|
|
258
|
+
const expectedName = expectedNames[i];
|
|
259
|
+
if (actualName === expectedName) {
|
|
260
|
+
if (actual.steps[i] && expected.steps[i]) {
|
|
261
|
+
if (expectedStepMatches(actual.steps[i], expected.steps[i])) {
|
|
262
|
+
matchedSteps++;
|
|
263
|
+
matchedExpectedIndices.add(i);
|
|
264
|
+
}
|
|
265
|
+
} else {
|
|
266
|
+
matchedSteps++;
|
|
267
|
+
matchedExpectedIndices.add(i);
|
|
268
|
+
}
|
|
269
|
+
} else if (actualName && expectedNames.includes(actualName)) {
|
|
270
|
+
outOfOrderSteps.push(actualName);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
const missingSteps = expectedNames.filter((_, i) => !matchedExpectedIndices.has(i));
|
|
274
|
+
const extraSteps = actualNames.filter((name) => !expectedNames.includes(name));
|
|
275
|
+
let score = matchedSteps / expected.steps.length;
|
|
276
|
+
if (actualNames.length > expectedNames.length) {
|
|
277
|
+
const extraPenalty = (actualNames.length - expectedNames.length) / expectedNames.length;
|
|
278
|
+
score = Math.max(0, score - extraPenalty * 0.5);
|
|
279
|
+
}
|
|
280
|
+
if (!opts.allowRepeatedSteps && opts.repeatedSteps.length > 0) {
|
|
281
|
+
score = Math.max(0, score - opts.repeatedSteps.length * 0.1);
|
|
282
|
+
}
|
|
283
|
+
return {
|
|
284
|
+
score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
|
|
285
|
+
matchedSteps,
|
|
286
|
+
totalExpectedSteps: expected.steps.length,
|
|
287
|
+
totalActualSteps: actual.steps.length,
|
|
288
|
+
missingSteps,
|
|
289
|
+
extraSteps,
|
|
290
|
+
outOfOrderSteps,
|
|
291
|
+
repeatedSteps: opts.repeatedSteps
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
function compareRelaxedOrder(actual, expected, opts) {
|
|
295
|
+
const actualNames = actual.steps.map((s) => s.name);
|
|
296
|
+
const expectedNames = expected.steps.map((s) => s.name);
|
|
297
|
+
let matchedSteps = 0;
|
|
298
|
+
let lastMatchedIndex = -1;
|
|
299
|
+
const outOfOrderSteps = [];
|
|
300
|
+
const matchedExpectedIndices = /* @__PURE__ */ new Set();
|
|
301
|
+
for (let i = 0; i < expectedNames.length; i++) {
|
|
302
|
+
const expectedName = expectedNames[i];
|
|
303
|
+
let found = false;
|
|
304
|
+
for (let j = lastMatchedIndex + 1; j < actualNames.length; j++) {
|
|
305
|
+
if (actualNames[j] === expectedName) {
|
|
306
|
+
if (actual.steps[j] && expected.steps[i]) {
|
|
307
|
+
if (expectedStepMatches(actual.steps[j], expected.steps[i])) {
|
|
308
|
+
matchedSteps++;
|
|
309
|
+
lastMatchedIndex = j;
|
|
310
|
+
matchedExpectedIndices.add(i);
|
|
311
|
+
found = true;
|
|
312
|
+
break;
|
|
313
|
+
}
|
|
314
|
+
} else {
|
|
315
|
+
matchedSteps++;
|
|
316
|
+
lastMatchedIndex = j;
|
|
317
|
+
matchedExpectedIndices.add(i);
|
|
318
|
+
found = true;
|
|
319
|
+
break;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (!found) {
|
|
324
|
+
if (actualNames.includes(expectedName)) {
|
|
325
|
+
outOfOrderSteps.push(expectedName);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const missingSteps = expectedNames.filter((_, i) => !matchedExpectedIndices.has(i));
|
|
330
|
+
const expectedSet = new Set(expectedNames);
|
|
331
|
+
const extraSteps = actualNames.filter((name) => !expectedSet.has(name));
|
|
332
|
+
let score = matchedSteps / expected.steps.length;
|
|
333
|
+
if (!opts.allowRepeatedSteps && opts.repeatedSteps.length > 0) {
|
|
334
|
+
score = Math.max(0, score - opts.repeatedSteps.length * 0.1);
|
|
335
|
+
}
|
|
336
|
+
return {
|
|
337
|
+
score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
|
|
338
|
+
matchedSteps,
|
|
339
|
+
totalExpectedSteps: expected.steps.length,
|
|
340
|
+
totalActualSteps: actual.steps.length,
|
|
341
|
+
missingSteps,
|
|
342
|
+
extraSteps,
|
|
343
|
+
outOfOrderSteps,
|
|
344
|
+
repeatedSteps: opts.repeatedSteps
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
var COMPARABLE_FIELDS_BY_TYPE = {
|
|
348
|
+
tool_call: ["toolArgs", "toolResult", "success"],
|
|
349
|
+
mcp_tool_call: ["toolArgs", "toolResult", "mcpServer", "success"],
|
|
350
|
+
model_generation: ["modelId", "promptTokens", "completionTokens", "finishReason"],
|
|
351
|
+
agent_run: ["agentId"],
|
|
352
|
+
workflow_step: ["stepId", "status", "output"],
|
|
353
|
+
workflow_run: ["workflowId", "status"],
|
|
354
|
+
workflow_conditional: ["conditionCount", "selectedSteps"],
|
|
355
|
+
workflow_parallel: ["branchCount", "parallelSteps"],
|
|
356
|
+
workflow_loop: ["loopType", "totalIterations"],
|
|
357
|
+
workflow_sleep: ["sleepDurationMs", "sleepType"],
|
|
358
|
+
workflow_wait_event: ["eventName", "eventReceived"],
|
|
359
|
+
processor_run: ["processorId"]
|
|
360
|
+
};
|
|
361
|
+
function expectedStepMatches(actual, expected) {
|
|
362
|
+
if (actual.name !== expected.name) return false;
|
|
363
|
+
if (expected.stepType && actual.stepType !== expected.stepType) return false;
|
|
364
|
+
if (expected.stepType) {
|
|
365
|
+
const fields = COMPARABLE_FIELDS_BY_TYPE[expected.stepType] ?? [];
|
|
366
|
+
for (const field of fields) {
|
|
367
|
+
const expectedVal = expected[field];
|
|
368
|
+
if (expectedVal === void 0) continue;
|
|
369
|
+
const actualVal = actual[field];
|
|
370
|
+
if (actualVal === void 0) return false;
|
|
371
|
+
try {
|
|
372
|
+
if (JSON.stringify(actualVal) !== JSON.stringify(expectedVal)) return false;
|
|
373
|
+
} catch {
|
|
374
|
+
return false;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
return true;
|
|
379
|
+
}
|
|
380
|
+
function compareUnorderedPresence(actual, expected, opts) {
|
|
381
|
+
const actualNames = actual.steps.map((s) => s.name);
|
|
382
|
+
const expectedNames = expected.steps.map((s) => s.name);
|
|
383
|
+
let matchedSteps = 0;
|
|
384
|
+
const matchedExpectedIndices = /* @__PURE__ */ new Set();
|
|
385
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
386
|
+
for (let i = 0; i < expected.steps.length; i++) {
|
|
387
|
+
const expectedStep = expected.steps[i];
|
|
388
|
+
for (let j = 0; j < actual.steps.length; j++) {
|
|
389
|
+
if (!usedIndices.has(j) && expectedStepMatches(actual.steps[j], expectedStep)) {
|
|
390
|
+
matchedSteps++;
|
|
391
|
+
matchedExpectedIndices.add(i);
|
|
392
|
+
usedIndices.add(j);
|
|
393
|
+
break;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
const missingSteps = expectedNames.filter((_, i) => !matchedExpectedIndices.has(i));
|
|
398
|
+
const expectedSet = new Set(expectedNames);
|
|
399
|
+
const extraSteps = actualNames.filter((name) => !expectedSet.has(name));
|
|
400
|
+
let score = matchedSteps / expected.steps.length;
|
|
401
|
+
if (!opts.allowRepeatedSteps && opts.repeatedSteps.length > 0) {
|
|
402
|
+
score = Math.max(0, score - opts.repeatedSteps.length * 0.1);
|
|
403
|
+
}
|
|
404
|
+
return {
|
|
405
|
+
score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
|
|
406
|
+
matchedSteps,
|
|
407
|
+
totalExpectedSteps: expected.steps.length,
|
|
408
|
+
totalActualSteps: actual.steps.length,
|
|
409
|
+
missingSteps,
|
|
410
|
+
extraSteps,
|
|
411
|
+
outOfOrderSteps: [],
|
|
412
|
+
// ordering not checked in unordered mode
|
|
413
|
+
repeatedSteps: opts.repeatedSteps
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
function checkTrajectoryEfficiency(trajectory, options = {}) {
|
|
417
|
+
const { maxSteps, maxTotalTokens, maxTotalDurationMs, noRedundantCalls = true } = options;
|
|
418
|
+
const totalSteps = trajectory.steps.length;
|
|
419
|
+
let totalTokens = 0;
|
|
420
|
+
for (const step of trajectory.steps) {
|
|
421
|
+
if (step.stepType === "model_generation") {
|
|
422
|
+
totalTokens += (step.promptTokens ?? 0) + (step.completionTokens ?? 0);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
const totalDurationMs = trajectory.totalDurationMs ?? trajectory.steps.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
426
|
+
const redundantCalls = [];
|
|
427
|
+
if (noRedundantCalls) {
|
|
428
|
+
for (let i = 1; i < trajectory.steps.length; i++) {
|
|
429
|
+
const prev = trajectory.steps[i - 1];
|
|
430
|
+
const curr = trajectory.steps[i];
|
|
431
|
+
if (prev.name === curr.name && prev.stepType === curr.stepType && (prev.stepType === "tool_call" || prev.stepType === "mcp_tool_call")) {
|
|
432
|
+
const prevArgs = prev.toolArgs;
|
|
433
|
+
const currArgs = curr.toolArgs;
|
|
434
|
+
try {
|
|
435
|
+
if (JSON.stringify(prevArgs) === JSON.stringify(currArgs)) {
|
|
436
|
+
redundantCalls.push({ name: curr.name, index: i });
|
|
437
|
+
}
|
|
438
|
+
} catch {
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
const overStepBudget = maxSteps !== void 0 && totalSteps > maxSteps;
|
|
444
|
+
const overTokenBudget = maxTotalTokens !== void 0 && totalTokens > maxTotalTokens;
|
|
445
|
+
const overDurationBudget = maxTotalDurationMs !== void 0 && totalDurationMs > maxTotalDurationMs;
|
|
446
|
+
const dimensions = [];
|
|
447
|
+
if (maxSteps !== void 0) {
|
|
448
|
+
dimensions.push(overStepBudget ? Math.max(0, 1 - (totalSteps - maxSteps) / maxSteps) : 1);
|
|
449
|
+
}
|
|
450
|
+
if (maxTotalTokens !== void 0) {
|
|
451
|
+
dimensions.push(overTokenBudget ? Math.max(0, 1 - (totalTokens - maxTotalTokens) / maxTotalTokens) : 1);
|
|
452
|
+
}
|
|
453
|
+
if (maxTotalDurationMs !== void 0) {
|
|
454
|
+
dimensions.push(
|
|
455
|
+
overDurationBudget ? Math.max(0, 1 - (totalDurationMs - maxTotalDurationMs) / maxTotalDurationMs) : 1
|
|
456
|
+
);
|
|
457
|
+
}
|
|
458
|
+
if (noRedundantCalls) {
|
|
459
|
+
dimensions.push(redundantCalls.length === 0 ? 1 : Math.max(0, 1 - redundantCalls.length * 0.2));
|
|
460
|
+
}
|
|
461
|
+
const score = dimensions.length > 0 ? dimensions.reduce((a, b) => a + b, 0) / dimensions.length : 1;
|
|
462
|
+
return {
|
|
463
|
+
score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
|
|
464
|
+
totalSteps,
|
|
465
|
+
overStepBudget,
|
|
466
|
+
totalTokens,
|
|
467
|
+
overTokenBudget,
|
|
468
|
+
totalDurationMs,
|
|
469
|
+
overDurationBudget,
|
|
470
|
+
redundantCalls
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
function checkTrajectoryBlacklist(trajectory, options = {}) {
|
|
474
|
+
const { blacklistedTools = [], blacklistedSequences = [] } = options;
|
|
475
|
+
const violatedTools = [];
|
|
476
|
+
const violatedSequences = [];
|
|
477
|
+
const stepNames = trajectory.steps.map((s) => s.name);
|
|
478
|
+
for (const forbidden of blacklistedTools) {
|
|
479
|
+
if (stepNames.includes(forbidden)) {
|
|
480
|
+
violatedTools.push(forbidden);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
for (const sequence of blacklistedSequences) {
|
|
484
|
+
if (sequence.length === 0) continue;
|
|
485
|
+
for (let i = 0; i <= stepNames.length - sequence.length; i++) {
|
|
486
|
+
let match = true;
|
|
487
|
+
for (let j = 0; j < sequence.length; j++) {
|
|
488
|
+
if (stepNames[i + j] !== sequence[j]) {
|
|
489
|
+
match = false;
|
|
490
|
+
break;
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
if (match) {
|
|
494
|
+
violatedSequences.push(sequence);
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
const hasViolations = violatedTools.length > 0 || violatedSequences.length > 0;
|
|
500
|
+
return {
|
|
501
|
+
score: hasViolations ? 0 : 1,
|
|
502
|
+
violatedTools,
|
|
503
|
+
violatedSequences
|
|
504
|
+
};
|
|
505
|
+
}
|
|
506
|
+
function analyzeToolFailures(trajectory, options = {}) {
|
|
507
|
+
const { maxRetriesPerTool = 2 } = options;
|
|
508
|
+
const patterns = [];
|
|
509
|
+
let totalRetries = 0;
|
|
510
|
+
const toolCallSteps = trajectory.steps.filter((s) => s.stepType === "tool_call" || s.stepType === "mcp_tool_call");
|
|
511
|
+
if (toolCallSteps.length === 0) {
|
|
512
|
+
return { score: 1, patterns: [], totalRetries: 0, excessiveRetryTools: [] };
|
|
513
|
+
}
|
|
514
|
+
let i = 0;
|
|
515
|
+
while (i < toolCallSteps.length) {
|
|
516
|
+
const currentTool = toolCallSteps[i];
|
|
517
|
+
let retryCount = 0;
|
|
518
|
+
let j = i + 1;
|
|
519
|
+
while (j < toolCallSteps.length && toolCallSteps[j].name === currentTool.name) {
|
|
520
|
+
const prevStep = toolCallSteps[j - 1];
|
|
521
|
+
if (prevStep.success === false) {
|
|
522
|
+
retryCount++;
|
|
523
|
+
}
|
|
524
|
+
j++;
|
|
525
|
+
}
|
|
526
|
+
if (retryCount > 0) {
|
|
527
|
+
const nextDifferentTool = j < toolCallSteps.length ? toolCallSteps[j] : void 0;
|
|
528
|
+
const lastRetry = toolCallSteps[j - 1];
|
|
529
|
+
const lastSuccess = lastRetry.success !== false;
|
|
530
|
+
patterns.push({
|
|
531
|
+
toolName: currentTool.name,
|
|
532
|
+
retryCount,
|
|
533
|
+
fellBackToAlternative: nextDifferentTool !== void 0 && !lastSuccess,
|
|
534
|
+
alternativeTool: nextDifferentTool !== void 0 && !lastSuccess ? nextDifferentTool.name : void 0,
|
|
535
|
+
eventuallySucceeded: lastSuccess
|
|
536
|
+
});
|
|
537
|
+
totalRetries += retryCount;
|
|
538
|
+
}
|
|
539
|
+
i = j;
|
|
540
|
+
}
|
|
541
|
+
const excessiveRetryTools = patterns.filter((p) => p.retryCount > maxRetriesPerTool).map((p) => p.toolName);
|
|
542
|
+
let score = 1;
|
|
543
|
+
if (toolCallSteps.length > 0) {
|
|
544
|
+
const excessRetries = patterns.reduce((sum, p) => sum + Math.max(0, p.retryCount - maxRetriesPerTool), 0);
|
|
545
|
+
score = Math.max(0, 1 - excessRetries * 0.2);
|
|
546
|
+
}
|
|
547
|
+
return {
|
|
548
|
+
score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
|
|
549
|
+
patterns,
|
|
550
|
+
totalRetries,
|
|
551
|
+
excessiveRetryTools
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
Object.defineProperty(exports, "extractTrajectory", {
|
|
556
|
+
enumerable: true,
|
|
557
|
+
get: function () { return evals.extractTrajectory; }
|
|
558
|
+
});
|
|
559
|
+
exports.analyzeToolFailures = analyzeToolFailures;
|
|
560
|
+
exports.checkTrajectoryBlacklist = checkTrajectoryBlacklist;
|
|
561
|
+
exports.checkTrajectoryEfficiency = checkTrajectoryEfficiency;
|
|
562
|
+
exports.compareTrajectories = compareTrajectories;
|
|
563
|
+
exports.createAgentTestRun = createAgentTestRun;
|
|
564
|
+
exports.createTestMessage = createTestMessage;
|
|
565
|
+
exports.createTestRun = createTestRun;
|
|
566
|
+
exports.createToolInvocation = createToolInvocation;
|
|
567
|
+
exports.createTrajectoryTestRun = createTrajectoryTestRun;
|
|
568
|
+
exports.extractAgentResponseMessages = extractAgentResponseMessages;
|
|
569
|
+
exports.extractInputMessages = extractInputMessages;
|
|
570
|
+
exports.extractToolCalls = extractToolCalls;
|
|
571
|
+
exports.extractToolResults = extractToolResults;
|
|
572
|
+
exports.getAssistantMessageFromRunOutput = getAssistantMessageFromRunOutput;
|
|
573
|
+
exports.getCombinedSystemPrompt = getCombinedSystemPrompt;
|
|
574
|
+
exports.getReasoningFromRunOutput = getReasoningFromRunOutput;
|
|
575
|
+
exports.getSystemMessagesFromRunInput = getSystemMessagesFromRunInput;
|
|
576
|
+
exports.getTextContentFromMastraDBMessage = getTextContentFromMastraDBMessage;
|
|
577
|
+
exports.getUserMessageFromRunInput = getUserMessageFromRunInput;
|
|
578
|
+
exports.isCloserTo = isCloserTo;
|
|
579
|
+
exports.roundToTwoDecimals = roundToTwoDecimals;
|
|
580
|
+
//# sourceMappingURL=chunk-AY4K3J4R.cjs.map
|
|
581
|
+
//# sourceMappingURL=chunk-AY4K3J4R.cjs.map
|