@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -0,0 +1,632 @@
1
+ 'use strict';
2
+
3
+ var requestContext = require('@mastra/core/request-context');
4
+ var evals = require('@mastra/core/evals');
5
+
6
+ // src/scorers/utils.ts
7
+ function getTextContentFromMastraDBMessage(message) {
8
+ if (typeof message.content.content === "string" && message.content.content !== "") {
9
+ return message.content.content;
10
+ }
11
+ if (message.content.parts && Array.isArray(message.content.parts)) {
12
+ const textParts = message.content.parts.filter((p) => p.type === "text");
13
+ return textParts.length > 0 ? textParts[textParts.length - 1]?.text || "" : "";
14
+ }
15
+ return "";
16
+ }
17
+ var roundToTwoDecimals = (num) => {
18
+ return Math.round((num + Number.EPSILON) * 100) / 100;
19
+ };
20
+ function isCloserTo(value, target1, target2) {
21
+ return Math.abs(value - target1) < Math.abs(value - target2);
22
+ }
23
+ var createTestRun = (input, output, additionalContext, requestContext) => {
24
+ return {
25
+ input: [{ role: "user", content: input }],
26
+ output: { role: "assistant", text: output },
27
+ additionalContext: additionalContext ?? {},
28
+ requestContext: requestContext ?? {}
29
+ };
30
+ };
31
+ var getUserMessageFromRunInput = (input) => {
32
+ const message = input?.inputMessages.find(({ role }) => role === "user");
33
+ return message ? getTextContentFromMastraDBMessage(message) : void 0;
34
+ };
35
+ var getSystemMessagesFromRunInput = (input) => {
36
+ const systemMessages = [];
37
+ if (input?.systemMessages) {
38
+ systemMessages.push(
39
+ ...input.systemMessages.map((msg) => {
40
+ if (typeof msg.content === "string") {
41
+ return msg.content;
42
+ } else if (Array.isArray(msg.content)) {
43
+ return msg.content.filter((part) => part.type === "text").map((part) => part.text || "").join(" ");
44
+ }
45
+ return "";
46
+ }).filter((content) => content)
47
+ );
48
+ }
49
+ if (input?.taggedSystemMessages) {
50
+ Object.values(input.taggedSystemMessages).forEach((messages) => {
51
+ messages.forEach((msg) => {
52
+ if (typeof msg.content === "string") {
53
+ systemMessages.push(msg.content);
54
+ }
55
+ });
56
+ });
57
+ }
58
+ return systemMessages;
59
+ };
60
+ var getCombinedSystemPrompt = (input) => {
61
+ const systemMessages = getSystemMessagesFromRunInput(input);
62
+ return systemMessages.join("\n\n");
63
+ };
64
+ var getAssistantMessageFromRunOutput = (output) => {
65
+ const message = output?.find(({ role }) => role === "assistant");
66
+ return message ? getTextContentFromMastraDBMessage(message) : void 0;
67
+ };
68
+ var getReasoningFromRunOutput = (output) => {
69
+ if (!output) return void 0;
70
+ const message = output.find(({ role }) => role === "assistant");
71
+ if (!message) return void 0;
72
+ if (message.content.reasoning) {
73
+ return message.content.reasoning;
74
+ }
75
+ const reasoningParts = message.content.parts?.filter((p) => p.type === "reasoning");
76
+ if (reasoningParts && reasoningParts.length > 0) {
77
+ const reasoningTexts = reasoningParts.map((p) => {
78
+ if (p.details && Array.isArray(p.details)) {
79
+ return p.details.filter((d) => d.type === "text").map((d) => d.text).join("");
80
+ }
81
+ return p.reasoning || "";
82
+ }).filter(Boolean);
83
+ return reasoningTexts.length > 0 ? reasoningTexts.join("\n") : void 0;
84
+ }
85
+ return void 0;
86
+ };
87
+ var createToolInvocation = ({
88
+ toolCallId,
89
+ toolName,
90
+ args,
91
+ result,
92
+ state = "result"
93
+ }) => {
94
+ return {
95
+ toolCallId,
96
+ toolName,
97
+ args,
98
+ result,
99
+ state
100
+ };
101
+ };
102
+ function createTestMessage({
103
+ content,
104
+ role,
105
+ id = "test-message",
106
+ toolInvocations = []
107
+ }) {
108
+ return {
109
+ id,
110
+ role,
111
+ content: {
112
+ format: 2,
113
+ parts: [{ type: "text", text: content }],
114
+ content,
115
+ ...toolInvocations.length > 0 && {
116
+ toolInvocations: toolInvocations.map((ti) => ({
117
+ toolCallId: ti.toolCallId,
118
+ toolName: ti.toolName,
119
+ args: ti.args,
120
+ result: ti.result,
121
+ state: ti.state
122
+ }))
123
+ }
124
+ },
125
+ createdAt: /* @__PURE__ */ new Date()
126
+ };
127
+ }
128
+ var createAgentTestRun = ({
129
+ inputMessages = [],
130
+ output,
131
+ rememberedMessages = [],
132
+ systemMessages = [],
133
+ taggedSystemMessages = {},
134
+ requestContext: requestContext$1 = new requestContext.RequestContext(),
135
+ runId = crypto.randomUUID()
136
+ }) => {
137
+ return {
138
+ input: {
139
+ inputMessages,
140
+ rememberedMessages,
141
+ systemMessages,
142
+ taggedSystemMessages
143
+ },
144
+ output,
145
+ requestContext: requestContext$1,
146
+ runId
147
+ };
148
+ };
149
+ var createTrajectoryTestRun = ({
150
+ inputMessages = [],
151
+ trajectory,
152
+ rememberedMessages = [],
153
+ systemMessages = [],
154
+ taggedSystemMessages = {},
155
+ requestContext: requestContext$1 = new requestContext.RequestContext(),
156
+ runId = crypto.randomUUID(),
157
+ expectedTrajectory
158
+ }) => {
159
+ return {
160
+ input: {
161
+ inputMessages,
162
+ rememberedMessages,
163
+ systemMessages,
164
+ taggedSystemMessages
165
+ },
166
+ output: trajectory,
167
+ expectedTrajectory,
168
+ requestContext: requestContext$1,
169
+ runId
170
+ };
171
+ };
172
+ function extractToolCalls(output) {
173
+ const toolCalls = [];
174
+ const toolCallInfos = [];
175
+ for (let messageIndex = 0; messageIndex < output.length; messageIndex++) {
176
+ const message = output[messageIndex];
177
+ if (message?.content?.toolInvocations) {
178
+ for (let invocationIndex = 0; invocationIndex < message.content.toolInvocations.length; invocationIndex++) {
179
+ const invocation = message.content.toolInvocations[invocationIndex];
180
+ if (invocation && invocation.toolName && (invocation.state === "result" || invocation.state === "call")) {
181
+ toolCalls.push(invocation.toolName);
182
+ toolCallInfos.push({
183
+ toolName: invocation.toolName,
184
+ toolCallId: invocation.toolCallId || `${messageIndex}-${invocationIndex}`,
185
+ messageIndex,
186
+ invocationIndex
187
+ });
188
+ }
189
+ }
190
+ }
191
+ }
192
+ return { tools: toolCalls, toolCallInfos };
193
+ }
194
+ var extractInputMessages = (runInput) => {
195
+ return runInput?.inputMessages?.map((msg) => getTextContentFromMastraDBMessage(msg)) || [];
196
+ };
197
+ var extractAgentResponseMessages = (runOutput) => {
198
+ return runOutput.filter((msg) => msg.role === "assistant").map((msg) => getTextContentFromMastraDBMessage(msg));
199
+ };
200
+ function extractToolResults(output) {
201
+ const results = [];
202
+ for (const message of output) {
203
+ const toolInvocations = message?.content?.toolInvocations;
204
+ if (!toolInvocations) continue;
205
+ for (const invocation of toolInvocations) {
206
+ if (invocation.state === "result" && invocation.result !== void 0) {
207
+ results.push({
208
+ toolName: invocation.toolName,
209
+ toolCallId: invocation.toolCallId || "",
210
+ args: invocation.args || {},
211
+ result: invocation.result
212
+ });
213
+ }
214
+ }
215
+ }
216
+ return results;
217
+ }
218
+ function compareTrajectories(actual, expected, options = {}) {
219
+ const { compareStepData = false, allowRepeatedSteps = true } = options;
220
+ const trajectoryStepKeys = [
221
+ "toolArgs",
222
+ "toolResult",
223
+ "agentId",
224
+ "modelId",
225
+ "durationMs",
226
+ "success",
227
+ "promptTokens",
228
+ "completionTokens"
229
+ ];
230
+ const hasTrajectorySteps = expected.steps.length > 0 && expected.steps.some((s) => trajectoryStepKeys.some((k) => k in s));
231
+ let normalizedExpected;
232
+ if (hasTrajectorySteps) {
233
+ normalizedExpected = {
234
+ steps: expected.steps.map((s) => {
235
+ const stepData = getStepData(s);
236
+ const data = {};
237
+ if (stepData.input !== void 0) data.input = stepData.input;
238
+ if (stepData.output !== void 0) data.output = stepData.output;
239
+ return {
240
+ name: s.name,
241
+ stepType: s.stepType,
242
+ ...Object.keys(data).length > 0 ? { data } : {}
243
+ };
244
+ })
245
+ };
246
+ } else {
247
+ normalizedExpected = expected;
248
+ }
249
+ let ordering = "relaxed";
250
+ if (options.ordering) {
251
+ ordering = options.ordering;
252
+ } else if (options.strictOrder) {
253
+ ordering = "strict";
254
+ }
255
+ if (normalizedExpected.steps.length === 0) {
256
+ return {
257
+ score: actual.steps.length === 0 ? 1 : 0,
258
+ matchedSteps: 0,
259
+ totalExpectedSteps: 0,
260
+ totalActualSteps: actual.steps.length,
261
+ missingSteps: [],
262
+ extraSteps: actual.steps.map((s) => s.name),
263
+ outOfOrderSteps: [],
264
+ repeatedSteps: []
265
+ };
266
+ }
267
+ const actualNames = actual.steps.map((s) => s.name);
268
+ const nameCounts = /* @__PURE__ */ new Map();
269
+ for (const name of actualNames) {
270
+ nameCounts.set(name, (nameCounts.get(name) || 0) + 1);
271
+ }
272
+ const repeatedSteps = [...nameCounts.entries()].filter(([_, count]) => count > 1).map(([name]) => name);
273
+ if (ordering === "strict") {
274
+ return compareStrictOrder(actual, normalizedExpected, { compareStepData, allowRepeatedSteps, repeatedSteps });
275
+ }
276
+ if (ordering === "unordered") {
277
+ return compareUnorderedPresence(actual, normalizedExpected, {
278
+ compareStepData,
279
+ allowRepeatedSteps,
280
+ repeatedSteps
281
+ });
282
+ }
283
+ return compareRelaxedOrder(actual, normalizedExpected, { compareStepData, allowRepeatedSteps, repeatedSteps });
284
+ }
285
+ function compareStrictOrder(actual, expected, opts) {
286
+ const actualNames = actual.steps.map((s) => s.name);
287
+ const expectedNames = expected.steps.map((s) => s.name);
288
+ let matchedSteps = 0;
289
+ const outOfOrderSteps = [];
290
+ const matchedExpectedIndices = /* @__PURE__ */ new Set();
291
+ const maxLen = Math.max(actualNames.length, expectedNames.length);
292
+ for (let i = 0; i < maxLen; i++) {
293
+ const actualName = actualNames[i];
294
+ const expectedName = expectedNames[i];
295
+ if (actualName === expectedName) {
296
+ if (opts.compareStepData && actual.steps[i] && expected.steps[i]) {
297
+ if (expectedStepMatches(actual.steps[i], expected.steps[i], true)) {
298
+ matchedSteps++;
299
+ matchedExpectedIndices.add(i);
300
+ }
301
+ } else if (actual.steps[i] && expected.steps[i]) {
302
+ if (expectedStepMatches(actual.steps[i], expected.steps[i], false)) {
303
+ matchedSteps++;
304
+ matchedExpectedIndices.add(i);
305
+ }
306
+ } else {
307
+ matchedSteps++;
308
+ matchedExpectedIndices.add(i);
309
+ }
310
+ } else if (actualName && expectedNames.includes(actualName)) {
311
+ outOfOrderSteps.push(actualName);
312
+ }
313
+ }
314
+ const missingSteps = expectedNames.filter((_, i) => !matchedExpectedIndices.has(i));
315
+ const extraSteps = actualNames.filter((name) => !expectedNames.includes(name));
316
+ let score = matchedSteps / expected.steps.length;
317
+ if (actualNames.length > expectedNames.length) {
318
+ const extraPenalty = (actualNames.length - expectedNames.length) / expectedNames.length;
319
+ score = Math.max(0, score - extraPenalty * 0.5);
320
+ }
321
+ if (!opts.allowRepeatedSteps && opts.repeatedSteps.length > 0) {
322
+ score = Math.max(0, score - opts.repeatedSteps.length * 0.1);
323
+ }
324
+ return {
325
+ score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
326
+ matchedSteps,
327
+ totalExpectedSteps: expected.steps.length,
328
+ totalActualSteps: actual.steps.length,
329
+ missingSteps,
330
+ extraSteps,
331
+ outOfOrderSteps,
332
+ repeatedSteps: opts.repeatedSteps
333
+ };
334
+ }
335
+ function compareRelaxedOrder(actual, expected, opts) {
336
+ const actualNames = actual.steps.map((s) => s.name);
337
+ const expectedNames = expected.steps.map((s) => s.name);
338
+ let matchedSteps = 0;
339
+ let lastMatchedIndex = -1;
340
+ const outOfOrderSteps = [];
341
+ const matchedExpectedIndices = /* @__PURE__ */ new Set();
342
+ for (let i = 0; i < expectedNames.length; i++) {
343
+ const expectedName = expectedNames[i];
344
+ let found = false;
345
+ for (let j = lastMatchedIndex + 1; j < actualNames.length; j++) {
346
+ if (actualNames[j] === expectedName) {
347
+ if (actual.steps[j] && expected.steps[i]) {
348
+ if (expectedStepMatches(actual.steps[j], expected.steps[i], opts.compareStepData)) {
349
+ matchedSteps++;
350
+ lastMatchedIndex = j;
351
+ matchedExpectedIndices.add(i);
352
+ found = true;
353
+ break;
354
+ }
355
+ } else {
356
+ matchedSteps++;
357
+ lastMatchedIndex = j;
358
+ matchedExpectedIndices.add(i);
359
+ found = true;
360
+ break;
361
+ }
362
+ }
363
+ }
364
+ if (!found) {
365
+ if (actualNames.includes(expectedName)) {
366
+ outOfOrderSteps.push(expectedName);
367
+ }
368
+ }
369
+ }
370
+ const missingSteps = expectedNames.filter((_, i) => !matchedExpectedIndices.has(i));
371
+ const expectedSet = new Set(expectedNames);
372
+ const extraSteps = actualNames.filter((name) => !expectedSet.has(name));
373
+ let score = matchedSteps / expected.steps.length;
374
+ if (!opts.allowRepeatedSteps && opts.repeatedSteps.length > 0) {
375
+ score = Math.max(0, score - opts.repeatedSteps.length * 0.1);
376
+ }
377
+ return {
378
+ score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
379
+ matchedSteps,
380
+ totalExpectedSteps: expected.steps.length,
381
+ totalActualSteps: actual.steps.length,
382
+ missingSteps,
383
+ extraSteps,
384
+ outOfOrderSteps,
385
+ repeatedSteps: opts.repeatedSteps
386
+ };
387
+ }
388
+ function getStepData(step) {
389
+ switch (step.stepType) {
390
+ case "tool_call":
391
+ case "mcp_tool_call":
392
+ return { input: step.toolArgs, output: step.toolResult };
393
+ case "workflow_step":
394
+ return { output: step.output };
395
+ default:
396
+ return {};
397
+ }
398
+ }
399
+ function expectedStepMatches(actual, expected, compareData) {
400
+ if (actual.name !== expected.name) return false;
401
+ if (expected.stepType && actual.stepType !== expected.stepType) return false;
402
+ if (compareData && expected.data) {
403
+ const actualData = getStepData(actual);
404
+ for (const [key, value] of Object.entries(expected.data)) {
405
+ const actualField = key === "input" ? actualData.input : key === "output" ? actualData.output : void 0;
406
+ if (actualField === void 0) return false;
407
+ try {
408
+ if (JSON.stringify(actualField) !== JSON.stringify(value)) return false;
409
+ } catch {
410
+ return false;
411
+ }
412
+ }
413
+ }
414
+ return true;
415
+ }
416
+ function compareUnorderedPresence(actual, expected, opts) {
417
+ const actualNames = actual.steps.map((s) => s.name);
418
+ const expectedNames = expected.steps.map((s) => s.name);
419
+ let matchedSteps = 0;
420
+ const matchedExpectedIndices = /* @__PURE__ */ new Set();
421
+ if (opts.compareStepData) {
422
+ const usedIndices = /* @__PURE__ */ new Set();
423
+ for (let i = 0; i < expected.steps.length; i++) {
424
+ const expectedStep = expected.steps[i];
425
+ for (let j = 0; j < actual.steps.length; j++) {
426
+ if (!usedIndices.has(j) && expectedStepMatches(actual.steps[j], expectedStep, true)) {
427
+ matchedSteps++;
428
+ matchedExpectedIndices.add(i);
429
+ usedIndices.add(j);
430
+ break;
431
+ }
432
+ }
433
+ }
434
+ } else {
435
+ const usedIndices = /* @__PURE__ */ new Set();
436
+ for (let i = 0; i < expected.steps.length; i++) {
437
+ const expectedStep = expected.steps[i];
438
+ for (let j = 0; j < actual.steps.length; j++) {
439
+ if (!usedIndices.has(j) && expectedStepMatches(actual.steps[j], expectedStep, false)) {
440
+ matchedSteps++;
441
+ matchedExpectedIndices.add(i);
442
+ usedIndices.add(j);
443
+ break;
444
+ }
445
+ }
446
+ }
447
+ }
448
+ const missingSteps = expectedNames.filter((_, i) => !matchedExpectedIndices.has(i));
449
+ const expectedSet = new Set(expectedNames);
450
+ const extraSteps = actualNames.filter((name) => !expectedSet.has(name));
451
+ let score = matchedSteps / expected.steps.length;
452
+ if (!opts.allowRepeatedSteps && opts.repeatedSteps.length > 0) {
453
+ score = Math.max(0, score - opts.repeatedSteps.length * 0.1);
454
+ }
455
+ return {
456
+ score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
457
+ matchedSteps,
458
+ totalExpectedSteps: expected.steps.length,
459
+ totalActualSteps: actual.steps.length,
460
+ missingSteps,
461
+ extraSteps,
462
+ outOfOrderSteps: [],
463
+ // ordering not checked in unordered mode
464
+ repeatedSteps: opts.repeatedSteps
465
+ };
466
+ }
467
+ function checkTrajectoryEfficiency(trajectory, options = {}) {
468
+ const { maxSteps, maxTotalTokens, maxTotalDurationMs, noRedundantCalls = true } = options;
469
+ const totalSteps = trajectory.steps.length;
470
+ let totalTokens = 0;
471
+ for (const step of trajectory.steps) {
472
+ if (step.stepType === "model_generation") {
473
+ totalTokens += (step.promptTokens ?? 0) + (step.completionTokens ?? 0);
474
+ }
475
+ }
476
+ const totalDurationMs = trajectory.totalDurationMs ?? trajectory.steps.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
477
+ const redundantCalls = [];
478
+ if (noRedundantCalls) {
479
+ for (let i = 1; i < trajectory.steps.length; i++) {
480
+ const prev = trajectory.steps[i - 1];
481
+ const curr = trajectory.steps[i];
482
+ if (prev.name === curr.name && prev.stepType === curr.stepType && (prev.stepType === "tool_call" || prev.stepType === "mcp_tool_call")) {
483
+ const prevArgs = prev.toolArgs;
484
+ const currArgs = curr.toolArgs;
485
+ try {
486
+ if (JSON.stringify(prevArgs) === JSON.stringify(currArgs)) {
487
+ redundantCalls.push({ name: curr.name, index: i });
488
+ }
489
+ } catch {
490
+ }
491
+ }
492
+ }
493
+ }
494
+ const overStepBudget = maxSteps !== void 0 && totalSteps > maxSteps;
495
+ const overTokenBudget = maxTotalTokens !== void 0 && totalTokens > maxTotalTokens;
496
+ const overDurationBudget = maxTotalDurationMs !== void 0 && totalDurationMs > maxTotalDurationMs;
497
+ const dimensions = [];
498
+ if (maxSteps !== void 0) {
499
+ dimensions.push(overStepBudget ? Math.max(0, 1 - (totalSteps - maxSteps) / maxSteps) : 1);
500
+ }
501
+ if (maxTotalTokens !== void 0) {
502
+ dimensions.push(overTokenBudget ? Math.max(0, 1 - (totalTokens - maxTotalTokens) / maxTotalTokens) : 1);
503
+ }
504
+ if (maxTotalDurationMs !== void 0) {
505
+ dimensions.push(
506
+ overDurationBudget ? Math.max(0, 1 - (totalDurationMs - maxTotalDurationMs) / maxTotalDurationMs) : 1
507
+ );
508
+ }
509
+ if (noRedundantCalls) {
510
+ dimensions.push(redundantCalls.length === 0 ? 1 : Math.max(0, 1 - redundantCalls.length * 0.2));
511
+ }
512
+ const score = dimensions.length > 0 ? dimensions.reduce((a, b) => a + b, 0) / dimensions.length : 1;
513
+ return {
514
+ score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
515
+ totalSteps,
516
+ overStepBudget,
517
+ totalTokens,
518
+ overTokenBudget,
519
+ totalDurationMs,
520
+ overDurationBudget,
521
+ redundantCalls
522
+ };
523
+ }
524
+ function checkTrajectoryBlacklist(trajectory, options = {}) {
525
+ const { blacklistedTools = [], blacklistedSequences = [] } = options;
526
+ const violatedTools = [];
527
+ const violatedSequences = [];
528
+ const stepNames = trajectory.steps.map((s) => s.name);
529
+ for (const forbidden of blacklistedTools) {
530
+ if (stepNames.includes(forbidden)) {
531
+ violatedTools.push(forbidden);
532
+ }
533
+ }
534
+ for (const sequence of blacklistedSequences) {
535
+ if (sequence.length === 0) continue;
536
+ for (let i = 0; i <= stepNames.length - sequence.length; i++) {
537
+ let match = true;
538
+ for (let j = 0; j < sequence.length; j++) {
539
+ if (stepNames[i + j] !== sequence[j]) {
540
+ match = false;
541
+ break;
542
+ }
543
+ }
544
+ if (match) {
545
+ violatedSequences.push(sequence);
546
+ break;
547
+ }
548
+ }
549
+ }
550
+ const hasViolations = violatedTools.length > 0 || violatedSequences.length > 0;
551
+ return {
552
+ score: hasViolations ? 0 : 1,
553
+ violatedTools,
554
+ violatedSequences
555
+ };
556
+ }
557
+ function analyzeToolFailures(trajectory, options = {}) {
558
+ const { maxRetriesPerTool = 2 } = options;
559
+ const patterns = [];
560
+ let totalRetries = 0;
561
+ const toolCallSteps = trajectory.steps.filter((s) => s.stepType === "tool_call" || s.stepType === "mcp_tool_call");
562
+ if (toolCallSteps.length === 0) {
563
+ return { score: 1, patterns: [], totalRetries: 0, excessiveRetryTools: [] };
564
+ }
565
+ let i = 0;
566
+ while (i < toolCallSteps.length) {
567
+ const currentTool = toolCallSteps[i];
568
+ let retryCount = 0;
569
+ let j = i + 1;
570
+ while (j < toolCallSteps.length && toolCallSteps[j].name === currentTool.name) {
571
+ const prevStep = toolCallSteps[j - 1];
572
+ if (prevStep.success === false) {
573
+ retryCount++;
574
+ }
575
+ j++;
576
+ }
577
+ if (retryCount > 0) {
578
+ const nextDifferentTool = j < toolCallSteps.length ? toolCallSteps[j] : void 0;
579
+ const lastRetry = toolCallSteps[j - 1];
580
+ const lastSuccess = lastRetry.success !== false;
581
+ patterns.push({
582
+ toolName: currentTool.name,
583
+ retryCount,
584
+ fellBackToAlternative: nextDifferentTool !== void 0 && !lastSuccess,
585
+ alternativeTool: nextDifferentTool !== void 0 && !lastSuccess ? nextDifferentTool.name : void 0,
586
+ eventuallySucceeded: lastSuccess
587
+ });
588
+ totalRetries += retryCount;
589
+ }
590
+ i = j;
591
+ }
592
+ const excessiveRetryTools = patterns.filter((p) => p.retryCount > maxRetriesPerTool).map((p) => p.toolName);
593
+ let score = 1;
594
+ if (toolCallSteps.length > 0) {
595
+ const excessRetries = patterns.reduce((sum, p) => sum + Math.max(0, p.retryCount - maxRetriesPerTool), 0);
596
+ score = Math.max(0, 1 - excessRetries * 0.2);
597
+ }
598
+ return {
599
+ score: roundToTwoDecimals(Math.max(0, Math.min(1, score))),
600
+ patterns,
601
+ totalRetries,
602
+ excessiveRetryTools
603
+ };
604
+ }
605
+
606
+ Object.defineProperty(exports, "extractTrajectory", {
607
+ enumerable: true,
608
+ get: function () { return evals.extractTrajectory; }
609
+ });
610
+ exports.analyzeToolFailures = analyzeToolFailures;
611
+ exports.checkTrajectoryBlacklist = checkTrajectoryBlacklist;
612
+ exports.checkTrajectoryEfficiency = checkTrajectoryEfficiency;
613
+ exports.compareTrajectories = compareTrajectories;
614
+ exports.createAgentTestRun = createAgentTestRun;
615
+ exports.createTestMessage = createTestMessage;
616
+ exports.createTestRun = createTestRun;
617
+ exports.createToolInvocation = createToolInvocation;
618
+ exports.createTrajectoryTestRun = createTrajectoryTestRun;
619
+ exports.extractAgentResponseMessages = extractAgentResponseMessages;
620
+ exports.extractInputMessages = extractInputMessages;
621
+ exports.extractToolCalls = extractToolCalls;
622
+ exports.extractToolResults = extractToolResults;
623
+ exports.getAssistantMessageFromRunOutput = getAssistantMessageFromRunOutput;
624
+ exports.getCombinedSystemPrompt = getCombinedSystemPrompt;
625
+ exports.getReasoningFromRunOutput = getReasoningFromRunOutput;
626
+ exports.getSystemMessagesFromRunInput = getSystemMessagesFromRunInput;
627
+ exports.getTextContentFromMastraDBMessage = getTextContentFromMastraDBMessage;
628
+ exports.getUserMessageFromRunInput = getUserMessageFromRunInput;
629
+ exports.isCloserTo = isCloserTo;
630
+ exports.roundToTwoDecimals = roundToTwoDecimals;
631
+ //# sourceMappingURL=chunk-XRUR5PBK.cjs.map
632
+ //# sourceMappingURL=chunk-XRUR5PBK.cjs.map