@mastra/evals 1.2.0-alpha.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-EVBNIL5M.js';
1
+ import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-X4MKZ735.js';
2
2
  import { createScorer } from '@mastra/core/evals';
3
3
  import { z } from 'zod';
4
4
  import nlp from 'compromise';
@@ -2735,7 +2735,8 @@ function formatExpectedSteps(steps, indent = 0) {
2735
2735
  const prefix = " ".repeat(indent);
2736
2736
  return steps.map((step, i) => {
2737
2737
  const typeStr = step.stepType ? `[${step.stepType}] ` : "";
2738
- const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
2738
+ const { name: _, stepType: _t, children: _c, ...fields } = step;
2739
+ const dataStr = Object.keys(fields).length > 0 ? ` (${JSON.stringify(fields)})` : "";
2739
2740
  let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
2740
2741
  if (step.children?.steps && step.children.steps.length > 0) {
2741
2742
  line += `
@@ -2764,22 +2765,15 @@ function createTrajectoryAccuracyScorerLLM({
2764
2765
  if (Array.isArray(staticExpectedTrajectory)) {
2765
2766
  expectedSteps = staticExpectedTrajectory;
2766
2767
  } else {
2767
- expectedSteps = staticExpectedTrajectory.steps.map((s) => {
2768
- const result = { name: s.name, stepType: s.stepType };
2769
- const data = {};
2770
- if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
2771
- data.input = s.toolArgs;
2772
- if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
2773
- data.output = s.toolResult;
2774
- if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
2775
- if (Object.keys(data).length > 0) result.data = data;
2776
- if (s.children && s.children.length > 0) {
2777
- result.children = {
2778
- steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
2779
- };
2768
+ const toExpectedStep = (s) => {
2769
+ const { durationMs: _, metadata: _m, children, ...rest } = s;
2770
+ const result = rest;
2771
+ if (children && children.length > 0) {
2772
+ result.children = { steps: children.map(toExpectedStep) };
2780
2773
  }
2781
2774
  return result;
2782
- });
2775
+ };
2776
+ expectedSteps = staticExpectedTrajectory.steps.map(toExpectedStep);
2783
2777
  }
2784
2778
  } else if (run.expectedTrajectory) {
2785
2779
  const expectation = run.expectedTrajectory;
@@ -3226,18 +3220,11 @@ function createToolCallAccuracyScorerCode(options) {
3226
3220
  });
3227
3221
  }
3228
3222
  function trajectoryStepToExpectedStep(step) {
3229
- const result = { name: step.name, stepType: step.stepType };
3230
- const data = {};
3231
- if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
3232
- if (step.toolArgs !== void 0) data.input = step.toolArgs;
3233
- if (step.toolResult !== void 0) data.output = step.toolResult;
3234
- } else if (step.stepType === "workflow_step") {
3235
- if (step.output !== void 0) data.output = step.output;
3236
- }
3237
- if (Object.keys(data).length > 0) result.data = data;
3238
- if (step.children && step.children.length > 0) {
3223
+ const { durationMs: _, metadata: _m, children, ...rest } = step;
3224
+ const result = rest;
3225
+ if (children && children.length > 0) {
3239
3226
  result.children = {
3240
- steps: step.children.map(trajectoryStepToExpectedStep)
3227
+ steps: children.map(trajectoryStepToExpectedStep)
3241
3228
  };
3242
3229
  }
3243
3230
  return result;
@@ -3248,15 +3235,14 @@ function expectationToExpectedSteps(expectation) {
3248
3235
  }
3249
3236
  function createTrajectoryAccuracyScorerCode(options = {}) {
3250
3237
  const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
3251
- const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
3252
- const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
3238
+ const { ordering = "relaxed", allowRepeatedSteps = true } = comparisonOptions;
3253
3239
  const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
3254
3240
  const getDescription = () => {
3255
3241
  if (staticExpectedSteps) {
3256
3242
  const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
3257
- return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
3243
+ return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${ordering} ordering)`;
3258
3244
  }
3259
- return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
3245
+ return `Evaluates trajectory accuracy against expected trajectory from dataset items (${ordering} ordering)`;
3260
3246
  };
3261
3247
  return createScorer({
3262
3248
  id: "code-trajectory-accuracy-scorer",
@@ -3281,15 +3267,13 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
3281
3267
  };
3282
3268
  }
3283
3269
  const itemExpectation = run.expectedTrajectory;
3284
- const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
3285
- const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
3270
+ const effectiveOrdering = itemExpectation?.ordering ?? ordering;
3286
3271
  const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
3287
3272
  const comparison = compareTrajectories(
3288
3273
  actualTrajectory,
3289
3274
  { steps: resolvedExpectedSteps },
3290
3275
  {
3291
3276
  ordering: effectiveOrdering,
3292
- compareStepData: effectiveCompareData,
3293
3277
  allowRepeatedSteps: effectiveAllowRepeated
3294
3278
  }
3295
3279
  );
@@ -3308,7 +3292,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
3308
3292
  return preprocessResult.comparison.score;
3309
3293
  });
3310
3294
  }
3311
- function evaluateNestedExpectations(expectedSteps, actualSteps) {
3295
+ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }) {
3312
3296
  const results = [];
3313
3297
  const matchedIndices = /* @__PURE__ */ new Set();
3314
3298
  for (const expectedStep of expectedSteps) {
@@ -3348,7 +3332,6 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
3348
3332
  { steps: childConfig.steps },
3349
3333
  {
3350
3334
  ordering: childConfig.ordering ?? "relaxed",
3351
- compareStepData: childConfig.compareStepData ?? false,
3352
3335
  allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
3353
3336
  }
3354
3337
  );
@@ -3368,22 +3351,23 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
3368
3351
  const toolFailures = analyzeToolFailures(childTrajectory, {
3369
3352
  maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
3370
3353
  });
3371
- const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
3354
+ const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
3372
3355
  const scores = [];
3373
- if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
3374
- if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
3375
- if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
3356
+ if (accuracy) scores.push({ weight: weights.accuracy, value: accuracy.score });
3357
+ if (efficiency) scores.push({ weight: weights.efficiency, value: efficiency.score });
3358
+ if (toolFailures && toolFailures.patterns.length > 0)
3359
+ scores.push({ weight: weights.toolFailures, value: toolFailures.score });
3376
3360
  if (blacklist) {
3377
3361
  if (blacklist.score === 0) {
3378
3362
  results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
3379
3363
  continue;
3380
3364
  }
3381
- scores.push({ weight: 0.1, value: blacklist.score });
3365
+ scores.push({ weight: weights.blacklist, value: blacklist.score });
3382
3366
  }
3383
3367
  let levelScore = 1;
3384
3368
  if (scores.length > 0) {
3385
3369
  const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
3386
- levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
3370
+ levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
3387
3371
  }
3388
3372
  let finalScore = levelScore;
3389
3373
  if (nested.length > 0) {
@@ -3408,7 +3392,13 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
3408
3392
  return results;
3409
3393
  }
3410
3394
  function createTrajectoryScorerCode(options = {}) {
3411
- const { defaults = {} } = options;
3395
+ const { defaults = {}, weights: userWeights = {} } = options;
3396
+ const w = {
3397
+ accuracy: Math.max(0, userWeights.accuracy ?? 0.4),
3398
+ efficiency: Math.max(0, userWeights.efficiency ?? 0.3),
3399
+ toolFailures: Math.max(0, userWeights.toolFailures ?? 0.2),
3400
+ blacklist: Math.max(0, userWeights.blacklist ?? 0.1)
3401
+ };
3412
3402
  return createScorer({
3413
3403
  id: "code-trajectory-scorer",
3414
3404
  name: "Trajectory Scorer",
@@ -3428,7 +3418,6 @@ function createTrajectoryScorerCode(options = {}) {
3428
3418
  { steps: config.steps },
3429
3419
  {
3430
3420
  ordering: config.ordering ?? "relaxed",
3431
- compareStepData: config.compareStepData ?? false,
3432
3421
  allowRepeatedSteps: config.allowRepeatedSteps ?? true
3433
3422
  }
3434
3423
  );
@@ -3448,7 +3437,7 @@ function createTrajectoryScorerCode(options = {}) {
3448
3437
  const toolFailures = analyzeToolFailures(actualTrajectory, {
3449
3438
  maxRetriesPerTool: config.maxRetriesPerTool ?? 2
3450
3439
  });
3451
- const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
3440
+ const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
3452
3441
  return {
3453
3442
  accuracy,
3454
3443
  efficiency,
@@ -3464,16 +3453,16 @@ function createTrajectoryScorerCode(options = {}) {
3464
3453
  }
3465
3454
  const scores = [];
3466
3455
  if (accuracy) {
3467
- scores.push({ weight: 0.4, value: accuracy.score });
3456
+ scores.push({ weight: w.accuracy, value: accuracy.score });
3468
3457
  }
3469
3458
  if (efficiency) {
3470
- scores.push({ weight: 0.3, value: efficiency.score });
3459
+ scores.push({ weight: w.efficiency, value: efficiency.score });
3471
3460
  }
3472
3461
  if (toolFailures && toolFailures.patterns.length > 0) {
3473
- scores.push({ weight: 0.2, value: toolFailures.score });
3462
+ scores.push({ weight: w.toolFailures, value: toolFailures.score });
3474
3463
  }
3475
3464
  if (blacklist) {
3476
- scores.push({ weight: 0.1, value: blacklist.score });
3465
+ scores.push({ weight: w.blacklist, value: blacklist.score });
3477
3466
  }
3478
3467
  if (scores.length === 0 && !nested) {
3479
3468
  return 1;
@@ -3481,7 +3470,7 @@ function createTrajectoryScorerCode(options = {}) {
3481
3470
  let levelScore = 1;
3482
3471
  if (scores.length > 0) {
3483
3472
  const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
3484
- levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
3473
+ levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
3485
3474
  }
3486
3475
  if (nested && nested.length > 0) {
3487
3476
  const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);