@mastra/evals 1.2.0-alpha.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +97 -0
- package/dist/{chunk-XRUR5PBK.cjs → chunk-AY4K3J4R.cjs} +44 -95
- package/dist/chunk-AY4K3J4R.cjs.map +1 -0
- package/dist/{chunk-EVBNIL5M.js → chunk-X4MKZ735.js} +44 -95
- package/dist/chunk-X4MKZ735.js.map +1 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/reference-evals-scorer-utils.md +9 -5
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +29 -15
- package/dist/scorers/code/trajectory/index.d.ts +18 -1
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
- package/dist/scorers/prebuilt/index.cjs +110 -121
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +39 -50
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +23 -23
- package/dist/scorers/utils.d.ts +1 -4
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +7 -7
- package/dist/chunk-EVBNIL5M.js.map +0 -1
- package/dist/chunk-XRUR5PBK.cjs.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-
|
|
1
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-X4MKZ735.js';
|
|
2
2
|
import { createScorer } from '@mastra/core/evals';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import nlp from 'compromise';
|
|
@@ -2735,7 +2735,8 @@ function formatExpectedSteps(steps, indent = 0) {
|
|
|
2735
2735
|
const prefix = " ".repeat(indent);
|
|
2736
2736
|
return steps.map((step, i) => {
|
|
2737
2737
|
const typeStr = step.stepType ? `[${step.stepType}] ` : "";
|
|
2738
|
-
const
|
|
2738
|
+
const { name: _, stepType: _t, children: _c, ...fields } = step;
|
|
2739
|
+
const dataStr = Object.keys(fields).length > 0 ? ` (${JSON.stringify(fields)})` : "";
|
|
2739
2740
|
let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
|
|
2740
2741
|
if (step.children?.steps && step.children.steps.length > 0) {
|
|
2741
2742
|
line += `
|
|
@@ -2764,22 +2765,15 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
2764
2765
|
if (Array.isArray(staticExpectedTrajectory)) {
|
|
2765
2766
|
expectedSteps = staticExpectedTrajectory;
|
|
2766
2767
|
} else {
|
|
2767
|
-
|
|
2768
|
-
const
|
|
2769
|
-
const
|
|
2770
|
-
if (
|
|
2771
|
-
|
|
2772
|
-
if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
|
|
2773
|
-
data.output = s.toolResult;
|
|
2774
|
-
if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
|
|
2775
|
-
if (Object.keys(data).length > 0) result.data = data;
|
|
2776
|
-
if (s.children && s.children.length > 0) {
|
|
2777
|
-
result.children = {
|
|
2778
|
-
steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
|
|
2779
|
-
};
|
|
2768
|
+
const toExpectedStep = (s) => {
|
|
2769
|
+
const { durationMs: _, metadata: _m, children, ...rest } = s;
|
|
2770
|
+
const result = rest;
|
|
2771
|
+
if (children && children.length > 0) {
|
|
2772
|
+
result.children = { steps: children.map(toExpectedStep) };
|
|
2780
2773
|
}
|
|
2781
2774
|
return result;
|
|
2782
|
-
}
|
|
2775
|
+
};
|
|
2776
|
+
expectedSteps = staticExpectedTrajectory.steps.map(toExpectedStep);
|
|
2783
2777
|
}
|
|
2784
2778
|
} else if (run.expectedTrajectory) {
|
|
2785
2779
|
const expectation = run.expectedTrajectory;
|
|
@@ -3226,18 +3220,11 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
3226
3220
|
});
|
|
3227
3221
|
}
|
|
3228
3222
|
function trajectoryStepToExpectedStep(step) {
|
|
3229
|
-
const
|
|
3230
|
-
const
|
|
3231
|
-
if (
|
|
3232
|
-
if (step.toolArgs !== void 0) data.input = step.toolArgs;
|
|
3233
|
-
if (step.toolResult !== void 0) data.output = step.toolResult;
|
|
3234
|
-
} else if (step.stepType === "workflow_step") {
|
|
3235
|
-
if (step.output !== void 0) data.output = step.output;
|
|
3236
|
-
}
|
|
3237
|
-
if (Object.keys(data).length > 0) result.data = data;
|
|
3238
|
-
if (step.children && step.children.length > 0) {
|
|
3223
|
+
const { durationMs: _, metadata: _m, children, ...rest } = step;
|
|
3224
|
+
const result = rest;
|
|
3225
|
+
if (children && children.length > 0) {
|
|
3239
3226
|
result.children = {
|
|
3240
|
-
steps:
|
|
3227
|
+
steps: children.map(trajectoryStepToExpectedStep)
|
|
3241
3228
|
};
|
|
3242
3229
|
}
|
|
3243
3230
|
return result;
|
|
@@ -3248,15 +3235,14 @@ function expectationToExpectedSteps(expectation) {
|
|
|
3248
3235
|
}
|
|
3249
3236
|
function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
3250
3237
|
const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
|
|
3251
|
-
const { ordering
|
|
3252
|
-
const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
|
|
3238
|
+
const { ordering = "relaxed", allowRepeatedSteps = true } = comparisonOptions;
|
|
3253
3239
|
const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
|
|
3254
3240
|
const getDescription = () => {
|
|
3255
3241
|
if (staticExpectedSteps) {
|
|
3256
3242
|
const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
|
|
3257
|
-
return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${
|
|
3243
|
+
return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${ordering} ordering)`;
|
|
3258
3244
|
}
|
|
3259
|
-
return `Evaluates trajectory accuracy against expected trajectory from dataset items (${
|
|
3245
|
+
return `Evaluates trajectory accuracy against expected trajectory from dataset items (${ordering} ordering)`;
|
|
3260
3246
|
};
|
|
3261
3247
|
return createScorer({
|
|
3262
3248
|
id: "code-trajectory-accuracy-scorer",
|
|
@@ -3281,15 +3267,13 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3281
3267
|
};
|
|
3282
3268
|
}
|
|
3283
3269
|
const itemExpectation = run.expectedTrajectory;
|
|
3284
|
-
const effectiveOrdering = itemExpectation?.ordering ??
|
|
3285
|
-
const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
|
|
3270
|
+
const effectiveOrdering = itemExpectation?.ordering ?? ordering;
|
|
3286
3271
|
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3287
3272
|
const comparison = compareTrajectories(
|
|
3288
3273
|
actualTrajectory,
|
|
3289
3274
|
{ steps: resolvedExpectedSteps },
|
|
3290
3275
|
{
|
|
3291
3276
|
ordering: effectiveOrdering,
|
|
3292
|
-
compareStepData: effectiveCompareData,
|
|
3293
3277
|
allowRepeatedSteps: effectiveAllowRepeated
|
|
3294
3278
|
}
|
|
3295
3279
|
);
|
|
@@ -3308,7 +3292,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3308
3292
|
return preprocessResult.comparison.score;
|
|
3309
3293
|
});
|
|
3310
3294
|
}
|
|
3311
|
-
function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
3295
|
+
function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }) {
|
|
3312
3296
|
const results = [];
|
|
3313
3297
|
const matchedIndices = /* @__PURE__ */ new Set();
|
|
3314
3298
|
for (const expectedStep of expectedSteps) {
|
|
@@ -3348,7 +3332,6 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
|
3348
3332
|
{ steps: childConfig.steps },
|
|
3349
3333
|
{
|
|
3350
3334
|
ordering: childConfig.ordering ?? "relaxed",
|
|
3351
|
-
compareStepData: childConfig.compareStepData ?? false,
|
|
3352
3335
|
allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
|
|
3353
3336
|
}
|
|
3354
3337
|
);
|
|
@@ -3368,22 +3351,23 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
|
3368
3351
|
const toolFailures = analyzeToolFailures(childTrajectory, {
|
|
3369
3352
|
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3370
3353
|
});
|
|
3371
|
-
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
|
|
3354
|
+
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
|
|
3372
3355
|
const scores = [];
|
|
3373
|
-
if (accuracy) scores.push({ weight:
|
|
3374
|
-
if (efficiency) scores.push({ weight:
|
|
3375
|
-
if (toolFailures && toolFailures.patterns.length > 0)
|
|
3356
|
+
if (accuracy) scores.push({ weight: weights.accuracy, value: accuracy.score });
|
|
3357
|
+
if (efficiency) scores.push({ weight: weights.efficiency, value: efficiency.score });
|
|
3358
|
+
if (toolFailures && toolFailures.patterns.length > 0)
|
|
3359
|
+
scores.push({ weight: weights.toolFailures, value: toolFailures.score });
|
|
3376
3360
|
if (blacklist) {
|
|
3377
3361
|
if (blacklist.score === 0) {
|
|
3378
3362
|
results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
|
|
3379
3363
|
continue;
|
|
3380
3364
|
}
|
|
3381
|
-
scores.push({ weight:
|
|
3365
|
+
scores.push({ weight: weights.blacklist, value: blacklist.score });
|
|
3382
3366
|
}
|
|
3383
3367
|
let levelScore = 1;
|
|
3384
3368
|
if (scores.length > 0) {
|
|
3385
3369
|
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3386
|
-
levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
|
|
3370
|
+
levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
|
|
3387
3371
|
}
|
|
3388
3372
|
let finalScore = levelScore;
|
|
3389
3373
|
if (nested.length > 0) {
|
|
@@ -3408,7 +3392,13 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
|
3408
3392
|
return results;
|
|
3409
3393
|
}
|
|
3410
3394
|
function createTrajectoryScorerCode(options = {}) {
|
|
3411
|
-
const { defaults = {} } = options;
|
|
3395
|
+
const { defaults = {}, weights: userWeights = {} } = options;
|
|
3396
|
+
const w = {
|
|
3397
|
+
accuracy: Math.max(0, userWeights.accuracy ?? 0.4),
|
|
3398
|
+
efficiency: Math.max(0, userWeights.efficiency ?? 0.3),
|
|
3399
|
+
toolFailures: Math.max(0, userWeights.toolFailures ?? 0.2),
|
|
3400
|
+
blacklist: Math.max(0, userWeights.blacklist ?? 0.1)
|
|
3401
|
+
};
|
|
3412
3402
|
return createScorer({
|
|
3413
3403
|
id: "code-trajectory-scorer",
|
|
3414
3404
|
name: "Trajectory Scorer",
|
|
@@ -3428,7 +3418,6 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3428
3418
|
{ steps: config.steps },
|
|
3429
3419
|
{
|
|
3430
3420
|
ordering: config.ordering ?? "relaxed",
|
|
3431
|
-
compareStepData: config.compareStepData ?? false,
|
|
3432
3421
|
allowRepeatedSteps: config.allowRepeatedSteps ?? true
|
|
3433
3422
|
}
|
|
3434
3423
|
);
|
|
@@ -3448,7 +3437,7 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3448
3437
|
const toolFailures = analyzeToolFailures(actualTrajectory, {
|
|
3449
3438
|
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3450
3439
|
});
|
|
3451
|
-
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
|
|
3440
|
+
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
|
|
3452
3441
|
return {
|
|
3453
3442
|
accuracy,
|
|
3454
3443
|
efficiency,
|
|
@@ -3464,16 +3453,16 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3464
3453
|
}
|
|
3465
3454
|
const scores = [];
|
|
3466
3455
|
if (accuracy) {
|
|
3467
|
-
scores.push({ weight:
|
|
3456
|
+
scores.push({ weight: w.accuracy, value: accuracy.score });
|
|
3468
3457
|
}
|
|
3469
3458
|
if (efficiency) {
|
|
3470
|
-
scores.push({ weight:
|
|
3459
|
+
scores.push({ weight: w.efficiency, value: efficiency.score });
|
|
3471
3460
|
}
|
|
3472
3461
|
if (toolFailures && toolFailures.patterns.length > 0) {
|
|
3473
|
-
scores.push({ weight:
|
|
3462
|
+
scores.push({ weight: w.toolFailures, value: toolFailures.score });
|
|
3474
3463
|
}
|
|
3475
3464
|
if (blacklist) {
|
|
3476
|
-
scores.push({ weight:
|
|
3465
|
+
scores.push({ weight: w.blacklist, value: blacklist.score });
|
|
3477
3466
|
}
|
|
3478
3467
|
if (scores.length === 0 && !nested) {
|
|
3479
3468
|
return 1;
|
|
@@ -3481,7 +3470,7 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3481
3470
|
let levelScore = 1;
|
|
3482
3471
|
if (scores.length > 0) {
|
|
3483
3472
|
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3484
|
-
levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
|
|
3473
|
+
levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
|
|
3485
3474
|
}
|
|
3486
3475
|
if (nested && nested.length > 0) {
|
|
3487
3476
|
const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
|