@mastra/evals 1.2.4 → 1.3.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
- package/dist/chunk-BE5F2OUQ.js.map +1 -0
- package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
- package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
- package/dist/docs/SKILL.md +2 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/reference-evals-rubric.md +113 -0
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/rubric/index.d.ts +71 -0
- package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
- package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
- package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +276 -78
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +203 -6
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +25 -25
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +9 -8
- package/dist/chunk-BULMCHKJ.cjs.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-
|
|
1
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-BE5F2OUQ.js';
|
|
2
2
|
import { createScorer } from '@mastra/core/evals';
|
|
3
3
|
import nlp from 'compromise';
|
|
4
4
|
import keyword_extractor from 'keyword-extractor';
|
|
@@ -3100,6 +3100,203 @@ function createPromptAlignmentScorerLLM({
|
|
|
3100
3100
|
});
|
|
3101
3101
|
}
|
|
3102
3102
|
|
|
3103
|
+
// src/scorers/llm/rubric/prompts.ts
|
|
3104
|
+
var RUBRIC_INSTRUCTIONS = `You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.
|
|
3105
|
+
|
|
3106
|
+
A rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.
|
|
3107
|
+
|
|
3108
|
+
Grading guidelines:
|
|
3109
|
+
- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.
|
|
3110
|
+
- A criterion is "satisfied" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.
|
|
3111
|
+
- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.
|
|
3112
|
+
- Be concise but specific in your reasoning: say what is present or missing.
|
|
3113
|
+
- Do not reward effort, intent, or partial progress. Only the actual output counts.`;
|
|
3114
|
+
function createAnalyzePrompt6({
|
|
3115
|
+
originalTask,
|
|
3116
|
+
output,
|
|
3117
|
+
criteria
|
|
3118
|
+
}) {
|
|
3119
|
+
const renderedCriteria = criteria.map((c, i) => `${i + 1}. [${c.required ? "required" : "optional"}] ${c.criterion}`).join("\n");
|
|
3120
|
+
return `Grade the agent's output against the rubric below.
|
|
3121
|
+
|
|
3122
|
+
Original task:
|
|
3123
|
+
${originalTask || "(no task provided)"}
|
|
3124
|
+
|
|
3125
|
+
Rubric criteria:
|
|
3126
|
+
${renderedCriteria}
|
|
3127
|
+
|
|
3128
|
+
Agent output to grade:
|
|
3129
|
+
${output || "(empty output)"}
|
|
3130
|
+
|
|
3131
|
+
For every criterion, decide whether the output satisfies it. Preserve the exact criterion text and its required/optional designation in your answer.
|
|
3132
|
+
|
|
3133
|
+
Return your judgement as JSON in this shape:
|
|
3134
|
+
{
|
|
3135
|
+
"criteria": [
|
|
3136
|
+
{
|
|
3137
|
+
"criterion": "exact criterion text",
|
|
3138
|
+
"satisfied": true,
|
|
3139
|
+
"required": true,
|
|
3140
|
+
"reasoning": "why it is or is not satisfied"
|
|
3141
|
+
}
|
|
3142
|
+
],
|
|
3143
|
+
"overallAssessment": "one or two sentence summary of what passed and what is missing"
|
|
3144
|
+
}`;
|
|
3145
|
+
}
|
|
3146
|
+
function formatRubricReason({ score, analysis }) {
|
|
3147
|
+
const complete = score >= 1;
|
|
3148
|
+
const header = complete ? "\u2705 Rubric satisfied: every required criterion is met." : "\u274C Rubric not yet satisfied.";
|
|
3149
|
+
const lines = analysis.criteria.map((c) => {
|
|
3150
|
+
const mark = c.satisfied ? "\u2705" : "\u274C";
|
|
3151
|
+
const tag = c.required ? "required" : "optional";
|
|
3152
|
+
return `${mark} [${tag}] ${c.criterion}
|
|
3153
|
+
\u2192 ${c.reasoning}`;
|
|
3154
|
+
});
|
|
3155
|
+
const unmetRequired = analysis.criteria.filter((c) => c.required && !c.satisfied);
|
|
3156
|
+
const footer = complete ? "" : `
|
|
3157
|
+
|
|
3158
|
+
To finish, address the ${unmetRequired.length} unmet required ${unmetRequired.length === 1 ? "criterion" : "criteria"} above.`;
|
|
3159
|
+
const assessment = analysis.overallAssessment ? `
|
|
3160
|
+
|
|
3161
|
+
${analysis.overallAssessment}` : "";
|
|
3162
|
+
return `${header}
|
|
3163
|
+
|
|
3164
|
+
${lines.join("\n")}${assessment}${footer}`;
|
|
3165
|
+
}
|
|
3166
|
+
|
|
3167
|
+
// src/scorers/llm/rubric/index.ts
|
|
3168
|
+
var analyzeOutputSchema6 = {
|
|
3169
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3170
|
+
"type": "object",
|
|
3171
|
+
"properties": {
|
|
3172
|
+
"criteria": {
|
|
3173
|
+
"type": "array",
|
|
3174
|
+
"items": {
|
|
3175
|
+
"type": "object",
|
|
3176
|
+
"properties": {
|
|
3177
|
+
"criterion": {
|
|
3178
|
+
"type": "string"
|
|
3179
|
+
},
|
|
3180
|
+
"satisfied": {
|
|
3181
|
+
"type": "boolean"
|
|
3182
|
+
},
|
|
3183
|
+
"required": {
|
|
3184
|
+
"type": "boolean"
|
|
3185
|
+
},
|
|
3186
|
+
"reasoning": {
|
|
3187
|
+
"type": "string"
|
|
3188
|
+
}
|
|
3189
|
+
},
|
|
3190
|
+
"required": [
|
|
3191
|
+
"criterion",
|
|
3192
|
+
"satisfied",
|
|
3193
|
+
"required",
|
|
3194
|
+
"reasoning"
|
|
3195
|
+
]
|
|
3196
|
+
}
|
|
3197
|
+
},
|
|
3198
|
+
"overallAssessment": {
|
|
3199
|
+
"type": "string"
|
|
3200
|
+
}
|
|
3201
|
+
},
|
|
3202
|
+
"required": [
|
|
3203
|
+
"criteria",
|
|
3204
|
+
"overallAssessment"
|
|
3205
|
+
]
|
|
3206
|
+
};
|
|
3207
|
+
function parseRubricString(rubric) {
|
|
3208
|
+
return rubric.split("\n").map((line) => line.replace(/^\s*(?:[-*•]|\d+[.)])\s*/, "").trim()).filter((line) => line.length > 0).map((description) => ({ description, required: true }));
|
|
3209
|
+
}
|
|
3210
|
+
function normalizeRubric(rubric) {
|
|
3211
|
+
if (!rubric) return [];
|
|
3212
|
+
if (typeof rubric === "string") return parseRubricString(rubric);
|
|
3213
|
+
return rubric;
|
|
3214
|
+
}
|
|
3215
|
+
function resolveRubric({
|
|
3216
|
+
staticRubric,
|
|
3217
|
+
run
|
|
3218
|
+
}) {
|
|
3219
|
+
if (staticRubric.length > 0) return staticRubric;
|
|
3220
|
+
const dynamic = pickRubric(run.requestContext) ?? pickRubric(run.additionalContext) ?? pickRubric(run.input);
|
|
3221
|
+
return normalizeRubric(dynamic);
|
|
3222
|
+
}
|
|
3223
|
+
function pickRubric(source) {
|
|
3224
|
+
if (!source || typeof source !== "object") return void 0;
|
|
3225
|
+
let value;
|
|
3226
|
+
const getter = source.get;
|
|
3227
|
+
if (typeof getter === "function") {
|
|
3228
|
+
value = getter.call(source, "rubric");
|
|
3229
|
+
} else {
|
|
3230
|
+
value = source.rubric;
|
|
3231
|
+
}
|
|
3232
|
+
if (typeof value === "string") return value;
|
|
3233
|
+
if (Array.isArray(value)) return value;
|
|
3234
|
+
return void 0;
|
|
3235
|
+
}
|
|
3236
|
+
function toCriterionInputs(criteria) {
|
|
3237
|
+
return criteria.map((c) => ({ criterion: c.description, required: c.required !== false }));
|
|
3238
|
+
}
|
|
3239
|
+
function getOutputText(run) {
|
|
3240
|
+
const fromOutput = getAssistantMessageFromRunOutput(run.output);
|
|
3241
|
+
if (fromOutput) return fromOutput;
|
|
3242
|
+
if (run.input && typeof run.input === "object" && typeof run.input.currentText === "string") {
|
|
3243
|
+
return run.input.currentText;
|
|
3244
|
+
}
|
|
3245
|
+
return typeof run.output === "string" ? run.output : "";
|
|
3246
|
+
}
|
|
3247
|
+
function getTaskText(run) {
|
|
3248
|
+
if (run.input && typeof run.input === "object" && typeof run.input.originalTask === "string") {
|
|
3249
|
+
return run.input.originalTask;
|
|
3250
|
+
}
|
|
3251
|
+
return getUserMessageFromRunInput(run.input) ?? "";
|
|
3252
|
+
}
|
|
3253
|
+
function createRubricScorer({
|
|
3254
|
+
model,
|
|
3255
|
+
criteria,
|
|
3256
|
+
options
|
|
3257
|
+
}) {
|
|
3258
|
+
const scale = options?.scale ?? 1;
|
|
3259
|
+
const staticRubric = normalizeRubric(criteria);
|
|
3260
|
+
return createScorer({
|
|
3261
|
+
id: "rubric-scorer",
|
|
3262
|
+
name: "Rubric (LLM)",
|
|
3263
|
+
description: "Grades an agent output against a rubric of criteria, returning 1 only when every required criterion is satisfied",
|
|
3264
|
+
judge: {
|
|
3265
|
+
model,
|
|
3266
|
+
instructions: RUBRIC_INSTRUCTIONS
|
|
3267
|
+
}
|
|
3268
|
+
}).analyze({
|
|
3269
|
+
description: "Judge the output against each rubric criterion",
|
|
3270
|
+
outputSchema: analyzeOutputSchema6,
|
|
3271
|
+
createPrompt: ({ run }) => {
|
|
3272
|
+
const rubric = resolveRubric({ staticRubric, run });
|
|
3273
|
+
if (rubric.length === 0) {
|
|
3274
|
+
return `No rubric was provided. Return exactly: {"criteria": [], "overallAssessment": "No rubric provided; nothing to grade."}`;
|
|
3275
|
+
}
|
|
3276
|
+
return createAnalyzePrompt6({
|
|
3277
|
+
originalTask: getTaskText(run),
|
|
3278
|
+
output: getOutputText(run),
|
|
3279
|
+
criteria: toCriterionInputs(rubric)
|
|
3280
|
+
});
|
|
3281
|
+
}
|
|
3282
|
+
}).generateScore(({ results }) => {
|
|
3283
|
+
const analysis = results.analyzeStepResult;
|
|
3284
|
+
if (!analysis || analysis.criteria.length === 0) {
|
|
3285
|
+
return 1;
|
|
3286
|
+
}
|
|
3287
|
+
const requiredCriteria = analysis.criteria.filter((c) => c.required);
|
|
3288
|
+
const gating = requiredCriteria.length > 0 ? requiredCriteria : analysis.criteria;
|
|
3289
|
+
const allSatisfied = gating.every((c) => c.satisfied);
|
|
3290
|
+
return (allSatisfied ? 1 : 0) * scale;
|
|
3291
|
+
}).generateReason(({ results, score }) => {
|
|
3292
|
+
const analysis = results.analyzeStepResult;
|
|
3293
|
+
if (!analysis || analysis.criteria.length === 0) {
|
|
3294
|
+
return "No rubric was provided, so the rubric check passed by default.";
|
|
3295
|
+
}
|
|
3296
|
+
return formatRubricReason({ score, analysis });
|
|
3297
|
+
});
|
|
3298
|
+
}
|
|
3299
|
+
|
|
3103
3300
|
// src/scorers/llm/trajectory/prompts.ts
|
|
3104
3301
|
var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
|
|
3105
3302
|
You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
|
|
@@ -3122,7 +3319,7 @@ OUTPUT REQUIREMENTS:
|
|
|
3122
3319
|
- Use provided JSON schema exactly as specified
|
|
3123
3320
|
- Be consistent in your evaluation standards
|
|
3124
3321
|
`;
|
|
3125
|
-
var
|
|
3322
|
+
var createAnalyzePrompt7 = ({
|
|
3126
3323
|
userInput,
|
|
3127
3324
|
agentResponse,
|
|
3128
3325
|
actualTrajectory,
|
|
@@ -3189,7 +3386,7 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
3189
3386
|
};
|
|
3190
3387
|
|
|
3191
3388
|
// src/scorers/llm/trajectory/index.ts
|
|
3192
|
-
var
|
|
3389
|
+
var analyzeOutputSchema7 = {
|
|
3193
3390
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3194
3391
|
"type": "object",
|
|
3195
3392
|
"properties": {
|
|
@@ -3331,11 +3528,11 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
3331
3528
|
};
|
|
3332
3529
|
}).analyze({
|
|
3333
3530
|
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
3334
|
-
outputSchema:
|
|
3531
|
+
outputSchema: analyzeOutputSchema7,
|
|
3335
3532
|
createPrompt: ({ run, results }) => {
|
|
3336
3533
|
const userInput = getUserMessageFromRunInput(run.input) ?? "";
|
|
3337
3534
|
const agentResponse = getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
3338
|
-
return
|
|
3535
|
+
return createAnalyzePrompt7({
|
|
3339
3536
|
userInput,
|
|
3340
3537
|
agentResponse,
|
|
3341
3538
|
actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
|
|
@@ -4096,6 +4293,6 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
4096
4293
|
});
|
|
4097
4294
|
}
|
|
4098
4295
|
|
|
4099
|
-
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer, createTrajectoryAccuracyScorerCode, createTrajectoryAccuracyScorerLLM, createTrajectoryScorerCode };
|
|
4296
|
+
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createRubricScorer, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer, createTrajectoryAccuracyScorerCode, createTrajectoryAccuracyScorerLLM, createTrajectoryScorerCode };
|
|
4100
4297
|
//# sourceMappingURL=index.js.map
|
|
4101
4298
|
//# sourceMappingURL=index.js.map
|