npm - @mastra/evals - Versions diffs - 1.2.4 → 1.3.0-alpha.0 - Mend

@mastra/evals 1.2.4 → 1.3.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/CHANGELOG.md +32 -0
package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
package/dist/chunk-BE5F2OUQ.js.map +1 -0
package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
package/dist/docs/SKILL.md +2 -1
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/reference-evals-rubric.md +113 -0
package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
package/dist/scorers/llm/index.d.ts +1 -0
package/dist/scorers/llm/index.d.ts.map +1 -1
package/dist/scorers/llm/rubric/index.d.ts +71 -0
package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
package/dist/scorers/prebuilt/index.cjs +276 -78
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +203 -6
package/dist/scorers/prebuilt/index.js.map +1 -1
package/dist/scorers/utils.cjs +25 -25
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +1 -1
package/package.json +9 -8
package/dist/chunk-BULMCHKJ.cjs.map +0 -1

package/dist/scorers/prebuilt/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-XOXUFZEG.js';
+import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-BE5F2OUQ.js';
 import { createScorer } from '@mastra/core/evals';
 import nlp from 'compromise';
 import keyword_extractor from 'keyword-extractor';
@@ -3100,6 +3100,203 @@ function createPromptAlignmentScorerLLM({
   });
 }
+// src/scorers/llm/rubric/prompts.ts
+var RUBRIC_INSTRUCTIONS = `You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.
+A rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.
+Grading guidelines:
+- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.
+- A criterion is "satisfied" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.
+- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.
+- Be concise but specific in your reasoning: say what is present or missing.
+- Do not reward effort, intent, or partial progress. Only the actual output counts.`;
+function createAnalyzePrompt6({
+  originalTask,
+  output,
+  criteria
+}) {
+  const renderedCriteria = criteria.map((c, i) => `${i + 1}. [${c.required ? "required" : "optional"}] ${c.criterion}`).join("\n");
+  return `Grade the agent's output against the rubric below.
+Original task:
+${originalTask || "(no task provided)"}
+Rubric criteria:
+${renderedCriteria}
+Agent output to grade:
+${output || "(empty output)"}
+For every criterion, decide whether the output satisfies it. Preserve the exact criterion text and its required/optional designation in your answer.
+Return your judgement as JSON in this shape:
+{
+  "criteria": [
+    {
+      "criterion": "exact criterion text",
+      "satisfied": true,
+      "required": true,
+      "reasoning": "why it is or is not satisfied"
+    }
+  ],
+  "overallAssessment": "one or two sentence summary of what passed and what is missing"
+}`;
+}
+function formatRubricReason({ score, analysis }) {
+  const complete = score >= 1;
+  const header = complete ? "\u2705 Rubric satisfied: every required criterion is met." : "\u274C Rubric not yet satisfied.";
+  const lines = analysis.criteria.map((c) => {
+    const mark = c.satisfied ? "\u2705" : "\u274C";
+    const tag = c.required ? "required" : "optional";
+    return `${mark} [${tag}] ${c.criterion}
+   \u2192 ${c.reasoning}`;
+  });
+  const unmetRequired = analysis.criteria.filter((c) => c.required && !c.satisfied);
+  const footer = complete ? "" : `
+To finish, address the ${unmetRequired.length} unmet required ${unmetRequired.length === 1 ? "criterion" : "criteria"} above.`;
+  const assessment = analysis.overallAssessment ? `
+${analysis.overallAssessment}` : "";
+  return `${header}
+${lines.join("\n")}${assessment}${footer}`;
+}
+// src/scorers/llm/rubric/index.ts
+var analyzeOutputSchema6 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "criteria": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "criterion": {
+            "type": "string"
+          },
+          "satisfied": {
+            "type": "boolean"
+          },
+          "required": {
+            "type": "boolean"
+          },
+          "reasoning": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "criterion",
+          "satisfied",
+          "required",
+          "reasoning"
+        ]
+      }
+    },
+    "overallAssessment": {
+      "type": "string"
+    }
+  },
+  "required": [
+    "criteria",
+    "overallAssessment"
+  ]
+};
+function parseRubricString(rubric) {
+  return rubric.split("\n").map((line) => line.replace(/^\s*(?:[-*•]|\d+[.)])\s*/, "").trim()).filter((line) => line.length > 0).map((description) => ({ description, required: true }));
+}
+function normalizeRubric(rubric) {
+  if (!rubric) return [];
+  if (typeof rubric === "string") return parseRubricString(rubric);
+  return rubric;
+}
+function resolveRubric({
+  staticRubric,
+  run
+}) {
+  if (staticRubric.length > 0) return staticRubric;
+  const dynamic = pickRubric(run.requestContext) ?? pickRubric(run.additionalContext) ?? pickRubric(run.input);
+  return normalizeRubric(dynamic);
+}
+function pickRubric(source) {
+  if (!source || typeof source !== "object") return void 0;
+  let value;
+  const getter = source.get;
+  if (typeof getter === "function") {
+    value = getter.call(source, "rubric");
+  } else {
+    value = source.rubric;
+  }
+  if (typeof value === "string") return value;
+  if (Array.isArray(value)) return value;
+  return void 0;
+}
+function toCriterionInputs(criteria) {
+  return criteria.map((c) => ({ criterion: c.description, required: c.required !== false }));
+}
+function getOutputText(run) {
+  const fromOutput = getAssistantMessageFromRunOutput(run.output);
+  if (fromOutput) return fromOutput;
+  if (run.input && typeof run.input === "object" && typeof run.input.currentText === "string") {
+    return run.input.currentText;
+  }
+  return typeof run.output === "string" ? run.output : "";
+}
+function getTaskText(run) {
+  if (run.input && typeof run.input === "object" && typeof run.input.originalTask === "string") {
+    return run.input.originalTask;
+  }
+  return getUserMessageFromRunInput(run.input) ?? "";
+}
+function createRubricScorer({
+  model,
+  criteria,
+  options
+}) {
+  const scale = options?.scale ?? 1;
+  const staticRubric = normalizeRubric(criteria);
+  return createScorer({
+    id: "rubric-scorer",
+    name: "Rubric (LLM)",
+    description: "Grades an agent output against a rubric of criteria, returning 1 only when every required criterion is satisfied",
+    judge: {
+      model,
+      instructions: RUBRIC_INSTRUCTIONS
+    }
+  }).analyze({
+    description: "Judge the output against each rubric criterion",
+    outputSchema: analyzeOutputSchema6,
+    createPrompt: ({ run }) => {
+      const rubric = resolveRubric({ staticRubric, run });
+      if (rubric.length === 0) {
+        return `No rubric was provided. Return exactly: {"criteria": [], "overallAssessment": "No rubric provided; nothing to grade."}`;
+      }
+      return createAnalyzePrompt6({
+        originalTask: getTaskText(run),
+        output: getOutputText(run),
+        criteria: toCriterionInputs(rubric)
+      });
+    }
+  }).generateScore(({ results }) => {
+    const analysis = results.analyzeStepResult;
+    if (!analysis || analysis.criteria.length === 0) {
+      return 1;
+    }
+    const requiredCriteria = analysis.criteria.filter((c) => c.required);
+    const gating = requiredCriteria.length > 0 ? requiredCriteria : analysis.criteria;
+    const allSatisfied = gating.every((c) => c.satisfied);
+    return (allSatisfied ? 1 : 0) * scale;
+  }).generateReason(({ results, score }) => {
+    const analysis = results.analyzeStepResult;
+    if (!analysis || analysis.criteria.length === 0) {
+      return "No rubric was provided, so the rubric check passed by default.";
+    }
+    return formatRubricReason({ score, analysis });
+  });
+}
 // src/scorers/llm/trajectory/prompts.ts
 var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
 You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
@@ -3122,7 +3319,7 @@ OUTPUT REQUIREMENTS:
 - Use provided JSON schema exactly as specified
 - Be consistent in your evaluation standards
 `;
-var createAnalyzePrompt6 = ({
+var createAnalyzePrompt7 = ({
   userInput,
   agentResponse,
   actualTrajectory,
@@ -3189,7 +3386,7 @@ Provide a single, concise sentence explaining why this score was given.
 };
 // src/scorers/llm/trajectory/index.ts
-var analyzeOutputSchema6 = {
+var analyzeOutputSchema7 = {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "type": "object",
   "properties": {
@@ -3331,11 +3528,11 @@ function createTrajectoryAccuracyScorerLLM({
     };
   }).analyze({
     description: "Analyze the quality and appropriateness of the agent trajectory",
-    outputSchema: analyzeOutputSchema6,
+    outputSchema: analyzeOutputSchema7,
     createPrompt: ({ run, results }) => {
       const userInput = getUserMessageFromRunInput(run.input) ?? "";
       const agentResponse = getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
-      return createAnalyzePrompt6({
+      return createAnalyzePrompt7({
         userInput,
         agentResponse,
         actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
@@ -4096,6 +4293,6 @@ function createTrajectoryScorerCode(options = {}) {
   });
 }
-export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer, createTrajectoryAccuracyScorerCode, createTrajectoryAccuracyScorerLLM, createTrajectoryScorerCode };
+export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createRubricScorer, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer, createTrajectoryAccuracyScorerCode, createTrajectoryAccuracyScorerLLM, createTrajectoryScorerCode };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map