npm - @mastra/evals - Versions diffs - 1.2.4 → 1.3.0-alpha.0 - Mend

@mastra/evals 1.2.4 → 1.3.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/CHANGELOG.md +32 -0
package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
package/dist/chunk-BE5F2OUQ.js.map +1 -0
package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
package/dist/docs/SKILL.md +2 -1
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/reference-evals-rubric.md +113 -0
package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
package/dist/scorers/llm/index.d.ts +1 -0
package/dist/scorers/llm/index.d.ts.map +1 -1
package/dist/scorers/llm/rubric/index.d.ts +71 -0
package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
package/dist/scorers/prebuilt/index.cjs +276 -78
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +203 -6
package/dist/scorers/prebuilt/index.js.map +1 -1
package/dist/scorers/utils.cjs +25 -25
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +1 -1
package/package.json +9 -8
package/dist/chunk-BULMCHKJ.cjs.map +0 -1

package/dist/scorers/prebuilt/index.cjs CHANGED Viewed

@@ -1,6 +1,6 @@
 'use strict';
-var chunkBULMCHKJ_cjs = require('../../chunk-BULMCHKJ.cjs');
+var chunkUNQXHPOD_cjs = require('../../chunk-UNQXHPOD.cjs');
 var evals = require('@mastra/core/evals');
 var nlp = require('compromise');
 var keyword_extractor = require('keyword-extractor');
@@ -250,7 +250,7 @@ function createAnswerRelevancyScorer({
     description: "Extract relevant statements from the LLM output",
     outputSchema: extractOutputSchema,
     createPrompt: ({ run }) => {
-      const assistantMessage = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const assistantMessage = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       return createExtractPrompt(assistantMessage);
     }
   }).analyze({
@@ -283,7 +283,7 @@ function createAnswerRelevancyScorer({
       ]
     },
     createPrompt: ({ run, results }) => {
-      const input = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const input = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
       return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
     }
   }).generateScore(({ results }) => {
@@ -300,13 +300,13 @@ function createAnswerRelevancyScorer({
       }
     }
     const score = relevancyCount / numberOfResults;
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * options.scale);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * options.scale);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       return createReasonPrompt({
-        input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         score,
         results: results.analyzeStepResult.results,
         scale: options.scale
@@ -581,7 +581,7 @@ function createAnswerSimilarityScorer({
           groundTruth: ""
         });
       }
-      const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
       return createExtractPrompt2({
         output,
@@ -639,14 +639,14 @@ function createAnswerSimilarityScorer({
     );
     score -= extraInfoPenalty;
     score = Math.max(0, Math.min(1, score));
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * mergedOptions.scale);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * mergedOptions.scale);
   }).generateReason({
     description: "Generate explanation of similarity score",
     createPrompt: ({ run, results, score }) => {
       if (!run.groundTruth) {
         return "No ground truth was provided for comparison. Score is 0 by default.";
       }
-      const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
       return createReasonPrompt2({
         output,
@@ -848,7 +848,7 @@ function createFaithfulnessScorer({
       ]
     },
     createPrompt: ({ run }) => {
-      const prompt = createFaithfulnessExtractPrompt({ output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createFaithfulnessExtractPrompt({ output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -895,13 +895,13 @@ function createFaithfulnessScorer({
       return 0;
     }
     const score = supportedClaims / totalClaims * (options?.scale || 1);
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       const prompt = createFaithfulnessReasonPrompt({
-        input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context: options?.context ?? getToolInvocationContext(run.output),
         score,
         scale: options?.scale || 1,
@@ -1046,7 +1046,7 @@ function createBiasScorer({ model, options }) {
         "opinions"
       ]
     },
-    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
+    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: {
@@ -1078,7 +1078,7 @@ function createBiasScorer({ model, options }) {
     },
     createPrompt: ({ run, results }) => {
       const prompt = createBiasAnalyzePrompt({
-        output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         opinions: results.preprocessStepResult?.opinions || []
       });
       return prompt;
@@ -1089,7 +1089,7 @@ function createBiasScorer({ model, options }) {
     }
     const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
     const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ score, results }) => {
@@ -1320,7 +1320,7 @@ function createHallucinationScorer({
       ]
     },
     createPrompt: ({ run }) => {
-      const prompt = createHallucinationExtractPrompt({ output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createHallucinationExtractPrompt({ output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -1376,7 +1376,7 @@ function createHallucinationScorer({
       return 0;
     }
     const score = contradictedStatements / totalStatements * (options?.scale || 1);
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: async ({ run, results, score }) => {
@@ -1387,8 +1387,8 @@ function createHallucinationScorer({
         context = options?.context ?? [];
       }
       const prompt = createHallucinationReasonPrompt({
-        input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context,
         score,
         scale: options?.scale || 1,
@@ -1528,8 +1528,8 @@ function createToxicityScorer({
     },
     createPrompt: ({ run }) => {
       const prompt = createToxicityAnalyzePrompt({
-        input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
+        input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
       });
       return prompt;
     }
@@ -1545,7 +1545,7 @@ function createToxicityScorer({
       }
     }
     const score = toxicityCount / numberOfVerdicts;
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ results, score }) => {
@@ -1706,7 +1706,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Input and output messages cannot be null or empty");
     }
-    const { tools: actualTools, toolCallInfos } = chunkBULMCHKJ_cjs.extractToolCalls(run.output);
+    const { tools: actualTools, toolCallInfos } = chunkUNQXHPOD_cjs.extractToolCalls(run.output);
     return {
       actualTools,
       hasToolCalls: actualTools.length > 0,
@@ -1716,8 +1716,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     description: "Analyze the appropriateness of tool selections",
     outputSchema: analyzeOutputSchema2,
     createPrompt: ({ run, results }) => {
-      const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const toolsCalled = results.preprocessStepResult?.actualTools || [];
       return createAnalyzePrompt2({
         userInput,
@@ -1734,11 +1734,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     }
     const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
     const totalToolCalls = evaluations.length;
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
   }).generateReason({
     description: "Generate human-readable explanation of tool selection evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const evaluations = results.analyzeStepResult?.evaluations || [];
       const missingTools = results.analyzeStepResult?.missingTools || [];
       return createReasonPrompt3({
@@ -1968,7 +1968,7 @@ var getContext = ({
   output,
   options
 }) => {
-  if (options.contextExtractor && chunkBULMCHKJ_cjs.isScorerRunInputForAgent(input) && chunkBULMCHKJ_cjs.isScorerRunOutputForAgent(output)) {
+  if (options.contextExtractor && chunkUNQXHPOD_cjs.isScorerRunInputForAgent(input) && chunkUNQXHPOD_cjs.isScorerRunOutputForAgent(output)) {
     return options.contextExtractor(input, output);
   }
   return options.context ?? [];
@@ -1996,8 +1996,8 @@ function createContextRelevanceScorerLLM({
     description: "Analyze the relevance and utility of provided context",
     outputSchema: analyzeOutputSchema3,
     createPrompt: ({ run }) => {
-      const userQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = getContext({ input: run.input, output: run.output, options });
       if (context.length === 0) {
         return createAnalyzePrompt3({
@@ -2045,11 +2045,11 @@ function createContextRelevanceScorerLLM({
     const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
     const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
     const scaledScore = finalScore * (options.scale || 1);
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(scaledScore);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(scaledScore);
   }).generateReason({
     description: "Generate human-readable explanation of context relevance evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const context = getContext({ input: run.input, output: run.output, options });
       if (context.length === 0) {
         return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
@@ -2224,7 +2224,7 @@ var getContext2 = ({
   output,
   options
 }) => {
-  if (options.contextExtractor && chunkBULMCHKJ_cjs.isScorerRunInputForAgent(input) && chunkBULMCHKJ_cjs.isScorerRunOutputForAgent(output)) {
+  if (options.contextExtractor && chunkUNQXHPOD_cjs.isScorerRunInputForAgent(input) && chunkUNQXHPOD_cjs.isScorerRunOutputForAgent(output)) {
     return options.contextExtractor(input, output);
   }
   return options.context ?? [];
@@ -2252,8 +2252,8 @@ function createContextPrecisionScorer({
     description: "Evaluate the relevance of each context piece for generating the expected output",
     outputSchema: contextRelevanceOutputSchema,
     createPrompt: ({ run }) => {
-      const input = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const input = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = getContext2({ input: run.input, output: run.output, options });
       if (context.length === 0) {
         throw new Error("No context available for evaluation");
@@ -2286,12 +2286,12 @@ function createContextPrecisionScorer({
     }
     const map = sumPrecision / relevantCount;
     const score = map * (options.scale || 1);
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(score);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the context precision results",
     createPrompt: ({ run, results, score }) => {
-      const input = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const input = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = getContext2({ input: run.input, output: run.output, options });
       return createContextPrecisionReasonPrompt({
         input,
@@ -2589,8 +2589,8 @@ function createNoiseSensitivityScorerLLM({
     description: "Analyze the impact of noise on agent response quality",
     outputSchema: analyzeOutputSchema4,
     createPrompt: ({ run }) => {
-      const originalQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const noisyResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const originalQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const noisyResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       if (!originalQuery || !noisyResponse) {
         throw new Error("Both original query and noisy response are required for evaluation");
       }
@@ -2633,11 +2633,11 @@ function createNoiseSensitivityScorerLLM({
     const majorIssues = analysisResult.majorIssues || [];
     const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
     finalScore = Math.max(0, finalScore - issuesPenalty);
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(finalScore);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(finalScore);
   }).generateReason({
     description: "Generate human-readable explanation of noise sensitivity evaluation",
     createPrompt: ({ run, results, score }) => {
-      const originalQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const originalQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const analysisResult = results.analyzeStepResult;
       if (!analysisResult) {
         throw new Error("Analysis step failed to produce results for reason generation");
@@ -3049,9 +3049,9 @@ function createPromptAlignmentScorerLLM({
     description: "Analyze prompt-response alignment across multiple dimensions",
     outputSchema: analyzeOutputSchema5,
     createPrompt: ({ run }) => {
-      const userPrompt = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const systemPrompt = chunkBULMCHKJ_cjs.getCombinedSystemPrompt(run.input) ?? "";
-      const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userPrompt = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const systemPrompt = chunkUNQXHPOD_cjs.getCombinedSystemPrompt(run.input) ?? "";
+      const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       if (evaluationMode === "user" && !userPrompt) {
         throw new Error("User prompt is required for user prompt alignment scoring");
       }
@@ -3087,12 +3087,12 @@ function createPromptAlignmentScorerLLM({
       weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
     }
     const finalScore = weightedScore * scale;
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(finalScore);
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(finalScore);
   }).generateReason({
     description: "Generate human-readable explanation of prompt alignment evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userPrompt = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const systemPrompt = chunkBULMCHKJ_cjs.getCombinedSystemPrompt(run.input) ?? "";
+      const userPrompt = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const systemPrompt = chunkUNQXHPOD_cjs.getCombinedSystemPrompt(run.input) ?? "";
       const analysis = results.analyzeStepResult;
       if (!analysis) {
         return `Unable to analyze prompt alignment. Score: ${score}`;
@@ -3109,6 +3109,203 @@ function createPromptAlignmentScorerLLM({
   });
 }
+// src/scorers/llm/rubric/prompts.ts
+var RUBRIC_INSTRUCTIONS = `You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.
+A rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.
+Grading guidelines:
+- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.
+- A criterion is "satisfied" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.
+- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.
+- Be concise but specific in your reasoning: say what is present or missing.
+- Do not reward effort, intent, or partial progress. Only the actual output counts.`;
+function createAnalyzePrompt6({
+  originalTask,
+  output,
+  criteria
+}) {
+  const renderedCriteria = criteria.map((c, i) => `${i + 1}. [${c.required ? "required" : "optional"}] ${c.criterion}`).join("\n");
+  return `Grade the agent's output against the rubric below.
+Original task:
+${originalTask || "(no task provided)"}
+Rubric criteria:
+${renderedCriteria}
+Agent output to grade:
+${output || "(empty output)"}
+For every criterion, decide whether the output satisfies it. Preserve the exact criterion text and its required/optional designation in your answer.
+Return your judgement as JSON in this shape:
+{
+  "criteria": [
+    {
+      "criterion": "exact criterion text",
+      "satisfied": true,
+      "required": true,
+      "reasoning": "why it is or is not satisfied"
+    }
+  ],
+  "overallAssessment": "one or two sentence summary of what passed and what is missing"
+}`;
+}
+function formatRubricReason({ score, analysis }) {
+  const complete = score >= 1;
+  const header = complete ? "\u2705 Rubric satisfied: every required criterion is met." : "\u274C Rubric not yet satisfied.";
+  const lines = analysis.criteria.map((c) => {
+    const mark = c.satisfied ? "\u2705" : "\u274C";
+    const tag = c.required ? "required" : "optional";
+    return `${mark} [${tag}] ${c.criterion}
+   \u2192 ${c.reasoning}`;
+  });
+  const unmetRequired = analysis.criteria.filter((c) => c.required && !c.satisfied);
+  const footer = complete ? "" : `
+To finish, address the ${unmetRequired.length} unmet required ${unmetRequired.length === 1 ? "criterion" : "criteria"} above.`;
+  const assessment = analysis.overallAssessment ? `
+${analysis.overallAssessment}` : "";
+  return `${header}
+${lines.join("\n")}${assessment}${footer}`;
+}
+// src/scorers/llm/rubric/index.ts
+var analyzeOutputSchema6 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "criteria": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "criterion": {
+            "type": "string"
+          },
+          "satisfied": {
+            "type": "boolean"
+          },
+          "required": {
+            "type": "boolean"
+          },
+          "reasoning": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "criterion",
+          "satisfied",
+          "required",
+          "reasoning"
+        ]
+      }
+    },
+    "overallAssessment": {
+      "type": "string"
+    }
+  },
+  "required": [
+    "criteria",
+    "overallAssessment"
+  ]
+};
+function parseRubricString(rubric) {
+  return rubric.split("\n").map((line) => line.replace(/^\s*(?:[-*•]|\d+[.)])\s*/, "").trim()).filter((line) => line.length > 0).map((description) => ({ description, required: true }));
+}
+function normalizeRubric(rubric) {
+  if (!rubric) return [];
+  if (typeof rubric === "string") return parseRubricString(rubric);
+  return rubric;
+}
+function resolveRubric({
+  staticRubric,
+  run
+}) {
+  if (staticRubric.length > 0) return staticRubric;
+  const dynamic = pickRubric(run.requestContext) ?? pickRubric(run.additionalContext) ?? pickRubric(run.input);
+  return normalizeRubric(dynamic);
+}
+function pickRubric(source) {
+  if (!source || typeof source !== "object") return void 0;
+  let value;
+  const getter = source.get;
+  if (typeof getter === "function") {
+    value = getter.call(source, "rubric");
+  } else {
+    value = source.rubric;
+  }
+  if (typeof value === "string") return value;
+  if (Array.isArray(value)) return value;
+  return void 0;
+}
+function toCriterionInputs(criteria) {
+  return criteria.map((c) => ({ criterion: c.description, required: c.required !== false }));
+}
+function getOutputText(run) {
+  const fromOutput = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output);
+  if (fromOutput) return fromOutput;
+  if (run.input && typeof run.input === "object" && typeof run.input.currentText === "string") {
+    return run.input.currentText;
+  }
+  return typeof run.output === "string" ? run.output : "";
+}
+function getTaskText(run) {
+  if (run.input && typeof run.input === "object" && typeof run.input.originalTask === "string") {
+    return run.input.originalTask;
+  }
+  return chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+}
+function createRubricScorer({
+  model,
+  criteria,
+  options
+}) {
+  const scale = options?.scale ?? 1;
+  const staticRubric = normalizeRubric(criteria);
+  return evals.createScorer({
+    id: "rubric-scorer",
+    name: "Rubric (LLM)",
+    description: "Grades an agent output against a rubric of criteria, returning 1 only when every required criterion is satisfied",
+    judge: {
+      model,
+      instructions: RUBRIC_INSTRUCTIONS
+    }
+  }).analyze({
+    description: "Judge the output against each rubric criterion",
+    outputSchema: analyzeOutputSchema6,
+    createPrompt: ({ run }) => {
+      const rubric = resolveRubric({ staticRubric, run });
+      if (rubric.length === 0) {
+        return `No rubric was provided. Return exactly: {"criteria": [], "overallAssessment": "No rubric provided; nothing to grade."}`;
+      }
+      return createAnalyzePrompt6({
+        originalTask: getTaskText(run),
+        output: getOutputText(run),
+        criteria: toCriterionInputs(rubric)
+      });
+    }
+  }).generateScore(({ results }) => {
+    const analysis = results.analyzeStepResult;
+    if (!analysis || analysis.criteria.length === 0) {
+      return 1;
+    }
+    const requiredCriteria = analysis.criteria.filter((c) => c.required);
+    const gating = requiredCriteria.length > 0 ? requiredCriteria : analysis.criteria;
+    const allSatisfied = gating.every((c) => c.satisfied);
+    return (allSatisfied ? 1 : 0) * scale;
+  }).generateReason(({ results, score }) => {
+    const analysis = results.analyzeStepResult;
+    if (!analysis || analysis.criteria.length === 0) {
+      return "No rubric was provided, so the rubric check passed by default.";
+    }
+    return formatRubricReason({ score, analysis });
+  });
+}
 // src/scorers/llm/trajectory/prompts.ts
 var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
 You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
@@ -3131,7 +3328,7 @@ OUTPUT REQUIREMENTS:
 - Use provided JSON schema exactly as specified
 - Be consistent in your evaluation standards
 `;
-var createAnalyzePrompt6 = ({
+var createAnalyzePrompt7 = ({
   userInput,
   agentResponse,
   actualTrajectory,
@@ -3198,7 +3395,7 @@ Provide a single, concise sentence explaining why this score was given.
 };
 // src/scorers/llm/trajectory/index.ts
-var analyzeOutputSchema6 = {
+var analyzeOutputSchema7 = {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "type": "object",
   "properties": {
@@ -3340,11 +3537,11 @@ function createTrajectoryAccuracyScorerLLM({
     };
   }).analyze({
     description: "Analyze the quality and appropriateness of the agent trajectory",
-    outputSchema: analyzeOutputSchema6,
+    outputSchema: analyzeOutputSchema7,
     createPrompt: ({ run, results }) => {
-      const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
-      return createAnalyzePrompt6({
+      const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
+      return createAnalyzePrompt7({
         userInput,
         agentResponse,
         actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
@@ -3368,11 +3565,11 @@ function createTrajectoryAccuracyScorerLLM({
     const necessityScore = necessarySteps / totalSteps;
     const orderScore = orderedSteps / totalSteps;
     const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
-    return chunkBULMCHKJ_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
+    return chunkUNQXHPOD_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
   }).generateReason({
     description: "Generate human-readable explanation of trajectory evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
       const missingSteps = results.analyzeStepResult?.missingSteps || [];
       const extraSteps = results.analyzeStepResult?.extraSteps || [];
@@ -3435,18 +3632,18 @@ function createCompletenessScorer() {
     type: "agent"
   }).preprocess(async ({ run }) => {
     const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
-      const content = chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i);
+      const content = chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i);
       return content === null || content === void 0;
     });
     const isOutputInvalid = !run.output || run.output.some((i) => {
-      const content = chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i);
+      const content = chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i);
       return content === null || content === void 0;
     });
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Inputs cannot be null or undefined");
     }
-    const input = run.input?.inputMessages.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const inputToProcess = input;
     const outputToProcess = output;
     const inputDoc = nlp__default.default(inputToProcess.trim());
@@ -3551,8 +3748,8 @@ function createTextualDifferenceScorer() {
     description: "Calculate textual difference between input and output using sequence matching algorithms.",
     type: "agent"
   }).preprocess(async ({ run }) => {
-    const input = run.input?.inputMessages?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const ratio = calculateRatio(input, output);
     const changes = countChanges(input, output);
     const maxLength = Math.max(input.length, output.length);
@@ -3575,8 +3772,8 @@ function createKeywordCoverageScorer() {
     description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
     type: "agent"
   }).preprocess(async ({ run }) => {
-    const input = run.input?.inputMessages?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     if (!input && !output) {
       return {
         result: {
@@ -3629,8 +3826,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
     description: "Calculates content similarity between input and output messages using string comparison algorithms.",
     type: "agent"
   }).preprocess(async ({ run }) => {
-    let processedInput = run.input?.inputMessages.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    let processedOutput = run.output.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    let processedInput = run.input?.inputMessages.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    let processedOutput = run.output.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     if (ignoreCase) {
       processedInput = processedInput.toLowerCase();
       processedOutput = processedOutput.toLowerCase();
@@ -3660,7 +3857,7 @@ function createToneScorer(config = {}) {
     type: "agent"
   }).preprocess(async ({ run }) => {
     const sentiment = new Sentiment__default.default();
-    const agentMessage = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const agentMessage = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const responseSentiment = sentiment.analyze(agentMessage);
     if (referenceTone) {
       const referenceSentiment = sentiment.analyze(referenceTone);
@@ -3747,7 +3944,7 @@ function createToolCallAccuracyScorerCode(options) {
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Input and output messages cannot be null or empty");
     }
-    const { tools: actualTools, toolCallInfos } = chunkBULMCHKJ_cjs.extractToolCalls(run.output);
+    const { tools: actualTools, toolCallInfos } = chunkUNQXHPOD_cjs.extractToolCalls(run.output);
     const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
     return {
       expectedTool,
@@ -3822,7 +4019,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
     const itemExpectation = run.expectedTrajectory;
     const effectiveOrdering = itemExpectation?.ordering ?? ordering;
     const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
-    const comparison = chunkBULMCHKJ_cjs.compareTrajectories(
+    const comparison = chunkUNQXHPOD_cjs.compareTrajectories(
       actualTrajectory,
       { steps: resolvedExpectedSteps },
       {
@@ -3880,7 +4077,7 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
     const childConfig = expectedStep.children;
     let accuracy;
     if (childConfig.steps && childConfig.steps.length > 0) {
-      accuracy = chunkBULMCHKJ_cjs.compareTrajectories(
+      accuracy = chunkUNQXHPOD_cjs.compareTrajectories(
         childTrajectory,
         { steps: childConfig.steps },
         {
@@ -3890,18 +4087,18 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
       );
     }
     const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
-    const efficiency = hasEfficiencyConfig ? chunkBULMCHKJ_cjs.checkTrajectoryEfficiency(childTrajectory, {
+    const efficiency = hasEfficiencyConfig ? chunkUNQXHPOD_cjs.checkTrajectoryEfficiency(childTrajectory, {
       maxSteps: childConfig.maxSteps,
       maxTotalTokens: childConfig.maxTotalTokens,
       maxTotalDurationMs: childConfig.maxTotalDurationMs,
       noRedundantCalls: childConfig.noRedundantCalls ?? true
     }) : void 0;
     const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
-    const blacklist = hasBlacklistConfig ? chunkBULMCHKJ_cjs.checkTrajectoryBlacklist(childTrajectory, {
+    const blacklist = hasBlacklistConfig ? chunkUNQXHPOD_cjs.checkTrajectoryBlacklist(childTrajectory, {
       blacklistedTools: childConfig.blacklistedTools,
       blacklistedSequences: childConfig.blacklistedSequences
     }) : void 0;
-    const toolFailures = chunkBULMCHKJ_cjs.analyzeToolFailures(childTrajectory, {
+    const toolFailures = chunkUNQXHPOD_cjs.analyzeToolFailures(childTrajectory, {
       maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
     });
     const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
@@ -3966,7 +4163,7 @@ function createTrajectoryScorerCode(options = {}) {
     }
     let accuracy;
     if (config.steps && config.steps.length > 0) {
-      accuracy = chunkBULMCHKJ_cjs.compareTrajectories(
+      accuracy = chunkUNQXHPOD_cjs.compareTrajectories(
         actualTrajectory,
         { steps: config.steps },
         {
@@ -3976,18 +4173,18 @@ function createTrajectoryScorerCode(options = {}) {
       );
     }
     const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
-    const efficiency = hasEfficiencyConfig ? chunkBULMCHKJ_cjs.checkTrajectoryEfficiency(actualTrajectory, {
+    const efficiency = hasEfficiencyConfig ? chunkUNQXHPOD_cjs.checkTrajectoryEfficiency(actualTrajectory, {
       maxSteps: config.maxSteps,
       maxTotalTokens: config.maxTotalTokens,
       maxTotalDurationMs: config.maxTotalDurationMs,
       noRedundantCalls: config.noRedundantCalls ?? true
     }) : void 0;
     const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
-    const blacklist = hasBlacklistConfig ? chunkBULMCHKJ_cjs.checkTrajectoryBlacklist(actualTrajectory, {
+    const blacklist = hasBlacklistConfig ? chunkUNQXHPOD_cjs.checkTrajectoryBlacklist(actualTrajectory, {
       blacklistedTools: config.blacklistedTools,
       blacklistedSequences: config.blacklistedSequences
     }) : void 0;
-    const toolFailures = chunkBULMCHKJ_cjs.analyzeToolFailures(actualTrajectory, {
+    const toolFailures = chunkUNQXHPOD_cjs.analyzeToolFailures(actualTrajectory, {
       maxRetriesPerTool: config.maxRetriesPerTool ?? 2
     });
     const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
@@ -4121,6 +4318,7 @@ exports.createHallucinationScorer = createHallucinationScorer;
 exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
 exports.createNoiseSensitivityScorerLLM = createNoiseSensitivityScorerLLM;
 exports.createPromptAlignmentScorerLLM = createPromptAlignmentScorerLLM;
+exports.createRubricScorer = createRubricScorer;
 exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
 exports.createToneScorer = createToneScorer;
 exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;