npm - @mastra/evals - Versions diffs - 0.0.1-alpha.0 → 0.0.1-alpha.1 - Mend

@mastra/evals 0.0.1-alpha.0 → 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CHANGELOG.md +6 -0
package/dist/evals.cjs.development.js +52 -41
package/dist/evals.cjs.development.js.map +1 -1
package/dist/evals.cjs.production.min.js +1 -1
package/dist/evals.cjs.production.min.js.map +1 -1
package/dist/evals.esm.js +52 -41
package/dist/evals.esm.js.map +1 -1
package/dist/metrics/answer-relevancy/index.d.ts +5 -4
package/dist/metrics/answer-relevancy/index.d.ts.map +1 -1
package/dist/metrics/answer-relevancy/metricJudge.d.ts +1 -1
package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -1
package/dist/metrics/answer-relevancy/prompts.d.ts +3 -2
package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -1
package/dist/metrics/context-position/index.d.ts.map +1 -1
package/dist/metrics/context-position/metricJudge.d.ts +1 -1
package/dist/metrics/context-position/metricJudge.d.ts.map +1 -1
package/dist/metrics/context-position/prompts.d.ts +2 -1
package/dist/metrics/context-position/prompts.d.ts.map +1 -1
package/dist/metrics/context-precision/index.d.ts.map +1 -1
package/dist/metrics/context-precision/metricJudge.d.ts +1 -1
package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -1
package/dist/metrics/context-precision/prompts.d.ts +3 -2
package/dist/metrics/context-precision/prompts.d.ts.map +1 -1
package/dist/metrics/prompt-alignment/index.d.ts.map +1 -1
package/dist/metrics/prompt-alignment/metricJudge.d.ts +1 -1
package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -1
package/dist/metrics/prompt-alignment/prompts.d.ts +3 -2
package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -1
package/dist/utils.d.ts +2 -0
package/dist/utils.d.ts.map +1 -0
package/package.json +5 -2
package/src/evaluation.test.ts +2 -2
package/src/metrics/answer-relevancy/index.test.ts +12 -22
package/src/metrics/answer-relevancy/index.ts +10 -15
package/src/metrics/answer-relevancy/metricJudge.ts +8 -2
package/src/metrics/answer-relevancy/prompts.ts +56 -22
package/src/metrics/context-position/index.ts +4 -2
package/src/metrics/context-position/metricJudge.ts +2 -1
package/src/metrics/context-position/prompts.ts +31 -14
package/src/metrics/context-precision/index.test.ts +1 -1
package/src/metrics/context-precision/index.ts +4 -2
package/src/metrics/context-precision/metricJudge.ts +2 -1
package/src/metrics/context-precision/prompts.ts +33 -5
package/src/metrics/prompt-alignment/index.test.ts +137 -20
package/src/metrics/prompt-alignment/index.ts +4 -2
package/src/metrics/prompt-alignment/metricJudge.ts +8 -2
package/src/metrics/prompt-alignment/prompts.ts +26 -3
package/src/utils.ts +3 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # @mastra/evals
+## 0.0.1-alpha.1
+### Patch Changes
+- 35764f4: Added workflow for eval tests
 ## 0.0.1-alpha.0
 ### Patch Changes

package/dist/evals.cjs.development.js CHANGED Viewed

@@ -419,6 +419,10 @@ function _evaluate() {
   return _evaluate.apply(this, arguments);
 }
+var roundToTwoDecimals = function roundToTwoDecimals(num) {
+  return Math.round((num + Number.EPSILON) * 100) / 100;
+};
 var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
   this.agent = void 0;
   this.agent = new core.Agent({
@@ -428,7 +432,7 @@ var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
   });
 };
-var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"";
+var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"\n6. Responses that discuss the type of information being asked show partial relevance";
 function generateEvaluationStatementsPrompt(_ref) {
   var output = _ref.output;
   return "Given the text, break it down into meaningful statements while preserving context and relationships.\nDon't split too aggressively.\n\nSplit compound statements particularly when they:\n- Are joined by \"and\"\n- Contain multiple distinct facts or claims\n- Have multiple descriptive elements about the subject\n\n\nHandle special cases:\n- A single word answer should be treated as a complete statement\n- Error messages should be treated as a single statement\n- Empty strings should return an empty list\n- When splitting text, keep related information together\n\nExample:\nExample text: Look! A bird! Birds are an interesting animal.\n\n{{\n    \"statements\": [\"Look!\", \"A bird!\", \"Birds are interesting animals.\"]\n}}\n\nPlease return only JSON format with \"statements\" array.\nReturn empty list for empty input.\n\nText:\n" + output + "\n\nJSON:\n";
@@ -436,14 +440,15 @@ function generateEvaluationStatementsPrompt(_ref) {
 function generateEvaluatePrompt$3(_ref2) {
   var input = _ref2.input,
     statements = _ref2.statements;
-  return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n    Return JSON with array of verdict objects. Each verdict must include:\n    - \"verdict\": \"yes\", \"no\", or \"unsure\"\n    - \"reason\": Clear explanation of the verdict\n    - Exact match between number of verdicts and statements\n\n    Verdict Guidelines:\n    - \"yes\": Statement explicitly and directly answers the input question\n        * Contains specific answer to the question asked (e.g., \"The color of the sky is blue\")\n        * States explicit relationship between key concepts (e.g., \"X is the CEO of company Y\")\n        * Can stand alone as a complete answer\n        * Contains appropriate question-type response (e.g., location for \"where\", person for \"who\")\n\n    - \"unsure\": Statement shows partial relevance when it:\n        * Contains topic-related administrative/governance terms without direct answer\n        * Mentions locations or entities related to the answer without specifying their role\n        * References functions or characteristics typically associated with the answer\n        * Is incorrect but shows understanding of the question\n        * Uses importance indicators (\"main\", \"primary\", \"major\") with relevant concepts\n        * Includes indirect references to the answer (e.g., \"where the president works\")\n        * Contains multiple relevant concepts but lacks explicit relationship between them\n        * Demonstrates understanding of question domain without providing specific answer\n\n    - \"no\": Statement lacks meaningful connection to question when it:\n        * Contains no concepts related to the question type or domain\n        * Only mentions the broader topic without relevant details (e.g., \"the country has nice weather\")\n        * Provides general descriptions without addressing the specific question\n        * Contains purely tangential information about the subject\n        * Consists of empty or meaningless content\n        * Discusses characteristics unrelated to the question type (e.g., describing cuisine when asked about geography)\n        * Note: Assessment is about topical relationship, not factual accuracy\n\n    REMEMBER: A statmenent does not have to be correct, it just has to be relevant.\n    If the statement contains words or phrases that are relevant to the input, it is partially relevant.\n    If the statement is a direct answer to the input, it is relevant.\n    If the statement is completely unrelated to the input or contains nothing, it is not relevant.\n    DO NOT MAKE A JUDGEMENT ON THE CORRECTNESS OF THE STATEMENT, JUST THE RELEVANCY.\n\n\n    Example:\n    Input: \"What color is the sky during daytime?\"\n    Statements: [\n      \"The sky is blue during daytime\",\n      \"The sky is full of clouds\", \n      \"I had breakfast today\",\n      \"Blue is a beautiful color\",\n      \"Many birds fly in the sky\",\n      \"\",\n      \"The sky is purple during daytime\",\n    ]\n    JSON:\n    {{\n        \"verdicts\": [\n            {{\n                \"verdict\": \"yes\",\n                \"reason\": \"This statement explicitly answers what color the sky is during daytime\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement describes the sky but doesn't address its color\"\n            }},\n            {{\n                \"verdict\": \"no\",\n                \"reason\": \"This statement about breakfast is completely unrelated to the sky\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement about blue is related to color but doesn't address the sky\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement is about the sky but doesn't address its color\"\n            }},\n            {{\n                \"verdict\": \"no\",\n                \"reason\": \"This statement is empty\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement is incorrect but contains relevant information and still addresses the question\"\n            }}\n        ]\n    }}\n\n  Input:\n  " + input + "\n\n  Statements:\n  " + statements.join('\n') + "\n\n  JSON:\n  ";
+  return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n    Return JSON with array of verdict objects. Each verdict must include:\n    - \"verdict\": \"yes\", \"no\", or \"unsure\"\n    - \"reason\": Clear explanation of the verdict\n    - Exact match between number of verdicts and statements\n\n    Verdict Guidelines:\n    - \"yes\": Statement explicitly and directly answers the input question when it:\n        * Contains specific answer to the question asked (e.g., \"The color of the sky is blue\")\n        * States explicit relationship between key concepts (e.g., \"X is the CEO of company Y\")\n        * Can stand alone as a complete answer\n        * Contains appropriate question-type response (e.g., location for \"where\", person for \"who\")\n        * Note: If statement is incorrect but directly addresses the question, mark as \"unsure\"\n\n    - \"unsure\": Statement shows partial relevance when it:\n        * Discusses the type of information being asked about (e.g., mentions temperatures when asked about temperature)\n        * Contains information about the answer without explicit statement\n        * Uses importance indicators (\"main\", \"primary\", \"major\") with relevant concepts\n        * Includes indirect references to the answer (e.g., \"where the president works\")\n        * Contains topic-related administrative/governance terms without direct answer\n        * References functions or characteristics typically associated with the answer\n        * Uses terms that match what's being asked about\n        * Mentions related entities without specifying their relationship to the answer\n        * Is incorrect but shows understanding of the question\n        * Contains the answer term but needs more context to be complete\n        * Contains measurement units or quantities relevant to the question type\n        * References locations or entities in the same category as what's being asked about\n        * Provides relevant information without using explicit question-type terminology\n        * Contains references to properties of the subject\n\n\n    - \"no\": Statement lacks meaningful connection to question when it:\n        * Contains neither the subject nor the type of information being requested\n        * Contains no terms related to what's being asked about\n        * Contains only general subject information without relating to what's being asked\n        * Consists of empty or meaningless content\n        * Contains purely tangential information with no mention of the subject or question type\n        * Note: Assessment is about connection to what's being asked, not factual accuracy\n        * Contains no connection to what's being asked about (neither the subject nor the type of information requested)\n\n    REMEMBER: \n    - If the statement contains words or phrases that are relevant to the input, it is partially relevant.\n    - If the statement is a direct answer to the input, it is relevant.\n    - If the statement is completely unrelated to the input or contains nothing, it is not relevant.\n    - DO NOT MAKE A JUDGEMENT ON THE CORRECTNESS OF THE STATEMENT, JUST THE RELEVANCY.\n\n    STRICT RULES:\n    - If a statement mentions the type of information being requested, it should be marked as \"unsure\" ONLY if it's discussing that type meaningfully (not just mentioning it)\n    - Subject mentions alone are NOT enough for relevance - they must connect to what's being asked about\n    - Empty or meaningless statements are always \"no\"\n    - General facts about the subject without connection to the question type should be marked as \"no\"\n    - ALWAYS mark a statement as \"no\" if it discusses the topic without any connection to the question type\n    - Statements that mention neither the subject nor the type of information are always \"no\"\n    - Type-level relevance overrides topic-only content\n    - Measurement/quantity relevance counts as type-level relevance\n    - Administrative/governance terms are only relevant if they relate to the question type\n\n\n    Examples of \"no\" statements:\n        * \"Japan has beautiful seasons\" for \"What is Japan's largest city?\"\n        * \"Trees grow tall\" for \"How tall is Mount Everest?\"\n        * \"The weather is nice\" for \"Who is the president?\"\n\n    Example:\n    Input: \"What color is the sky during daytime?\"\n    Statements: [\n      \"The sky is blue during daytime\",\n      \"The sky is full of clouds\", \n      \"I had breakfast today\",\n      \"Blue is a beautiful color\",\n      \"Many birds fly in the sky\",\n      \"\",\n      \"The sky is purple during daytime\",\n      \"Daytime is when the sun is up\",\n    ]\n    JSON:\n    {{\n        \"verdicts\": [\n            {{\n                \"verdict\": \"yes\",\n                \"reason\": \"This statement explicitly answers what color the sky is during daytime\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement describes the sky but doesn't address its color\"\n            }},\n            {{\n                \"verdict\": \"no\",\n                \"reason\": \"This statement about breakfast is completely unrelated to the sky\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement about blue is related to color but doesn't address the sky\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement is about the sky but doesn't address its color\"\n            }},\n            {{\n                \"verdict\": \"no\",\n                \"reason\": \"This statement is empty\"\n            }},\n            {{\n                \"verdict\": \"unsure\",\n                \"reason\": \"This statement is incorrect but contains relevant information and still addresses the question\"\n            }},\n            {{\n                \"verdict\": \"no\",\n                \"reason\": \"This statement is about daytime but doesn't address the sky\"\n            }}\n        ]\n    }}\n\n  Input:\n  " + input + "\n\n  Number of statements: " + (statements.length === 0 ? '1' : statements.length) + "\n\n  Statements:\n  " + statements + "\n\n  JSON:\n  ";
 }
 function generateReasonPrompt$3(_ref3) {
   var score = _ref3.score,
     reasons = _ref3.reasons,
     input = _ref3.input,
-    output = _ref3.output;
-  return "Explain the irrelevancy score (0-10) for the LLM's response using this context:\n  Context:\n  Input: " + input + "\n  Output: " + output + "\n  Score: " + score + "\n  Irrelevancy Reasons: " + reasons.join('\n') + "\n  \n  Rules:\n  - Explain score based on mix of direct answers and related context\n  - Consider both full and partial relevance\n  - Keep explanation concise and focused\n  - Use given score, don't recalculate\n  - Don't judge factual correctness\n  - Explain both relevant and irrelevant aspects\n  - For mixed responses, explain the balance\n\n    Format:\n    {\n        \"reason\": \"The score is {score} because {explanation of overall relevance}\"\n    }\n\n    Example Responses:\n    {\n        \"reason\": \"The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant\"\n    }\n    {\n        \"reason\": \"The score is 3 because while the answer discusses the right topic, it doesn't directly address the question\"\n    }\n    ";
+    output = _ref3.output,
+    scale = _ref3.scale;
+  return "Explain the irrelevancy score where 0 is the lowest and " + scale + " is the highest for the LLM's response using this context:\n  Context:\n  Input: " + input + "\n  Output: " + output + "\n  Score: " + score + "\n  Irrelevancy Reasons: " + reasons.join('\n') + "\n  \n  Rules:\n  - Explain score based on mix of direct answers and related context\n  - Consider both full and partial relevance\n  - Keep explanation concise and focused\n  - Use given score, don't recalculate\n  - Don't judge factual correctness\n  - Explain both relevant and irrelevant aspects\n  - For mixed responses, explain the balance\n\n    Format:\n    {\n        \"reason\": \"The score is {score} because {explanation of overall relevance}\"\n    }\n\n    Example Responses:\n    {\n        \"reason\": \"The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant\"\n    }\n    {\n        \"reason\": \"The score is 3 because while the answer discusses the right topic, it doesn't directly address the question\"\n    }\n    ";
 }
 var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -497,7 +502,7 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
     return evaluate;
   }();
   _proto.getReason = /*#__PURE__*/function () {
-    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
+    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, reasons) {
       var prompt, result;
       return _regeneratorRuntime().wrap(function _callee2$(_context2) {
         while (1) switch (_context2.prev = _context2.next) {
@@ -506,7 +511,8 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
               input: input,
               output: actualOutput,
               reasons: reasons,
-              score: score
+              score: score,
+              scale: scale
             });
             _context2.next = 3;
             return this.agent.generate(prompt, {
@@ -523,7 +529,7 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
         }
       }, _callee2, this);
     }));
-    function getReason(_x3, _x4, _x5, _x6) {
+    function getReason(_x3, _x4, _x5, _x6, _x7) {
       return _getReason.apply(this, arguments);
     }
     return getReason;
@@ -534,12 +540,11 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
 var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
   function AnswerRelevancyMetric(model, _temp) {
     var _this;
-    var _ref = _temp === void 0 ? {
-        uncertaintyWeight: 0.3,
-        scale: 10
-      } : _temp,
-      uncertaintyWeight = _ref.uncertaintyWeight,
-      scale = _ref.scale;
+    var _ref = _temp === void 0 ? {} : _temp,
+      _ref$uncertaintyWeigh = _ref.uncertaintyWeight,
+      uncertaintyWeight = _ref$uncertaintyWeigh === void 0 ? 0.3 : _ref$uncertaintyWeigh,
+      _ref$scale = _ref.scale,
+      scale = _ref$scale === void 0 ? 10 : _ref$scale;
     _this = _Metric.call(this) || this;
     _this.judge = void 0;
     _this.uncertaintyWeight = void 0;
@@ -596,7 +601,7 @@ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
               }
             }
             _context2.next = 4;
-            return this.judge.getReason(input, output, score, reasonsForVerdicts);
+            return this.judge.getReason(input, output, score, this.scale, reasonsForVerdicts);
           case 4:
             reason = _context2.sent;
             return _context2.abrupt("return", reason);
@@ -626,7 +631,7 @@ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
       }
     }
     var score = relevancyCount / numberOfVerdicts;
-    return Math.round(score * this.scale);
+    return roundToTwoDecimals(score * this.scale);
   };
   return AnswerRelevancyMetric;
 }(core.Metric);
@@ -801,14 +806,15 @@ function generateEvaluatePrompt$2(_ref) {
   var input = _ref.input,
     output = _ref.output,
     context = _ref.context;
-  return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n- Demonstrates or validates characteristics mentioned in the output\n- Shows real-world applications or effects of the concept\n- Reinforces or provides evidence for any part of the output\n- Helps establish credibility or understanding of the subject\n- Describes the actions the subject can perform\n\nA context piece should be considered relevant if it contributes ANY supporting information or evidence, even if indirect.\n\nExample:\n{\n    \"verdicts\": [\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n        },\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n        },\n        {\n            \"verdict\": \"no\",\n            \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n        }\n    ]  \n}\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
+  return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nCRITICAL: Context should be marked as relevant if it:\n1. Directly helps define or explain the subject\n2. Demonstrates properties or behaviors mentioned in the output\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\", \"The Sun gives light to planets\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input question\n- Demonstrates properties mentioned in the output\n- Provides examples that validate the output\n- Contains information that helps define the subject\n\nMark as not relevant if the information:\n- Only describes other objects' behaviors\n- Has no connection to properties mentioned in output\n- Is completely unrelated to the subject\n- Contradicts the output\n\nExample:\n{\n    \"verdicts\": [\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n        },\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n        },\n        {\n            \"verdict\": \"no\",\n            \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is or how it produces light, as it only describes how another object interacts with sunlight.\"\n        },\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'The Sun gives light to planets' demonstrates the light-producing property mentioned in the output.\"\n        }\n    ]  \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nNumber of context pieces: " + (context.length === 0 ? '1' : context.length) + "\n\nContext:\n" + context + "\n\nJSON:\n";
 }
 function generateReasonPrompt$2(_ref2) {
   var input = _ref2.input,
     output = _ref2.output,
     verdicts = _ref2.verdicts,
-    score = _ref2.score;
-  return "Given the input, output, verdicts, and position score, provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.\n  The retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n    \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (10.0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPosition Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
+    score = _ref2.score,
+    scale = _ref2.scale;
+  return "Given the input, output, verdicts, and position score, and the highest possible score is " + scale + ", provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.\n  The retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n    \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (" + scale + ".0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPosition Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
 }
 var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -852,7 +858,7 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
     return evaluate;
   }();
   _proto.getReason = /*#__PURE__*/function () {
-    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
+    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, verdicts) {
       var prompt, result;
       return _regeneratorRuntime().wrap(function _callee2$(_context2) {
         while (1) switch (_context2.prev = _context2.next) {
@@ -861,7 +867,8 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
               input: input,
               output: actualOutput,
               verdicts: verdicts,
-              score: score
+              score: score,
+              scale: scale
             });
             _context2.next = 3;
             return this.agent.generate(prompt, {
@@ -878,7 +885,7 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
         }
       }, _callee2, this);
     }));
-    function getReason(_x4, _x5, _x6, _x7) {
+    function getReason(_x4, _x5, _x6, _x7, _x8) {
       return _getReason.apply(this, arguments);
     }
     return getReason;
@@ -914,7 +921,7 @@ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
             verdicts = _context.sent;
             score = this.calculateScore(verdicts);
             _context.next = 7;
-            return this.judge.getReason(input, output, score, verdicts);
+            return this.judge.getReason(input, output, score, this.scale, verdicts);
           case 7:
             reason = _context.sent;
             return _context.abrupt("return", {
@@ -956,24 +963,25 @@ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
     }
     // Normalize against the maximum possible score
     var finalScore = weightedSum / maxPossibleSum * this.scale;
-    return finalScore;
+    return roundToTwoDecimals(finalScore);
   };
   return ContextPositionMetric;
 }(core.Metric);
-var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider both direct and indirect relevance\n3. Prioritize usefulness over completeness\n4. Recognize that some nodes may be partially relevant\n5. Empty or error nodes should be marked as not relevant";
+var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider all forms of relevance:\n   - Direct definitions or explanations\n   - Supporting evidence or examples\n   - Related characteristics or behaviors\n   - Real-world applications or effects\n3. Prioritize usefulness over completeness\n4. Recognize that some nodes may be partially relevant\n5. Empty or error nodes should be marked as not relevant";
 function generateEvaluatePrompt$1(_ref) {
   var input = _ref.input,
     output = _ref.output,
     context = _ref.context;
-  return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nExample:\n{\n    \"verdicts\": [\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n        },\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n        },\n        {\n            \"verdict\": \"no\",\n            \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n        }\n    ]  \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
+  return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nCRITICAL: Context should be marked as relevant if it:\n1. Directly helps define or explain the subject\n2. Demonstrates properties or behaviors mentioned in the output\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\", \"The Sun gives light to planets\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input question\n- Demonstrates properties mentioned in the output\n- Provides examples that validate the output\n- Contains information that helps define the subject\n\nMark as not relevant if the information:\n- Only describes other objects' behaviors\n- Has no connection to properties mentioned in output\n- Is completely unrelated to the subject\n- Contradicts the output\n\nExample:\n{\n    \"verdicts\": [\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n        },\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n        },\n        {\n            \"verdict\": \"no\",\n            \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is or how it produces light, as it only describes how another object interacts with sunlight.\"\n        },\n        {\n            \"verdict\": \"yes\",\n            \"reason\": \"The context 'The Sun gives light to planets' demonstrates the light-producing property mentioned in the output.\"\n        }\n    ]  \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nNumber of context pieces: " + (context.length === 0 ? '1' : context.length) + "\n\nContext:\n" + context + "\n\nJSON:\n";
 }
 function generateReasonPrompt$1(_ref2) {
   var input = _ref2.input,
     output = _ref2.output,
     verdicts = _ref2.verdicts,
-    score = _ref2.score;
-  return "Given the input, output, verdicts, and precision score, provide a BRIEF explanation for the score. Explain both its strengths and limitations.\nThe retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n    \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (10.0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPrecision Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
+    score = _ref2.score,
+    scale = _ref2.scale;
+  return "Given the input, output, verdicts, and precision score, and the highest possible score is " + scale + ", provide a BRIEF explanation for the score. Explain both its strengths and limitations.\nThe retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n    \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (" + scale + ".0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPrecision Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
 }
 var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -1017,7 +1025,7 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
     return evaluate;
   }();
   _proto.getReason = /*#__PURE__*/function () {
-    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
+    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, verdicts) {
       var prompt, result;
       return _regeneratorRuntime().wrap(function _callee2$(_context2) {
         while (1) switch (_context2.prev = _context2.next) {
@@ -1026,7 +1034,8 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
               input: input,
               output: actualOutput,
               verdicts: verdicts,
-              score: score
+              score: score,
+              scale: scale
             });
             _context2.next = 3;
             return this.agent.generate(prompt, {
@@ -1043,7 +1052,7 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
         }
       }, _callee2, this);
     }));
-    function getReason(_x4, _x5, _x6, _x7) {
+    function getReason(_x4, _x5, _x6, _x7, _x8) {
       return _getReason.apply(this, arguments);
     }
     return getReason;
@@ -1079,7 +1088,7 @@ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
             verdicts = _context.sent;
             score = this.calculateScore(verdicts);
             _context.next = 7;
-            return this.judge.getReason(input, output, score, verdicts);
+            return this.judge.getReason(input, output, score, this.scale, verdicts);
           case 7:
             reason = _context.sent;
             return _context.abrupt("return", {
@@ -1120,7 +1129,7 @@ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
       return 0;
     }
     var finalScore = weightedPrecisionSum / relevantCount;
-    return finalScore * this.scale;
+    return roundToTwoDecimals(finalScore * this.scale);
   };
   return ContextPrecisionMetric;
 }(core.Metric);
@@ -1237,19 +1246,20 @@ var KeywordCoverageMetric = /*#__PURE__*/function (_Metric) {
   return KeywordCoverageMetric;
 }(core.Metric);
-var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
+var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n6. Judge each instruction independently. Only check if the current instruction is followed. Do not let instructions be influenced by other instructions.\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
 function generateEvaluatePrompt(_ref) {
   var instructions = _ref.instructions,
     input = _ref.input,
     output = _ref.output;
-  return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n  \"verdicts\": [\n    {\n      \"verdict\": \"no\",\n      \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n    },\n    {\n      \"verdict\": \"no\",\n      \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n    }\n  ]\n}\n\nPrompt Instructions:\n" + instructions.join('\n') + "\n\nInput:\n" + input + "\n\nLLM Actual Output:\n" + output + "\n\nJSON:";
+  return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n  \"verdicts\": [\n    {\n      \"verdict\": \"no\",\n      \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n    },\n    {\n      \"verdict\": \"no\",\n      \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n    }\n  ]\n}\n\nExample 2:\nInput: \"describe the sky\"\nOutput: \"The sky is blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Talk about the color black\"]\n\n{\n  \"verdicts\": [\n    {\n      \"verdict\": \"yes\",\n      \"reason\": \"The output starts with a capital letter\"\n    },\n    {\n      \"verdict\": \"no\",\n      \"reason\": \"The output does not talk about the color black\"\n    }\n  ]\n}\n\nNumber of instructions: " + instructions.length + "\n\nPrompt Instructions:\n" + instructions + "\n\nInput:\n" + input + "\n\nLLM Actual Output:\n" + output + "\n\nJSON:";
 }
 function generateReasonPrompt(_ref2) {
   var input = _ref2.input,
     output = _ref2.output,
     score = _ref2.score,
-    reasons = _ref2.reasons;
-  return "Explain the instruction following score (0-10) for the LLM's response using this context:\n  Context:\n  Input: " + input + "\n  Output: " + output + "\n  Score: " + score + "\n  Failure Reasons: " + reasons.join('\n') + "\n\n  Rules (follow these rules exactly. do not deviate):\n  - Keep your response concise and to the point.\n  - Do not change score from what is given.\n  - Do not make judgements on inputs or outputs (factual correctness, quality, etc).\n  - If there are failure reasons given, explain why the score is not higher.\n  \n\n  Output format:\n  {\n    \"reason\": \"The score is {score} because {explanation of instruction following}\"\n  }\n    \n  Example Responses:\n  {\n    \"reason\": \"The score is 10 because the output follows the instructions exactly\"\n  }\n  {\n    \"reason\": \"The score is 0 because the output does not follow the instructions\"\n  }\n  ";
+    reasons = _ref2.reasons,
+    scale = _ref2.scale;
+  return "Explain the instruction following score where 0 is the lowest and " + scale + " is the highest for the LLM's response using this context:\n  Context:\n  Input: " + input + "\n  Output: " + output + "\n  Score: " + score + "\n  Failure Reasons: " + reasons.join('\n') + "\n\n  Rules (follow these rules exactly. do not deviate):\n  - Keep your response concise and to the point.\n  - Do not change score from what is given.\n  - Do not make judgements on inputs or outputs (factual correctness, quality, etc).\n  - If there are failure reasons given, explain why the score is not higher.\n  \n\n  Output format:\n  {\n    \"reason\": \"The score is {score} because {explanation of instruction following}\"\n  }\n    \n  Example Responses:\n  {\n    \"reason\": \"The score is " + scale + " because the output follows the instructions exactly\"\n  }\n  {\n    \"reason\": \"The score is 0 because the output does not follow the instructions\"\n  }\n  ";
 }
 var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -1293,7 +1303,7 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
     return evaluate;
   }();
   _proto.getReason = /*#__PURE__*/function () {
-    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
+    var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, reasons) {
       var prompt, result;
       return _regeneratorRuntime().wrap(function _callee2$(_context2) {
         while (1) switch (_context2.prev = _context2.next) {
@@ -1302,7 +1312,8 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
               input: input,
               output: actualOutput,
               reasons: reasons,
-              score: score
+              score: score,
+              scale: scale
             });
             _context2.next = 3;
             return this.agent.generate(prompt, {
@@ -1319,7 +1330,7 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
         }
       }, _callee2, this);
     }));
-    function getReason(_x4, _x5, _x6, _x7) {
+    function getReason(_x4, _x5, _x6, _x7, _x8) {
       return _getReason.apply(this, arguments);
     }
     return getReason;
@@ -1389,7 +1400,7 @@ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
               }
             }
             _context2.next = 4;
-            return this.judge.getReason(input, output, score, reasonsForVerdicts);
+            return this.judge.getReason(input, output, score, this.scale, reasonsForVerdicts);
           case 4:
             reason = _context2.sent;
             return _context2.abrupt("return", reason);
@@ -1417,7 +1428,7 @@ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
       }
     }
     var score = alignmentCount / numberOfVerdicts;
-    return score * this.scale;
+    return roundToTwoDecimals(score * this.scale);
   };
   return PromptAlignmentMetric;
 }(core.Metric);