@mastra/evals 0.0.1-alpha.0 → 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/evals.cjs.development.js +52 -41
  3. package/dist/evals.cjs.development.js.map +1 -1
  4. package/dist/evals.cjs.production.min.js +1 -1
  5. package/dist/evals.cjs.production.min.js.map +1 -1
  6. package/dist/evals.esm.js +52 -41
  7. package/dist/evals.esm.js.map +1 -1
  8. package/dist/metrics/answer-relevancy/index.d.ts +5 -4
  9. package/dist/metrics/answer-relevancy/index.d.ts.map +1 -1
  10. package/dist/metrics/answer-relevancy/metricJudge.d.ts +1 -1
  11. package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -1
  12. package/dist/metrics/answer-relevancy/prompts.d.ts +3 -2
  13. package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -1
  14. package/dist/metrics/context-position/index.d.ts.map +1 -1
  15. package/dist/metrics/context-position/metricJudge.d.ts +1 -1
  16. package/dist/metrics/context-position/metricJudge.d.ts.map +1 -1
  17. package/dist/metrics/context-position/prompts.d.ts +2 -1
  18. package/dist/metrics/context-position/prompts.d.ts.map +1 -1
  19. package/dist/metrics/context-precision/index.d.ts.map +1 -1
  20. package/dist/metrics/context-precision/metricJudge.d.ts +1 -1
  21. package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -1
  22. package/dist/metrics/context-precision/prompts.d.ts +3 -2
  23. package/dist/metrics/context-precision/prompts.d.ts.map +1 -1
  24. package/dist/metrics/prompt-alignment/index.d.ts.map +1 -1
  25. package/dist/metrics/prompt-alignment/metricJudge.d.ts +1 -1
  26. package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -1
  27. package/dist/metrics/prompt-alignment/prompts.d.ts +3 -2
  28. package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -1
  29. package/dist/utils.d.ts +2 -0
  30. package/dist/utils.d.ts.map +1 -0
  31. package/package.json +5 -2
  32. package/src/evaluation.test.ts +2 -2
  33. package/src/metrics/answer-relevancy/index.test.ts +12 -22
  34. package/src/metrics/answer-relevancy/index.ts +10 -15
  35. package/src/metrics/answer-relevancy/metricJudge.ts +8 -2
  36. package/src/metrics/answer-relevancy/prompts.ts +56 -22
  37. package/src/metrics/context-position/index.ts +4 -2
  38. package/src/metrics/context-position/metricJudge.ts +2 -1
  39. package/src/metrics/context-position/prompts.ts +31 -14
  40. package/src/metrics/context-precision/index.test.ts +1 -1
  41. package/src/metrics/context-precision/index.ts +4 -2
  42. package/src/metrics/context-precision/metricJudge.ts +2 -1
  43. package/src/metrics/context-precision/prompts.ts +33 -5
  44. package/src/metrics/prompt-alignment/index.test.ts +137 -20
  45. package/src/metrics/prompt-alignment/index.ts +4 -2
  46. package/src/metrics/prompt-alignment/metricJudge.ts +8 -2
  47. package/src/metrics/prompt-alignment/prompts.ts +26 -3
  48. package/src/utils.ts +3 -0
package/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 0.0.1-alpha.1
4
+
5
+ ### Patch Changes
6
+
7
+ - 35764f4: Added workflow for eval tests
8
+
3
9
  ## 0.0.1-alpha.0
4
10
 
5
11
  ### Patch Changes
@@ -419,6 +419,10 @@ function _evaluate() {
419
419
  return _evaluate.apply(this, arguments);
420
420
  }
421
421
 
422
+ var roundToTwoDecimals = function roundToTwoDecimals(num) {
423
+ return Math.round((num + Number.EPSILON) * 100) / 100;
424
+ };
425
+
422
426
  var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
423
427
  this.agent = void 0;
424
428
  this.agent = new core.Agent({
@@ -428,7 +432,7 @@ var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
428
432
  });
429
433
  };
430
434
 
431
- var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"";
435
+ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"\n6. Responses that discuss the type of information being asked show partial relevance";
432
436
  function generateEvaluationStatementsPrompt(_ref) {
433
437
  var output = _ref.output;
434
438
  return "Given the text, break it down into meaningful statements while preserving context and relationships.\nDon't split too aggressively.\n\nSplit compound statements particularly when they:\n- Are joined by \"and\"\n- Contain multiple distinct facts or claims\n- Have multiple descriptive elements about the subject\n\n\nHandle special cases:\n- A single word answer should be treated as a complete statement\n- Error messages should be treated as a single statement\n- Empty strings should return an empty list\n- When splitting text, keep related information together\n\nExample:\nExample text: Look! A bird! Birds are an interesting animal.\n\n{{\n \"statements\": [\"Look!\", \"A bird!\", \"Birds are interesting animals.\"]\n}}\n\nPlease return only JSON format with \"statements\" array.\nReturn empty list for empty input.\n\nText:\n" + output + "\n\nJSON:\n";
@@ -436,14 +440,15 @@ function generateEvaluationStatementsPrompt(_ref) {
436
440
  function generateEvaluatePrompt$3(_ref2) {
437
441
  var input = _ref2.input,
438
442
  statements = _ref2.statements;
439
- return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n Return JSON with array of verdict objects. Each verdict must include:\n - \"verdict\": \"yes\", \"no\", or \"unsure\"\n - \"reason\": Clear explanation of the verdict\n - Exact match between number of verdicts and statements\n\n Verdict Guidelines:\n - \"yes\": Statement explicitly and directly answers the input question\n * Contains specific answer to the question asked (e.g., \"The color of the sky is blue\")\n * States explicit relationship between key concepts (e.g., \"X is the CEO of company Y\")\n * Can stand alone as a complete answer\n * Contains appropriate question-type response (e.g., location for \"where\", person for \"who\")\n\n - \"unsure\": Statement shows partial relevance when it:\n * Contains topic-related administrative/governance terms without direct answer\n * Mentions locations or entities related to the answer without specifying their role\n * References functions or characteristics typically associated with the answer\n * Is incorrect but shows understanding of the question\n * Uses importance indicators (\"main\", \"primary\", \"major\") with relevant concepts\n * Includes indirect references to the answer (e.g., \"where the president works\")\n * Contains multiple relevant concepts but lacks explicit relationship between them\n * Demonstrates understanding of question domain without providing specific answer\n\n - \"no\": Statement lacks meaningful connection to question when it:\n * Contains no concepts related to the question type or domain\n * Only mentions the broader topic without relevant details (e.g., \"the country has nice weather\")\n * Provides general descriptions without addressing the specific question\n * Contains purely tangential information about the subject\n * Consists of empty or meaningless content\n * Discusses characteristics unrelated to the question type (e.g., describing cuisine when asked about geography)\n * Note: Assessment is about topical relationship, not factual accuracy\n\n REMEMBER: A statmenent does not have to be correct, it just has to be relevant.\n If the statement contains words or phrases that are relevant to the input, it is partially relevant.\n If the statement is a direct answer to the input, it is relevant.\n If the statement is completely unrelated to the input or contains nothing, it is not relevant.\n DO NOT MAKE A JUDGEMENT ON THE CORRECTNESS OF THE STATEMENT, JUST THE RELEVANCY.\n\n\n Example:\n Input: \"What color is the sky during daytime?\"\n Statements: [\n \"The sky is blue during daytime\",\n \"The sky is full of clouds\", \n \"I had breakfast today\",\n \"Blue is a beautiful color\",\n \"Many birds fly in the sky\",\n \"\",\n \"The sky is purple during daytime\",\n ]\n JSON:\n {{\n \"verdicts\": [\n {{\n \"verdict\": \"yes\",\n \"reason\": \"This statement explicitly answers what color the sky is during daytime\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement describes the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement about breakfast is completely unrelated to the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement about blue is related to color but doesn't address the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is about the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement is empty\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is incorrect but contains relevant information and still addresses the question\"\n }}\n ]\n }}\n\n Input:\n " + input + "\n\n Statements:\n " + statements.join('\n') + "\n\n JSON:\n ";
443
+ return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n Return JSON with array of verdict objects. Each verdict must include:\n - \"verdict\": \"yes\", \"no\", or \"unsure\"\n - \"reason\": Clear explanation of the verdict\n - Exact match between number of verdicts and statements\n\n Verdict Guidelines:\n - \"yes\": Statement explicitly and directly answers the input question when it:\n * Contains specific answer to the question asked (e.g., \"The color of the sky is blue\")\n * States explicit relationship between key concepts (e.g., \"X is the CEO of company Y\")\n * Can stand alone as a complete answer\n * Contains appropriate question-type response (e.g., location for \"where\", person for \"who\")\n * Note: If statement is incorrect but directly addresses the question, mark as \"unsure\"\n\n - \"unsure\": Statement shows partial relevance when it:\n * Discusses the type of information being asked about (e.g., mentions temperatures when asked about temperature)\n * Contains information about the answer without explicit statement\n * Uses importance indicators (\"main\", \"primary\", \"major\") with relevant concepts\n * Includes indirect references to the answer (e.g., \"where the president works\")\n * Contains topic-related administrative/governance terms without direct answer\n * References functions or characteristics typically associated with the answer\n * Uses terms that match what's being asked about\n * Mentions related entities without specifying their relationship to the answer\n * Is incorrect but shows understanding of the question\n * Contains the answer term but needs more context to be complete\n * Contains measurement units or quantities relevant to the question type\n * References locations or entities in the same category as what's being asked about\n * Provides relevant information without using explicit question-type terminology\n * Contains references to properties of the subject\n\n\n - \"no\": Statement lacks meaningful connection to question when it:\n * Contains neither the subject nor the type of information being requested\n * Contains no terms related to what's being asked about\n * Contains only general subject information without relating to what's being asked\n * Consists of empty or meaningless content\n * Contains purely tangential information with no mention of the subject or question type\n * Note: Assessment is about connection to what's being asked, not factual accuracy\n * Contains no connection to what's being asked about (neither the subject nor the type of information requested)\n\n REMEMBER: \n - If the statement contains words or phrases that are relevant to the input, it is partially relevant.\n - If the statement is a direct answer to the input, it is relevant.\n - If the statement is completely unrelated to the input or contains nothing, it is not relevant.\n - DO NOT MAKE A JUDGEMENT ON THE CORRECTNESS OF THE STATEMENT, JUST THE RELEVANCY.\n\n STRICT RULES:\n - If a statement mentions the type of information being requested, it should be marked as \"unsure\" ONLY if it's discussing that type meaningfully (not just mentioning it)\n - Subject mentions alone are NOT enough for relevance - they must connect to what's being asked about\n - Empty or meaningless statements are always \"no\"\n - General facts about the subject without connection to the question type should be marked as \"no\"\n - ALWAYS mark a statement as \"no\" if it discusses the topic without any connection to the question type\n - Statements that mention neither the subject nor the type of information are always \"no\"\n - Type-level relevance overrides topic-only content\n - Measurement/quantity relevance counts as type-level relevance\n - Administrative/governance terms are only relevant if they relate to the question type\n\n\n Examples of \"no\" statements:\n * \"Japan has beautiful seasons\" for \"What is Japan's largest city?\"\n * \"Trees grow tall\" for \"How tall is Mount Everest?\"\n * \"The weather is nice\" for \"Who is the president?\"\n\n Example:\n Input: \"What color is the sky during daytime?\"\n Statements: [\n \"The sky is blue during daytime\",\n \"The sky is full of clouds\", \n \"I had breakfast today\",\n \"Blue is a beautiful color\",\n \"Many birds fly in the sky\",\n \"\",\n \"The sky is purple during daytime\",\n \"Daytime is when the sun is up\",\n ]\n JSON:\n {{\n \"verdicts\": [\n {{\n \"verdict\": \"yes\",\n \"reason\": \"This statement explicitly answers what color the sky is during daytime\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement describes the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement about breakfast is completely unrelated to the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement about blue is related to color but doesn't address the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is about the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement is empty\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is incorrect but contains relevant information and still addresses the question\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement is about daytime but doesn't address the sky\"\n }}\n ]\n }}\n\n Input:\n " + input + "\n\n Number of statements: " + (statements.length === 0 ? '1' : statements.length) + "\n\n Statements:\n " + statements + "\n\n JSON:\n ";
440
444
  }
441
445
  function generateReasonPrompt$3(_ref3) {
442
446
  var score = _ref3.score,
443
447
  reasons = _ref3.reasons,
444
448
  input = _ref3.input,
445
- output = _ref3.output;
446
- return "Explain the irrelevancy score (0-10) for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Irrelevancy Reasons: " + reasons.join('\n') + "\n \n Rules:\n - Explain score based on mix of direct answers and related context\n - Consider both full and partial relevance\n - Keep explanation concise and focused\n - Use given score, don't recalculate\n - Don't judge factual correctness\n - Explain both relevant and irrelevant aspects\n - For mixed responses, explain the balance\n\n Format:\n {\n \"reason\": \"The score is {score} because {explanation of overall relevance}\"\n }\n\n Example Responses:\n {\n \"reason\": \"The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant\"\n }\n {\n \"reason\": \"The score is 3 because while the answer discusses the right topic, it doesn't directly address the question\"\n }\n ";
449
+ output = _ref3.output,
450
+ scale = _ref3.scale;
451
+ return "Explain the irrelevancy score where 0 is the lowest and " + scale + " is the highest for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Irrelevancy Reasons: " + reasons.join('\n') + "\n \n Rules:\n - Explain score based on mix of direct answers and related context\n - Consider both full and partial relevance\n - Keep explanation concise and focused\n - Use given score, don't recalculate\n - Don't judge factual correctness\n - Explain both relevant and irrelevant aspects\n - For mixed responses, explain the balance\n\n Format:\n {\n \"reason\": \"The score is {score} because {explanation of overall relevance}\"\n }\n\n Example Responses:\n {\n \"reason\": \"The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant\"\n }\n {\n \"reason\": \"The score is 3 because while the answer discusses the right topic, it doesn't directly address the question\"\n }\n ";
447
452
  }
448
453
 
449
454
  var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -497,7 +502,7 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
497
502
  return evaluate;
498
503
  }();
499
504
  _proto.getReason = /*#__PURE__*/function () {
500
- var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
505
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, reasons) {
501
506
  var prompt, result;
502
507
  return _regeneratorRuntime().wrap(function _callee2$(_context2) {
503
508
  while (1) switch (_context2.prev = _context2.next) {
@@ -506,7 +511,8 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
506
511
  input: input,
507
512
  output: actualOutput,
508
513
  reasons: reasons,
509
- score: score
514
+ score: score,
515
+ scale: scale
510
516
  });
511
517
  _context2.next = 3;
512
518
  return this.agent.generate(prompt, {
@@ -523,7 +529,7 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
523
529
  }
524
530
  }, _callee2, this);
525
531
  }));
526
- function getReason(_x3, _x4, _x5, _x6) {
532
+ function getReason(_x3, _x4, _x5, _x6, _x7) {
527
533
  return _getReason.apply(this, arguments);
528
534
  }
529
535
  return getReason;
@@ -534,12 +540,11 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
534
540
  var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
535
541
  function AnswerRelevancyMetric(model, _temp) {
536
542
  var _this;
537
- var _ref = _temp === void 0 ? {
538
- uncertaintyWeight: 0.3,
539
- scale: 10
540
- } : _temp,
541
- uncertaintyWeight = _ref.uncertaintyWeight,
542
- scale = _ref.scale;
543
+ var _ref = _temp === void 0 ? {} : _temp,
544
+ _ref$uncertaintyWeigh = _ref.uncertaintyWeight,
545
+ uncertaintyWeight = _ref$uncertaintyWeigh === void 0 ? 0.3 : _ref$uncertaintyWeigh,
546
+ _ref$scale = _ref.scale,
547
+ scale = _ref$scale === void 0 ? 10 : _ref$scale;
543
548
  _this = _Metric.call(this) || this;
544
549
  _this.judge = void 0;
545
550
  _this.uncertaintyWeight = void 0;
@@ -596,7 +601,7 @@ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
596
601
  }
597
602
  }
598
603
  _context2.next = 4;
599
- return this.judge.getReason(input, output, score, reasonsForVerdicts);
604
+ return this.judge.getReason(input, output, score, this.scale, reasonsForVerdicts);
600
605
  case 4:
601
606
  reason = _context2.sent;
602
607
  return _context2.abrupt("return", reason);
@@ -626,7 +631,7 @@ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
626
631
  }
627
632
  }
628
633
  var score = relevancyCount / numberOfVerdicts;
629
- return Math.round(score * this.scale);
634
+ return roundToTwoDecimals(score * this.scale);
630
635
  };
631
636
  return AnswerRelevancyMetric;
632
637
  }(core.Metric);
@@ -801,14 +806,15 @@ function generateEvaluatePrompt$2(_ref) {
801
806
  var input = _ref.input,
802
807
  output = _ref.output,
803
808
  context = _ref.context;
804
- return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n- Demonstrates or validates characteristics mentioned in the output\n- Shows real-world applications or effects of the concept\n- Reinforces or provides evidence for any part of the output\n- Helps establish credibility or understanding of the subject\n- Describes the actions the subject can perform\n\nA context piece should be considered relevant if it contributes ANY supporting information or evidence, even if indirect.\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n }\n ] \n}\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
809
+ return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nCRITICAL: Context should be marked as relevant if it:\n1. Directly helps define or explain the subject\n2. Demonstrates properties or behaviors mentioned in the output\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\", \"The Sun gives light to planets\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input question\n- Demonstrates properties mentioned in the output\n- Provides examples that validate the output\n- Contains information that helps define the subject\n\nMark as not relevant if the information:\n- Only describes other objects' behaviors\n- Has no connection to properties mentioned in output\n- Is completely unrelated to the subject\n- Contradicts the output\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is or how it produces light, as it only describes how another object interacts with sunlight.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun gives light to planets' demonstrates the light-producing property mentioned in the output.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nNumber of context pieces: " + (context.length === 0 ? '1' : context.length) + "\n\nContext:\n" + context + "\n\nJSON:\n";
805
810
  }
806
811
  function generateReasonPrompt$2(_ref2) {
807
812
  var input = _ref2.input,
808
813
  output = _ref2.output,
809
814
  verdicts = _ref2.verdicts,
810
- score = _ref2.score;
811
- return "Given the input, output, verdicts, and position score, provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.\n The retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (10.0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPosition Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
815
+ score = _ref2.score,
816
+ scale = _ref2.scale;
817
+ return "Given the input, output, verdicts, and position score, and the highest possible score is " + scale + ", provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.\n The retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (" + scale + ".0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPosition Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
812
818
  }
813
819
 
814
820
  var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -852,7 +858,7 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
852
858
  return evaluate;
853
859
  }();
854
860
  _proto.getReason = /*#__PURE__*/function () {
855
- var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
861
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, verdicts) {
856
862
  var prompt, result;
857
863
  return _regeneratorRuntime().wrap(function _callee2$(_context2) {
858
864
  while (1) switch (_context2.prev = _context2.next) {
@@ -861,7 +867,8 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
861
867
  input: input,
862
868
  output: actualOutput,
863
869
  verdicts: verdicts,
864
- score: score
870
+ score: score,
871
+ scale: scale
865
872
  });
866
873
  _context2.next = 3;
867
874
  return this.agent.generate(prompt, {
@@ -878,7 +885,7 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
878
885
  }
879
886
  }, _callee2, this);
880
887
  }));
881
- function getReason(_x4, _x5, _x6, _x7) {
888
+ function getReason(_x4, _x5, _x6, _x7, _x8) {
882
889
  return _getReason.apply(this, arguments);
883
890
  }
884
891
  return getReason;
@@ -914,7 +921,7 @@ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
914
921
  verdicts = _context.sent;
915
922
  score = this.calculateScore(verdicts);
916
923
  _context.next = 7;
917
- return this.judge.getReason(input, output, score, verdicts);
924
+ return this.judge.getReason(input, output, score, this.scale, verdicts);
918
925
  case 7:
919
926
  reason = _context.sent;
920
927
  return _context.abrupt("return", {
@@ -956,24 +963,25 @@ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
956
963
  }
957
964
  // Normalize against the maximum possible score
958
965
  var finalScore = weightedSum / maxPossibleSum * this.scale;
959
- return finalScore;
966
+ return roundToTwoDecimals(finalScore);
960
967
  };
961
968
  return ContextPositionMetric;
962
969
  }(core.Metric);
963
970
 
964
- var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider both direct and indirect relevance\n3. Prioritize usefulness over completeness\n4. Recognize that some nodes may be partially relevant\n5. Empty or error nodes should be marked as not relevant";
971
+ var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider all forms of relevance:\n - Direct definitions or explanations\n - Supporting evidence or examples\n - Related characteristics or behaviors\n - Real-world applications or effects\n3. Prioritize usefulness over completeness\n4. Recognize that some nodes may be partially relevant\n5. Empty or error nodes should be marked as not relevant";
965
972
  function generateEvaluatePrompt$1(_ref) {
966
973
  var input = _ref.input,
967
974
  output = _ref.output,
968
975
  context = _ref.context;
969
- return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
976
+ return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nCRITICAL: Context should be marked as relevant if it:\n1. Directly helps define or explain the subject\n2. Demonstrates properties or behaviors mentioned in the output\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\", \"The Sun gives light to planets\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input question\n- Demonstrates properties mentioned in the output\n- Provides examples that validate the output\n- Contains information that helps define the subject\n\nMark as not relevant if the information:\n- Only describes other objects' behaviors\n- Has no connection to properties mentioned in output\n- Is completely unrelated to the subject\n- Contradicts the output\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is or how it produces light, as it only describes how another object interacts with sunlight.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun gives light to planets' demonstrates the light-producing property mentioned in the output.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nNumber of context pieces: " + (context.length === 0 ? '1' : context.length) + "\n\nContext:\n" + context + "\n\nJSON:\n";
970
977
  }
971
978
  function generateReasonPrompt$1(_ref2) {
972
979
  var input = _ref2.input,
973
980
  output = _ref2.output,
974
981
  verdicts = _ref2.verdicts,
975
- score = _ref2.score;
976
- return "Given the input, output, verdicts, and precision score, provide a BRIEF explanation for the score. Explain both its strengths and limitations.\nThe retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (10.0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPrecision Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
982
+ score = _ref2.score,
983
+ scale = _ref2.scale;
984
+ return "Given the input, output, verdicts, and precision score, and the highest possible score is " + scale + ", provide a BRIEF explanation for the score. Explain both its strengths and limitations.\nThe retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (" + scale + ".0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPrecision Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
977
985
  }
978
986
 
979
987
  var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -1017,7 +1025,7 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1017
1025
  return evaluate;
1018
1026
  }();
1019
1027
  _proto.getReason = /*#__PURE__*/function () {
1020
- var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
1028
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, verdicts) {
1021
1029
  var prompt, result;
1022
1030
  return _regeneratorRuntime().wrap(function _callee2$(_context2) {
1023
1031
  while (1) switch (_context2.prev = _context2.next) {
@@ -1026,7 +1034,8 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1026
1034
  input: input,
1027
1035
  output: actualOutput,
1028
1036
  verdicts: verdicts,
1029
- score: score
1037
+ score: score,
1038
+ scale: scale
1030
1039
  });
1031
1040
  _context2.next = 3;
1032
1041
  return this.agent.generate(prompt, {
@@ -1043,7 +1052,7 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1043
1052
  }
1044
1053
  }, _callee2, this);
1045
1054
  }));
1046
- function getReason(_x4, _x5, _x6, _x7) {
1055
+ function getReason(_x4, _x5, _x6, _x7, _x8) {
1047
1056
  return _getReason.apply(this, arguments);
1048
1057
  }
1049
1058
  return getReason;
@@ -1079,7 +1088,7 @@ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
1079
1088
  verdicts = _context.sent;
1080
1089
  score = this.calculateScore(verdicts);
1081
1090
  _context.next = 7;
1082
- return this.judge.getReason(input, output, score, verdicts);
1091
+ return this.judge.getReason(input, output, score, this.scale, verdicts);
1083
1092
  case 7:
1084
1093
  reason = _context.sent;
1085
1094
  return _context.abrupt("return", {
@@ -1120,7 +1129,7 @@ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
1120
1129
  return 0;
1121
1130
  }
1122
1131
  var finalScore = weightedPrecisionSum / relevantCount;
1123
- return finalScore * this.scale;
1132
+ return roundToTwoDecimals(finalScore * this.scale);
1124
1133
  };
1125
1134
  return ContextPrecisionMetric;
1126
1135
  }(core.Metric);
@@ -1237,19 +1246,20 @@ var KeywordCoverageMetric = /*#__PURE__*/function (_Metric) {
1237
1246
  return KeywordCoverageMetric;
1238
1247
  }(core.Metric);
1239
1248
 
1240
- var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
1249
+ var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n6. Judge each instruction independently. Only check if the current instruction is followed. Do not let instructions be influenced by other instructions.\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
1241
1250
  function generateEvaluatePrompt(_ref) {
1242
1251
  var instructions = _ref.instructions,
1243
1252
  input = _ref.input,
1244
1253
  output = _ref.output;
1245
- return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"no\",\n \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n }\n ]\n}\n\nPrompt Instructions:\n" + instructions.join('\n') + "\n\nInput:\n" + input + "\n\nLLM Actual Output:\n" + output + "\n\nJSON:";
1254
+ return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"no\",\n \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n }\n ]\n}\n\nExample 2:\nInput: \"describe the sky\"\nOutput: \"The sky is blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Talk about the color black\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The output starts with a capital letter\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The output does not talk about the color black\"\n }\n ]\n}\n\nNumber of instructions: " + instructions.length + "\n\nPrompt Instructions:\n" + instructions + "\n\nInput:\n" + input + "\n\nLLM Actual Output:\n" + output + "\n\nJSON:";
1246
1255
  }
1247
1256
  function generateReasonPrompt(_ref2) {
1248
1257
  var input = _ref2.input,
1249
1258
  output = _ref2.output,
1250
1259
  score = _ref2.score,
1251
- reasons = _ref2.reasons;
1252
- return "Explain the instruction following score (0-10) for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Failure Reasons: " + reasons.join('\n') + "\n\n Rules (follow these rules exactly. do not deviate):\n - Keep your response concise and to the point.\n - Do not change score from what is given.\n - Do not make judgements on inputs or outputs (factual correctness, quality, etc).\n - If there are failure reasons given, explain why the score is not higher.\n \n\n Output format:\n {\n \"reason\": \"The score is {score} because {explanation of instruction following}\"\n }\n \n Example Responses:\n {\n \"reason\": \"The score is 10 because the output follows the instructions exactly\"\n }\n {\n \"reason\": \"The score is 0 because the output does not follow the instructions\"\n }\n ";
1260
+ reasons = _ref2.reasons,
1261
+ scale = _ref2.scale;
1262
+ return "Explain the instruction following score where 0 is the lowest and " + scale + " is the highest for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Failure Reasons: " + reasons.join('\n') + "\n\n Rules (follow these rules exactly. do not deviate):\n - Keep your response concise and to the point.\n - Do not change score from what is given.\n - Do not make judgements on inputs or outputs (factual correctness, quality, etc).\n - If there are failure reasons given, explain why the score is not higher.\n \n\n Output format:\n {\n \"reason\": \"The score is {score} because {explanation of instruction following}\"\n }\n \n Example Responses:\n {\n \"reason\": \"The score is " + scale + " because the output follows the instructions exactly\"\n }\n {\n \"reason\": \"The score is 0 because the output does not follow the instructions\"\n }\n ";
1253
1263
  }
1254
1264
 
1255
1265
  var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
@@ -1293,7 +1303,7 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1293
1303
  return evaluate;
1294
1304
  }();
1295
1305
  _proto.getReason = /*#__PURE__*/function () {
1296
- var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
1306
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, reasons) {
1297
1307
  var prompt, result;
1298
1308
  return _regeneratorRuntime().wrap(function _callee2$(_context2) {
1299
1309
  while (1) switch (_context2.prev = _context2.next) {
@@ -1302,7 +1312,8 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1302
1312
  input: input,
1303
1313
  output: actualOutput,
1304
1314
  reasons: reasons,
1305
- score: score
1315
+ score: score,
1316
+ scale: scale
1306
1317
  });
1307
1318
  _context2.next = 3;
1308
1319
  return this.agent.generate(prompt, {
@@ -1319,7 +1330,7 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1319
1330
  }
1320
1331
  }, _callee2, this);
1321
1332
  }));
1322
- function getReason(_x4, _x5, _x6, _x7) {
1333
+ function getReason(_x4, _x5, _x6, _x7, _x8) {
1323
1334
  return _getReason.apply(this, arguments);
1324
1335
  }
1325
1336
  return getReason;
@@ -1389,7 +1400,7 @@ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
1389
1400
  }
1390
1401
  }
1391
1402
  _context2.next = 4;
1392
- return this.judge.getReason(input, output, score, reasonsForVerdicts);
1403
+ return this.judge.getReason(input, output, score, this.scale, reasonsForVerdicts);
1393
1404
  case 4:
1394
1405
  reason = _context2.sent;
1395
1406
  return _context2.abrupt("return", reason);
@@ -1417,7 +1428,7 @@ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
1417
1428
  }
1418
1429
  }
1419
1430
  var score = alignmentCount / numberOfVerdicts;
1420
- return score * this.scale;
1431
+ return roundToTwoDecimals(score * this.scale);
1421
1432
  };
1422
1433
  return PromptAlignmentMetric;
1423
1434
  }(core.Metric);