@mastra/evals 0.0.1-alpha.0 → 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/evals.cjs.development.js +52 -41
- package/dist/evals.cjs.development.js.map +1 -1
- package/dist/evals.cjs.production.min.js +1 -1
- package/dist/evals.cjs.production.min.js.map +1 -1
- package/dist/evals.esm.js +52 -41
- package/dist/evals.esm.js.map +1 -1
- package/dist/metrics/answer-relevancy/index.d.ts +5 -4
- package/dist/metrics/answer-relevancy/index.d.ts.map +1 -1
- package/dist/metrics/answer-relevancy/metricJudge.d.ts +1 -1
- package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -1
- package/dist/metrics/answer-relevancy/prompts.d.ts +3 -2
- package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -1
- package/dist/metrics/context-position/index.d.ts.map +1 -1
- package/dist/metrics/context-position/metricJudge.d.ts +1 -1
- package/dist/metrics/context-position/metricJudge.d.ts.map +1 -1
- package/dist/metrics/context-position/prompts.d.ts +2 -1
- package/dist/metrics/context-position/prompts.d.ts.map +1 -1
- package/dist/metrics/context-precision/index.d.ts.map +1 -1
- package/dist/metrics/context-precision/metricJudge.d.ts +1 -1
- package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -1
- package/dist/metrics/context-precision/prompts.d.ts +3 -2
- package/dist/metrics/context-precision/prompts.d.ts.map +1 -1
- package/dist/metrics/prompt-alignment/index.d.ts.map +1 -1
- package/dist/metrics/prompt-alignment/metricJudge.d.ts +1 -1
- package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -1
- package/dist/metrics/prompt-alignment/prompts.d.ts +3 -2
- package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -1
- package/dist/utils.d.ts +2 -0
- package/dist/utils.d.ts.map +1 -0
- package/package.json +5 -2
- package/src/evaluation.test.ts +2 -2
- package/src/metrics/answer-relevancy/index.test.ts +12 -22
- package/src/metrics/answer-relevancy/index.ts +10 -15
- package/src/metrics/answer-relevancy/metricJudge.ts +8 -2
- package/src/metrics/answer-relevancy/prompts.ts +56 -22
- package/src/metrics/context-position/index.ts +4 -2
- package/src/metrics/context-position/metricJudge.ts +2 -1
- package/src/metrics/context-position/prompts.ts +31 -14
- package/src/metrics/context-precision/index.test.ts +1 -1
- package/src/metrics/context-precision/index.ts +4 -2
- package/src/metrics/context-precision/metricJudge.ts +2 -1
- package/src/metrics/context-precision/prompts.ts +33 -5
- package/src/metrics/prompt-alignment/index.test.ts +137 -20
- package/src/metrics/prompt-alignment/index.ts +4 -2
- package/src/metrics/prompt-alignment/metricJudge.ts +8 -2
- package/src/metrics/prompt-alignment/prompts.ts +26 -3
- package/src/utils.ts +3 -0
package/CHANGELOG.md
CHANGED
|
@@ -419,6 +419,10 @@ function _evaluate() {
|
|
|
419
419
|
return _evaluate.apply(this, arguments);
|
|
420
420
|
}
|
|
421
421
|
|
|
422
|
+
var roundToTwoDecimals = function roundToTwoDecimals(num) {
|
|
423
|
+
return Math.round((num + Number.EPSILON) * 100) / 100;
|
|
424
|
+
};
|
|
425
|
+
|
|
422
426
|
var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
|
|
423
427
|
this.agent = void 0;
|
|
424
428
|
this.agent = new core.Agent({
|
|
@@ -428,7 +432,7 @@ var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
|
|
|
428
432
|
});
|
|
429
433
|
};
|
|
430
434
|
|
|
431
|
-
var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"";
|
|
435
|
+
var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"\n6. Responses that discuss the type of information being asked show partial relevance";
|
|
432
436
|
function generateEvaluationStatementsPrompt(_ref) {
|
|
433
437
|
var output = _ref.output;
|
|
434
438
|
return "Given the text, break it down into meaningful statements while preserving context and relationships.\nDon't split too aggressively.\n\nSplit compound statements particularly when they:\n- Are joined by \"and\"\n- Contain multiple distinct facts or claims\n- Have multiple descriptive elements about the subject\n\n\nHandle special cases:\n- A single word answer should be treated as a complete statement\n- Error messages should be treated as a single statement\n- Empty strings should return an empty list\n- When splitting text, keep related information together\n\nExample:\nExample text: Look! A bird! Birds are an interesting animal.\n\n{{\n \"statements\": [\"Look!\", \"A bird!\", \"Birds are interesting animals.\"]\n}}\n\nPlease return only JSON format with \"statements\" array.\nReturn empty list for empty input.\n\nText:\n" + output + "\n\nJSON:\n";
|
|
@@ -436,14 +440,15 @@ function generateEvaluationStatementsPrompt(_ref) {
|
|
|
436
440
|
function generateEvaluatePrompt$3(_ref2) {
|
|
437
441
|
var input = _ref2.input,
|
|
438
442
|
statements = _ref2.statements;
|
|
439
|
-
return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n Return JSON with array of verdict objects. Each verdict must include:\n - \"verdict\": \"yes\", \"no\", or \"unsure\"\n - \"reason\": Clear explanation of the verdict\n - Exact match between number of verdicts and statements\n\n Verdict Guidelines:\n - \"yes\": Statement explicitly and directly answers the input question
|
|
443
|
+
return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n Return JSON with array of verdict objects. Each verdict must include:\n - \"verdict\": \"yes\", \"no\", or \"unsure\"\n - \"reason\": Clear explanation of the verdict\n - Exact match between number of verdicts and statements\n\n Verdict Guidelines:\n - \"yes\": Statement explicitly and directly answers the input question when it:\n * Contains specific answer to the question asked (e.g., \"The color of the sky is blue\")\n * States explicit relationship between key concepts (e.g., \"X is the CEO of company Y\")\n * Can stand alone as a complete answer\n * Contains appropriate question-type response (e.g., location for \"where\", person for \"who\")\n * Note: If statement is incorrect but directly addresses the question, mark as \"unsure\"\n\n - \"unsure\": Statement shows partial relevance when it:\n * Discusses the type of information being asked about (e.g., mentions temperatures when asked about temperature)\n * Contains information about the answer without explicit statement\n * Uses importance indicators (\"main\", \"primary\", \"major\") with relevant concepts\n * Includes indirect references to the answer (e.g., \"where the president works\")\n * Contains topic-related administrative/governance terms without direct answer\n * References functions or characteristics typically associated with the answer\n * Uses terms that match what's being asked about\n * Mentions related entities without specifying their relationship to the answer\n * Is incorrect but shows understanding of the question\n * Contains the answer term but needs more context to be complete\n * Contains measurement units or quantities relevant to the question type\n * References locations or entities in the same category as what's being asked about\n * Provides relevant information without using explicit question-type terminology\n * Contains references to properties of the subject\n\n\n - \"no\": Statement lacks meaningful connection to question when it:\n * Contains neither the subject nor the type of information being requested\n * Contains no terms related to what's being asked about\n * Contains only general subject information without relating to what's being asked\n * Consists of empty or meaningless content\n * Contains purely tangential information with no mention of the subject or question type\n * Note: Assessment is about connection to what's being asked, not factual accuracy\n * Contains no connection to what's being asked about (neither the subject nor the type of information requested)\n\n REMEMBER: \n - If the statement contains words or phrases that are relevant to the input, it is partially relevant.\n - If the statement is a direct answer to the input, it is relevant.\n - If the statement is completely unrelated to the input or contains nothing, it is not relevant.\n - DO NOT MAKE A JUDGEMENT ON THE CORRECTNESS OF THE STATEMENT, JUST THE RELEVANCY.\n\n STRICT RULES:\n - If a statement mentions the type of information being requested, it should be marked as \"unsure\" ONLY if it's discussing that type meaningfully (not just mentioning it)\n - Subject mentions alone are NOT enough for relevance - they must connect to what's being asked about\n - Empty or meaningless statements are always \"no\"\n - General facts about the subject without connection to the question type should be marked as \"no\"\n - ALWAYS mark a statement as \"no\" if it discusses the topic without any connection to the question type\n - Statements that mention neither the subject nor the type of information are always \"no\"\n - Type-level relevance overrides topic-only content\n - Measurement/quantity relevance counts as type-level relevance\n - Administrative/governance terms are only relevant if they relate to the question type\n\n\n Examples of \"no\" statements:\n * \"Japan has beautiful seasons\" for \"What is Japan's largest city?\"\n * \"Trees grow tall\" for \"How tall is Mount Everest?\"\n * \"The weather is nice\" for \"Who is the president?\"\n\n Example:\n Input: \"What color is the sky during daytime?\"\n Statements: [\n \"The sky is blue during daytime\",\n \"The sky is full of clouds\", \n \"I had breakfast today\",\n \"Blue is a beautiful color\",\n \"Many birds fly in the sky\",\n \"\",\n \"The sky is purple during daytime\",\n \"Daytime is when the sun is up\",\n ]\n JSON:\n {{\n \"verdicts\": [\n {{\n \"verdict\": \"yes\",\n \"reason\": \"This statement explicitly answers what color the sky is during daytime\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement describes the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement about breakfast is completely unrelated to the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement about blue is related to color but doesn't address the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is about the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement is empty\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is incorrect but contains relevant information and still addresses the question\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement is about daytime but doesn't address the sky\"\n }}\n ]\n }}\n\n Input:\n " + input + "\n\n Number of statements: " + (statements.length === 0 ? '1' : statements.length) + "\n\n Statements:\n " + statements + "\n\n JSON:\n ";
|
|
440
444
|
}
|
|
441
445
|
function generateReasonPrompt$3(_ref3) {
|
|
442
446
|
var score = _ref3.score,
|
|
443
447
|
reasons = _ref3.reasons,
|
|
444
448
|
input = _ref3.input,
|
|
445
|
-
output = _ref3.output
|
|
446
|
-
|
|
449
|
+
output = _ref3.output,
|
|
450
|
+
scale = _ref3.scale;
|
|
451
|
+
return "Explain the irrelevancy score where 0 is the lowest and " + scale + " is the highest for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Irrelevancy Reasons: " + reasons.join('\n') + "\n \n Rules:\n - Explain score based on mix of direct answers and related context\n - Consider both full and partial relevance\n - Keep explanation concise and focused\n - Use given score, don't recalculate\n - Don't judge factual correctness\n - Explain both relevant and irrelevant aspects\n - For mixed responses, explain the balance\n\n Format:\n {\n \"reason\": \"The score is {score} because {explanation of overall relevance}\"\n }\n\n Example Responses:\n {\n \"reason\": \"The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant\"\n }\n {\n \"reason\": \"The score is 3 because while the answer discusses the right topic, it doesn't directly address the question\"\n }\n ";
|
|
447
452
|
}
|
|
448
453
|
|
|
449
454
|
var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
@@ -497,7 +502,7 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
497
502
|
return evaluate;
|
|
498
503
|
}();
|
|
499
504
|
_proto.getReason = /*#__PURE__*/function () {
|
|
500
|
-
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
|
|
505
|
+
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, reasons) {
|
|
501
506
|
var prompt, result;
|
|
502
507
|
return _regeneratorRuntime().wrap(function _callee2$(_context2) {
|
|
503
508
|
while (1) switch (_context2.prev = _context2.next) {
|
|
@@ -506,7 +511,8 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
506
511
|
input: input,
|
|
507
512
|
output: actualOutput,
|
|
508
513
|
reasons: reasons,
|
|
509
|
-
score: score
|
|
514
|
+
score: score,
|
|
515
|
+
scale: scale
|
|
510
516
|
});
|
|
511
517
|
_context2.next = 3;
|
|
512
518
|
return this.agent.generate(prompt, {
|
|
@@ -523,7 +529,7 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
523
529
|
}
|
|
524
530
|
}, _callee2, this);
|
|
525
531
|
}));
|
|
526
|
-
function getReason(_x3, _x4, _x5, _x6) {
|
|
532
|
+
function getReason(_x3, _x4, _x5, _x6, _x7) {
|
|
527
533
|
return _getReason.apply(this, arguments);
|
|
528
534
|
}
|
|
529
535
|
return getReason;
|
|
@@ -534,12 +540,11 @@ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
534
540
|
var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
|
|
535
541
|
function AnswerRelevancyMetric(model, _temp) {
|
|
536
542
|
var _this;
|
|
537
|
-
var _ref = _temp === void 0 ? {
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
scale = _ref.scale;
|
|
543
|
+
var _ref = _temp === void 0 ? {} : _temp,
|
|
544
|
+
_ref$uncertaintyWeigh = _ref.uncertaintyWeight,
|
|
545
|
+
uncertaintyWeight = _ref$uncertaintyWeigh === void 0 ? 0.3 : _ref$uncertaintyWeigh,
|
|
546
|
+
_ref$scale = _ref.scale,
|
|
547
|
+
scale = _ref$scale === void 0 ? 10 : _ref$scale;
|
|
543
548
|
_this = _Metric.call(this) || this;
|
|
544
549
|
_this.judge = void 0;
|
|
545
550
|
_this.uncertaintyWeight = void 0;
|
|
@@ -596,7 +601,7 @@ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
|
|
|
596
601
|
}
|
|
597
602
|
}
|
|
598
603
|
_context2.next = 4;
|
|
599
|
-
return this.judge.getReason(input, output, score, reasonsForVerdicts);
|
|
604
|
+
return this.judge.getReason(input, output, score, this.scale, reasonsForVerdicts);
|
|
600
605
|
case 4:
|
|
601
606
|
reason = _context2.sent;
|
|
602
607
|
return _context2.abrupt("return", reason);
|
|
@@ -626,7 +631,7 @@ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
|
|
|
626
631
|
}
|
|
627
632
|
}
|
|
628
633
|
var score = relevancyCount / numberOfVerdicts;
|
|
629
|
-
return
|
|
634
|
+
return roundToTwoDecimals(score * this.scale);
|
|
630
635
|
};
|
|
631
636
|
return AnswerRelevancyMetric;
|
|
632
637
|
}(core.Metric);
|
|
@@ -801,14 +806,15 @@ function generateEvaluatePrompt$2(_ref) {
|
|
|
801
806
|
var input = _ref.input,
|
|
802
807
|
output = _ref.output,
|
|
803
808
|
context = _ref.context;
|
|
804
|
-
return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input\n-
|
|
809
|
+
return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nCRITICAL: Context should be marked as relevant if it:\n1. Directly helps define or explain the subject\n2. Demonstrates properties or behaviors mentioned in the output\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\", \"The Sun gives light to planets\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input question\n- Demonstrates properties mentioned in the output\n- Provides examples that validate the output\n- Contains information that helps define the subject\n\nMark as not relevant if the information:\n- Only describes other objects' behaviors\n- Has no connection to properties mentioned in output\n- Is completely unrelated to the subject\n- Contradicts the output\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is or how it produces light, as it only describes how another object interacts with sunlight.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun gives light to planets' demonstrates the light-producing property mentioned in the output.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nNumber of context pieces: " + (context.length === 0 ? '1' : context.length) + "\n\nContext:\n" + context + "\n\nJSON:\n";
|
|
805
810
|
}
|
|
806
811
|
function generateReasonPrompt$2(_ref2) {
|
|
807
812
|
var input = _ref2.input,
|
|
808
813
|
output = _ref2.output,
|
|
809
814
|
verdicts = _ref2.verdicts,
|
|
810
|
-
score = _ref2.score
|
|
811
|
-
|
|
815
|
+
score = _ref2.score,
|
|
816
|
+
scale = _ref2.scale;
|
|
817
|
+
return "Given the input, output, verdicts, and position score, and the highest possible score is " + scale + ", provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.\n The retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (" + scale + ".0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPosition Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
|
|
812
818
|
}
|
|
813
819
|
|
|
814
820
|
var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
@@ -852,7 +858,7 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
852
858
|
return evaluate;
|
|
853
859
|
}();
|
|
854
860
|
_proto.getReason = /*#__PURE__*/function () {
|
|
855
|
-
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
|
|
861
|
+
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, verdicts) {
|
|
856
862
|
var prompt, result;
|
|
857
863
|
return _regeneratorRuntime().wrap(function _callee2$(_context2) {
|
|
858
864
|
while (1) switch (_context2.prev = _context2.next) {
|
|
@@ -861,7 +867,8 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
861
867
|
input: input,
|
|
862
868
|
output: actualOutput,
|
|
863
869
|
verdicts: verdicts,
|
|
864
|
-
score: score
|
|
870
|
+
score: score,
|
|
871
|
+
scale: scale
|
|
865
872
|
});
|
|
866
873
|
_context2.next = 3;
|
|
867
874
|
return this.agent.generate(prompt, {
|
|
@@ -878,7 +885,7 @@ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
878
885
|
}
|
|
879
886
|
}, _callee2, this);
|
|
880
887
|
}));
|
|
881
|
-
function getReason(_x4, _x5, _x6, _x7) {
|
|
888
|
+
function getReason(_x4, _x5, _x6, _x7, _x8) {
|
|
882
889
|
return _getReason.apply(this, arguments);
|
|
883
890
|
}
|
|
884
891
|
return getReason;
|
|
@@ -914,7 +921,7 @@ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
|
|
|
914
921
|
verdicts = _context.sent;
|
|
915
922
|
score = this.calculateScore(verdicts);
|
|
916
923
|
_context.next = 7;
|
|
917
|
-
return this.judge.getReason(input, output, score, verdicts);
|
|
924
|
+
return this.judge.getReason(input, output, score, this.scale, verdicts);
|
|
918
925
|
case 7:
|
|
919
926
|
reason = _context.sent;
|
|
920
927
|
return _context.abrupt("return", {
|
|
@@ -956,24 +963,25 @@ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
|
|
|
956
963
|
}
|
|
957
964
|
// Normalize against the maximum possible score
|
|
958
965
|
var finalScore = weightedSum / maxPossibleSum * this.scale;
|
|
959
|
-
return finalScore;
|
|
966
|
+
return roundToTwoDecimals(finalScore);
|
|
960
967
|
};
|
|
961
968
|
return ContextPositionMetric;
|
|
962
969
|
}(core.Metric);
|
|
963
970
|
|
|
964
|
-
var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider
|
|
971
|
+
var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider all forms of relevance:\n - Direct definitions or explanations\n - Supporting evidence or examples\n - Related characteristics or behaviors\n - Real-world applications or effects\n3. Prioritize usefulness over completeness\n4. Recognize that some nodes may be partially relevant\n5. Empty or error nodes should be marked as not relevant";
|
|
965
972
|
function generateEvaluatePrompt$1(_ref) {
|
|
966
973
|
var input = _ref.input,
|
|
967
974
|
output = _ref.output,
|
|
968
975
|
context = _ref.context;
|
|
969
|
-
return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
|
|
976
|
+
return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nCRITICAL: Context should be marked as relevant if it:\n1. Directly helps define or explain the subject\n2. Demonstrates properties or behaviors mentioned in the output\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\", \"The Sun gives light to planets\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input question\n- Demonstrates properties mentioned in the output\n- Provides examples that validate the output\n- Contains information that helps define the subject\n\nMark as not relevant if the information:\n- Only describes other objects' behaviors\n- Has no connection to properties mentioned in output\n- Is completely unrelated to the subject\n- Contradicts the output\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is or how it produces light, as it only describes how another object interacts with sunlight.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun gives light to planets' demonstrates the light-producing property mentioned in the output.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nNumber of context pieces: " + (context.length === 0 ? '1' : context.length) + "\n\nContext:\n" + context + "\n\nJSON:\n";
|
|
970
977
|
}
|
|
971
978
|
function generateReasonPrompt$1(_ref2) {
|
|
972
979
|
var input = _ref2.input,
|
|
973
980
|
output = _ref2.output,
|
|
974
981
|
verdicts = _ref2.verdicts,
|
|
975
|
-
score = _ref2.score
|
|
976
|
-
|
|
982
|
+
score = _ref2.score,
|
|
983
|
+
scale = _ref2.scale;
|
|
984
|
+
return "Given the input, output, verdicts, and precision score, and the highest possible score is " + scale + ", provide a BRIEF explanation for the score. Explain both its strengths and limitations.\nThe retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (" + scale + ".0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPrecision Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
|
|
977
985
|
}
|
|
978
986
|
|
|
979
987
|
var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
@@ -1017,7 +1025,7 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
1017
1025
|
return evaluate;
|
|
1018
1026
|
}();
|
|
1019
1027
|
_proto.getReason = /*#__PURE__*/function () {
|
|
1020
|
-
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
|
|
1028
|
+
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, verdicts) {
|
|
1021
1029
|
var prompt, result;
|
|
1022
1030
|
return _regeneratorRuntime().wrap(function _callee2$(_context2) {
|
|
1023
1031
|
while (1) switch (_context2.prev = _context2.next) {
|
|
@@ -1026,7 +1034,8 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
1026
1034
|
input: input,
|
|
1027
1035
|
output: actualOutput,
|
|
1028
1036
|
verdicts: verdicts,
|
|
1029
|
-
score: score
|
|
1037
|
+
score: score,
|
|
1038
|
+
scale: scale
|
|
1030
1039
|
});
|
|
1031
1040
|
_context2.next = 3;
|
|
1032
1041
|
return this.agent.generate(prompt, {
|
|
@@ -1043,7 +1052,7 @@ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
1043
1052
|
}
|
|
1044
1053
|
}, _callee2, this);
|
|
1045
1054
|
}));
|
|
1046
|
-
function getReason(_x4, _x5, _x6, _x7) {
|
|
1055
|
+
function getReason(_x4, _x5, _x6, _x7, _x8) {
|
|
1047
1056
|
return _getReason.apply(this, arguments);
|
|
1048
1057
|
}
|
|
1049
1058
|
return getReason;
|
|
@@ -1079,7 +1088,7 @@ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
|
|
|
1079
1088
|
verdicts = _context.sent;
|
|
1080
1089
|
score = this.calculateScore(verdicts);
|
|
1081
1090
|
_context.next = 7;
|
|
1082
|
-
return this.judge.getReason(input, output, score, verdicts);
|
|
1091
|
+
return this.judge.getReason(input, output, score, this.scale, verdicts);
|
|
1083
1092
|
case 7:
|
|
1084
1093
|
reason = _context.sent;
|
|
1085
1094
|
return _context.abrupt("return", {
|
|
@@ -1120,7 +1129,7 @@ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
|
|
|
1120
1129
|
return 0;
|
|
1121
1130
|
}
|
|
1122
1131
|
var finalScore = weightedPrecisionSum / relevantCount;
|
|
1123
|
-
return finalScore * this.scale;
|
|
1132
|
+
return roundToTwoDecimals(finalScore * this.scale);
|
|
1124
1133
|
};
|
|
1125
1134
|
return ContextPrecisionMetric;
|
|
1126
1135
|
}(core.Metric);
|
|
@@ -1237,19 +1246,20 @@ var KeywordCoverageMetric = /*#__PURE__*/function (_Metric) {
|
|
|
1237
1246
|
return KeywordCoverageMetric;
|
|
1238
1247
|
}(core.Metric);
|
|
1239
1248
|
|
|
1240
|
-
var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
|
|
1249
|
+
var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n6. Judge each instruction independently. Only check if the current instruction is followed. Do not let instructions be influenced by other instructions.\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
|
|
1241
1250
|
function generateEvaluatePrompt(_ref) {
|
|
1242
1251
|
var instructions = _ref.instructions,
|
|
1243
1252
|
input = _ref.input,
|
|
1244
1253
|
output = _ref.output;
|
|
1245
|
-
return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"no\",\n \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n }\n ]\n}\n\
|
|
1254
|
+
return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"no\",\n \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n }\n ]\n}\n\nExample 2:\nInput: \"describe the sky\"\nOutput: \"The sky is blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Talk about the color black\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The output starts with a capital letter\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The output does not talk about the color black\"\n }\n ]\n}\n\nNumber of instructions: " + instructions.length + "\n\nPrompt Instructions:\n" + instructions + "\n\nInput:\n" + input + "\n\nLLM Actual Output:\n" + output + "\n\nJSON:";
|
|
1246
1255
|
}
|
|
1247
1256
|
function generateReasonPrompt(_ref2) {
|
|
1248
1257
|
var input = _ref2.input,
|
|
1249
1258
|
output = _ref2.output,
|
|
1250
1259
|
score = _ref2.score,
|
|
1251
|
-
reasons = _ref2.reasons
|
|
1252
|
-
|
|
1260
|
+
reasons = _ref2.reasons,
|
|
1261
|
+
scale = _ref2.scale;
|
|
1262
|
+
return "Explain the instruction following score where 0 is the lowest and " + scale + " is the highest for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Failure Reasons: " + reasons.join('\n') + "\n\n Rules (follow these rules exactly. do not deviate):\n - Keep your response concise and to the point.\n - Do not change score from what is given.\n - Do not make judgements on inputs or outputs (factual correctness, quality, etc).\n - If there are failure reasons given, explain why the score is not higher.\n \n\n Output format:\n {\n \"reason\": \"The score is {score} because {explanation of instruction following}\"\n }\n \n Example Responses:\n {\n \"reason\": \"The score is " + scale + " because the output follows the instructions exactly\"\n }\n {\n \"reason\": \"The score is 0 because the output does not follow the instructions\"\n }\n ";
|
|
1253
1263
|
}
|
|
1254
1264
|
|
|
1255
1265
|
var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
@@ -1293,7 +1303,7 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
1293
1303
|
return evaluate;
|
|
1294
1304
|
}();
|
|
1295
1305
|
_proto.getReason = /*#__PURE__*/function () {
|
|
1296
|
-
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
|
|
1306
|
+
var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, scale, reasons) {
|
|
1297
1307
|
var prompt, result;
|
|
1298
1308
|
return _regeneratorRuntime().wrap(function _callee2$(_context2) {
|
|
1299
1309
|
while (1) switch (_context2.prev = _context2.next) {
|
|
@@ -1302,7 +1312,8 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
1302
1312
|
input: input,
|
|
1303
1313
|
output: actualOutput,
|
|
1304
1314
|
reasons: reasons,
|
|
1305
|
-
score: score
|
|
1315
|
+
score: score,
|
|
1316
|
+
scale: scale
|
|
1306
1317
|
});
|
|
1307
1318
|
_context2.next = 3;
|
|
1308
1319
|
return this.agent.generate(prompt, {
|
|
@@ -1319,7 +1330,7 @@ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
|
|
|
1319
1330
|
}
|
|
1320
1331
|
}, _callee2, this);
|
|
1321
1332
|
}));
|
|
1322
|
-
function getReason(_x4, _x5, _x6, _x7) {
|
|
1333
|
+
function getReason(_x4, _x5, _x6, _x7, _x8) {
|
|
1323
1334
|
return _getReason.apply(this, arguments);
|
|
1324
1335
|
}
|
|
1325
1336
|
return getReason;
|
|
@@ -1389,7 +1400,7 @@ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
|
|
|
1389
1400
|
}
|
|
1390
1401
|
}
|
|
1391
1402
|
_context2.next = 4;
|
|
1392
|
-
return this.judge.getReason(input, output, score, reasonsForVerdicts);
|
|
1403
|
+
return this.judge.getReason(input, output, score, this.scale, reasonsForVerdicts);
|
|
1393
1404
|
case 4:
|
|
1394
1405
|
reason = _context2.sent;
|
|
1395
1406
|
return _context2.abrupt("return", reason);
|
|
@@ -1417,7 +1428,7 @@ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
|
|
|
1417
1428
|
}
|
|
1418
1429
|
}
|
|
1419
1430
|
var score = alignmentCount / numberOfVerdicts;
|
|
1420
|
-
return score * this.scale;
|
|
1431
|
+
return roundToTwoDecimals(score * this.scale);
|
|
1421
1432
|
};
|
|
1422
1433
|
return PromptAlignmentMetric;
|
|
1423
1434
|
}(core.Metric);
|