@mastra/evals 0.12.1 → 0.13.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/chunk-5CVZXIFW.js +36 -0
  2. package/dist/chunk-5CVZXIFW.js.map +1 -0
  3. package/dist/chunk-QVZBKGOE.cjs +41 -0
  4. package/dist/chunk-QVZBKGOE.cjs.map +1 -0
  5. package/dist/{dist-BODKWAXM.cjs → dist-JQCAD3AD.cjs} +9 -9
  6. package/dist/{dist-BODKWAXM.cjs.map → dist-JQCAD3AD.cjs.map} +1 -1
  7. package/dist/{dist-JRG62SVA.js → dist-JVIEAZJ6.js} +9 -9
  8. package/dist/{dist-JRG62SVA.js.map → dist-JVIEAZJ6.js.map} +1 -1
  9. package/dist/evaluation.d.ts +1 -1
  10. package/dist/evaluation.d.ts.map +1 -1
  11. package/dist/index.cjs +3 -3
  12. package/dist/index.cjs.map +1 -1
  13. package/dist/index.js +2 -2
  14. package/dist/index.js.map +1 -1
  15. package/dist/scorers/code/index.cjs +85 -0
  16. package/dist/scorers/code/index.cjs.map +1 -1
  17. package/dist/scorers/code/index.d.ts +1 -0
  18. package/dist/scorers/code/index.d.ts.map +1 -1
  19. package/dist/scorers/code/index.js +85 -1
  20. package/dist/scorers/code/index.js.map +1 -1
  21. package/dist/scorers/code/tool-call-accuracy/index.d.ts +18 -0
  22. package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -0
  23. package/dist/scorers/llm/index.cjs +184 -28
  24. package/dist/scorers/llm/index.cjs.map +1 -1
  25. package/dist/scorers/llm/index.d.ts +1 -0
  26. package/dist/scorers/llm/index.d.ts.map +1 -1
  27. package/dist/scorers/llm/index.js +170 -15
  28. package/dist/scorers/llm/index.js.map +1 -1
  29. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +22 -0
  30. package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -0
  31. package/dist/scorers/llm/tool-call-accuracy/prompts.d.ts +19 -0
  32. package/dist/scorers/llm/tool-call-accuracy/prompts.d.ts.map +1 -0
  33. package/dist/scorers/utils.d.ts +12 -0
  34. package/dist/scorers/utils.d.ts.map +1 -1
  35. package/package.json +11 -10
@@ -1,19 +1,10 @@
1
1
  'use strict';
2
2
 
3
3
  var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
4
+ var chunkQVZBKGOE_cjs = require('../../chunk-QVZBKGOE.cjs');
4
5
  var scores = require('@mastra/core/scores');
5
6
  var zod = require('zod');
6
7
 
7
- var roundToTwoDecimals2 = (num) => {
8
- return Math.round((num + Number.EPSILON) * 100) / 100;
9
- };
10
- var getUserMessageFromRunInput = (input) => {
11
- return input?.inputMessages.find(({ role }) => role === "user")?.content;
12
- };
13
- var getAssistantMessageFromRunOutput = (output) => {
14
- return output?.find(({ role }) => role === "assistant")?.content;
15
- };
16
-
17
8
  // src/scorers/llm/answer-relevancy/prompts.ts
18
9
  var createExtractPrompt = (output) => `
19
10
  Given the text, break it down into meaningful statements while preserving context and relationships.
@@ -236,14 +227,14 @@ function createAnswerRelevancyScorer({
236
227
  description: "Extract relevant statements from the LLM output",
237
228
  outputSchema: extractOutputSchema,
238
229
  createPrompt: ({ run }) => {
239
- const assistantMessage = getAssistantMessageFromRunOutput(run.output) ?? "";
230
+ const assistantMessage = chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
240
231
  return createExtractPrompt(assistantMessage);
241
232
  }
242
233
  }).analyze({
243
234
  description: "Score the relevance of the statements to the input",
244
235
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
245
236
  createPrompt: ({ run, results }) => {
246
- const input = getUserMessageFromRunInput(run.input) ?? "";
237
+ const input = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
247
238
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
248
239
  }
249
240
  }).generateScore(({ results }) => {
@@ -265,8 +256,8 @@ function createAnswerRelevancyScorer({
265
256
  description: "Reason about the results",
266
257
  createPrompt: ({ run, results, score }) => {
267
258
  return createReasonPrompt({
268
- input: getUserMessageFromRunInput(run.input) ?? "",
269
- output: getAssistantMessageFromRunOutput(run.output) ?? "",
259
+ input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
260
+ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
270
261
  score,
271
262
  results: results.analyzeStepResult.results,
272
263
  scale: options.scale
@@ -444,7 +435,7 @@ function createFaithfulnessScorer({
444
435
  description: "Extract relevant statements from the LLM output",
445
436
  outputSchema: zod.z.array(zod.z.string()),
446
437
  createPrompt: ({ run }) => {
447
- const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
438
+ const prompt = createFaithfulnessExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
448
439
  return prompt;
449
440
  }
450
441
  }).analyze({
@@ -465,13 +456,13 @@ function createFaithfulnessScorer({
465
456
  return 0;
466
457
  }
467
458
  const score = supportedClaims / totalClaims * (options?.scale || 1);
468
- return roundToTwoDecimals2(score);
459
+ return chunkQVZBKGOE_cjs.roundToTwoDecimals(score);
469
460
  }).generateReason({
470
461
  description: "Reason about the results",
471
462
  createPrompt: ({ run, results, score }) => {
472
463
  const prompt = createFaithfulnessReasonPrompt({
473
- input: getUserMessageFromRunInput(run.input) ?? "",
474
- output: getAssistantMessageFromRunOutput(run.output) ?? "",
464
+ input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
465
+ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
475
466
  context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
476
467
  score,
477
468
  scale: options?.scale || 1,
@@ -602,13 +593,13 @@ function createBiasScorer({ model, options }) {
602
593
  outputSchema: zod.z.object({
603
594
  opinions: zod.z.array(zod.z.string())
604
595
  }),
605
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
596
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
606
597
  }).analyze({
607
598
  description: "Score the relevance of the statements to the input",
608
599
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
609
600
  createPrompt: ({ run, results }) => {
610
601
  const prompt = createBiasAnalyzePrompt({
611
- output: getAssistantMessageFromRunOutput(run.output) ?? "",
602
+ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
612
603
  opinions: results.preprocessStepResult?.opinions || []
613
604
  });
614
605
  return prompt;
@@ -619,7 +610,7 @@ function createBiasScorer({ model, options }) {
619
610
  }
620
611
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
621
612
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
622
- return roundToTwoDecimals2(score * (options?.scale || 1));
613
+ return chunkQVZBKGOE_cjs.roundToTwoDecimals(score * (options?.scale || 1));
623
614
  }).generateReason({
624
615
  description: "Reason about the results",
625
616
  createPrompt: ({ score, results }) => {
@@ -836,7 +827,7 @@ function createHallucinationScorer({
836
827
  claims: zod.z.array(zod.z.string())
837
828
  }),
838
829
  createPrompt: ({ run }) => {
839
- const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
830
+ const prompt = createHallucinationExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
840
831
  return prompt;
841
832
  }
842
833
  }).analyze({
@@ -858,13 +849,13 @@ function createHallucinationScorer({
858
849
  return 0;
859
850
  }
860
851
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
861
- return roundToTwoDecimals2(score);
852
+ return chunkQVZBKGOE_cjs.roundToTwoDecimals(score);
862
853
  }).generateReason({
863
854
  description: "Reason about the results",
864
855
  createPrompt: ({ run, results, score }) => {
865
856
  const prompt = createHallucinationReasonPrompt({
866
- input: getUserMessageFromRunInput(run.input) ?? "",
867
- output: getAssistantMessageFromRunOutput(run.output) ?? "",
857
+ input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
858
+ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
868
859
  context: options?.context || [],
869
860
  score,
870
861
  scale: options?.scale || 1,
@@ -973,8 +964,8 @@ function createToxicityScorer({ model, options }) {
973
964
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
974
965
  createPrompt: ({ run }) => {
975
966
  const prompt = createToxicityAnalyzePrompt({
976
- input: getUserMessageFromRunInput(run.input) ?? "",
977
- output: getAssistantMessageFromRunOutput(run.output) ?? ""
967
+ input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
968
+ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
978
969
  });
979
970
  return prompt;
980
971
  }
@@ -990,7 +981,7 @@ function createToxicityScorer({ model, options }) {
990
981
  }
991
982
  }
992
983
  const score = toxicityCount / numberOfVerdicts;
993
- return roundToTwoDecimals2(score * (options?.scale || 1));
984
+ return chunkQVZBKGOE_cjs.roundToTwoDecimals(score * (options?.scale || 1));
994
985
  }).generateReason({
995
986
  description: "Reason about the results",
996
987
  createPrompt: ({ results, score }) => {
@@ -1003,12 +994,177 @@ function createToxicityScorer({ model, options }) {
1003
994
  });
1004
995
  }
1005
996
 
997
+ // src/scorers/llm/tool-call-accuracy/prompts.ts
998
+ var TOOL_SELECTION_ACCURACY_INSTRUCTIONS = `
999
+ You are an expert evaluator specializing in AI agent tool selection analysis. Your role is to assess whether an agent chose appropriate tools based on explicit user requests.
1000
+
1001
+ CORE RESPONSIBILITIES:
1002
+ - Analyze user requests to understand what was explicitly asked for
1003
+ - Evaluate each tool call against the specific user need
1004
+ - Identify missing tools that should have been used
1005
+ - Apply strict evaluation criteria focused on direct relevance
1006
+
1007
+ EVALUATION PHILOSOPHY:
1008
+ - Be precise and literal in your assessments
1009
+ - Only approve tools that directly address the user's explicit request
1010
+ - Distinguish between "helpful" and "appropriate" - reject tools that are merely helpful but not requested
1011
+ - Consider context but prioritize what was actually asked for
1012
+
1013
+ OUTPUT REQUIREMENTS:
1014
+ - Provide clear, specific reasoning for each evaluation
1015
+ - Use provided JSON schema exactly as specified
1016
+ - Be consistent in your evaluation standards
1017
+ - Focus on actionable insights
1018
+
1019
+ You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
1020
+ `;
1021
+ var createAnalyzePrompt = ({
1022
+ userInput,
1023
+ agentResponse,
1024
+ toolsCalled,
1025
+ availableTools
1026
+ }) => {
1027
+ return `
1028
+ You are evaluating whether an AI agent made appropriate tool choices for a user request.
1029
+
1030
+ USER REQUEST: "${userInput}"
1031
+ AGENT RESPONSE: "${agentResponse}"
1032
+ TOOLS THE AGENT ACTUALLY CALLED: ${toolsCalled.length > 0 ? toolsCalled.join(", ") : "None"}
1033
+
1034
+ TOOL REFERENCE:
1035
+ ${availableTools}
1036
+
1037
+ EVALUATION RULES:
1038
+ 1. If NO tools were called: evaluate BOTH the user request AND agent response:
1039
+ - Did the user make a specific, actionable request?
1040
+ - Did the agent appropriately ask for clarification when details were insufficient?
1041
+ - Would calling a tool without the requested clarification provide poor results?
1042
+ 2. If tools WERE called: evaluate if each tool was appropriate for the EXPLICIT user request
1043
+
1044
+ AGENT RESPONSE EVALUATION:
1045
+ When no tools are called, consider if the agent's response demonstrates good judgment:
1046
+ - Asking follow-up questions for vague requests = APPROPRIATE (missingTools should be empty)
1047
+ - Providing generic answers without using available tools = INAPPROPRIATE
1048
+ - Ignoring clear, specific requests = INAPPROPRIATE
1049
+
1050
+ CLARIFICATION EXAMPLES:
1051
+ User: "I'm looking for a firm" + Agent asks about practice area/location = APPROPRIATE clarification
1052
+ User: "help with legal stuff" + Agent asks for specifics = APPROPRIATE clarification
1053
+ User: "Create RFP for corporate litigation in NY" + Agent asks for more details = INAPPROPRIATE delay
1054
+ User: "I need pricing for litigation" + Agent gives generic answer = MISSED tool opportunity
1055
+
1056
+ EVALUATION QUESTION:
1057
+ Did the agent make the right choice between:
1058
+ 1. Acting immediately with available tools, OR
1059
+ 2. Gathering more information for better results?
1060
+
1061
+ Consider: Would you rather get generic firm recommendations or have the agent ask clarifying questions first?
1062
+
1063
+ STRICT EVALUATION CRITERIA:
1064
+ - Only mark tools as appropriate if they DIRECTLY address what the user explicitly asked for
1065
+ - Do NOT mark tools as appropriate just because they might be "helpful" or "related" to the domain
1066
+ - If the user asked for "A", only tools that provide "A" should be marked appropriate
1067
+ - Additional tools the agent decided to call without being asked should be marked inappropriate
1068
+
1069
+ Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
1070
+ `;
1071
+ };
1072
+ var createReasonPrompt2 = ({
1073
+ userInput,
1074
+ score,
1075
+ evaluations,
1076
+ missingTools
1077
+ }) => {
1078
+ return `
1079
+ Explain this tool selection evaluation in ONE SENTENCE.
1080
+
1081
+ User Request: "${userInput}"
1082
+ Score: ${score}/1
1083
+ Tools Evaluated: ${JSON.stringify(evaluations)}
1084
+ Missing Tools: ${JSON.stringify(missingTools)}
1085
+
1086
+ Provide a single, concise sentence explaining why this score was given.
1087
+ `;
1088
+ };
1089
+
1090
+ // src/scorers/llm/tool-call-accuracy/index.ts
1091
+ var analyzeOutputSchema = zod.z.object({
1092
+ evaluations: zod.z.array(
1093
+ zod.z.object({
1094
+ toolCalled: zod.z.string(),
1095
+ wasAppropriate: zod.z.boolean(),
1096
+ reasoning: zod.z.string()
1097
+ })
1098
+ ),
1099
+ missingTools: zod.z.array(zod.z.string()).optional()
1100
+ });
1101
+ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1102
+ const toolDefinitions = availableTools.map((tool) => `${tool.name}: ${tool.description}`).join("\n");
1103
+ return scores.createScorer({
1104
+ name: "Tool Call Accuracy (LLM)",
1105
+ description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
1106
+ judge: {
1107
+ model,
1108
+ instructions: TOOL_SELECTION_ACCURACY_INSTRUCTIONS
1109
+ }
1110
+ }).preprocess(async ({ run }) => {
1111
+ const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
1112
+ const isOutputInvalid = !run.output || run.output.length === 0;
1113
+ if (isInputInvalid || isOutputInvalid) {
1114
+ throw new Error("Input and output messages cannot be null or empty");
1115
+ }
1116
+ const { tools: actualTools, toolCallInfos } = chunkQVZBKGOE_cjs.extractToolCalls(run.output);
1117
+ return {
1118
+ actualTools,
1119
+ hasToolCalls: actualTools.length > 0,
1120
+ toolCallInfos
1121
+ };
1122
+ }).analyze({
1123
+ description: "Analyze the appropriateness of tool selections",
1124
+ outputSchema: analyzeOutputSchema,
1125
+ createPrompt: ({ run, results }) => {
1126
+ const userInput = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
1127
+ const agentResponse = chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1128
+ const toolsCalled = results.preprocessStepResult?.actualTools || [];
1129
+ return createAnalyzePrompt({
1130
+ userInput,
1131
+ agentResponse,
1132
+ toolsCalled,
1133
+ availableTools: toolDefinitions
1134
+ });
1135
+ }
1136
+ }).generateScore(({ results }) => {
1137
+ const evaluations = results.analyzeStepResult?.evaluations || [];
1138
+ if (evaluations.length === 0) {
1139
+ const missingTools = results.analyzeStepResult?.missingTools || [];
1140
+ return missingTools.length > 0 ? 0 : 1;
1141
+ }
1142
+ const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1143
+ const totalToolCalls = evaluations.length;
1144
+ return chunkQVZBKGOE_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1145
+ }).generateReason({
1146
+ description: "Generate human-readable explanation of tool selection evaluation",
1147
+ createPrompt: ({ run, results, score }) => {
1148
+ const userInput = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
1149
+ const evaluations = results.analyzeStepResult?.evaluations || [];
1150
+ const missingTools = results.analyzeStepResult?.missingTools || [];
1151
+ return createReasonPrompt2({
1152
+ userInput,
1153
+ score,
1154
+ evaluations,
1155
+ missingTools
1156
+ });
1157
+ }
1158
+ });
1159
+ }
1160
+
1006
1161
  exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
1007
1162
  exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
1008
1163
  exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
1009
1164
  exports.createBiasScorer = createBiasScorer;
1010
1165
  exports.createFaithfulnessScorer = createFaithfulnessScorer;
1011
1166
  exports.createHallucinationScorer = createHallucinationScorer;
1167
+ exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
1012
1168
  exports.createToxicityScorer = createToxicityScorer;
1013
1169
  //# sourceMappingURL=index.cjs.map
1014
1170
  //# sourceMappingURL=index.cjs.map