npm - @mastra/evals - Versions diffs - 0.12.1 → 0.13.0-alpha.1 - Mend

@mastra/evals 0.12.1 → 0.13.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/dist/chunk-5CVZXIFW.js +36 -0
package/dist/chunk-5CVZXIFW.js.map +1 -0
package/dist/chunk-QVZBKGOE.cjs +41 -0
package/dist/chunk-QVZBKGOE.cjs.map +1 -0
package/dist/{dist-BODKWAXM.cjs → dist-JQCAD3AD.cjs} +9 -9
package/dist/{dist-BODKWAXM.cjs.map → dist-JQCAD3AD.cjs.map} +1 -1
package/dist/{dist-JRG62SVA.js → dist-JVIEAZJ6.js} +9 -9
package/dist/{dist-JRG62SVA.js.map → dist-JVIEAZJ6.js.map} +1 -1
package/dist/evaluation.d.ts +1 -1
package/dist/evaluation.d.ts.map +1 -1
package/dist/index.cjs +3 -3
package/dist/index.cjs.map +1 -1
package/dist/index.js +2 -2
package/dist/index.js.map +1 -1
package/dist/scorers/code/index.cjs +85 -0
package/dist/scorers/code/index.cjs.map +1 -1
package/dist/scorers/code/index.d.ts +1 -0
package/dist/scorers/code/index.d.ts.map +1 -1
package/dist/scorers/code/index.js +85 -1
package/dist/scorers/code/index.js.map +1 -1
package/dist/scorers/code/tool-call-accuracy/index.d.ts +18 -0
package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -0
package/dist/scorers/llm/index.cjs +184 -28
package/dist/scorers/llm/index.cjs.map +1 -1
package/dist/scorers/llm/index.d.ts +1 -0
package/dist/scorers/llm/index.d.ts.map +1 -1
package/dist/scorers/llm/index.js +170 -15
package/dist/scorers/llm/index.js.map +1 -1
package/dist/scorers/llm/tool-call-accuracy/index.d.ts +22 -0
package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -0
package/dist/scorers/llm/tool-call-accuracy/prompts.d.ts +19 -0
package/dist/scorers/llm/tool-call-accuracy/prompts.d.ts.map +1 -0
package/dist/scorers/utils.d.ts +12 -0
package/dist/scorers/utils.d.ts.map +1 -1
package/package.json +11 -10

package/dist/scorers/llm/index.cjs CHANGED Viewed

@@ -1,19 +1,10 @@
 'use strict';
 var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
+var chunkQVZBKGOE_cjs = require('../../chunk-QVZBKGOE.cjs');
 var scores = require('@mastra/core/scores');
 var zod = require('zod');
-var roundToTwoDecimals2 = (num) => {
-  return Math.round((num + Number.EPSILON) * 100) / 100;
-};
-var getUserMessageFromRunInput = (input) => {
-  return input?.inputMessages.find(({ role }) => role === "user")?.content;
-};
-var getAssistantMessageFromRunOutput = (output) => {
-  return output?.find(({ role }) => role === "assistant")?.content;
-};
 // src/scorers/llm/answer-relevancy/prompts.ts
 var createExtractPrompt = (output) => `
         Given the text, break it down into meaningful statements while preserving context and relationships.
@@ -236,14 +227,14 @@ function createAnswerRelevancyScorer({
     description: "Extract relevant statements from the LLM output",
     outputSchema: extractOutputSchema,
     createPrompt: ({ run }) => {
-      const assistantMessage = getAssistantMessageFromRunOutput(run.output) ?? "";
+      const assistantMessage = chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       return createExtractPrompt(assistantMessage);
     }
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run, results }) => {
-      const input = getUserMessageFromRunInput(run.input) ?? "";
+      const input = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
       return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
     }
   }).generateScore(({ results }) => {
@@ -265,8 +256,8 @@ function createAnswerRelevancyScorer({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       return createReasonPrompt({
-        input: getUserMessageFromRunInput(run.input) ?? "",
-        output: getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         score,
         results: results.analyzeStepResult.results,
         scale: options.scale
@@ -444,7 +435,7 @@ function createFaithfulnessScorer({
     description: "Extract relevant statements from the LLM output",
     outputSchema: zod.z.array(zod.z.string()),
     createPrompt: ({ run }) => {
-      const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createFaithfulnessExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -465,13 +456,13 @@ function createFaithfulnessScorer({
       return 0;
     }
     const score = supportedClaims / totalClaims * (options?.scale || 1);
-    return roundToTwoDecimals2(score);
+    return chunkQVZBKGOE_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       const prompt = createFaithfulnessReasonPrompt({
-        input: getUserMessageFromRunInput(run.input) ?? "",
-        output: getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
         score,
         scale: options?.scale || 1,
@@ -602,13 +593,13 @@ function createBiasScorer({ model, options }) {
     outputSchema: zod.z.object({
       opinions: zod.z.array(zod.z.string())
     }),
-    createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
+    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run, results }) => {
       const prompt = createBiasAnalyzePrompt({
-        output: getAssistantMessageFromRunOutput(run.output) ?? "",
+        output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         opinions: results.preprocessStepResult?.opinions || []
       });
       return prompt;
@@ -619,7 +610,7 @@ function createBiasScorer({ model, options }) {
     }
     const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
     const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
-    return roundToTwoDecimals2(score * (options?.scale || 1));
+    return chunkQVZBKGOE_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ score, results }) => {
@@ -836,7 +827,7 @@ function createHallucinationScorer({
       claims: zod.z.array(zod.z.string())
     }),
     createPrompt: ({ run }) => {
-      const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createHallucinationExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -858,13 +849,13 @@ function createHallucinationScorer({
       return 0;
     }
     const score = contradictedStatements / totalStatements * (options?.scale || 1);
-    return roundToTwoDecimals2(score);
+    return chunkQVZBKGOE_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       const prompt = createHallucinationReasonPrompt({
-        input: getUserMessageFromRunInput(run.input) ?? "",
-        output: getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context: options?.context || [],
         score,
         scale: options?.scale || 1,
@@ -973,8 +964,8 @@ function createToxicityScorer({ model, options }) {
     outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run }) => {
       const prompt = createToxicityAnalyzePrompt({
-        input: getUserMessageFromRunInput(run.input) ?? "",
-        output: getAssistantMessageFromRunOutput(run.output) ?? ""
+        input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
       });
       return prompt;
     }
@@ -990,7 +981,7 @@ function createToxicityScorer({ model, options }) {
       }
     }
     const score = toxicityCount / numberOfVerdicts;
-    return roundToTwoDecimals2(score * (options?.scale || 1));
+    return chunkQVZBKGOE_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ results, score }) => {
@@ -1003,12 +994,177 @@ function createToxicityScorer({ model, options }) {
   });
 }
+// src/scorers/llm/tool-call-accuracy/prompts.ts
+var TOOL_SELECTION_ACCURACY_INSTRUCTIONS = `
+You are an expert evaluator specializing in AI agent tool selection analysis. Your role is to assess whether an agent chose appropriate tools based on explicit user requests.
+CORE RESPONSIBILITIES:
+- Analyze user requests to understand what was explicitly asked for
+- Evaluate each tool call against the specific user need
+- Identify missing tools that should have been used
+- Apply strict evaluation criteria focused on direct relevance
+EVALUATION PHILOSOPHY:
+- Be precise and literal in your assessments
+- Only approve tools that directly address the user's explicit request
+- Distinguish between "helpful" and "appropriate" - reject tools that are merely helpful but not requested
+- Consider context but prioritize what was actually asked for
+OUTPUT REQUIREMENTS:
+- Provide clear, specific reasoning for each evaluation
+- Use provided JSON schema exactly as specified
+- Be consistent in your evaluation standards
+- Focus on actionable insights
+You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
+`;
+var createAnalyzePrompt = ({
+  userInput,
+  agentResponse,
+  toolsCalled,
+  availableTools
+}) => {
+  return `
+You are evaluating whether an AI agent made appropriate tool choices for a user request.
+USER REQUEST: "${userInput}"
+AGENT RESPONSE: "${agentResponse}"
+TOOLS THE AGENT ACTUALLY CALLED: ${toolsCalled.length > 0 ? toolsCalled.join(", ") : "None"}
+TOOL REFERENCE:
+${availableTools}
+EVALUATION RULES:
+1. If NO tools were called: evaluate BOTH the user request AND agent response:
+   - Did the user make a specific, actionable request?
+   - Did the agent appropriately ask for clarification when details were insufficient?
+   - Would calling a tool without the requested clarification provide poor results?
+2. If tools WERE called: evaluate if each tool was appropriate for the EXPLICIT user request
+AGENT RESPONSE EVALUATION:
+When no tools are called, consider if the agent's response demonstrates good judgment:
+- Asking follow-up questions for vague requests = APPROPRIATE (missingTools should be empty)
+- Providing generic answers without using available tools = INAPPROPRIATE
+- Ignoring clear, specific requests = INAPPROPRIATE
+CLARIFICATION EXAMPLES:
+User: "I'm looking for a firm" + Agent asks about practice area/location = APPROPRIATE clarification
+User: "help with legal stuff" + Agent asks for specifics = APPROPRIATE clarification
+User: "Create RFP for corporate litigation in NY" + Agent asks for more details = INAPPROPRIATE delay
+User: "I need pricing for litigation" + Agent gives generic answer = MISSED tool opportunity
+EVALUATION QUESTION:
+Did the agent make the right choice between:
+1. Acting immediately with available tools, OR
+2. Gathering more information for better results?
+Consider: Would you rather get generic firm recommendations or have the agent ask clarifying questions first?
+STRICT EVALUATION CRITERIA:
+- Only mark tools as appropriate if they DIRECTLY address what the user explicitly asked for
+- Do NOT mark tools as appropriate just because they might be "helpful" or "related" to the domain
+- If the user asked for "A", only tools that provide "A" should be marked appropriate
+- Additional tools the agent decided to call without being asked should be marked inappropriate
+Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
+`;
+};
+var createReasonPrompt2 = ({
+  userInput,
+  score,
+  evaluations,
+  missingTools
+}) => {
+  return `
+Explain this tool selection evaluation in ONE SENTENCE.
+User Request: "${userInput}"
+Score: ${score}/1
+Tools Evaluated: ${JSON.stringify(evaluations)}
+Missing Tools: ${JSON.stringify(missingTools)}
+Provide a single, concise sentence explaining why this score was given.
+`;
+};
+// src/scorers/llm/tool-call-accuracy/index.ts
+var analyzeOutputSchema = zod.z.object({
+  evaluations: zod.z.array(
+    zod.z.object({
+      toolCalled: zod.z.string(),
+      wasAppropriate: zod.z.boolean(),
+      reasoning: zod.z.string()
+    })
+  ),
+  missingTools: zod.z.array(zod.z.string()).optional()
+});
+function createToolCallAccuracyScorerLLM({ model, availableTools }) {
+  const toolDefinitions = availableTools.map((tool) => `${tool.name}: ${tool.description}`).join("\n");
+  return scores.createScorer({
+    name: "Tool Call Accuracy (LLM)",
+    description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
+    judge: {
+      model,
+      instructions: TOOL_SELECTION_ACCURACY_INSTRUCTIONS
+    }
+  }).preprocess(async ({ run }) => {
+    const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
+    const isOutputInvalid = !run.output || run.output.length === 0;
+    if (isInputInvalid || isOutputInvalid) {
+      throw new Error("Input and output messages cannot be null or empty");
+    }
+    const { tools: actualTools, toolCallInfos } = chunkQVZBKGOE_cjs.extractToolCalls(run.output);
+    return {
+      actualTools,
+      hasToolCalls: actualTools.length > 0,
+      toolCallInfos
+    };
+  }).analyze({
+    description: "Analyze the appropriateness of tool selections",
+    outputSchema: analyzeOutputSchema,
+    createPrompt: ({ run, results }) => {
+      const userInput = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const toolsCalled = results.preprocessStepResult?.actualTools || [];
+      return createAnalyzePrompt({
+        userInput,
+        agentResponse,
+        toolsCalled,
+        availableTools: toolDefinitions
+      });
+    }
+  }).generateScore(({ results }) => {
+    const evaluations = results.analyzeStepResult?.evaluations || [];
+    if (evaluations.length === 0) {
+      const missingTools = results.analyzeStepResult?.missingTools || [];
+      return missingTools.length > 0 ? 0 : 1;
+    }
+    const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
+    const totalToolCalls = evaluations.length;
+    return chunkQVZBKGOE_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
+  }).generateReason({
+    description: "Generate human-readable explanation of tool selection evaluation",
+    createPrompt: ({ run, results, score }) => {
+      const userInput = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const evaluations = results.analyzeStepResult?.evaluations || [];
+      const missingTools = results.analyzeStepResult?.missingTools || [];
+      return createReasonPrompt2({
+        userInput,
+        score,
+        evaluations,
+        missingTools
+      });
+    }
+  });
+}
 exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
 exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
 exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
 exports.createBiasScorer = createBiasScorer;
 exports.createFaithfulnessScorer = createFaithfulnessScorer;
 exports.createHallucinationScorer = createHallucinationScorer;
+exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
 exports.createToxicityScorer = createToxicityScorer;
 //# sourceMappingURL=index.cjs.map
 //# sourceMappingURL=index.cjs.map