@mastra/evals 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +38 -0
  2. package/dist/{chunk-AY4K3J4R.cjs → chunk-33T2SZZ2.cjs} +74 -14
  3. package/dist/chunk-33T2SZZ2.cjs.map +1 -0
  4. package/dist/{chunk-X4MKZ735.js → chunk-ZRHCSFKL.js} +73 -15
  5. package/dist/chunk-ZRHCSFKL.js.map +1 -0
  6. package/dist/docs/SKILL.md +1 -1
  7. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  8. package/dist/docs/references/reference-evals-context-precision.md +3 -3
  9. package/dist/docs/references/reference-evals-context-relevance.md +3 -3
  10. package/dist/docs/references/reference-evals-noise-sensitivity.md +6 -6
  11. package/dist/docs/references/reference-evals-prompt-alignment.md +12 -12
  12. package/dist/docs/references/reference-evals-scorer-utils.md +3 -3
  13. package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
  14. package/dist/scorers/llm/answer-relevancy/index.d.ts +2 -1
  15. package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
  16. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -1
  17. package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
  18. package/dist/scorers/llm/bias/index.d.ts +2 -2
  19. package/dist/scorers/llm/bias/index.d.ts.map +1 -1
  20. package/dist/scorers/llm/context-precision/index.d.ts +2 -1
  21. package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
  22. package/dist/scorers/llm/context-relevance/index.d.ts +2 -1
  23. package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
  24. package/dist/scorers/llm/faithfulness/index.d.ts +2 -1
  25. package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
  26. package/dist/scorers/llm/hallucination/index.d.ts +4 -4
  27. package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
  28. package/dist/scorers/llm/noise-sensitivity/index.d.ts +2 -1
  29. package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
  30. package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
  31. package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
  32. package/dist/scorers/llm/toxicity/index.d.ts +2 -1
  33. package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
  34. package/dist/scorers/prebuilt/index.cjs +105 -85
  35. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  36. package/dist/scorers/prebuilt/index.js +34 -14
  37. package/dist/scorers/prebuilt/index.js.map +1 -1
  38. package/dist/scorers/utils.cjs +31 -23
  39. package/dist/scorers/utils.d.ts +33 -16
  40. package/dist/scorers/utils.d.ts.map +1 -1
  41. package/dist/scorers/utils.js +1 -1
  42. package/package.json +12 -12
  43. package/dist/chunk-AY4K3J4R.cjs.map +0 -1
  44. package/dist/chunk-X4MKZ735.js.map +0 -1
@@ -1,4 +1,4 @@
1
- import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-X4MKZ735.js';
1
+ import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-ZRHCSFKL.js';
2
2
  import { createScorer } from '@mastra/core/evals';
3
3
  import { z } from 'zod';
4
4
  import nlp from 'compromise';
@@ -689,6 +689,10 @@ Example Responses:
689
689
  }
690
690
 
691
691
  // src/scorers/llm/faithfulness/index.ts
692
+ var getToolInvocationContext = (output) => {
693
+ if (!Array.isArray(output)) return [];
694
+ return output.filter((message) => message?.role === "assistant").flatMap((message) => message?.content?.toolInvocations ?? []).filter((toolCall) => toolCall.state === "result").map((toolCall) => JSON.stringify(toolCall.result));
695
+ };
692
696
  function createFaithfulnessScorer({
693
697
  model,
694
698
  options
@@ -715,10 +719,7 @@ function createFaithfulnessScorer({
715
719
  description: "Score the relevance of the statements to the input",
716
720
  outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
717
721
  createPrompt: ({ results, run }) => {
718
- const assistantMessage = run.output.find(({ role }) => role === "assistant");
719
- const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
720
- (toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
721
- ) ?? [];
722
+ const context = options?.context ?? getToolInvocationContext(run.output);
722
723
  const prompt = createFaithfulnessAnalyzePrompt({
723
724
  claims: results.preprocessStepResult?.claims || [],
724
725
  context
@@ -736,11 +737,10 @@ function createFaithfulnessScorer({
736
737
  }).generateReason({
737
738
  description: "Reason about the results",
738
739
  createPrompt: ({ run, results, score }) => {
739
- const assistantMessage = run.output.find(({ role }) => role === "assistant");
740
740
  const prompt = createFaithfulnessReasonPrompt({
741
741
  input: getUserMessageFromRunInput(run.input) ?? "",
742
742
  output: getAssistantMessageFromRunOutput(run.output) ?? "",
743
- context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
743
+ context: options?.context ?? getToolInvocationContext(run.output),
744
744
  score,
745
745
  scale: options?.scale || 1,
746
746
  verdicts: results.analyzeStepResult?.verdicts || []
@@ -1627,6 +1627,16 @@ var DEFAULT_PENALTIES = {
1627
1627
  MAX_MISSING_CONTEXT_PENALTY: 0.5
1628
1628
  // Maximum 50% penalty for missing context
1629
1629
  };
1630
+ var getContext = ({
1631
+ input,
1632
+ output,
1633
+ options
1634
+ }) => {
1635
+ if (options.contextExtractor && isScorerRunInputForAgent(input) && isScorerRunOutputForAgent(output)) {
1636
+ return options.contextExtractor(input, output);
1637
+ }
1638
+ return options.context ?? [];
1639
+ };
1630
1640
  function createContextRelevanceScorerLLM({
1631
1641
  model,
1632
1642
  options
@@ -1652,7 +1662,7 @@ function createContextRelevanceScorerLLM({
1652
1662
  createPrompt: ({ run }) => {
1653
1663
  const userQuery = getUserMessageFromRunInput(run.input) ?? "";
1654
1664
  const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
1655
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1665
+ const context = getContext({ input: run.input, output: run.output, options });
1656
1666
  if (context.length === 0) {
1657
1667
  return createAnalyzePrompt3({
1658
1668
  userQuery,
@@ -1668,7 +1678,7 @@ function createContextRelevanceScorerLLM({
1668
1678
  }
1669
1679
  }).generateScore(({ results, run }) => {
1670
1680
  const evaluations = results.analyzeStepResult?.evaluations || [];
1671
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1681
+ const context = getContext({ input: run.input, output: run.output, options });
1672
1682
  if (context.length === 0) {
1673
1683
  return 1 * (options.scale || 1);
1674
1684
  }
@@ -1704,7 +1714,7 @@ function createContextRelevanceScorerLLM({
1704
1714
  description: "Generate human-readable explanation of context relevance evaluation",
1705
1715
  createPrompt: ({ run, results, score }) => {
1706
1716
  const userQuery = getUserMessageFromRunInput(run.input) ?? "";
1707
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1717
+ const context = getContext({ input: run.input, output: run.output, options });
1708
1718
  if (context.length === 0) {
1709
1719
  return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
1710
1720
  }
@@ -1851,6 +1861,16 @@ var contextRelevanceOutputSchema = z.object({
1851
1861
  })
1852
1862
  )
1853
1863
  });
1864
+ var getContext2 = ({
1865
+ input,
1866
+ output,
1867
+ options
1868
+ }) => {
1869
+ if (options.contextExtractor && isScorerRunInputForAgent(input) && isScorerRunOutputForAgent(output)) {
1870
+ return options.contextExtractor(input, output);
1871
+ }
1872
+ return options.context ?? [];
1873
+ };
1854
1874
  function createContextPrecisionScorer({
1855
1875
  model,
1856
1876
  options
@@ -1876,7 +1896,7 @@ function createContextPrecisionScorer({
1876
1896
  createPrompt: ({ run }) => {
1877
1897
  const input = getUserMessageFromRunInput(run.input) ?? "";
1878
1898
  const output = getAssistantMessageFromRunOutput(run.output) ?? "";
1879
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1899
+ const context = getContext2({ input: run.input, output: run.output, options });
1880
1900
  if (context.length === 0) {
1881
1901
  throw new Error("No context available for evaluation");
1882
1902
  }
@@ -1914,7 +1934,7 @@ function createContextPrecisionScorer({
1914
1934
  createPrompt: ({ run, results, score }) => {
1915
1935
  const input = getUserMessageFromRunInput(run.input) ?? "";
1916
1936
  const output = getAssistantMessageFromRunOutput(run.output) ?? "";
1917
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1937
+ const context = getContext2({ input: run.input, output: run.output, options });
1918
1938
  return createContextPrecisionReasonPrompt({
1919
1939
  input,
1920
1940
  output,
@@ -2550,8 +2570,8 @@ function createPromptAlignmentScorerLLM({
2550
2570
  if (evaluationMode === "system" && !systemPrompt) {
2551
2571
  throw new Error("System prompt is required for system prompt alignment scoring");
2552
2572
  }
2553
- if (evaluationMode === "both" && (!userPrompt || !systemPrompt)) {
2554
- throw new Error("Both user and system prompts are required for combined alignment scoring");
2573
+ if (evaluationMode === "both" && !userPrompt && !systemPrompt) {
2574
+ throw new Error("A user or system prompt is required for combined alignment scoring");
2555
2575
  }
2556
2576
  if (!agentResponse) {
2557
2577
  throw new Error("Agent response is required for prompt alignment scoring");