@mastra/evals 1.2.1 → 1.2.2-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/dist/{chunk-AY4K3J4R.cjs → chunk-33T2SZZ2.cjs} +74 -14
  3. package/dist/chunk-33T2SZZ2.cjs.map +1 -0
  4. package/dist/{chunk-X4MKZ735.js → chunk-ZRHCSFKL.js} +73 -15
  5. package/dist/chunk-ZRHCSFKL.js.map +1 -0
  6. package/dist/docs/SKILL.md +1 -1
  7. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  8. package/dist/docs/references/reference-evals-context-precision.md +3 -3
  9. package/dist/docs/references/reference-evals-context-relevance.md +3 -3
  10. package/dist/docs/references/reference-evals-noise-sensitivity.md +6 -6
  11. package/dist/docs/references/reference-evals-prompt-alignment.md +12 -12
  12. package/dist/docs/references/reference-evals-scorer-utils.md +3 -3
  13. package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
  14. package/dist/scorers/llm/answer-relevancy/index.d.ts +2 -1
  15. package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
  16. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -1
  17. package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
  18. package/dist/scorers/llm/bias/index.d.ts +2 -2
  19. package/dist/scorers/llm/bias/index.d.ts.map +1 -1
  20. package/dist/scorers/llm/context-precision/index.d.ts +2 -1
  21. package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
  22. package/dist/scorers/llm/context-relevance/index.d.ts +2 -1
  23. package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
  24. package/dist/scorers/llm/faithfulness/index.d.ts +2 -1
  25. package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
  26. package/dist/scorers/llm/hallucination/index.d.ts +4 -4
  27. package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
  28. package/dist/scorers/llm/noise-sensitivity/index.d.ts +2 -1
  29. package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
  30. package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
  31. package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
  32. package/dist/scorers/llm/toxicity/index.d.ts +2 -1
  33. package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
  34. package/dist/scorers/prebuilt/index.cjs +105 -85
  35. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  36. package/dist/scorers/prebuilt/index.js +34 -14
  37. package/dist/scorers/prebuilt/index.js.map +1 -1
  38. package/dist/scorers/utils.cjs +31 -23
  39. package/dist/scorers/utils.d.ts +33 -16
  40. package/dist/scorers/utils.d.ts.map +1 -1
  41. package/dist/scorers/utils.js +1 -1
  42. package/package.json +12 -12
  43. package/dist/chunk-AY4K3J4R.cjs.map +0 -1
  44. package/dist/chunk-X4MKZ735.js.map +0 -1
@@ -1,6 +1,6 @@
1
1
  'use strict';
2
2
 
3
- var chunkAY4K3J4R_cjs = require('../../chunk-AY4K3J4R.cjs');
3
+ var chunk33T2SZZ2_cjs = require('../../chunk-33T2SZZ2.cjs');
4
4
  var evals = require('@mastra/core/evals');
5
5
  var zod = require('zod');
6
6
  var nlp = require('compromise');
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
239
239
  description: "Extract relevant statements from the LLM output",
240
240
  outputSchema: extractOutputSchema,
241
241
  createPrompt: ({ run }) => {
242
- const assistantMessage = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
242
+ const assistantMessage = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
243
243
  return createExtractPrompt(assistantMessage);
244
244
  }
245
245
  }).analyze({
246
246
  description: "Score the relevance of the statements to the input",
247
247
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
248
248
  createPrompt: ({ run, results }) => {
249
- const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
249
+ const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
250
250
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
251
251
  }
252
252
  }).generateScore(({ results }) => {
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
263
263
  }
264
264
  }
265
265
  const score = relevancyCount / numberOfResults;
266
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * options.scale);
266
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * options.scale);
267
267
  }).generateReason({
268
268
  description: "Reason about the results",
269
269
  createPrompt: ({ run, results, score }) => {
270
270
  return createReasonPrompt({
271
- input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
272
- output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
271
+ input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
272
+ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
273
273
  score,
274
274
  results: results.analyzeStepResult.results,
275
275
  scale: options.scale
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
466
466
  groundTruth: ""
467
467
  });
468
468
  }
469
- const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
469
+ const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
470
470
  const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
471
471
  return createExtractPrompt2({
472
472
  output,
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
524
524
  );
525
525
  score -= extraInfoPenalty;
526
526
  score = Math.max(0, Math.min(1, score));
527
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * mergedOptions.scale);
527
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * mergedOptions.scale);
528
528
  }).generateReason({
529
529
  description: "Generate explanation of similarity score",
530
530
  createPrompt: ({ run, results, score }) => {
531
531
  if (!run.groundTruth) {
532
532
  return "No ground truth was provided for comparison. Score is 0 by default.";
533
533
  }
534
- const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
534
+ const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
535
535
  const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
536
536
  return createReasonPrompt2({
537
537
  output,
@@ -698,6 +698,10 @@ Example Responses:
698
698
  }
699
699
 
700
700
  // src/scorers/llm/faithfulness/index.ts
701
+ var getToolInvocationContext = (output) => {
702
+ if (!Array.isArray(output)) return [];
703
+ return output.filter((message) => message?.role === "assistant").flatMap((message) => message?.content?.toolInvocations ?? []).filter((toolCall) => toolCall.state === "result").map((toolCall) => JSON.stringify(toolCall.result));
704
+ };
701
705
  function createFaithfulnessScorer({
702
706
  model,
703
707
  options
@@ -717,17 +721,14 @@ function createFaithfulnessScorer({
717
721
  claims: zod.z.array(zod.z.string())
718
722
  }),
719
723
  createPrompt: ({ run }) => {
720
- const prompt = createFaithfulnessExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
724
+ const prompt = createFaithfulnessExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
721
725
  return prompt;
722
726
  }
723
727
  }).analyze({
724
728
  description: "Score the relevance of the statements to the input",
725
729
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
726
730
  createPrompt: ({ results, run }) => {
727
- const assistantMessage = run.output.find(({ role }) => role === "assistant");
728
- const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
729
- (toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
730
- ) ?? [];
731
+ const context = options?.context ?? getToolInvocationContext(run.output);
731
732
  const prompt = createFaithfulnessAnalyzePrompt({
732
733
  claims: results.preprocessStepResult?.claims || [],
733
734
  context
@@ -741,15 +742,14 @@ function createFaithfulnessScorer({
741
742
  return 0;
742
743
  }
743
744
  const score = supportedClaims / totalClaims * (options?.scale || 1);
744
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
745
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score);
745
746
  }).generateReason({
746
747
  description: "Reason about the results",
747
748
  createPrompt: ({ run, results, score }) => {
748
- const assistantMessage = run.output.find(({ role }) => role === "assistant");
749
749
  const prompt = createFaithfulnessReasonPrompt({
750
- input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
751
- output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
752
- context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
750
+ input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
751
+ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
752
+ context: options?.context ?? getToolInvocationContext(run.output),
753
753
  score,
754
754
  scale: options?.scale || 1,
755
755
  verdicts: results.analyzeStepResult?.verdicts || []
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
881
881
  outputSchema: zod.z.object({
882
882
  opinions: zod.z.array(zod.z.string())
883
883
  }),
884
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
884
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
885
885
  }).analyze({
886
886
  description: "Score the relevance of the statements to the input",
887
887
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
888
888
  createPrompt: ({ run, results }) => {
889
889
  const prompt = createBiasAnalyzePrompt({
890
- output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
890
+ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
891
891
  opinions: results.preprocessStepResult?.opinions || []
892
892
  });
893
893
  return prompt;
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
898
898
  }
899
899
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
900
900
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
901
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * (options?.scale || 1));
901
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * (options?.scale || 1));
902
902
  }).generateReason({
903
903
  description: "Reason about the results",
904
904
  createPrompt: ({ score, results }) => {
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
1117
1117
  claims: zod.z.array(zod.z.string())
1118
1118
  }),
1119
1119
  createPrompt: ({ run }) => {
1120
- const prompt = createHallucinationExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1120
+ const prompt = createHallucinationExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1121
1121
  return prompt;
1122
1122
  }
1123
1123
  }).analyze({
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
1145
1145
  return 0;
1146
1146
  }
1147
1147
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
1148
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
1148
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score);
1149
1149
  }).generateReason({
1150
1150
  description: "Reason about the results",
1151
1151
  createPrompt: async ({ run, results, score }) => {
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
1156
1156
  context = options?.context ?? [];
1157
1157
  }
1158
1158
  const prompt = createHallucinationReasonPrompt({
1159
- input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
1160
- output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1159
+ input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
1160
+ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1161
1161
  context,
1162
1162
  score,
1163
1163
  scale: options?.scale || 1,
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
1271
1271
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
1272
1272
  createPrompt: ({ run }) => {
1273
1273
  const prompt = createToxicityAnalyzePrompt({
1274
- input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
1275
- output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1274
+ input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
1275
+ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1276
1276
  });
1277
1277
  return prompt;
1278
1278
  }
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
1288
1288
  }
1289
1289
  }
1290
1290
  const score = toxicityCount / numberOfVerdicts;
1291
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1291
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1292
1292
  }).generateReason({
1293
1293
  description: "Reason about the results",
1294
1294
  createPrompt: ({ results, score }) => {
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1422
1422
  if (isInputInvalid || isOutputInvalid) {
1423
1423
  throw new Error("Input and output messages cannot be null or empty");
1424
1424
  }
1425
- const { tools: actualTools, toolCallInfos } = chunkAY4K3J4R_cjs.extractToolCalls(run.output);
1425
+ const { tools: actualTools, toolCallInfos } = chunk33T2SZZ2_cjs.extractToolCalls(run.output);
1426
1426
  return {
1427
1427
  actualTools,
1428
1428
  hasToolCalls: actualTools.length > 0,
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1432
1432
  description: "Analyze the appropriateness of tool selections",
1433
1433
  outputSchema: analyzeOutputSchema2,
1434
1434
  createPrompt: ({ run, results }) => {
1435
- const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
1436
- const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1435
+ const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
1436
+ const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1437
1437
  const toolsCalled = results.preprocessStepResult?.actualTools || [];
1438
1438
  return createAnalyzePrompt2({
1439
1439
  userInput,
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1450
1450
  }
1451
1451
  const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1452
1452
  const totalToolCalls = evaluations.length;
1453
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1453
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1454
1454
  }).generateReason({
1455
1455
  description: "Generate human-readable explanation of tool selection evaluation",
1456
1456
  createPrompt: ({ run, results, score }) => {
1457
- const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
1457
+ const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
1458
1458
  const evaluations = results.analyzeStepResult?.evaluations || [];
1459
1459
  const missingTools = results.analyzeStepResult?.missingTools || [];
1460
1460
  return createReasonPrompt3({
@@ -1636,6 +1636,16 @@ var DEFAULT_PENALTIES = {
1636
1636
  MAX_MISSING_CONTEXT_PENALTY: 0.5
1637
1637
  // Maximum 50% penalty for missing context
1638
1638
  };
1639
+ var getContext = ({
1640
+ input,
1641
+ output,
1642
+ options
1643
+ }) => {
1644
+ if (options.contextExtractor && chunk33T2SZZ2_cjs.isScorerRunInputForAgent(input) && chunk33T2SZZ2_cjs.isScorerRunOutputForAgent(output)) {
1645
+ return options.contextExtractor(input, output);
1646
+ }
1647
+ return options.context ?? [];
1648
+ };
1639
1649
  function createContextRelevanceScorerLLM({
1640
1650
  model,
1641
1651
  options
@@ -1659,9 +1669,9 @@ function createContextRelevanceScorerLLM({
1659
1669
  description: "Analyze the relevance and utility of provided context",
1660
1670
  outputSchema: analyzeOutputSchema3,
1661
1671
  createPrompt: ({ run }) => {
1662
- const userQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
1663
- const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1664
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1672
+ const userQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
1673
+ const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1674
+ const context = getContext({ input: run.input, output: run.output, options });
1665
1675
  if (context.length === 0) {
1666
1676
  return createAnalyzePrompt3({
1667
1677
  userQuery,
@@ -1677,7 +1687,7 @@ function createContextRelevanceScorerLLM({
1677
1687
  }
1678
1688
  }).generateScore(({ results, run }) => {
1679
1689
  const evaluations = results.analyzeStepResult?.evaluations || [];
1680
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1690
+ const context = getContext({ input: run.input, output: run.output, options });
1681
1691
  if (context.length === 0) {
1682
1692
  return 1 * (options.scale || 1);
1683
1693
  }
@@ -1708,12 +1718,12 @@ function createContextRelevanceScorerLLM({
1708
1718
  const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1709
1719
  const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1710
1720
  const scaledScore = finalScore * (options.scale || 1);
1711
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(scaledScore);
1721
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(scaledScore);
1712
1722
  }).generateReason({
1713
1723
  description: "Generate human-readable explanation of context relevance evaluation",
1714
1724
  createPrompt: ({ run, results, score }) => {
1715
- const userQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
1716
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1725
+ const userQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
1726
+ const context = getContext({ input: run.input, output: run.output, options });
1717
1727
  if (context.length === 0) {
1718
1728
  return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
1719
1729
  }
@@ -1860,6 +1870,16 @@ var contextRelevanceOutputSchema = zod.z.object({
1860
1870
  })
1861
1871
  )
1862
1872
  });
1873
+ var getContext2 = ({
1874
+ input,
1875
+ output,
1876
+ options
1877
+ }) => {
1878
+ if (options.contextExtractor && chunk33T2SZZ2_cjs.isScorerRunInputForAgent(input) && chunk33T2SZZ2_cjs.isScorerRunOutputForAgent(output)) {
1879
+ return options.contextExtractor(input, output);
1880
+ }
1881
+ return options.context ?? [];
1882
+ };
1863
1883
  function createContextPrecisionScorer({
1864
1884
  model,
1865
1885
  options
@@ -1883,9 +1903,9 @@ function createContextPrecisionScorer({
1883
1903
  description: "Evaluate the relevance of each context piece for generating the expected output",
1884
1904
  outputSchema: contextRelevanceOutputSchema,
1885
1905
  createPrompt: ({ run }) => {
1886
- const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
1887
- const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1888
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1906
+ const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
1907
+ const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1908
+ const context = getContext2({ input: run.input, output: run.output, options });
1889
1909
  if (context.length === 0) {
1890
1910
  throw new Error("No context available for evaluation");
1891
1911
  }
@@ -1917,13 +1937,13 @@ function createContextPrecisionScorer({
1917
1937
  }
1918
1938
  const map = sumPrecision / relevantCount;
1919
1939
  const score = map * (options.scale || 1);
1920
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
1940
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(score);
1921
1941
  }).generateReason({
1922
1942
  description: "Reason about the context precision results",
1923
1943
  createPrompt: ({ run, results, score }) => {
1924
- const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
1925
- const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1926
- const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1944
+ const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
1945
+ const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1946
+ const context = getContext2({ input: run.input, output: run.output, options });
1927
1947
  return createContextPrecisionReasonPrompt({
1928
1948
  input,
1929
1949
  output,
@@ -2177,8 +2197,8 @@ function createNoiseSensitivityScorerLLM({
2177
2197
  description: "Analyze the impact of noise on agent response quality",
2178
2198
  outputSchema: analyzeOutputSchema4,
2179
2199
  createPrompt: ({ run }) => {
2180
- const originalQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
2181
- const noisyResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2200
+ const originalQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
2201
+ const noisyResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2182
2202
  if (!originalQuery || !noisyResponse) {
2183
2203
  throw new Error("Both original query and noisy response are required for evaluation");
2184
2204
  }
@@ -2221,11 +2241,11 @@ function createNoiseSensitivityScorerLLM({
2221
2241
  const majorIssues = analysisResult.majorIssues || [];
2222
2242
  const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
2223
2243
  finalScore = Math.max(0, finalScore - issuesPenalty);
2224
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(finalScore);
2244
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(finalScore);
2225
2245
  }).generateReason({
2226
2246
  description: "Generate human-readable explanation of noise sensitivity evaluation",
2227
2247
  createPrompt: ({ run, results, score }) => {
2228
- const originalQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
2248
+ const originalQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
2229
2249
  const analysisResult = results.analyzeStepResult;
2230
2250
  if (!analysisResult) {
2231
2251
  throw new Error("Analysis step failed to produce results for reason generation");
@@ -2550,17 +2570,17 @@ function createPromptAlignmentScorerLLM({
2550
2570
  description: "Analyze prompt-response alignment across multiple dimensions",
2551
2571
  outputSchema: analyzeOutputSchema5,
2552
2572
  createPrompt: ({ run }) => {
2553
- const userPrompt = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
2554
- const systemPrompt = chunkAY4K3J4R_cjs.getCombinedSystemPrompt(run.input) ?? "";
2555
- const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2573
+ const userPrompt = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
2574
+ const systemPrompt = chunk33T2SZZ2_cjs.getCombinedSystemPrompt(run.input) ?? "";
2575
+ const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2556
2576
  if (evaluationMode === "user" && !userPrompt) {
2557
2577
  throw new Error("User prompt is required for user prompt alignment scoring");
2558
2578
  }
2559
2579
  if (evaluationMode === "system" && !systemPrompt) {
2560
2580
  throw new Error("System prompt is required for system prompt alignment scoring");
2561
2581
  }
2562
- if (evaluationMode === "both" && (!userPrompt || !systemPrompt)) {
2563
- throw new Error("Both user and system prompts are required for combined alignment scoring");
2582
+ if (evaluationMode === "both" && !userPrompt && !systemPrompt) {
2583
+ throw new Error("A user or system prompt is required for combined alignment scoring");
2564
2584
  }
2565
2585
  if (!agentResponse) {
2566
2586
  throw new Error("Agent response is required for prompt alignment scoring");
@@ -2588,12 +2608,12 @@ function createPromptAlignmentScorerLLM({
2588
2608
  weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2589
2609
  }
2590
2610
  const finalScore = weightedScore * scale;
2591
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(finalScore);
2611
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(finalScore);
2592
2612
  }).generateReason({
2593
2613
  description: "Generate human-readable explanation of prompt alignment evaluation",
2594
2614
  createPrompt: ({ run, results, score }) => {
2595
- const userPrompt = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
2596
- const systemPrompt = chunkAY4K3J4R_cjs.getCombinedSystemPrompt(run.input) ?? "";
2615
+ const userPrompt = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
2616
+ const systemPrompt = chunk33T2SZZ2_cjs.getCombinedSystemPrompt(run.input) ?? "";
2597
2617
  const analysis = results.analyzeStepResult;
2598
2618
  if (!analysis) {
2599
2619
  return `Unable to analyze prompt alignment. Score: ${score}`;
@@ -2798,8 +2818,8 @@ function createTrajectoryAccuracyScorerLLM({
2798
2818
  description: "Analyze the quality and appropriateness of the agent trajectory",
2799
2819
  outputSchema: analyzeOutputSchema6,
2800
2820
  createPrompt: ({ run, results }) => {
2801
- const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
2802
- const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
2821
+ const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
2822
+ const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
2803
2823
  return createAnalyzePrompt6({
2804
2824
  userInput,
2805
2825
  agentResponse,
@@ -2824,11 +2844,11 @@ function createTrajectoryAccuracyScorerLLM({
2824
2844
  const necessityScore = necessarySteps / totalSteps;
2825
2845
  const orderScore = orderedSteps / totalSteps;
2826
2846
  const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
2827
- return chunkAY4K3J4R_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
2847
+ return chunk33T2SZZ2_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
2828
2848
  }).generateReason({
2829
2849
  description: "Generate human-readable explanation of trajectory evaluation",
2830
2850
  createPrompt: ({ run, results, score }) => {
2831
- const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
2851
+ const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
2832
2852
  const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
2833
2853
  const missingSteps = results.analyzeStepResult?.missingSteps || [];
2834
2854
  const extraSteps = results.analyzeStepResult?.extraSteps || [];
@@ -2891,18 +2911,18 @@ function createCompletenessScorer() {
2891
2911
  type: "agent"
2892
2912
  }).preprocess(async ({ run }) => {
2893
2913
  const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
2894
- const content = chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i);
2914
+ const content = chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i);
2895
2915
  return content === null || content === void 0;
2896
2916
  });
2897
2917
  const isOutputInvalid = !run.output || run.output.some((i) => {
2898
- const content = chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i);
2918
+ const content = chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i);
2899
2919
  return content === null || content === void 0;
2900
2920
  });
2901
2921
  if (isInputInvalid || isOutputInvalid) {
2902
2922
  throw new Error("Inputs cannot be null or undefined");
2903
2923
  }
2904
- const input = run.input?.inputMessages.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2905
- const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2924
+ const input = run.input?.inputMessages.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2925
+ const output = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2906
2926
  const inputToProcess = input;
2907
2927
  const outputToProcess = output;
2908
2928
  const inputDoc = nlp__default.default(inputToProcess.trim());
@@ -3007,8 +3027,8 @@ function createTextualDifferenceScorer() {
3007
3027
  description: "Calculate textual difference between input and output using sequence matching algorithms.",
3008
3028
  type: "agent"
3009
3029
  }).preprocess(async ({ run }) => {
3010
- const input = run.input?.inputMessages?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3011
- const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3030
+ const input = run.input?.inputMessages?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3031
+ const output = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3012
3032
  const ratio = calculateRatio(input, output);
3013
3033
  const changes = countChanges(input, output);
3014
3034
  const maxLength = Math.max(input.length, output.length);
@@ -3031,8 +3051,8 @@ function createKeywordCoverageScorer() {
3031
3051
  description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
3032
3052
  type: "agent"
3033
3053
  }).preprocess(async ({ run }) => {
3034
- const input = run.input?.inputMessages?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3035
- const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3054
+ const input = run.input?.inputMessages?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3055
+ const output = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3036
3056
  if (!input && !output) {
3037
3057
  return {
3038
3058
  result: {
@@ -3085,8 +3105,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
3085
3105
  description: "Calculates content similarity between input and output messages using string comparison algorithms.",
3086
3106
  type: "agent"
3087
3107
  }).preprocess(async ({ run }) => {
3088
- let processedInput = run.input?.inputMessages.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3089
- let processedOutput = run.output.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3108
+ let processedInput = run.input?.inputMessages.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3109
+ let processedOutput = run.output.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3090
3110
  if (ignoreCase) {
3091
3111
  processedInput = processedInput.toLowerCase();
3092
3112
  processedOutput = processedOutput.toLowerCase();
@@ -3116,7 +3136,7 @@ function createToneScorer(config = {}) {
3116
3136
  type: "agent"
3117
3137
  }).preprocess(async ({ run }) => {
3118
3138
  const sentiment = new Sentiment__default.default();
3119
- const agentMessage = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3139
+ const agentMessage = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3120
3140
  const responseSentiment = sentiment.analyze(agentMessage);
3121
3141
  if (referenceTone) {
3122
3142
  const referenceSentiment = sentiment.analyze(referenceTone);
@@ -3203,7 +3223,7 @@ function createToolCallAccuracyScorerCode(options) {
3203
3223
  if (isInputInvalid || isOutputInvalid) {
3204
3224
  throw new Error("Input and output messages cannot be null or empty");
3205
3225
  }
3206
- const { tools: actualTools, toolCallInfos } = chunkAY4K3J4R_cjs.extractToolCalls(run.output);
3226
+ const { tools: actualTools, toolCallInfos } = chunk33T2SZZ2_cjs.extractToolCalls(run.output);
3207
3227
  const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
3208
3228
  return {
3209
3229
  expectedTool,
@@ -3278,7 +3298,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
3278
3298
  const itemExpectation = run.expectedTrajectory;
3279
3299
  const effectiveOrdering = itemExpectation?.ordering ?? ordering;
3280
3300
  const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
3281
- const comparison = chunkAY4K3J4R_cjs.compareTrajectories(
3301
+ const comparison = chunk33T2SZZ2_cjs.compareTrajectories(
3282
3302
  actualTrajectory,
3283
3303
  { steps: resolvedExpectedSteps },
3284
3304
  {
@@ -3336,7 +3356,7 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
3336
3356
  const childConfig = expectedStep.children;
3337
3357
  let accuracy;
3338
3358
  if (childConfig.steps && childConfig.steps.length > 0) {
3339
- accuracy = chunkAY4K3J4R_cjs.compareTrajectories(
3359
+ accuracy = chunk33T2SZZ2_cjs.compareTrajectories(
3340
3360
  childTrajectory,
3341
3361
  { steps: childConfig.steps },
3342
3362
  {
@@ -3346,18 +3366,18 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
3346
3366
  );
3347
3367
  }
3348
3368
  const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
3349
- const efficiency = hasEfficiencyConfig ? chunkAY4K3J4R_cjs.checkTrajectoryEfficiency(childTrajectory, {
3369
+ const efficiency = hasEfficiencyConfig ? chunk33T2SZZ2_cjs.checkTrajectoryEfficiency(childTrajectory, {
3350
3370
  maxSteps: childConfig.maxSteps,
3351
3371
  maxTotalTokens: childConfig.maxTotalTokens,
3352
3372
  maxTotalDurationMs: childConfig.maxTotalDurationMs,
3353
3373
  noRedundantCalls: childConfig.noRedundantCalls ?? true
3354
3374
  }) : void 0;
3355
3375
  const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
3356
- const blacklist = hasBlacklistConfig ? chunkAY4K3J4R_cjs.checkTrajectoryBlacklist(childTrajectory, {
3376
+ const blacklist = hasBlacklistConfig ? chunk33T2SZZ2_cjs.checkTrajectoryBlacklist(childTrajectory, {
3357
3377
  blacklistedTools: childConfig.blacklistedTools,
3358
3378
  blacklistedSequences: childConfig.blacklistedSequences
3359
3379
  }) : void 0;
3360
- const toolFailures = chunkAY4K3J4R_cjs.analyzeToolFailures(childTrajectory, {
3380
+ const toolFailures = chunk33T2SZZ2_cjs.analyzeToolFailures(childTrajectory, {
3361
3381
  maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
3362
3382
  });
3363
3383
  const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
@@ -3422,7 +3442,7 @@ function createTrajectoryScorerCode(options = {}) {
3422
3442
  }
3423
3443
  let accuracy;
3424
3444
  if (config.steps && config.steps.length > 0) {
3425
- accuracy = chunkAY4K3J4R_cjs.compareTrajectories(
3445
+ accuracy = chunk33T2SZZ2_cjs.compareTrajectories(
3426
3446
  actualTrajectory,
3427
3447
  { steps: config.steps },
3428
3448
  {
@@ -3432,18 +3452,18 @@ function createTrajectoryScorerCode(options = {}) {
3432
3452
  );
3433
3453
  }
3434
3454
  const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
3435
- const efficiency = hasEfficiencyConfig ? chunkAY4K3J4R_cjs.checkTrajectoryEfficiency(actualTrajectory, {
3455
+ const efficiency = hasEfficiencyConfig ? chunk33T2SZZ2_cjs.checkTrajectoryEfficiency(actualTrajectory, {
3436
3456
  maxSteps: config.maxSteps,
3437
3457
  maxTotalTokens: config.maxTotalTokens,
3438
3458
  maxTotalDurationMs: config.maxTotalDurationMs,
3439
3459
  noRedundantCalls: config.noRedundantCalls ?? true
3440
3460
  }) : void 0;
3441
3461
  const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
3442
- const blacklist = hasBlacklistConfig ? chunkAY4K3J4R_cjs.checkTrajectoryBlacklist(actualTrajectory, {
3462
+ const blacklist = hasBlacklistConfig ? chunk33T2SZZ2_cjs.checkTrajectoryBlacklist(actualTrajectory, {
3443
3463
  blacklistedTools: config.blacklistedTools,
3444
3464
  blacklistedSequences: config.blacklistedSequences
3445
3465
  }) : void 0;
3446
- const toolFailures = chunkAY4K3J4R_cjs.analyzeToolFailures(actualTrajectory, {
3466
+ const toolFailures = chunk33T2SZZ2_cjs.analyzeToolFailures(actualTrajectory, {
3447
3467
  maxRetriesPerTool: config.maxRetriesPerTool ?? 2
3448
3468
  });
3449
3469
  const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;