@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,6 +1,6 @@
1
1
  'use strict';
2
2
 
3
- var chunkW3U7MMDX_cjs = require('../../chunk-W3U7MMDX.cjs');
3
+ var chunkXRUR5PBK_cjs = require('../../chunk-XRUR5PBK.cjs');
4
4
  var evals = require('@mastra/core/evals');
5
5
  var zod = require('zod');
6
6
  var nlp = require('compromise');
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
239
239
  description: "Extract relevant statements from the LLM output",
240
240
  outputSchema: extractOutputSchema,
241
241
  createPrompt: ({ run }) => {
242
- const assistantMessage = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
242
+ const assistantMessage = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
243
243
  return createExtractPrompt(assistantMessage);
244
244
  }
245
245
  }).analyze({
246
246
  description: "Score the relevance of the statements to the input",
247
247
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
248
248
  createPrompt: ({ run, results }) => {
249
- const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
249
+ const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
250
250
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
251
251
  }
252
252
  }).generateScore(({ results }) => {
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
263
263
  }
264
264
  }
265
265
  const score = relevancyCount / numberOfResults;
266
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * options.scale);
266
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * options.scale);
267
267
  }).generateReason({
268
268
  description: "Reason about the results",
269
269
  createPrompt: ({ run, results, score }) => {
270
270
  return createReasonPrompt({
271
- input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
272
- output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
271
+ input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
272
+ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
273
273
  score,
274
274
  results: results.analyzeStepResult.results,
275
275
  scale: options.scale
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
466
466
  groundTruth: ""
467
467
  });
468
468
  }
469
- const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
469
+ const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
470
470
  const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
471
471
  return createExtractPrompt2({
472
472
  output,
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
524
524
  );
525
525
  score -= extraInfoPenalty;
526
526
  score = Math.max(0, Math.min(1, score));
527
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * mergedOptions.scale);
527
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * mergedOptions.scale);
528
528
  }).generateReason({
529
529
  description: "Generate explanation of similarity score",
530
530
  createPrompt: ({ run, results, score }) => {
531
531
  if (!run.groundTruth) {
532
532
  return "No ground truth was provided for comparison. Score is 0 by default.";
533
533
  }
534
- const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
534
+ const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
535
535
  const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
536
536
  return createReasonPrompt2({
537
537
  output,
@@ -717,7 +717,7 @@ function createFaithfulnessScorer({
717
717
  claims: zod.z.array(zod.z.string())
718
718
  }),
719
719
  createPrompt: ({ run }) => {
720
- const prompt = createFaithfulnessExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
720
+ const prompt = createFaithfulnessExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
721
721
  return prompt;
722
722
  }
723
723
  }).analyze({
@@ -741,14 +741,14 @@ function createFaithfulnessScorer({
741
741
  return 0;
742
742
  }
743
743
  const score = supportedClaims / totalClaims * (options?.scale || 1);
744
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
744
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
745
745
  }).generateReason({
746
746
  description: "Reason about the results",
747
747
  createPrompt: ({ run, results, score }) => {
748
748
  const assistantMessage = run.output.find(({ role }) => role === "assistant");
749
749
  const prompt = createFaithfulnessReasonPrompt({
750
- input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
751
- output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
750
+ input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
751
+ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
752
752
  context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
753
753
  score,
754
754
  scale: options?.scale || 1,
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
881
881
  outputSchema: zod.z.object({
882
882
  opinions: zod.z.array(zod.z.string())
883
883
  }),
884
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
884
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
885
885
  }).analyze({
886
886
  description: "Score the relevance of the statements to the input",
887
887
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
888
888
  createPrompt: ({ run, results }) => {
889
889
  const prompt = createBiasAnalyzePrompt({
890
- output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
890
+ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
891
891
  opinions: results.preprocessStepResult?.opinions || []
892
892
  });
893
893
  return prompt;
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
898
898
  }
899
899
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
900
900
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
901
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * (options?.scale || 1));
901
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
902
902
  }).generateReason({
903
903
  description: "Reason about the results",
904
904
  createPrompt: ({ score, results }) => {
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
1117
1117
  claims: zod.z.array(zod.z.string())
1118
1118
  }),
1119
1119
  createPrompt: ({ run }) => {
1120
- const prompt = createHallucinationExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1120
+ const prompt = createHallucinationExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1121
1121
  return prompt;
1122
1122
  }
1123
1123
  }).analyze({
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
1145
1145
  return 0;
1146
1146
  }
1147
1147
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
1148
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
1148
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
1149
1149
  }).generateReason({
1150
1150
  description: "Reason about the results",
1151
1151
  createPrompt: async ({ run, results, score }) => {
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
1156
1156
  context = options?.context ?? [];
1157
1157
  }
1158
1158
  const prompt = createHallucinationReasonPrompt({
1159
- input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
1160
- output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1159
+ input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
1160
+ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1161
1161
  context,
1162
1162
  score,
1163
1163
  scale: options?.scale || 1,
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
1271
1271
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
1272
1272
  createPrompt: ({ run }) => {
1273
1273
  const prompt = createToxicityAnalyzePrompt({
1274
- input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
1275
- output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1274
+ input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
1275
+ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1276
1276
  });
1277
1277
  return prompt;
1278
1278
  }
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
1288
1288
  }
1289
1289
  }
1290
1290
  const score = toxicityCount / numberOfVerdicts;
1291
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1291
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1292
1292
  }).generateReason({
1293
1293
  description: "Reason about the results",
1294
1294
  createPrompt: ({ results, score }) => {
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1422
1422
  if (isInputInvalid || isOutputInvalid) {
1423
1423
  throw new Error("Input and output messages cannot be null or empty");
1424
1424
  }
1425
- const { tools: actualTools, toolCallInfos } = chunkW3U7MMDX_cjs.extractToolCalls(run.output);
1425
+ const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
1426
1426
  return {
1427
1427
  actualTools,
1428
1428
  hasToolCalls: actualTools.length > 0,
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1432
1432
  description: "Analyze the appropriateness of tool selections",
1433
1433
  outputSchema: analyzeOutputSchema2,
1434
1434
  createPrompt: ({ run, results }) => {
1435
- const userInput = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
1436
- const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1435
+ const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
1436
+ const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1437
1437
  const toolsCalled = results.preprocessStepResult?.actualTools || [];
1438
1438
  return createAnalyzePrompt2({
1439
1439
  userInput,
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1450
1450
  }
1451
1451
  const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1452
1452
  const totalToolCalls = evaluations.length;
1453
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1453
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1454
1454
  }).generateReason({
1455
1455
  description: "Generate human-readable explanation of tool selection evaluation",
1456
1456
  createPrompt: ({ run, results, score }) => {
1457
- const userInput = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
1457
+ const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
1458
1458
  const evaluations = results.analyzeStepResult?.evaluations || [];
1459
1459
  const missingTools = results.analyzeStepResult?.missingTools || [];
1460
1460
  return createReasonPrompt3({
@@ -1659,8 +1659,8 @@ function createContextRelevanceScorerLLM({
1659
1659
  description: "Analyze the relevance and utility of provided context",
1660
1660
  outputSchema: analyzeOutputSchema3,
1661
1661
  createPrompt: ({ run }) => {
1662
- const userQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
1663
- const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1662
+ const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
1663
+ const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1664
1664
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1665
1665
  if (context.length === 0) {
1666
1666
  return createAnalyzePrompt3({
@@ -1708,11 +1708,11 @@ function createContextRelevanceScorerLLM({
1708
1708
  const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1709
1709
  const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1710
1710
  const scaledScore = finalScore * (options.scale || 1);
1711
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(scaledScore);
1711
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(scaledScore);
1712
1712
  }).generateReason({
1713
1713
  description: "Generate human-readable explanation of context relevance evaluation",
1714
1714
  createPrompt: ({ run, results, score }) => {
1715
- const userQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
1715
+ const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
1716
1716
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1717
1717
  if (context.length === 0) {
1718
1718
  return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
@@ -1883,8 +1883,8 @@ function createContextPrecisionScorer({
1883
1883
  description: "Evaluate the relevance of each context piece for generating the expected output",
1884
1884
  outputSchema: contextRelevanceOutputSchema,
1885
1885
  createPrompt: ({ run }) => {
1886
- const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
1887
- const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1886
+ const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
1887
+ const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1888
1888
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1889
1889
  if (context.length === 0) {
1890
1890
  throw new Error("No context available for evaluation");
@@ -1917,12 +1917,12 @@ function createContextPrecisionScorer({
1917
1917
  }
1918
1918
  const map = sumPrecision / relevantCount;
1919
1919
  const score = map * (options.scale || 1);
1920
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
1920
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
1921
1921
  }).generateReason({
1922
1922
  description: "Reason about the context precision results",
1923
1923
  createPrompt: ({ run, results, score }) => {
1924
- const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
1925
- const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1924
+ const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
1925
+ const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1926
1926
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1927
1927
  return createContextPrecisionReasonPrompt({
1928
1928
  input,
@@ -2177,8 +2177,8 @@ function createNoiseSensitivityScorerLLM({
2177
2177
  description: "Analyze the impact of noise on agent response quality",
2178
2178
  outputSchema: analyzeOutputSchema4,
2179
2179
  createPrompt: ({ run }) => {
2180
- const originalQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
2181
- const noisyResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2180
+ const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
2181
+ const noisyResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2182
2182
  if (!originalQuery || !noisyResponse) {
2183
2183
  throw new Error("Both original query and noisy response are required for evaluation");
2184
2184
  }
@@ -2221,11 +2221,11 @@ function createNoiseSensitivityScorerLLM({
2221
2221
  const majorIssues = analysisResult.majorIssues || [];
2222
2222
  const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
2223
2223
  finalScore = Math.max(0, finalScore - issuesPenalty);
2224
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(finalScore);
2224
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
2225
2225
  }).generateReason({
2226
2226
  description: "Generate human-readable explanation of noise sensitivity evaluation",
2227
2227
  createPrompt: ({ run, results, score }) => {
2228
- const originalQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
2228
+ const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
2229
2229
  const analysisResult = results.analyzeStepResult;
2230
2230
  if (!analysisResult) {
2231
2231
  throw new Error("Analysis step failed to produce results for reason generation");
@@ -2550,9 +2550,9 @@ function createPromptAlignmentScorerLLM({
2550
2550
  description: "Analyze prompt-response alignment across multiple dimensions",
2551
2551
  outputSchema: analyzeOutputSchema5,
2552
2552
  createPrompt: ({ run }) => {
2553
- const userPrompt = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
2554
- const systemPrompt = chunkW3U7MMDX_cjs.getCombinedSystemPrompt(run.input) ?? "";
2555
- const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2553
+ const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
2554
+ const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
2555
+ const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2556
2556
  if (evaluationMode === "user" && !userPrompt) {
2557
2557
  throw new Error("User prompt is required for user prompt alignment scoring");
2558
2558
  }
@@ -2588,12 +2588,12 @@ function createPromptAlignmentScorerLLM({
2588
2588
  weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2589
2589
  }
2590
2590
  const finalScore = weightedScore * scale;
2591
- return chunkW3U7MMDX_cjs.roundToTwoDecimals(finalScore);
2591
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
2592
2592
  }).generateReason({
2593
2593
  description: "Generate human-readable explanation of prompt alignment evaluation",
2594
2594
  createPrompt: ({ run, results, score }) => {
2595
- const userPrompt = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
2596
- const systemPrompt = chunkW3U7MMDX_cjs.getCombinedSystemPrompt(run.input) ?? "";
2595
+ const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
2596
+ const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
2597
2597
  const analysis = results.analyzeStepResult;
2598
2598
  if (!analysis) {
2599
2599
  return `Unable to analyze prompt alignment. Score: ${score}`;
@@ -2609,6 +2609,245 @@ function createPromptAlignmentScorerLLM({
2609
2609
  }
2610
2610
  });
2611
2611
  }
2612
+
2613
+ // src/scorers/llm/trajectory/prompts.ts
2614
+ var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
2615
+ You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
2616
+
2617
+ CORE RESPONSIBILITIES:
2618
+ - Analyze the full sequence of actions the agent took
2619
+ - Evaluate whether each step was necessary and well-ordered
2620
+ - Identify unnecessary, redundant, or missing steps
2621
+ - Assess the overall quality of the agent's action path
2622
+
2623
+ EVALUATION PHILOSOPHY:
2624
+ - Consider both the individual steps AND the overall flow
2625
+ - A good trajectory is efficient, logical, and complete
2626
+ - Redundant steps reduce quality even if the final result is correct
2627
+ - Missing critical steps are a significant issue
2628
+ - Order matters: logical dependencies should be respected
2629
+
2630
+ OUTPUT REQUIREMENTS:
2631
+ - Provide clear reasoning for your trajectory assessment
2632
+ - Use provided JSON schema exactly as specified
2633
+ - Be consistent in your evaluation standards
2634
+ `;
2635
+ var createAnalyzePrompt6 = ({
2636
+ userInput,
2637
+ agentResponse,
2638
+ actualTrajectory,
2639
+ expectedTrajectory
2640
+ }) => {
2641
+ let prompt = `
2642
+ You are evaluating whether an AI agent took an appropriate sequence of actions to fulfill a user request.
2643
+
2644
+ USER REQUEST: "${userInput}"
2645
+ AGENT FINAL RESPONSE: "${agentResponse}"
2646
+
2647
+ ACTUAL TRAJECTORY (sequence of actions the agent took):
2648
+ ${actualTrajectory}
2649
+ `;
2650
+ if (expectedTrajectory) {
2651
+ prompt += `
2652
+ EXPECTED TRAJECTORY (the ideal sequence):
2653
+ ${expectedTrajectory}
2654
+
2655
+ EVALUATION CRITERIA:
2656
+ 1. STEP PRESENCE: Did the agent perform all expected steps?
2657
+ 2. STEP ORDER: Were the steps in a logical order? (Expected order is a guideline, not absolute)
2658
+ 3. EXTRA STEPS: Did the agent take unnecessary steps not in the expected trajectory?
2659
+ 4. MISSING STEPS: Are any expected steps missing from the actual trajectory?
2660
+ 5. STEP QUALITY: For each step that matches, was it executed appropriately?
2661
+
2662
+ For each actual step, evaluate:
2663
+ - Does it correspond to an expected step?
2664
+ - Was it necessary for the task?
2665
+ - Was it in the right position in the sequence?
2666
+ `;
2667
+ } else {
2668
+ prompt += `
2669
+ EVALUATION CRITERIA (no expected trajectory provided - evaluate based on the task):
2670
+ 1. COMPLETENESS: Did the agent take all necessary steps to fulfill the request?
2671
+ 2. EFFICIENCY: Were there any redundant or unnecessary steps?
2672
+ 3. ORDERING: Were the steps in a logical order given their dependencies?
2673
+ 4. APPROPRIATENESS: Was each step appropriate for the task?
2674
+ `;
2675
+ }
2676
+ prompt += `
2677
+ Evaluate each step and the overall trajectory quality.
2678
+ `;
2679
+ return prompt;
2680
+ };
2681
+ var createReasonPrompt7 = ({
2682
+ userInput,
2683
+ score,
2684
+ stepEvaluations,
2685
+ missingSteps,
2686
+ extraSteps
2687
+ }) => {
2688
+ return `
2689
+ Explain this trajectory evaluation in ONE SENTENCE.
2690
+
2691
+ User Request: "${userInput}"
2692
+ Score: ${score}/1
2693
+ Steps Evaluated: ${JSON.stringify(stepEvaluations)}
2694
+ Missing Steps: ${JSON.stringify(missingSteps)}
2695
+ Extra/Unnecessary Steps: ${JSON.stringify(extraSteps)}
2696
+
2697
+ Provide a single, concise sentence explaining why this score was given.
2698
+ `;
2699
+ };
2700
+
2701
+ // src/scorers/llm/trajectory/index.ts
2702
+ var analyzeOutputSchema6 = zod.z.object({
2703
+ stepEvaluations: zod.z.array(
2704
+ zod.z.object({
2705
+ stepName: zod.z.string().describe("Name of the step (tool name or action)"),
2706
+ wasNecessary: zod.z.boolean().describe("Whether this step was necessary for the task"),
2707
+ wasInOrder: zod.z.boolean().describe("Whether this step was in a logical position in the sequence"),
2708
+ reasoning: zod.z.string().describe("Brief explanation of the evaluation")
2709
+ })
2710
+ ),
2711
+ missingSteps: zod.z.array(zod.z.string()).optional().describe("Steps that should have been taken but were not"),
2712
+ extraSteps: zod.z.array(zod.z.string()).optional().describe("Steps that were unnecessary or redundant"),
2713
+ overallAssessment: zod.z.string().describe("Brief overall assessment of the trajectory quality")
2714
+ });
2715
+ function formatStepDetails(step) {
2716
+ switch (step.stepType) {
2717
+ case "tool_call":
2718
+ case "mcp_tool_call": {
2719
+ const parts = [];
2720
+ if (step.toolArgs !== void 0) parts.push(`args: ${JSON.stringify(step.toolArgs)}`);
2721
+ if (step.toolResult !== void 0) parts.push(`result: ${JSON.stringify(step.toolResult)}`);
2722
+ return parts.length > 0 ? ` (${parts.join(", ")})` : "";
2723
+ }
2724
+ case "model_generation":
2725
+ return step.modelId ? ` (model: ${step.modelId})` : "";
2726
+ case "workflow_step":
2727
+ return step.output !== void 0 ? ` (output: ${JSON.stringify(step.output)})` : "";
2728
+ default:
2729
+ return "";
2730
+ }
2731
+ }
2732
+ function formatTrajectory(trajectory, indent = 0) {
2733
+ const prefix = " ".repeat(indent);
2734
+ return trajectory.steps.map((step, i) => {
2735
+ let line = `${prefix}${i + 1}. [${step.stepType}] ${step.name}${formatStepDetails(step)}`;
2736
+ if (step.children && step.children.length > 0) {
2737
+ line += `
2738
+ ${formatTrajectory({ steps: step.children }, indent + 1)}`;
2739
+ }
2740
+ return line;
2741
+ }).join("\n");
2742
+ }
2743
+ function formatExpectedSteps(steps, indent = 0) {
2744
+ const prefix = " ".repeat(indent);
2745
+ return steps.map((step, i) => {
2746
+ const typeStr = step.stepType ? `[${step.stepType}] ` : "";
2747
+ const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
2748
+ let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
2749
+ if (step.children?.steps && step.children.steps.length > 0) {
2750
+ line += `
2751
+ ${formatExpectedSteps(step.children.steps, indent + 1)}`;
2752
+ }
2753
+ return line;
2754
+ }).join("\n");
2755
+ }
2756
+ function createTrajectoryAccuracyScorerLLM({
2757
+ model,
2758
+ expectedTrajectory: staticExpectedTrajectory
2759
+ }) {
2760
+ return evals.createScorer({
2761
+ id: "llm-trajectory-accuracy-scorer",
2762
+ name: "Trajectory Accuracy (LLM)",
2763
+ description: staticExpectedTrajectory ? "Evaluates the trajectory against an expected trajectory using LLM analysis" : "Evaluates the quality and appropriateness of the trajectory using LLM analysis",
2764
+ judge: {
2765
+ model,
2766
+ instructions: TRAJECTORY_EVALUATION_INSTRUCTIONS
2767
+ },
2768
+ type: "trajectory"
2769
+ }).preprocess(async ({ run }) => {
2770
+ const actualTrajectory = run.output;
2771
+ let expectedSteps;
2772
+ if (staticExpectedTrajectory) {
2773
+ if (Array.isArray(staticExpectedTrajectory)) {
2774
+ expectedSteps = staticExpectedTrajectory;
2775
+ } else {
2776
+ expectedSteps = staticExpectedTrajectory.steps.map((s) => {
2777
+ const result = { name: s.name, stepType: s.stepType };
2778
+ const data = {};
2779
+ if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
2780
+ data.input = s.toolArgs;
2781
+ if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
2782
+ data.output = s.toolResult;
2783
+ if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
2784
+ if (Object.keys(data).length > 0) result.data = data;
2785
+ if (s.children && s.children.length > 0) {
2786
+ result.children = {
2787
+ steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
2788
+ };
2789
+ }
2790
+ return result;
2791
+ });
2792
+ }
2793
+ } else if (run.expectedTrajectory) {
2794
+ const expectation = run.expectedTrajectory;
2795
+ expectedSteps = expectation.steps && expectation.steps.length > 0 ? expectation.steps : void 0;
2796
+ }
2797
+ return {
2798
+ actualTrajectory,
2799
+ actualTrajectoryFormatted: formatTrajectory(actualTrajectory),
2800
+ expectedTrajectoryFormatted: expectedSteps ? formatExpectedSteps(expectedSteps) : void 0,
2801
+ hasSteps: actualTrajectory.steps.length > 0
2802
+ };
2803
+ }).analyze({
2804
+ description: "Analyze the quality and appropriateness of the agent trajectory",
2805
+ outputSchema: analyzeOutputSchema6,
2806
+ createPrompt: ({ run, results }) => {
2807
+ const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
2808
+ const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
2809
+ return createAnalyzePrompt6({
2810
+ userInput,
2811
+ agentResponse,
2812
+ actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
2813
+ expectedTrajectory: results.preprocessStepResult?.expectedTrajectoryFormatted
2814
+ });
2815
+ }
2816
+ }).generateScore(({ results }) => {
2817
+ const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
2818
+ if (stepEvaluations.length === 0) {
2819
+ const missingSteps2 = results.analyzeStepResult?.missingSteps || [];
2820
+ const extraSteps = results.analyzeStepResult?.extraSteps || [];
2821
+ if (missingSteps2.length > 0) return 0;
2822
+ if (extraSteps.length > 0) return 0.5;
2823
+ return 1;
2824
+ }
2825
+ const necessarySteps = stepEvaluations.filter((e) => e.wasNecessary).length;
2826
+ const orderedSteps = stepEvaluations.filter((e) => e.wasInOrder).length;
2827
+ const totalSteps = stepEvaluations.length;
2828
+ const missingSteps = results.analyzeStepResult?.missingSteps || [];
2829
+ const missingPenalty = missingSteps.length > 0 ? missingSteps.length / (totalSteps + missingSteps.length) : 0;
2830
+ const necessityScore = necessarySteps / totalSteps;
2831
+ const orderScore = orderedSteps / totalSteps;
2832
+ const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
2833
+ return chunkXRUR5PBK_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
2834
+ }).generateReason({
2835
+ description: "Generate human-readable explanation of trajectory evaluation",
2836
+ createPrompt: ({ run, results, score }) => {
2837
+ const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
2838
+ const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
2839
+ const missingSteps = results.analyzeStepResult?.missingSteps || [];
2840
+ const extraSteps = results.analyzeStepResult?.extraSteps || [];
2841
+ return createReasonPrompt7({
2842
+ userInput,
2843
+ score,
2844
+ stepEvaluations,
2845
+ missingSteps,
2846
+ extraSteps
2847
+ });
2848
+ }
2849
+ });
2850
+ }
2612
2851
  function normalizeString(str) {
2613
2852
  return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
2614
2853
  }
@@ -2658,18 +2897,18 @@ function createCompletenessScorer() {
2658
2897
  type: "agent"
2659
2898
  }).preprocess(async ({ run }) => {
2660
2899
  const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
2661
- const content = chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i);
2900
+ const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
2662
2901
  return content === null || content === void 0;
2663
2902
  });
2664
2903
  const isOutputInvalid = !run.output || run.output.some((i) => {
2665
- const content = chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i);
2904
+ const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
2666
2905
  return content === null || content === void 0;
2667
2906
  });
2668
2907
  if (isInputInvalid || isOutputInvalid) {
2669
2908
  throw new Error("Inputs cannot be null or undefined");
2670
2909
  }
2671
- const input = run.input?.inputMessages.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2672
- const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2910
+ const input = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2911
+ const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2673
2912
  const inputToProcess = input;
2674
2913
  const outputToProcess = output;
2675
2914
  const inputDoc = nlp__default.default(inputToProcess.trim());
@@ -2774,8 +3013,8 @@ function createTextualDifferenceScorer() {
2774
3013
  description: "Calculate textual difference between input and output using sequence matching algorithms.",
2775
3014
  type: "agent"
2776
3015
  }).preprocess(async ({ run }) => {
2777
- const input = run.input?.inputMessages?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2778
- const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3016
+ const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3017
+ const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2779
3018
  const ratio = calculateRatio(input, output);
2780
3019
  const changes = countChanges(input, output);
2781
3020
  const maxLength = Math.max(input.length, output.length);
@@ -2798,8 +3037,8 @@ function createKeywordCoverageScorer() {
2798
3037
  description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
2799
3038
  type: "agent"
2800
3039
  }).preprocess(async ({ run }) => {
2801
- const input = run.input?.inputMessages?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2802
- const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3040
+ const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3041
+ const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2803
3042
  if (!input && !output) {
2804
3043
  return {
2805
3044
  result: {
@@ -2852,8 +3091,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
2852
3091
  description: "Calculates content similarity between input and output messages using string comparison algorithms.",
2853
3092
  type: "agent"
2854
3093
  }).preprocess(async ({ run }) => {
2855
- let processedInput = run.input?.inputMessages.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2856
- let processedOutput = run.output.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3094
+ let processedInput = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3095
+ let processedOutput = run.output.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2857
3096
  if (ignoreCase) {
2858
3097
  processedInput = processedInput.toLowerCase();
2859
3098
  processedOutput = processedOutput.toLowerCase();
@@ -2883,7 +3122,7 @@ function createToneScorer(config = {}) {
2883
3122
  type: "agent"
2884
3123
  }).preprocess(async ({ run }) => {
2885
3124
  const sentiment = new Sentiment__default.default();
2886
- const agentMessage = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
3125
+ const agentMessage = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2887
3126
  const responseSentiment = sentiment.analyze(agentMessage);
2888
3127
  if (referenceTone) {
2889
3128
  const referenceSentiment = sentiment.analyze(referenceTone);
@@ -2970,7 +3209,7 @@ function createToolCallAccuracyScorerCode(options) {
2970
3209
  if (isInputInvalid || isOutputInvalid) {
2971
3210
  throw new Error("Input and output messages cannot be null or empty");
2972
3211
  }
2973
- const { tools: actualTools, toolCallInfos } = chunkW3U7MMDX_cjs.extractToolCalls(run.output);
3212
+ const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
2974
3213
  const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
2975
3214
  return {
2976
3215
  expectedTool,
@@ -2995,6 +3234,343 @@ function createToolCallAccuracyScorerCode(options) {
2995
3234
  });
2996
3235
  });
2997
3236
  }
3237
+ function trajectoryStepToExpectedStep(step) {
3238
+ const result = { name: step.name, stepType: step.stepType };
3239
+ const data = {};
3240
+ if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
3241
+ if (step.toolArgs !== void 0) data.input = step.toolArgs;
3242
+ if (step.toolResult !== void 0) data.output = step.toolResult;
3243
+ } else if (step.stepType === "workflow_step") {
3244
+ if (step.output !== void 0) data.output = step.output;
3245
+ }
3246
+ if (Object.keys(data).length > 0) result.data = data;
3247
+ if (step.children && step.children.length > 0) {
3248
+ result.children = {
3249
+ steps: step.children.map(trajectoryStepToExpectedStep)
3250
+ };
3251
+ }
3252
+ return result;
3253
+ }
3254
+ function expectationToExpectedSteps(expectation) {
3255
+ if (!expectation.steps || expectation.steps.length === 0) return void 0;
3256
+ return expectation.steps;
3257
+ }
3258
+ function createTrajectoryAccuracyScorerCode(options = {}) {
3259
+ const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
3260
+ const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
3261
+ const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
3262
+ const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
3263
+ const getDescription = () => {
3264
+ if (staticExpectedSteps) {
3265
+ const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
3266
+ return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
3267
+ }
3268
+ return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
3269
+ };
3270
+ return evals.createScorer({
3271
+ id: "code-trajectory-accuracy-scorer",
3272
+ name: "Trajectory Accuracy Scorer",
3273
+ description: getDescription(),
3274
+ type: "trajectory"
3275
+ }).preprocess(async ({ run }) => {
3276
+ const actualTrajectory = run.output;
3277
+ let resolvedExpectedSteps = staticExpectedSteps;
3278
+ if (!resolvedExpectedSteps && run.expectedTrajectory) {
3279
+ const expectation = run.expectedTrajectory;
3280
+ resolvedExpectedSteps = expectationToExpectedSteps(expectation);
3281
+ }
3282
+ if (!resolvedExpectedSteps || resolvedExpectedSteps.length === 0) {
3283
+ return {
3284
+ actualTrajectory,
3285
+ expectedTrajectory: void 0,
3286
+ comparison: void 0,
3287
+ actualStepNames: actualTrajectory.steps.map((s) => s.name),
3288
+ expectedStepNames: [],
3289
+ error: "No expected trajectory provided (pass via options or dataset item expectedTrajectory)"
3290
+ };
3291
+ }
3292
+ const itemExpectation = run.expectedTrajectory;
3293
+ const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
3294
+ const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
3295
+ const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
3296
+ const comparison = chunkXRUR5PBK_cjs.compareTrajectories(
3297
+ actualTrajectory,
3298
+ { steps: resolvedExpectedSteps },
3299
+ {
3300
+ ordering: effectiveOrdering,
3301
+ compareStepData: effectiveCompareData,
3302
+ allowRepeatedSteps: effectiveAllowRepeated
3303
+ }
3304
+ );
3305
+ return {
3306
+ actualTrajectory,
3307
+ expectedTrajectory: { steps: resolvedExpectedSteps },
3308
+ comparison,
3309
+ actualStepNames: actualTrajectory.steps.map((s) => s.name),
3310
+ expectedStepNames: resolvedExpectedSteps.map((s) => s.name)
3311
+ };
3312
+ }).generateScore(({ results }) => {
3313
+ const preprocessResult = results.preprocessStepResult;
3314
+ if (!preprocessResult || !preprocessResult.comparison) {
3315
+ return 0;
3316
+ }
3317
+ return preprocessResult.comparison.score;
3318
+ });
3319
+ }
3320
+ function evaluateNestedExpectations(expectedSteps, actualSteps) {
3321
+ const results = [];
3322
+ const matchedIndices = /* @__PURE__ */ new Set();
3323
+ for (const expectedStep of expectedSteps) {
3324
+ if (!expectedStep.children) continue;
3325
+ const matchIndex = actualSteps.findIndex(
3326
+ (s, i) => !matchedIndices.has(i) && s.name === expectedStep.name && (!expectedStep.stepType || s.stepType === expectedStep.stepType)
3327
+ );
3328
+ const actualStep = matchIndex >= 0 ? actualSteps[matchIndex] : void 0;
3329
+ if (matchIndex >= 0) matchedIndices.add(matchIndex);
3330
+ if (!actualStep?.children || actualStep.children.length === 0) {
3331
+ const expectedStepCount = expectedStep.children.steps?.length ?? 0;
3332
+ results.push({
3333
+ stepName: expectedStep.name,
3334
+ score: 0,
3335
+ accuracy: expectedStepCount > 0 ? {
3336
+ score: 0,
3337
+ matchedSteps: 0,
3338
+ totalExpectedSteps: expectedStepCount,
3339
+ totalActualSteps: 0,
3340
+ missingSteps: expectedStep.children.steps.map((s) => s.name),
3341
+ extraSteps: [],
3342
+ outOfOrderSteps: [],
3343
+ repeatedSteps: []
3344
+ } : void 0
3345
+ });
3346
+ continue;
3347
+ }
3348
+ const childTrajectory = {
3349
+ steps: actualStep.children,
3350
+ totalDurationMs: actualStep.durationMs
3351
+ };
3352
+ const childConfig = expectedStep.children;
3353
+ let accuracy;
3354
+ if (childConfig.steps && childConfig.steps.length > 0) {
3355
+ accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
3356
+ childTrajectory,
3357
+ { steps: childConfig.steps },
3358
+ {
3359
+ ordering: childConfig.ordering ?? "relaxed",
3360
+ compareStepData: childConfig.compareStepData ?? false,
3361
+ allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
3362
+ }
3363
+ );
3364
+ }
3365
+ const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
3366
+ const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(childTrajectory, {
3367
+ maxSteps: childConfig.maxSteps,
3368
+ maxTotalTokens: childConfig.maxTotalTokens,
3369
+ maxTotalDurationMs: childConfig.maxTotalDurationMs,
3370
+ noRedundantCalls: childConfig.noRedundantCalls ?? true
3371
+ }) : void 0;
3372
+ const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
3373
+ const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(childTrajectory, {
3374
+ blacklistedTools: childConfig.blacklistedTools,
3375
+ blacklistedSequences: childConfig.blacklistedSequences
3376
+ }) : void 0;
3377
+ const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(childTrajectory, {
3378
+ maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
3379
+ });
3380
+ const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
3381
+ const scores = [];
3382
+ if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
3383
+ if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
3384
+ if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
3385
+ if (blacklist) {
3386
+ if (blacklist.score === 0) {
3387
+ results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
3388
+ continue;
3389
+ }
3390
+ scores.push({ weight: 0.1, value: blacklist.score });
3391
+ }
3392
+ let levelScore = 1;
3393
+ if (scores.length > 0) {
3394
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
3395
+ levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
3396
+ }
3397
+ let finalScore = levelScore;
3398
+ if (nested.length > 0) {
3399
+ const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
3400
+ if (hasNestedBlacklistViolation) {
3401
+ results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
3402
+ continue;
3403
+ }
3404
+ const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
3405
+ finalScore = 0.7 * levelScore + 0.3 * nestedAvg;
3406
+ }
3407
+ results.push({
3408
+ stepName: expectedStep.name,
3409
+ score: Math.round(finalScore * 100) / 100,
3410
+ accuracy,
3411
+ efficiency,
3412
+ blacklist,
3413
+ toolFailures,
3414
+ nested: nested.length > 0 ? nested : void 0
3415
+ });
3416
+ }
3417
+ return results;
3418
+ }
3419
+ function createTrajectoryScorerCode(options = {}) {
3420
+ const { defaults = {} } = options;
3421
+ return evals.createScorer({
3422
+ id: "code-trajectory-scorer",
3423
+ name: "Trajectory Scorer",
3424
+ description: "Multi-dimensional trajectory evaluation: accuracy, efficiency, blacklist, and tool failures",
3425
+ type: "trajectory"
3426
+ }).preprocess(async ({ run }) => {
3427
+ const actualTrajectory = run.output;
3428
+ const itemExpectation = run.expectedTrajectory ?? {};
3429
+ const config = { ...defaults, ...itemExpectation };
3430
+ if (itemExpectation.steps !== void 0) {
3431
+ config.steps = itemExpectation.steps;
3432
+ }
3433
+ let accuracy;
3434
+ if (config.steps && config.steps.length > 0) {
3435
+ accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
3436
+ actualTrajectory,
3437
+ { steps: config.steps },
3438
+ {
3439
+ ordering: config.ordering ?? "relaxed",
3440
+ compareStepData: config.compareStepData ?? false,
3441
+ allowRepeatedSteps: config.allowRepeatedSteps ?? true
3442
+ }
3443
+ );
3444
+ }
3445
+ const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
3446
+ const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(actualTrajectory, {
3447
+ maxSteps: config.maxSteps,
3448
+ maxTotalTokens: config.maxTotalTokens,
3449
+ maxTotalDurationMs: config.maxTotalDurationMs,
3450
+ noRedundantCalls: config.noRedundantCalls ?? true
3451
+ }) : void 0;
3452
+ const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
3453
+ const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(actualTrajectory, {
3454
+ blacklistedTools: config.blacklistedTools,
3455
+ blacklistedSequences: config.blacklistedSequences
3456
+ }) : void 0;
3457
+ const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(actualTrajectory, {
3458
+ maxRetriesPerTool: config.maxRetriesPerTool ?? 2
3459
+ });
3460
+ const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
3461
+ return {
3462
+ accuracy,
3463
+ efficiency,
3464
+ blacklist,
3465
+ toolFailures,
3466
+ nested: nested && nested.length > 0 ? nested : void 0,
3467
+ config
3468
+ };
3469
+ }).generateScore(({ results }) => {
3470
+ const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
3471
+ if (blacklist && blacklist.score === 0) {
3472
+ return 0;
3473
+ }
3474
+ const scores = [];
3475
+ if (accuracy) {
3476
+ scores.push({ weight: 0.4, value: accuracy.score });
3477
+ }
3478
+ if (efficiency) {
3479
+ scores.push({ weight: 0.3, value: efficiency.score });
3480
+ }
3481
+ if (toolFailures && toolFailures.patterns.length > 0) {
3482
+ scores.push({ weight: 0.2, value: toolFailures.score });
3483
+ }
3484
+ if (blacklist) {
3485
+ scores.push({ weight: 0.1, value: blacklist.score });
3486
+ }
3487
+ if (scores.length === 0 && !nested) {
3488
+ return 1;
3489
+ }
3490
+ let levelScore = 1;
3491
+ if (scores.length > 0) {
3492
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
3493
+ levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
3494
+ }
3495
+ if (nested && nested.length > 0) {
3496
+ const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
3497
+ if (hasNestedBlacklistViolation) {
3498
+ return 0;
3499
+ }
3500
+ const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
3501
+ levelScore = 0.7 * levelScore + 0.3 * nestedAvg;
3502
+ }
3503
+ return Math.round(levelScore * 100) / 100;
3504
+ }).generateReason(({ results, score }) => {
3505
+ const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
3506
+ const parts = [];
3507
+ parts.push(`Score: ${score}`);
3508
+ if (blacklist && blacklist.score === 0) {
3509
+ const violations = [];
3510
+ if (blacklist.violatedTools.length > 0) {
3511
+ violations.push(`forbidden tools used: ${blacklist.violatedTools.join(", ")}`);
3512
+ }
3513
+ if (blacklist.violatedSequences.length > 0) {
3514
+ violations.push(`forbidden sequences: ${blacklist.violatedSequences.map((s) => s.join(" \u2192 ")).join("; ")}`);
3515
+ }
3516
+ parts.push(`Blacklist violation: ${violations.join(". ")}.`);
3517
+ return parts.join("\n");
3518
+ }
3519
+ if (nested && nested.some((r) => r.blacklist && r.blacklist.score === 0)) {
3520
+ const violating = nested.filter((r) => r.blacklist && r.blacklist.score === 0).map((r) => r.stepName);
3521
+ parts.push(`Nested blacklist violation in: ${violating.join(", ")}.`);
3522
+ return parts.join("\n");
3523
+ }
3524
+ if (accuracy) {
3525
+ const details = [`${accuracy.matchedSteps}/${accuracy.totalExpectedSteps} expected steps matched`];
3526
+ if (accuracy.missingSteps.length > 0) {
3527
+ details.push(`missing: ${accuracy.missingSteps.join(", ")}`);
3528
+ }
3529
+ if (accuracy.extraSteps.length > 0) {
3530
+ details.push(`extra: ${accuracy.extraSteps.join(", ")}`);
3531
+ }
3532
+ if (accuracy.outOfOrderSteps.length > 0) {
3533
+ details.push(`out of order: ${accuracy.outOfOrderSteps.join(", ")}`);
3534
+ }
3535
+ parts.push(`Accuracy (${accuracy.score}): ${details.join(". ")}.`);
3536
+ }
3537
+ if (efficiency) {
3538
+ const details = [];
3539
+ if (efficiency.overStepBudget) {
3540
+ details.push(`over step budget (${efficiency.totalSteps} steps)`);
3541
+ }
3542
+ if (efficiency.overTokenBudget) {
3543
+ details.push(`over token budget (${efficiency.totalTokens} tokens)`);
3544
+ }
3545
+ if (efficiency.overDurationBudget) {
3546
+ details.push(`over duration budget (${efficiency.totalDurationMs}ms)`);
3547
+ }
3548
+ if (efficiency.redundantCalls.length > 0) {
3549
+ details.push(`redundant calls: ${efficiency.redundantCalls.map((c) => c.name).join(", ")}`);
3550
+ }
3551
+ if (details.length > 0) {
3552
+ parts.push(`Efficiency (${efficiency.score}): ${details.join(". ")}.`);
3553
+ } else {
3554
+ parts.push(`Efficiency (${efficiency.score}): all budgets met, no redundant calls.`);
3555
+ }
3556
+ }
3557
+ if (toolFailures && toolFailures.patterns.length > 0) {
3558
+ const details = [];
3559
+ if (toolFailures.totalRetries > 0) {
3560
+ details.push(`${toolFailures.totalRetries} total retries`);
3561
+ }
3562
+ if (toolFailures.excessiveRetryTools.length > 0) {
3563
+ details.push(`excessive retries: ${toolFailures.excessiveRetryTools.join(", ")}`);
3564
+ }
3565
+ parts.push(`Tool failures (${toolFailures.score}): ${details.join(". ")}.`);
3566
+ }
3567
+ if (nested && nested.length > 0) {
3568
+ const nestedSummary = nested.map((r) => `${r.stepName}: ${r.score}`).join(", ");
3569
+ parts.push(`Nested scores: ${nestedSummary}.`);
3570
+ }
3571
+ return parts.join("\n");
3572
+ });
3573
+ }
2998
3574
 
2999
3575
  exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
3000
3576
  exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
@@ -3017,5 +3593,8 @@ exports.createToneScorer = createToneScorer;
3017
3593
  exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;
3018
3594
  exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
3019
3595
  exports.createToxicityScorer = createToxicityScorer;
3596
+ exports.createTrajectoryAccuracyScorerCode = createTrajectoryAccuracyScorerCode;
3597
+ exports.createTrajectoryAccuracyScorerLLM = createTrajectoryAccuracyScorerLLM;
3598
+ exports.createTrajectoryScorerCode = createTrajectoryScorerCode;
3020
3599
  //# sourceMappingURL=index.cjs.map
3021
3600
  //# sourceMappingURL=index.cjs.map