@mastra/evals 1.2.1 → 1.2.2-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/dist/{chunk-AY4K3J4R.cjs → chunk-33T2SZZ2.cjs} +74 -14
- package/dist/chunk-33T2SZZ2.cjs.map +1 -0
- package/dist/{chunk-X4MKZ735.js → chunk-ZRHCSFKL.js} +73 -15
- package/dist/chunk-ZRHCSFKL.js.map +1 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/reference-evals-context-precision.md +3 -3
- package/dist/docs/references/reference-evals-context-relevance.md +3 -3
- package/dist/docs/references/reference-evals-noise-sensitivity.md +6 -6
- package/dist/docs/references/reference-evals-prompt-alignment.md +12 -12
- package/dist/docs/references/reference-evals-scorer-utils.md +3 -3
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
- package/dist/scorers/llm/answer-relevancy/index.d.ts +2 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts +2 -2
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +2 -1
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts +2 -1
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +2 -1
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +4 -4
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +2 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +2 -1
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/prebuilt/index.cjs +105 -85
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +34 -14
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +31 -23
- package/dist/scorers/utils.d.ts +33 -16
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +12 -12
- package/dist/chunk-AY4K3J4R.cjs.map +0 -1
- package/dist/chunk-X4MKZ735.js.map +0 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunk33T2SZZ2_cjs = require('../../chunk-33T2SZZ2.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var zod = require('zod');
|
|
6
6
|
var nlp = require('compromise');
|
|
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
|
|
|
239
239
|
description: "Extract relevant statements from the LLM output",
|
|
240
240
|
outputSchema: extractOutputSchema,
|
|
241
241
|
createPrompt: ({ run }) => {
|
|
242
|
-
const assistantMessage =
|
|
242
|
+
const assistantMessage = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
243
243
|
return createExtractPrompt(assistantMessage);
|
|
244
244
|
}
|
|
245
245
|
}).analyze({
|
|
246
246
|
description: "Score the relevance of the statements to the input",
|
|
247
247
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
248
248
|
createPrompt: ({ run, results }) => {
|
|
249
|
-
const input =
|
|
249
|
+
const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
250
250
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
251
251
|
}
|
|
252
252
|
}).generateScore(({ results }) => {
|
|
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
|
|
|
263
263
|
}
|
|
264
264
|
}
|
|
265
265
|
const score = relevancyCount / numberOfResults;
|
|
266
|
-
return
|
|
266
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * options.scale);
|
|
267
267
|
}).generateReason({
|
|
268
268
|
description: "Reason about the results",
|
|
269
269
|
createPrompt: ({ run, results, score }) => {
|
|
270
270
|
return createReasonPrompt({
|
|
271
|
-
input:
|
|
272
|
-
output:
|
|
271
|
+
input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
273
273
|
score,
|
|
274
274
|
results: results.analyzeStepResult.results,
|
|
275
275
|
scale: options.scale
|
|
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
|
|
|
466
466
|
groundTruth: ""
|
|
467
467
|
});
|
|
468
468
|
}
|
|
469
|
-
const output =
|
|
469
|
+
const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
470
470
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
471
471
|
return createExtractPrompt2({
|
|
472
472
|
output,
|
|
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
|
|
|
524
524
|
);
|
|
525
525
|
score -= extraInfoPenalty;
|
|
526
526
|
score = Math.max(0, Math.min(1, score));
|
|
527
|
-
return
|
|
527
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
528
528
|
}).generateReason({
|
|
529
529
|
description: "Generate explanation of similarity score",
|
|
530
530
|
createPrompt: ({ run, results, score }) => {
|
|
531
531
|
if (!run.groundTruth) {
|
|
532
532
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
533
533
|
}
|
|
534
|
-
const output =
|
|
534
|
+
const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
535
535
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
536
536
|
return createReasonPrompt2({
|
|
537
537
|
output,
|
|
@@ -698,6 +698,10 @@ Example Responses:
|
|
|
698
698
|
}
|
|
699
699
|
|
|
700
700
|
// src/scorers/llm/faithfulness/index.ts
|
|
701
|
+
var getToolInvocationContext = (output) => {
|
|
702
|
+
if (!Array.isArray(output)) return [];
|
|
703
|
+
return output.filter((message) => message?.role === "assistant").flatMap((message) => message?.content?.toolInvocations ?? []).filter((toolCall) => toolCall.state === "result").map((toolCall) => JSON.stringify(toolCall.result));
|
|
704
|
+
};
|
|
701
705
|
function createFaithfulnessScorer({
|
|
702
706
|
model,
|
|
703
707
|
options
|
|
@@ -717,17 +721,14 @@ function createFaithfulnessScorer({
|
|
|
717
721
|
claims: zod.z.array(zod.z.string())
|
|
718
722
|
}),
|
|
719
723
|
createPrompt: ({ run }) => {
|
|
720
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
724
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
721
725
|
return prompt;
|
|
722
726
|
}
|
|
723
727
|
}).analyze({
|
|
724
728
|
description: "Score the relevance of the statements to the input",
|
|
725
729
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
726
730
|
createPrompt: ({ results, run }) => {
|
|
727
|
-
const
|
|
728
|
-
const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
|
|
729
|
-
(toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
|
|
730
|
-
) ?? [];
|
|
731
|
+
const context = options?.context ?? getToolInvocationContext(run.output);
|
|
731
732
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
732
733
|
claims: results.preprocessStepResult?.claims || [],
|
|
733
734
|
context
|
|
@@ -741,15 +742,14 @@ function createFaithfulnessScorer({
|
|
|
741
742
|
return 0;
|
|
742
743
|
}
|
|
743
744
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
744
|
-
return
|
|
745
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score);
|
|
745
746
|
}).generateReason({
|
|
746
747
|
description: "Reason about the results",
|
|
747
748
|
createPrompt: ({ run, results, score }) => {
|
|
748
|
-
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
749
749
|
const prompt = createFaithfulnessReasonPrompt({
|
|
750
|
-
input:
|
|
751
|
-
output:
|
|
752
|
-
context:
|
|
750
|
+
input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
751
|
+
output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
752
|
+
context: options?.context ?? getToolInvocationContext(run.output),
|
|
753
753
|
score,
|
|
754
754
|
scale: options?.scale || 1,
|
|
755
755
|
verdicts: results.analyzeStepResult?.verdicts || []
|
|
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
|
|
|
881
881
|
outputSchema: zod.z.object({
|
|
882
882
|
opinions: zod.z.array(zod.z.string())
|
|
883
883
|
}),
|
|
884
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
884
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
885
885
|
}).analyze({
|
|
886
886
|
description: "Score the relevance of the statements to the input",
|
|
887
887
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
888
888
|
createPrompt: ({ run, results }) => {
|
|
889
889
|
const prompt = createBiasAnalyzePrompt({
|
|
890
|
-
output:
|
|
890
|
+
output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
891
891
|
opinions: results.preprocessStepResult?.opinions || []
|
|
892
892
|
});
|
|
893
893
|
return prompt;
|
|
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
|
|
|
898
898
|
}
|
|
899
899
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
900
900
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
901
|
-
return
|
|
901
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
902
902
|
}).generateReason({
|
|
903
903
|
description: "Reason about the results",
|
|
904
904
|
createPrompt: ({ score, results }) => {
|
|
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
|
|
|
1117
1117
|
claims: zod.z.array(zod.z.string())
|
|
1118
1118
|
}),
|
|
1119
1119
|
createPrompt: ({ run }) => {
|
|
1120
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1120
|
+
const prompt = createHallucinationExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1121
1121
|
return prompt;
|
|
1122
1122
|
}
|
|
1123
1123
|
}).analyze({
|
|
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
|
|
|
1145
1145
|
return 0;
|
|
1146
1146
|
}
|
|
1147
1147
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1148
|
-
return
|
|
1148
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score);
|
|
1149
1149
|
}).generateReason({
|
|
1150
1150
|
description: "Reason about the results",
|
|
1151
1151
|
createPrompt: async ({ run, results, score }) => {
|
|
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
|
|
|
1156
1156
|
context = options?.context ?? [];
|
|
1157
1157
|
}
|
|
1158
1158
|
const prompt = createHallucinationReasonPrompt({
|
|
1159
|
-
input:
|
|
1160
|
-
output:
|
|
1159
|
+
input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1160
|
+
output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1161
1161
|
context,
|
|
1162
1162
|
score,
|
|
1163
1163
|
scale: options?.scale || 1,
|
|
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
|
|
|
1271
1271
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1272
1272
|
createPrompt: ({ run }) => {
|
|
1273
1273
|
const prompt = createToxicityAnalyzePrompt({
|
|
1274
|
-
input:
|
|
1275
|
-
output:
|
|
1274
|
+
input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1275
|
+
output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1276
1276
|
});
|
|
1277
1277
|
return prompt;
|
|
1278
1278
|
}
|
|
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
}
|
|
1290
1290
|
const score = toxicityCount / numberOfVerdicts;
|
|
1291
|
-
return
|
|
1291
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1292
1292
|
}).generateReason({
|
|
1293
1293
|
description: "Reason about the results",
|
|
1294
1294
|
createPrompt: ({ results, score }) => {
|
|
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1422
1422
|
if (isInputInvalid || isOutputInvalid) {
|
|
1423
1423
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1424
1424
|
}
|
|
1425
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1425
|
+
const { tools: actualTools, toolCallInfos } = chunk33T2SZZ2_cjs.extractToolCalls(run.output);
|
|
1426
1426
|
return {
|
|
1427
1427
|
actualTools,
|
|
1428
1428
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1432
1432
|
description: "Analyze the appropriateness of tool selections",
|
|
1433
1433
|
outputSchema: analyzeOutputSchema2,
|
|
1434
1434
|
createPrompt: ({ run, results }) => {
|
|
1435
|
-
const userInput =
|
|
1436
|
-
const agentResponse =
|
|
1435
|
+
const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1436
|
+
const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1437
1437
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1438
1438
|
return createAnalyzePrompt2({
|
|
1439
1439
|
userInput,
|
|
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1450
1450
|
}
|
|
1451
1451
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1452
1452
|
const totalToolCalls = evaluations.length;
|
|
1453
|
-
return
|
|
1453
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1454
1454
|
}).generateReason({
|
|
1455
1455
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1456
1456
|
createPrompt: ({ run, results, score }) => {
|
|
1457
|
-
const userInput =
|
|
1457
|
+
const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1458
1458
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1459
1459
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1460
1460
|
return createReasonPrompt3({
|
|
@@ -1636,6 +1636,16 @@ var DEFAULT_PENALTIES = {
|
|
|
1636
1636
|
MAX_MISSING_CONTEXT_PENALTY: 0.5
|
|
1637
1637
|
// Maximum 50% penalty for missing context
|
|
1638
1638
|
};
|
|
1639
|
+
var getContext = ({
|
|
1640
|
+
input,
|
|
1641
|
+
output,
|
|
1642
|
+
options
|
|
1643
|
+
}) => {
|
|
1644
|
+
if (options.contextExtractor && chunk33T2SZZ2_cjs.isScorerRunInputForAgent(input) && chunk33T2SZZ2_cjs.isScorerRunOutputForAgent(output)) {
|
|
1645
|
+
return options.contextExtractor(input, output);
|
|
1646
|
+
}
|
|
1647
|
+
return options.context ?? [];
|
|
1648
|
+
};
|
|
1639
1649
|
function createContextRelevanceScorerLLM({
|
|
1640
1650
|
model,
|
|
1641
1651
|
options
|
|
@@ -1659,9 +1669,9 @@ function createContextRelevanceScorerLLM({
|
|
|
1659
1669
|
description: "Analyze the relevance and utility of provided context",
|
|
1660
1670
|
outputSchema: analyzeOutputSchema3,
|
|
1661
1671
|
createPrompt: ({ run }) => {
|
|
1662
|
-
const userQuery =
|
|
1663
|
-
const agentResponse =
|
|
1664
|
-
const context =
|
|
1672
|
+
const userQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1673
|
+
const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1674
|
+
const context = getContext({ input: run.input, output: run.output, options });
|
|
1665
1675
|
if (context.length === 0) {
|
|
1666
1676
|
return createAnalyzePrompt3({
|
|
1667
1677
|
userQuery,
|
|
@@ -1677,7 +1687,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1677
1687
|
}
|
|
1678
1688
|
}).generateScore(({ results, run }) => {
|
|
1679
1689
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1680
|
-
const context =
|
|
1690
|
+
const context = getContext({ input: run.input, output: run.output, options });
|
|
1681
1691
|
if (context.length === 0) {
|
|
1682
1692
|
return 1 * (options.scale || 1);
|
|
1683
1693
|
}
|
|
@@ -1708,12 +1718,12 @@ function createContextRelevanceScorerLLM({
|
|
|
1708
1718
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1709
1719
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1710
1720
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1711
|
-
return
|
|
1721
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(scaledScore);
|
|
1712
1722
|
}).generateReason({
|
|
1713
1723
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1714
1724
|
createPrompt: ({ run, results, score }) => {
|
|
1715
|
-
const userQuery =
|
|
1716
|
-
const context =
|
|
1725
|
+
const userQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1726
|
+
const context = getContext({ input: run.input, output: run.output, options });
|
|
1717
1727
|
if (context.length === 0) {
|
|
1718
1728
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
1719
1729
|
}
|
|
@@ -1860,6 +1870,16 @@ var contextRelevanceOutputSchema = zod.z.object({
|
|
|
1860
1870
|
})
|
|
1861
1871
|
)
|
|
1862
1872
|
});
|
|
1873
|
+
var getContext2 = ({
|
|
1874
|
+
input,
|
|
1875
|
+
output,
|
|
1876
|
+
options
|
|
1877
|
+
}) => {
|
|
1878
|
+
if (options.contextExtractor && chunk33T2SZZ2_cjs.isScorerRunInputForAgent(input) && chunk33T2SZZ2_cjs.isScorerRunOutputForAgent(output)) {
|
|
1879
|
+
return options.contextExtractor(input, output);
|
|
1880
|
+
}
|
|
1881
|
+
return options.context ?? [];
|
|
1882
|
+
};
|
|
1863
1883
|
function createContextPrecisionScorer({
|
|
1864
1884
|
model,
|
|
1865
1885
|
options
|
|
@@ -1883,9 +1903,9 @@ function createContextPrecisionScorer({
|
|
|
1883
1903
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1884
1904
|
outputSchema: contextRelevanceOutputSchema,
|
|
1885
1905
|
createPrompt: ({ run }) => {
|
|
1886
|
-
const input =
|
|
1887
|
-
const output =
|
|
1888
|
-
const context =
|
|
1906
|
+
const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1907
|
+
const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1908
|
+
const context = getContext2({ input: run.input, output: run.output, options });
|
|
1889
1909
|
if (context.length === 0) {
|
|
1890
1910
|
throw new Error("No context available for evaluation");
|
|
1891
1911
|
}
|
|
@@ -1917,13 +1937,13 @@ function createContextPrecisionScorer({
|
|
|
1917
1937
|
}
|
|
1918
1938
|
const map = sumPrecision / relevantCount;
|
|
1919
1939
|
const score = map * (options.scale || 1);
|
|
1920
|
-
return
|
|
1940
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(score);
|
|
1921
1941
|
}).generateReason({
|
|
1922
1942
|
description: "Reason about the context precision results",
|
|
1923
1943
|
createPrompt: ({ run, results, score }) => {
|
|
1924
|
-
const input =
|
|
1925
|
-
const output =
|
|
1926
|
-
const context =
|
|
1944
|
+
const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1945
|
+
const output = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1946
|
+
const context = getContext2({ input: run.input, output: run.output, options });
|
|
1927
1947
|
return createContextPrecisionReasonPrompt({
|
|
1928
1948
|
input,
|
|
1929
1949
|
output,
|
|
@@ -2177,8 +2197,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2177
2197
|
description: "Analyze the impact of noise on agent response quality",
|
|
2178
2198
|
outputSchema: analyzeOutputSchema4,
|
|
2179
2199
|
createPrompt: ({ run }) => {
|
|
2180
|
-
const originalQuery =
|
|
2181
|
-
const noisyResponse =
|
|
2200
|
+
const originalQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2201
|
+
const noisyResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2182
2202
|
if (!originalQuery || !noisyResponse) {
|
|
2183
2203
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2184
2204
|
}
|
|
@@ -2221,11 +2241,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2221
2241
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2222
2242
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2223
2243
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2224
|
-
return
|
|
2244
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(finalScore);
|
|
2225
2245
|
}).generateReason({
|
|
2226
2246
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2227
2247
|
createPrompt: ({ run, results, score }) => {
|
|
2228
|
-
const originalQuery =
|
|
2248
|
+
const originalQuery = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2229
2249
|
const analysisResult = results.analyzeStepResult;
|
|
2230
2250
|
if (!analysisResult) {
|
|
2231
2251
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -2550,17 +2570,17 @@ function createPromptAlignmentScorerLLM({
|
|
|
2550
2570
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2551
2571
|
outputSchema: analyzeOutputSchema5,
|
|
2552
2572
|
createPrompt: ({ run }) => {
|
|
2553
|
-
const userPrompt =
|
|
2554
|
-
const systemPrompt =
|
|
2555
|
-
const agentResponse =
|
|
2573
|
+
const userPrompt = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2574
|
+
const systemPrompt = chunk33T2SZZ2_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2575
|
+
const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2556
2576
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2557
2577
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2558
2578
|
}
|
|
2559
2579
|
if (evaluationMode === "system" && !systemPrompt) {
|
|
2560
2580
|
throw new Error("System prompt is required for system prompt alignment scoring");
|
|
2561
2581
|
}
|
|
2562
|
-
if (evaluationMode === "both" &&
|
|
2563
|
-
throw new Error("
|
|
2582
|
+
if (evaluationMode === "both" && !userPrompt && !systemPrompt) {
|
|
2583
|
+
throw new Error("A user or system prompt is required for combined alignment scoring");
|
|
2564
2584
|
}
|
|
2565
2585
|
if (!agentResponse) {
|
|
2566
2586
|
throw new Error("Agent response is required for prompt alignment scoring");
|
|
@@ -2588,12 +2608,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
2588
2608
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2589
2609
|
}
|
|
2590
2610
|
const finalScore = weightedScore * scale;
|
|
2591
|
-
return
|
|
2611
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(finalScore);
|
|
2592
2612
|
}).generateReason({
|
|
2593
2613
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2594
2614
|
createPrompt: ({ run, results, score }) => {
|
|
2595
|
-
const userPrompt =
|
|
2596
|
-
const systemPrompt =
|
|
2615
|
+
const userPrompt = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2616
|
+
const systemPrompt = chunk33T2SZZ2_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2597
2617
|
const analysis = results.analyzeStepResult;
|
|
2598
2618
|
if (!analysis) {
|
|
2599
2619
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -2798,8 +2818,8 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
2798
2818
|
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
2799
2819
|
outputSchema: analyzeOutputSchema6,
|
|
2800
2820
|
createPrompt: ({ run, results }) => {
|
|
2801
|
-
const userInput =
|
|
2802
|
-
const agentResponse =
|
|
2821
|
+
const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2822
|
+
const agentResponse = chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
2803
2823
|
return createAnalyzePrompt6({
|
|
2804
2824
|
userInput,
|
|
2805
2825
|
agentResponse,
|
|
@@ -2824,11 +2844,11 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
2824
2844
|
const necessityScore = necessarySteps / totalSteps;
|
|
2825
2845
|
const orderScore = orderedSteps / totalSteps;
|
|
2826
2846
|
const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
|
|
2827
|
-
return
|
|
2847
|
+
return chunk33T2SZZ2_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
|
|
2828
2848
|
}).generateReason({
|
|
2829
2849
|
description: "Generate human-readable explanation of trajectory evaluation",
|
|
2830
2850
|
createPrompt: ({ run, results, score }) => {
|
|
2831
|
-
const userInput =
|
|
2851
|
+
const userInput = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2832
2852
|
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
2833
2853
|
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
2834
2854
|
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
@@ -2891,18 +2911,18 @@ function createCompletenessScorer() {
|
|
|
2891
2911
|
type: "agent"
|
|
2892
2912
|
}).preprocess(async ({ run }) => {
|
|
2893
2913
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2894
|
-
const content =
|
|
2914
|
+
const content = chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i);
|
|
2895
2915
|
return content === null || content === void 0;
|
|
2896
2916
|
});
|
|
2897
2917
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2898
|
-
const content =
|
|
2918
|
+
const content = chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i);
|
|
2899
2919
|
return content === null || content === void 0;
|
|
2900
2920
|
});
|
|
2901
2921
|
if (isInputInvalid || isOutputInvalid) {
|
|
2902
2922
|
throw new Error("Inputs cannot be null or undefined");
|
|
2903
2923
|
}
|
|
2904
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
2905
|
-
const output = run.output?.map((i) =>
|
|
2924
|
+
const input = run.input?.inputMessages.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2925
|
+
const output = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2906
2926
|
const inputToProcess = input;
|
|
2907
2927
|
const outputToProcess = output;
|
|
2908
2928
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -3007,8 +3027,8 @@ function createTextualDifferenceScorer() {
|
|
|
3007
3027
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
3008
3028
|
type: "agent"
|
|
3009
3029
|
}).preprocess(async ({ run }) => {
|
|
3010
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3011
|
-
const output = run.output?.map((i) =>
|
|
3030
|
+
const input = run.input?.inputMessages?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3031
|
+
const output = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3012
3032
|
const ratio = calculateRatio(input, output);
|
|
3013
3033
|
const changes = countChanges(input, output);
|
|
3014
3034
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -3031,8 +3051,8 @@ function createKeywordCoverageScorer() {
|
|
|
3031
3051
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
3032
3052
|
type: "agent"
|
|
3033
3053
|
}).preprocess(async ({ run }) => {
|
|
3034
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3035
|
-
const output = run.output?.map((i) =>
|
|
3054
|
+
const input = run.input?.inputMessages?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3055
|
+
const output = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3036
3056
|
if (!input && !output) {
|
|
3037
3057
|
return {
|
|
3038
3058
|
result: {
|
|
@@ -3085,8 +3105,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
3085
3105
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
3086
3106
|
type: "agent"
|
|
3087
3107
|
}).preprocess(async ({ run }) => {
|
|
3088
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
3089
|
-
let processedOutput = run.output.map((i) =>
|
|
3108
|
+
let processedInput = run.input?.inputMessages.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3109
|
+
let processedOutput = run.output.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3090
3110
|
if (ignoreCase) {
|
|
3091
3111
|
processedInput = processedInput.toLowerCase();
|
|
3092
3112
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -3116,7 +3136,7 @@ function createToneScorer(config = {}) {
|
|
|
3116
3136
|
type: "agent"
|
|
3117
3137
|
}).preprocess(async ({ run }) => {
|
|
3118
3138
|
const sentiment = new Sentiment__default.default();
|
|
3119
|
-
const agentMessage = run.output?.map((i) =>
|
|
3139
|
+
const agentMessage = run.output?.map((i) => chunk33T2SZZ2_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3120
3140
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
3121
3141
|
if (referenceTone) {
|
|
3122
3142
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -3203,7 +3223,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
3203
3223
|
if (isInputInvalid || isOutputInvalid) {
|
|
3204
3224
|
throw new Error("Input and output messages cannot be null or empty");
|
|
3205
3225
|
}
|
|
3206
|
-
const { tools: actualTools, toolCallInfos } =
|
|
3226
|
+
const { tools: actualTools, toolCallInfos } = chunk33T2SZZ2_cjs.extractToolCalls(run.output);
|
|
3207
3227
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
3208
3228
|
return {
|
|
3209
3229
|
expectedTool,
|
|
@@ -3278,7 +3298,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3278
3298
|
const itemExpectation = run.expectedTrajectory;
|
|
3279
3299
|
const effectiveOrdering = itemExpectation?.ordering ?? ordering;
|
|
3280
3300
|
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3281
|
-
const comparison =
|
|
3301
|
+
const comparison = chunk33T2SZZ2_cjs.compareTrajectories(
|
|
3282
3302
|
actualTrajectory,
|
|
3283
3303
|
{ steps: resolvedExpectedSteps },
|
|
3284
3304
|
{
|
|
@@ -3336,7 +3356,7 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
|
|
|
3336
3356
|
const childConfig = expectedStep.children;
|
|
3337
3357
|
let accuracy;
|
|
3338
3358
|
if (childConfig.steps && childConfig.steps.length > 0) {
|
|
3339
|
-
accuracy =
|
|
3359
|
+
accuracy = chunk33T2SZZ2_cjs.compareTrajectories(
|
|
3340
3360
|
childTrajectory,
|
|
3341
3361
|
{ steps: childConfig.steps },
|
|
3342
3362
|
{
|
|
@@ -3346,18 +3366,18 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
|
|
|
3346
3366
|
);
|
|
3347
3367
|
}
|
|
3348
3368
|
const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
|
|
3349
|
-
const efficiency = hasEfficiencyConfig ?
|
|
3369
|
+
const efficiency = hasEfficiencyConfig ? chunk33T2SZZ2_cjs.checkTrajectoryEfficiency(childTrajectory, {
|
|
3350
3370
|
maxSteps: childConfig.maxSteps,
|
|
3351
3371
|
maxTotalTokens: childConfig.maxTotalTokens,
|
|
3352
3372
|
maxTotalDurationMs: childConfig.maxTotalDurationMs,
|
|
3353
3373
|
noRedundantCalls: childConfig.noRedundantCalls ?? true
|
|
3354
3374
|
}) : void 0;
|
|
3355
3375
|
const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
|
|
3356
|
-
const blacklist = hasBlacklistConfig ?
|
|
3376
|
+
const blacklist = hasBlacklistConfig ? chunk33T2SZZ2_cjs.checkTrajectoryBlacklist(childTrajectory, {
|
|
3357
3377
|
blacklistedTools: childConfig.blacklistedTools,
|
|
3358
3378
|
blacklistedSequences: childConfig.blacklistedSequences
|
|
3359
3379
|
}) : void 0;
|
|
3360
|
-
const toolFailures =
|
|
3380
|
+
const toolFailures = chunk33T2SZZ2_cjs.analyzeToolFailures(childTrajectory, {
|
|
3361
3381
|
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3362
3382
|
});
|
|
3363
3383
|
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
|
|
@@ -3422,7 +3442,7 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3422
3442
|
}
|
|
3423
3443
|
let accuracy;
|
|
3424
3444
|
if (config.steps && config.steps.length > 0) {
|
|
3425
|
-
accuracy =
|
|
3445
|
+
accuracy = chunk33T2SZZ2_cjs.compareTrajectories(
|
|
3426
3446
|
actualTrajectory,
|
|
3427
3447
|
{ steps: config.steps },
|
|
3428
3448
|
{
|
|
@@ -3432,18 +3452,18 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3432
3452
|
);
|
|
3433
3453
|
}
|
|
3434
3454
|
const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
|
|
3435
|
-
const efficiency = hasEfficiencyConfig ?
|
|
3455
|
+
const efficiency = hasEfficiencyConfig ? chunk33T2SZZ2_cjs.checkTrajectoryEfficiency(actualTrajectory, {
|
|
3436
3456
|
maxSteps: config.maxSteps,
|
|
3437
3457
|
maxTotalTokens: config.maxTotalTokens,
|
|
3438
3458
|
maxTotalDurationMs: config.maxTotalDurationMs,
|
|
3439
3459
|
noRedundantCalls: config.noRedundantCalls ?? true
|
|
3440
3460
|
}) : void 0;
|
|
3441
3461
|
const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
|
|
3442
|
-
const blacklist = hasBlacklistConfig ?
|
|
3462
|
+
const blacklist = hasBlacklistConfig ? chunk33T2SZZ2_cjs.checkTrajectoryBlacklist(actualTrajectory, {
|
|
3443
3463
|
blacklistedTools: config.blacklistedTools,
|
|
3444
3464
|
blacklistedSequences: config.blacklistedSequences
|
|
3445
3465
|
}) : void 0;
|
|
3446
|
-
const toolFailures =
|
|
3466
|
+
const toolFailures = chunk33T2SZZ2_cjs.analyzeToolFailures(actualTrajectory, {
|
|
3447
3467
|
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3448
3468
|
});
|
|
3449
3469
|
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
|