@mastra/evals 1.2.3 → 1.2.4-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/dist/{chunk-33T2SZZ2.cjs → chunk-BULMCHKJ.cjs} +20 -16
- package/dist/chunk-BULMCHKJ.cjs.map +1 -0
- package/dist/{chunk-ZRHCSFKL.js → chunk-XOXUFZEG.js} +20 -16
- package/dist/chunk-XOXUFZEG.js.map +1 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/scorers/prebuilt/index.cjs +74 -74
- package/dist/scorers/prebuilt/index.js +1 -1
- package/dist/scorers/utils.cjs +25 -25
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +7 -7
- package/dist/chunk-33T2SZZ2.cjs.map +0 -1
- package/dist/chunk-ZRHCSFKL.js.map +0 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkBULMCHKJ_cjs = require('../../chunk-BULMCHKJ.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var nlp = require('compromise');
|
|
6
6
|
var keyword_extractor = require('keyword-extractor');
|
|
@@ -250,7 +250,7 @@ function createAnswerRelevancyScorer({
|
|
|
250
250
|
description: "Extract relevant statements from the LLM output",
|
|
251
251
|
outputSchema: extractOutputSchema,
|
|
252
252
|
createPrompt: ({ run }) => {
|
|
253
|
-
const assistantMessage =
|
|
253
|
+
const assistantMessage = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
254
254
|
return createExtractPrompt(assistantMessage);
|
|
255
255
|
}
|
|
256
256
|
}).analyze({
|
|
@@ -283,7 +283,7 @@ function createAnswerRelevancyScorer({
|
|
|
283
283
|
]
|
|
284
284
|
},
|
|
285
285
|
createPrompt: ({ run, results }) => {
|
|
286
|
-
const input =
|
|
286
|
+
const input = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
287
287
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
288
288
|
}
|
|
289
289
|
}).generateScore(({ results }) => {
|
|
@@ -300,13 +300,13 @@ function createAnswerRelevancyScorer({
|
|
|
300
300
|
}
|
|
301
301
|
}
|
|
302
302
|
const score = relevancyCount / numberOfResults;
|
|
303
|
-
return
|
|
303
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * options.scale);
|
|
304
304
|
}).generateReason({
|
|
305
305
|
description: "Reason about the results",
|
|
306
306
|
createPrompt: ({ run, results, score }) => {
|
|
307
307
|
return createReasonPrompt({
|
|
308
|
-
input:
|
|
309
|
-
output:
|
|
308
|
+
input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
309
|
+
output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
310
310
|
score,
|
|
311
311
|
results: results.analyzeStepResult.results,
|
|
312
312
|
scale: options.scale
|
|
@@ -581,7 +581,7 @@ function createAnswerSimilarityScorer({
|
|
|
581
581
|
groundTruth: ""
|
|
582
582
|
});
|
|
583
583
|
}
|
|
584
|
-
const output =
|
|
584
|
+
const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
585
585
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
586
586
|
return createExtractPrompt2({
|
|
587
587
|
output,
|
|
@@ -639,14 +639,14 @@ function createAnswerSimilarityScorer({
|
|
|
639
639
|
);
|
|
640
640
|
score -= extraInfoPenalty;
|
|
641
641
|
score = Math.max(0, Math.min(1, score));
|
|
642
|
-
return
|
|
642
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
643
643
|
}).generateReason({
|
|
644
644
|
description: "Generate explanation of similarity score",
|
|
645
645
|
createPrompt: ({ run, results, score }) => {
|
|
646
646
|
if (!run.groundTruth) {
|
|
647
647
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
648
648
|
}
|
|
649
|
-
const output =
|
|
649
|
+
const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
650
650
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
651
651
|
return createReasonPrompt2({
|
|
652
652
|
output,
|
|
@@ -848,7 +848,7 @@ function createFaithfulnessScorer({
|
|
|
848
848
|
]
|
|
849
849
|
},
|
|
850
850
|
createPrompt: ({ run }) => {
|
|
851
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
851
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
852
852
|
return prompt;
|
|
853
853
|
}
|
|
854
854
|
}).analyze({
|
|
@@ -895,13 +895,13 @@ function createFaithfulnessScorer({
|
|
|
895
895
|
return 0;
|
|
896
896
|
}
|
|
897
897
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
898
|
-
return
|
|
898
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score);
|
|
899
899
|
}).generateReason({
|
|
900
900
|
description: "Reason about the results",
|
|
901
901
|
createPrompt: ({ run, results, score }) => {
|
|
902
902
|
const prompt = createFaithfulnessReasonPrompt({
|
|
903
|
-
input:
|
|
904
|
-
output:
|
|
903
|
+
input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
904
|
+
output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
905
905
|
context: options?.context ?? getToolInvocationContext(run.output),
|
|
906
906
|
score,
|
|
907
907
|
scale: options?.scale || 1,
|
|
@@ -1046,7 +1046,7 @@ function createBiasScorer({ model, options }) {
|
|
|
1046
1046
|
"opinions"
|
|
1047
1047
|
]
|
|
1048
1048
|
},
|
|
1049
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
1049
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
1050
1050
|
}).analyze({
|
|
1051
1051
|
description: "Score the relevance of the statements to the input",
|
|
1052
1052
|
outputSchema: {
|
|
@@ -1078,7 +1078,7 @@ function createBiasScorer({ model, options }) {
|
|
|
1078
1078
|
},
|
|
1079
1079
|
createPrompt: ({ run, results }) => {
|
|
1080
1080
|
const prompt = createBiasAnalyzePrompt({
|
|
1081
|
-
output:
|
|
1081
|
+
output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1082
1082
|
opinions: results.preprocessStepResult?.opinions || []
|
|
1083
1083
|
});
|
|
1084
1084
|
return prompt;
|
|
@@ -1089,7 +1089,7 @@ function createBiasScorer({ model, options }) {
|
|
|
1089
1089
|
}
|
|
1090
1090
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
1091
1091
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
1092
|
-
return
|
|
1092
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1093
1093
|
}).generateReason({
|
|
1094
1094
|
description: "Reason about the results",
|
|
1095
1095
|
createPrompt: ({ score, results }) => {
|
|
@@ -1320,7 +1320,7 @@ function createHallucinationScorer({
|
|
|
1320
1320
|
]
|
|
1321
1321
|
},
|
|
1322
1322
|
createPrompt: ({ run }) => {
|
|
1323
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1323
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1324
1324
|
return prompt;
|
|
1325
1325
|
}
|
|
1326
1326
|
}).analyze({
|
|
@@ -1376,7 +1376,7 @@ function createHallucinationScorer({
|
|
|
1376
1376
|
return 0;
|
|
1377
1377
|
}
|
|
1378
1378
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1379
|
-
return
|
|
1379
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score);
|
|
1380
1380
|
}).generateReason({
|
|
1381
1381
|
description: "Reason about the results",
|
|
1382
1382
|
createPrompt: async ({ run, results, score }) => {
|
|
@@ -1387,8 +1387,8 @@ function createHallucinationScorer({
|
|
|
1387
1387
|
context = options?.context ?? [];
|
|
1388
1388
|
}
|
|
1389
1389
|
const prompt = createHallucinationReasonPrompt({
|
|
1390
|
-
input:
|
|
1391
|
-
output:
|
|
1390
|
+
input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1391
|
+
output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1392
1392
|
context,
|
|
1393
1393
|
score,
|
|
1394
1394
|
scale: options?.scale || 1,
|
|
@@ -1528,8 +1528,8 @@ function createToxicityScorer({
|
|
|
1528
1528
|
},
|
|
1529
1529
|
createPrompt: ({ run }) => {
|
|
1530
1530
|
const prompt = createToxicityAnalyzePrompt({
|
|
1531
|
-
input:
|
|
1532
|
-
output:
|
|
1531
|
+
input: chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1532
|
+
output: chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1533
1533
|
});
|
|
1534
1534
|
return prompt;
|
|
1535
1535
|
}
|
|
@@ -1545,7 +1545,7 @@ function createToxicityScorer({
|
|
|
1545
1545
|
}
|
|
1546
1546
|
}
|
|
1547
1547
|
const score = toxicityCount / numberOfVerdicts;
|
|
1548
|
-
return
|
|
1548
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1549
1549
|
}).generateReason({
|
|
1550
1550
|
description: "Reason about the results",
|
|
1551
1551
|
createPrompt: ({ results, score }) => {
|
|
@@ -1706,7 +1706,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1706
1706
|
if (isInputInvalid || isOutputInvalid) {
|
|
1707
1707
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1708
1708
|
}
|
|
1709
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1709
|
+
const { tools: actualTools, toolCallInfos } = chunkBULMCHKJ_cjs.extractToolCalls(run.output);
|
|
1710
1710
|
return {
|
|
1711
1711
|
actualTools,
|
|
1712
1712
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1716,8 +1716,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1716
1716
|
description: "Analyze the appropriateness of tool selections",
|
|
1717
1717
|
outputSchema: analyzeOutputSchema2,
|
|
1718
1718
|
createPrompt: ({ run, results }) => {
|
|
1719
|
-
const userInput =
|
|
1720
|
-
const agentResponse =
|
|
1719
|
+
const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1720
|
+
const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1721
1721
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1722
1722
|
return createAnalyzePrompt2({
|
|
1723
1723
|
userInput,
|
|
@@ -1734,11 +1734,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1734
1734
|
}
|
|
1735
1735
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1736
1736
|
const totalToolCalls = evaluations.length;
|
|
1737
|
-
return
|
|
1737
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1738
1738
|
}).generateReason({
|
|
1739
1739
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1740
1740
|
createPrompt: ({ run, results, score }) => {
|
|
1741
|
-
const userInput =
|
|
1741
|
+
const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1742
1742
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1743
1743
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1744
1744
|
return createReasonPrompt3({
|
|
@@ -1968,7 +1968,7 @@ var getContext = ({
|
|
|
1968
1968
|
output,
|
|
1969
1969
|
options
|
|
1970
1970
|
}) => {
|
|
1971
|
-
if (options.contextExtractor &&
|
|
1971
|
+
if (options.contextExtractor && chunkBULMCHKJ_cjs.isScorerRunInputForAgent(input) && chunkBULMCHKJ_cjs.isScorerRunOutputForAgent(output)) {
|
|
1972
1972
|
return options.contextExtractor(input, output);
|
|
1973
1973
|
}
|
|
1974
1974
|
return options.context ?? [];
|
|
@@ -1996,8 +1996,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1996
1996
|
description: "Analyze the relevance and utility of provided context",
|
|
1997
1997
|
outputSchema: analyzeOutputSchema3,
|
|
1998
1998
|
createPrompt: ({ run }) => {
|
|
1999
|
-
const userQuery =
|
|
2000
|
-
const agentResponse =
|
|
1999
|
+
const userQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2000
|
+
const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2001
2001
|
const context = getContext({ input: run.input, output: run.output, options });
|
|
2002
2002
|
if (context.length === 0) {
|
|
2003
2003
|
return createAnalyzePrompt3({
|
|
@@ -2045,11 +2045,11 @@ function createContextRelevanceScorerLLM({
|
|
|
2045
2045
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
2046
2046
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
2047
2047
|
const scaledScore = finalScore * (options.scale || 1);
|
|
2048
|
-
return
|
|
2048
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(scaledScore);
|
|
2049
2049
|
}).generateReason({
|
|
2050
2050
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
2051
2051
|
createPrompt: ({ run, results, score }) => {
|
|
2052
|
-
const userQuery =
|
|
2052
|
+
const userQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2053
2053
|
const context = getContext({ input: run.input, output: run.output, options });
|
|
2054
2054
|
if (context.length === 0) {
|
|
2055
2055
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -2224,7 +2224,7 @@ var getContext2 = ({
|
|
|
2224
2224
|
output,
|
|
2225
2225
|
options
|
|
2226
2226
|
}) => {
|
|
2227
|
-
if (options.contextExtractor &&
|
|
2227
|
+
if (options.contextExtractor && chunkBULMCHKJ_cjs.isScorerRunInputForAgent(input) && chunkBULMCHKJ_cjs.isScorerRunOutputForAgent(output)) {
|
|
2228
2228
|
return options.contextExtractor(input, output);
|
|
2229
2229
|
}
|
|
2230
2230
|
return options.context ?? [];
|
|
@@ -2252,8 +2252,8 @@ function createContextPrecisionScorer({
|
|
|
2252
2252
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
2253
2253
|
outputSchema: contextRelevanceOutputSchema,
|
|
2254
2254
|
createPrompt: ({ run }) => {
|
|
2255
|
-
const input =
|
|
2256
|
-
const output =
|
|
2255
|
+
const input = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2256
|
+
const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2257
2257
|
const context = getContext2({ input: run.input, output: run.output, options });
|
|
2258
2258
|
if (context.length === 0) {
|
|
2259
2259
|
throw new Error("No context available for evaluation");
|
|
@@ -2286,12 +2286,12 @@ function createContextPrecisionScorer({
|
|
|
2286
2286
|
}
|
|
2287
2287
|
const map = sumPrecision / relevantCount;
|
|
2288
2288
|
const score = map * (options.scale || 1);
|
|
2289
|
-
return
|
|
2289
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(score);
|
|
2290
2290
|
}).generateReason({
|
|
2291
2291
|
description: "Reason about the context precision results",
|
|
2292
2292
|
createPrompt: ({ run, results, score }) => {
|
|
2293
|
-
const input =
|
|
2294
|
-
const output =
|
|
2293
|
+
const input = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2294
|
+
const output = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2295
2295
|
const context = getContext2({ input: run.input, output: run.output, options });
|
|
2296
2296
|
return createContextPrecisionReasonPrompt({
|
|
2297
2297
|
input,
|
|
@@ -2589,8 +2589,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2589
2589
|
description: "Analyze the impact of noise on agent response quality",
|
|
2590
2590
|
outputSchema: analyzeOutputSchema4,
|
|
2591
2591
|
createPrompt: ({ run }) => {
|
|
2592
|
-
const originalQuery =
|
|
2593
|
-
const noisyResponse =
|
|
2592
|
+
const originalQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2593
|
+
const noisyResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2594
2594
|
if (!originalQuery || !noisyResponse) {
|
|
2595
2595
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2596
2596
|
}
|
|
@@ -2633,11 +2633,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2633
2633
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2634
2634
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2635
2635
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2636
|
-
return
|
|
2636
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(finalScore);
|
|
2637
2637
|
}).generateReason({
|
|
2638
2638
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2639
2639
|
createPrompt: ({ run, results, score }) => {
|
|
2640
|
-
const originalQuery =
|
|
2640
|
+
const originalQuery = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2641
2641
|
const analysisResult = results.analyzeStepResult;
|
|
2642
2642
|
if (!analysisResult) {
|
|
2643
2643
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -3049,9 +3049,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
3049
3049
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
3050
3050
|
outputSchema: analyzeOutputSchema5,
|
|
3051
3051
|
createPrompt: ({ run }) => {
|
|
3052
|
-
const userPrompt =
|
|
3053
|
-
const systemPrompt =
|
|
3054
|
-
const agentResponse =
|
|
3052
|
+
const userPrompt = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3053
|
+
const systemPrompt = chunkBULMCHKJ_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
3054
|
+
const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
3055
3055
|
if (evaluationMode === "user" && !userPrompt) {
|
|
3056
3056
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
3057
3057
|
}
|
|
@@ -3087,12 +3087,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
3087
3087
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
3088
3088
|
}
|
|
3089
3089
|
const finalScore = weightedScore * scale;
|
|
3090
|
-
return
|
|
3090
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(finalScore);
|
|
3091
3091
|
}).generateReason({
|
|
3092
3092
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
3093
3093
|
createPrompt: ({ run, results, score }) => {
|
|
3094
|
-
const userPrompt =
|
|
3095
|
-
const systemPrompt =
|
|
3094
|
+
const userPrompt = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3095
|
+
const systemPrompt = chunkBULMCHKJ_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
3096
3096
|
const analysis = results.analyzeStepResult;
|
|
3097
3097
|
if (!analysis) {
|
|
3098
3098
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -3342,8 +3342,8 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
3342
3342
|
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
3343
3343
|
outputSchema: analyzeOutputSchema6,
|
|
3344
3344
|
createPrompt: ({ run, results }) => {
|
|
3345
|
-
const userInput =
|
|
3346
|
-
const agentResponse =
|
|
3345
|
+
const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3346
|
+
const agentResponse = chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
3347
3347
|
return createAnalyzePrompt6({
|
|
3348
3348
|
userInput,
|
|
3349
3349
|
agentResponse,
|
|
@@ -3368,11 +3368,11 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
3368
3368
|
const necessityScore = necessarySteps / totalSteps;
|
|
3369
3369
|
const orderScore = orderedSteps / totalSteps;
|
|
3370
3370
|
const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
|
|
3371
|
-
return
|
|
3371
|
+
return chunkBULMCHKJ_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
|
|
3372
3372
|
}).generateReason({
|
|
3373
3373
|
description: "Generate human-readable explanation of trajectory evaluation",
|
|
3374
3374
|
createPrompt: ({ run, results, score }) => {
|
|
3375
|
-
const userInput =
|
|
3375
|
+
const userInput = chunkBULMCHKJ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3376
3376
|
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
3377
3377
|
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
3378
3378
|
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
@@ -3435,18 +3435,18 @@ function createCompletenessScorer() {
|
|
|
3435
3435
|
type: "agent"
|
|
3436
3436
|
}).preprocess(async ({ run }) => {
|
|
3437
3437
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
3438
|
-
const content =
|
|
3438
|
+
const content = chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i);
|
|
3439
3439
|
return content === null || content === void 0;
|
|
3440
3440
|
});
|
|
3441
3441
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
3442
|
-
const content =
|
|
3442
|
+
const content = chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i);
|
|
3443
3443
|
return content === null || content === void 0;
|
|
3444
3444
|
});
|
|
3445
3445
|
if (isInputInvalid || isOutputInvalid) {
|
|
3446
3446
|
throw new Error("Inputs cannot be null or undefined");
|
|
3447
3447
|
}
|
|
3448
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
3449
|
-
const output = run.output?.map((i) =>
|
|
3448
|
+
const input = run.input?.inputMessages.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3449
|
+
const output = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3450
3450
|
const inputToProcess = input;
|
|
3451
3451
|
const outputToProcess = output;
|
|
3452
3452
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -3551,8 +3551,8 @@ function createTextualDifferenceScorer() {
|
|
|
3551
3551
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
3552
3552
|
type: "agent"
|
|
3553
3553
|
}).preprocess(async ({ run }) => {
|
|
3554
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3555
|
-
const output = run.output?.map((i) =>
|
|
3554
|
+
const input = run.input?.inputMessages?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3555
|
+
const output = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3556
3556
|
const ratio = calculateRatio(input, output);
|
|
3557
3557
|
const changes = countChanges(input, output);
|
|
3558
3558
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -3575,8 +3575,8 @@ function createKeywordCoverageScorer() {
|
|
|
3575
3575
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
3576
3576
|
type: "agent"
|
|
3577
3577
|
}).preprocess(async ({ run }) => {
|
|
3578
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3579
|
-
const output = run.output?.map((i) =>
|
|
3578
|
+
const input = run.input?.inputMessages?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3579
|
+
const output = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3580
3580
|
if (!input && !output) {
|
|
3581
3581
|
return {
|
|
3582
3582
|
result: {
|
|
@@ -3629,8 +3629,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
3629
3629
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
3630
3630
|
type: "agent"
|
|
3631
3631
|
}).preprocess(async ({ run }) => {
|
|
3632
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
3633
|
-
let processedOutput = run.output.map((i) =>
|
|
3632
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3633
|
+
let processedOutput = run.output.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3634
3634
|
if (ignoreCase) {
|
|
3635
3635
|
processedInput = processedInput.toLowerCase();
|
|
3636
3636
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -3660,7 +3660,7 @@ function createToneScorer(config = {}) {
|
|
|
3660
3660
|
type: "agent"
|
|
3661
3661
|
}).preprocess(async ({ run }) => {
|
|
3662
3662
|
const sentiment = new Sentiment__default.default();
|
|
3663
|
-
const agentMessage = run.output?.map((i) =>
|
|
3663
|
+
const agentMessage = run.output?.map((i) => chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3664
3664
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
3665
3665
|
if (referenceTone) {
|
|
3666
3666
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -3747,7 +3747,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
3747
3747
|
if (isInputInvalid || isOutputInvalid) {
|
|
3748
3748
|
throw new Error("Input and output messages cannot be null or empty");
|
|
3749
3749
|
}
|
|
3750
|
-
const { tools: actualTools, toolCallInfos } =
|
|
3750
|
+
const { tools: actualTools, toolCallInfos } = chunkBULMCHKJ_cjs.extractToolCalls(run.output);
|
|
3751
3751
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
3752
3752
|
return {
|
|
3753
3753
|
expectedTool,
|
|
@@ -3822,7 +3822,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3822
3822
|
const itemExpectation = run.expectedTrajectory;
|
|
3823
3823
|
const effectiveOrdering = itemExpectation?.ordering ?? ordering;
|
|
3824
3824
|
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3825
|
-
const comparison =
|
|
3825
|
+
const comparison = chunkBULMCHKJ_cjs.compareTrajectories(
|
|
3826
3826
|
actualTrajectory,
|
|
3827
3827
|
{ steps: resolvedExpectedSteps },
|
|
3828
3828
|
{
|
|
@@ -3880,7 +3880,7 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
|
|
|
3880
3880
|
const childConfig = expectedStep.children;
|
|
3881
3881
|
let accuracy;
|
|
3882
3882
|
if (childConfig.steps && childConfig.steps.length > 0) {
|
|
3883
|
-
accuracy =
|
|
3883
|
+
accuracy = chunkBULMCHKJ_cjs.compareTrajectories(
|
|
3884
3884
|
childTrajectory,
|
|
3885
3885
|
{ steps: childConfig.steps },
|
|
3886
3886
|
{
|
|
@@ -3890,18 +3890,18 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
|
|
|
3890
3890
|
);
|
|
3891
3891
|
}
|
|
3892
3892
|
const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
|
|
3893
|
-
const efficiency = hasEfficiencyConfig ?
|
|
3893
|
+
const efficiency = hasEfficiencyConfig ? chunkBULMCHKJ_cjs.checkTrajectoryEfficiency(childTrajectory, {
|
|
3894
3894
|
maxSteps: childConfig.maxSteps,
|
|
3895
3895
|
maxTotalTokens: childConfig.maxTotalTokens,
|
|
3896
3896
|
maxTotalDurationMs: childConfig.maxTotalDurationMs,
|
|
3897
3897
|
noRedundantCalls: childConfig.noRedundantCalls ?? true
|
|
3898
3898
|
}) : void 0;
|
|
3899
3899
|
const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
|
|
3900
|
-
const blacklist = hasBlacklistConfig ?
|
|
3900
|
+
const blacklist = hasBlacklistConfig ? chunkBULMCHKJ_cjs.checkTrajectoryBlacklist(childTrajectory, {
|
|
3901
3901
|
blacklistedTools: childConfig.blacklistedTools,
|
|
3902
3902
|
blacklistedSequences: childConfig.blacklistedSequences
|
|
3903
3903
|
}) : void 0;
|
|
3904
|
-
const toolFailures =
|
|
3904
|
+
const toolFailures = chunkBULMCHKJ_cjs.analyzeToolFailures(childTrajectory, {
|
|
3905
3905
|
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3906
3906
|
});
|
|
3907
3907
|
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
|
|
@@ -3966,7 +3966,7 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3966
3966
|
}
|
|
3967
3967
|
let accuracy;
|
|
3968
3968
|
if (config.steps && config.steps.length > 0) {
|
|
3969
|
-
accuracy =
|
|
3969
|
+
accuracy = chunkBULMCHKJ_cjs.compareTrajectories(
|
|
3970
3970
|
actualTrajectory,
|
|
3971
3971
|
{ steps: config.steps },
|
|
3972
3972
|
{
|
|
@@ -3976,18 +3976,18 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3976
3976
|
);
|
|
3977
3977
|
}
|
|
3978
3978
|
const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
|
|
3979
|
-
const efficiency = hasEfficiencyConfig ?
|
|
3979
|
+
const efficiency = hasEfficiencyConfig ? chunkBULMCHKJ_cjs.checkTrajectoryEfficiency(actualTrajectory, {
|
|
3980
3980
|
maxSteps: config.maxSteps,
|
|
3981
3981
|
maxTotalTokens: config.maxTotalTokens,
|
|
3982
3982
|
maxTotalDurationMs: config.maxTotalDurationMs,
|
|
3983
3983
|
noRedundantCalls: config.noRedundantCalls ?? true
|
|
3984
3984
|
}) : void 0;
|
|
3985
3985
|
const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
|
|
3986
|
-
const blacklist = hasBlacklistConfig ?
|
|
3986
|
+
const blacklist = hasBlacklistConfig ? chunkBULMCHKJ_cjs.checkTrajectoryBlacklist(actualTrajectory, {
|
|
3987
3987
|
blacklistedTools: config.blacklistedTools,
|
|
3988
3988
|
blacklistedSequences: config.blacklistedSequences
|
|
3989
3989
|
}) : void 0;
|
|
3990
|
-
const toolFailures =
|
|
3990
|
+
const toolFailures = chunkBULMCHKJ_cjs.analyzeToolFailures(actualTrajectory, {
|
|
3991
3991
|
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3992
3992
|
});
|
|
3993
3993
|
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-
|
|
1
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-XOXUFZEG.js';
|
|
2
2
|
import { createScorer } from '@mastra/core/evals';
|
|
3
3
|
import nlp from 'compromise';
|
|
4
4
|
import keyword_extractor from 'keyword-extractor';
|
package/dist/scorers/utils.cjs
CHANGED
|
@@ -1,104 +1,104 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkBULMCHKJ_cjs = require('../chunk-BULMCHKJ.cjs');
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
Object.defineProperty(exports, "analyzeToolFailures", {
|
|
8
8
|
enumerable: true,
|
|
9
|
-
get: function () { return
|
|
9
|
+
get: function () { return chunkBULMCHKJ_cjs.analyzeToolFailures; }
|
|
10
10
|
});
|
|
11
11
|
Object.defineProperty(exports, "checkTrajectoryBlacklist", {
|
|
12
12
|
enumerable: true,
|
|
13
|
-
get: function () { return
|
|
13
|
+
get: function () { return chunkBULMCHKJ_cjs.checkTrajectoryBlacklist; }
|
|
14
14
|
});
|
|
15
15
|
Object.defineProperty(exports, "checkTrajectoryEfficiency", {
|
|
16
16
|
enumerable: true,
|
|
17
|
-
get: function () { return
|
|
17
|
+
get: function () { return chunkBULMCHKJ_cjs.checkTrajectoryEfficiency; }
|
|
18
18
|
});
|
|
19
19
|
Object.defineProperty(exports, "compareTrajectories", {
|
|
20
20
|
enumerable: true,
|
|
21
|
-
get: function () { return
|
|
21
|
+
get: function () { return chunkBULMCHKJ_cjs.compareTrajectories; }
|
|
22
22
|
});
|
|
23
23
|
Object.defineProperty(exports, "createAgentTestRun", {
|
|
24
24
|
enumerable: true,
|
|
25
|
-
get: function () { return
|
|
25
|
+
get: function () { return chunkBULMCHKJ_cjs.createAgentTestRun; }
|
|
26
26
|
});
|
|
27
27
|
Object.defineProperty(exports, "createTestMessage", {
|
|
28
28
|
enumerable: true,
|
|
29
|
-
get: function () { return
|
|
29
|
+
get: function () { return chunkBULMCHKJ_cjs.createTestMessage; }
|
|
30
30
|
});
|
|
31
31
|
Object.defineProperty(exports, "createTestRun", {
|
|
32
32
|
enumerable: true,
|
|
33
|
-
get: function () { return
|
|
33
|
+
get: function () { return chunkBULMCHKJ_cjs.createTestRun; }
|
|
34
34
|
});
|
|
35
35
|
Object.defineProperty(exports, "createToolInvocation", {
|
|
36
36
|
enumerable: true,
|
|
37
|
-
get: function () { return
|
|
37
|
+
get: function () { return chunkBULMCHKJ_cjs.createToolInvocation; }
|
|
38
38
|
});
|
|
39
39
|
Object.defineProperty(exports, "createTrajectoryTestRun", {
|
|
40
40
|
enumerable: true,
|
|
41
|
-
get: function () { return
|
|
41
|
+
get: function () { return chunkBULMCHKJ_cjs.createTrajectoryTestRun; }
|
|
42
42
|
});
|
|
43
43
|
Object.defineProperty(exports, "extractAgentResponseMessages", {
|
|
44
44
|
enumerable: true,
|
|
45
|
-
get: function () { return
|
|
45
|
+
get: function () { return chunkBULMCHKJ_cjs.extractAgentResponseMessages; }
|
|
46
46
|
});
|
|
47
47
|
Object.defineProperty(exports, "extractInputMessages", {
|
|
48
48
|
enumerable: true,
|
|
49
|
-
get: function () { return
|
|
49
|
+
get: function () { return chunkBULMCHKJ_cjs.extractInputMessages; }
|
|
50
50
|
});
|
|
51
51
|
Object.defineProperty(exports, "extractToolCalls", {
|
|
52
52
|
enumerable: true,
|
|
53
|
-
get: function () { return
|
|
53
|
+
get: function () { return chunkBULMCHKJ_cjs.extractToolCalls; }
|
|
54
54
|
});
|
|
55
55
|
Object.defineProperty(exports, "extractToolResults", {
|
|
56
56
|
enumerable: true,
|
|
57
|
-
get: function () { return
|
|
57
|
+
get: function () { return chunkBULMCHKJ_cjs.extractToolResults; }
|
|
58
58
|
});
|
|
59
59
|
Object.defineProperty(exports, "extractTrajectory", {
|
|
60
60
|
enumerable: true,
|
|
61
|
-
get: function () { return
|
|
61
|
+
get: function () { return chunkBULMCHKJ_cjs.extractTrajectory; }
|
|
62
62
|
});
|
|
63
63
|
Object.defineProperty(exports, "getAssistantMessageFromRunOutput", {
|
|
64
64
|
enumerable: true,
|
|
65
|
-
get: function () { return
|
|
65
|
+
get: function () { return chunkBULMCHKJ_cjs.getAssistantMessageFromRunOutput; }
|
|
66
66
|
});
|
|
67
67
|
Object.defineProperty(exports, "getCombinedSystemPrompt", {
|
|
68
68
|
enumerable: true,
|
|
69
|
-
get: function () { return
|
|
69
|
+
get: function () { return chunkBULMCHKJ_cjs.getCombinedSystemPrompt; }
|
|
70
70
|
});
|
|
71
71
|
Object.defineProperty(exports, "getReasoningFromRunOutput", {
|
|
72
72
|
enumerable: true,
|
|
73
|
-
get: function () { return
|
|
73
|
+
get: function () { return chunkBULMCHKJ_cjs.getReasoningFromRunOutput; }
|
|
74
74
|
});
|
|
75
75
|
Object.defineProperty(exports, "getSystemMessagesFromRunInput", {
|
|
76
76
|
enumerable: true,
|
|
77
|
-
get: function () { return
|
|
77
|
+
get: function () { return chunkBULMCHKJ_cjs.getSystemMessagesFromRunInput; }
|
|
78
78
|
});
|
|
79
79
|
Object.defineProperty(exports, "getTextContentFromMastraDBMessage", {
|
|
80
80
|
enumerable: true,
|
|
81
|
-
get: function () { return
|
|
81
|
+
get: function () { return chunkBULMCHKJ_cjs.getTextContentFromMastraDBMessage; }
|
|
82
82
|
});
|
|
83
83
|
Object.defineProperty(exports, "getUserMessageFromRunInput", {
|
|
84
84
|
enumerable: true,
|
|
85
|
-
get: function () { return
|
|
85
|
+
get: function () { return chunkBULMCHKJ_cjs.getUserMessageFromRunInput; }
|
|
86
86
|
});
|
|
87
87
|
Object.defineProperty(exports, "isCloserTo", {
|
|
88
88
|
enumerable: true,
|
|
89
|
-
get: function () { return
|
|
89
|
+
get: function () { return chunkBULMCHKJ_cjs.isCloserTo; }
|
|
90
90
|
});
|
|
91
91
|
Object.defineProperty(exports, "isScorerRunInputForAgent", {
|
|
92
92
|
enumerable: true,
|
|
93
|
-
get: function () { return
|
|
93
|
+
get: function () { return chunkBULMCHKJ_cjs.isScorerRunInputForAgent; }
|
|
94
94
|
});
|
|
95
95
|
Object.defineProperty(exports, "isScorerRunOutputForAgent", {
|
|
96
96
|
enumerable: true,
|
|
97
|
-
get: function () { return
|
|
97
|
+
get: function () { return chunkBULMCHKJ_cjs.isScorerRunOutputForAgent; }
|
|
98
98
|
});
|
|
99
99
|
Object.defineProperty(exports, "roundToTwoDecimals", {
|
|
100
100
|
enumerable: true,
|
|
101
|
-
get: function () { return
|
|
101
|
+
get: function () { return chunkBULMCHKJ_cjs.roundToTwoDecimals; }
|
|
102
102
|
});
|
|
103
103
|
//# sourceMappingURL=utils.cjs.map
|
|
104
104
|
//# sourceMappingURL=utils.cjs.map
|