@mastra/evals 1.2.4-alpha.0 → 1.3.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
- package/dist/chunk-BE5F2OUQ.js.map +1 -0
- package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
- package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
- package/dist/docs/SKILL.md +2 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-overview.md +2 -2
- package/dist/docs/references/reference-evals-answer-relevancy.md +1 -1
- package/dist/docs/references/reference-evals-answer-similarity.md +1 -1
- package/dist/docs/references/reference-evals-bias.md +1 -1
- package/dist/docs/references/reference-evals-context-precision.md +3 -3
- package/dist/docs/references/reference-evals-context-relevance.md +11 -11
- package/dist/docs/references/reference-evals-faithfulness.md +1 -1
- package/dist/docs/references/reference-evals-hallucination.md +5 -5
- package/dist/docs/references/reference-evals-noise-sensitivity.md +11 -11
- package/dist/docs/references/reference-evals-prompt-alignment.md +15 -15
- package/dist/docs/references/reference-evals-rubric.md +113 -0
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +3 -3
- package/dist/docs/references/reference-evals-toxicity.md +1 -1
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/rubric/index.d.ts +71 -0
- package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
- package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
- package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +276 -78
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +203 -6
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +25 -25
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +9 -8
- package/dist/chunk-BULMCHKJ.cjs.map +0 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkUNQXHPOD_cjs = require('../../chunk-UNQXHPOD.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var nlp = require('compromise');
|
|
6
6
|
var keyword_extractor = require('keyword-extractor');
|
|
@@ -250,7 +250,7 @@ function createAnswerRelevancyScorer({
|
|
|
250
250
|
description: "Extract relevant statements from the LLM output",
|
|
251
251
|
outputSchema: extractOutputSchema,
|
|
252
252
|
createPrompt: ({ run }) => {
|
|
253
|
-
const assistantMessage =
|
|
253
|
+
const assistantMessage = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
254
254
|
return createExtractPrompt(assistantMessage);
|
|
255
255
|
}
|
|
256
256
|
}).analyze({
|
|
@@ -283,7 +283,7 @@ function createAnswerRelevancyScorer({
|
|
|
283
283
|
]
|
|
284
284
|
},
|
|
285
285
|
createPrompt: ({ run, results }) => {
|
|
286
|
-
const input =
|
|
286
|
+
const input = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
287
287
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
288
288
|
}
|
|
289
289
|
}).generateScore(({ results }) => {
|
|
@@ -300,13 +300,13 @@ function createAnswerRelevancyScorer({
|
|
|
300
300
|
}
|
|
301
301
|
}
|
|
302
302
|
const score = relevancyCount / numberOfResults;
|
|
303
|
-
return
|
|
303
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * options.scale);
|
|
304
304
|
}).generateReason({
|
|
305
305
|
description: "Reason about the results",
|
|
306
306
|
createPrompt: ({ run, results, score }) => {
|
|
307
307
|
return createReasonPrompt({
|
|
308
|
-
input:
|
|
309
|
-
output:
|
|
308
|
+
input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
309
|
+
output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
310
310
|
score,
|
|
311
311
|
results: results.analyzeStepResult.results,
|
|
312
312
|
scale: options.scale
|
|
@@ -581,7 +581,7 @@ function createAnswerSimilarityScorer({
|
|
|
581
581
|
groundTruth: ""
|
|
582
582
|
});
|
|
583
583
|
}
|
|
584
|
-
const output =
|
|
584
|
+
const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
585
585
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
586
586
|
return createExtractPrompt2({
|
|
587
587
|
output,
|
|
@@ -639,14 +639,14 @@ function createAnswerSimilarityScorer({
|
|
|
639
639
|
);
|
|
640
640
|
score -= extraInfoPenalty;
|
|
641
641
|
score = Math.max(0, Math.min(1, score));
|
|
642
|
-
return
|
|
642
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
643
643
|
}).generateReason({
|
|
644
644
|
description: "Generate explanation of similarity score",
|
|
645
645
|
createPrompt: ({ run, results, score }) => {
|
|
646
646
|
if (!run.groundTruth) {
|
|
647
647
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
648
648
|
}
|
|
649
|
-
const output =
|
|
649
|
+
const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
650
650
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
651
651
|
return createReasonPrompt2({
|
|
652
652
|
output,
|
|
@@ -848,7 +848,7 @@ function createFaithfulnessScorer({
|
|
|
848
848
|
]
|
|
849
849
|
},
|
|
850
850
|
createPrompt: ({ run }) => {
|
|
851
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
851
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
852
852
|
return prompt;
|
|
853
853
|
}
|
|
854
854
|
}).analyze({
|
|
@@ -895,13 +895,13 @@ function createFaithfulnessScorer({
|
|
|
895
895
|
return 0;
|
|
896
896
|
}
|
|
897
897
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
898
|
-
return
|
|
898
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score);
|
|
899
899
|
}).generateReason({
|
|
900
900
|
description: "Reason about the results",
|
|
901
901
|
createPrompt: ({ run, results, score }) => {
|
|
902
902
|
const prompt = createFaithfulnessReasonPrompt({
|
|
903
|
-
input:
|
|
904
|
-
output:
|
|
903
|
+
input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
904
|
+
output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
905
905
|
context: options?.context ?? getToolInvocationContext(run.output),
|
|
906
906
|
score,
|
|
907
907
|
scale: options?.scale || 1,
|
|
@@ -1046,7 +1046,7 @@ function createBiasScorer({ model, options }) {
|
|
|
1046
1046
|
"opinions"
|
|
1047
1047
|
]
|
|
1048
1048
|
},
|
|
1049
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
1049
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
1050
1050
|
}).analyze({
|
|
1051
1051
|
description: "Score the relevance of the statements to the input",
|
|
1052
1052
|
outputSchema: {
|
|
@@ -1078,7 +1078,7 @@ function createBiasScorer({ model, options }) {
|
|
|
1078
1078
|
},
|
|
1079
1079
|
createPrompt: ({ run, results }) => {
|
|
1080
1080
|
const prompt = createBiasAnalyzePrompt({
|
|
1081
|
-
output:
|
|
1081
|
+
output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1082
1082
|
opinions: results.preprocessStepResult?.opinions || []
|
|
1083
1083
|
});
|
|
1084
1084
|
return prompt;
|
|
@@ -1089,7 +1089,7 @@ function createBiasScorer({ model, options }) {
|
|
|
1089
1089
|
}
|
|
1090
1090
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
1091
1091
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
1092
|
-
return
|
|
1092
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1093
1093
|
}).generateReason({
|
|
1094
1094
|
description: "Reason about the results",
|
|
1095
1095
|
createPrompt: ({ score, results }) => {
|
|
@@ -1320,7 +1320,7 @@ function createHallucinationScorer({
|
|
|
1320
1320
|
]
|
|
1321
1321
|
},
|
|
1322
1322
|
createPrompt: ({ run }) => {
|
|
1323
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1323
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1324
1324
|
return prompt;
|
|
1325
1325
|
}
|
|
1326
1326
|
}).analyze({
|
|
@@ -1376,7 +1376,7 @@ function createHallucinationScorer({
|
|
|
1376
1376
|
return 0;
|
|
1377
1377
|
}
|
|
1378
1378
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1379
|
-
return
|
|
1379
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score);
|
|
1380
1380
|
}).generateReason({
|
|
1381
1381
|
description: "Reason about the results",
|
|
1382
1382
|
createPrompt: async ({ run, results, score }) => {
|
|
@@ -1387,8 +1387,8 @@ function createHallucinationScorer({
|
|
|
1387
1387
|
context = options?.context ?? [];
|
|
1388
1388
|
}
|
|
1389
1389
|
const prompt = createHallucinationReasonPrompt({
|
|
1390
|
-
input:
|
|
1391
|
-
output:
|
|
1390
|
+
input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1391
|
+
output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1392
1392
|
context,
|
|
1393
1393
|
score,
|
|
1394
1394
|
scale: options?.scale || 1,
|
|
@@ -1528,8 +1528,8 @@ function createToxicityScorer({
|
|
|
1528
1528
|
},
|
|
1529
1529
|
createPrompt: ({ run }) => {
|
|
1530
1530
|
const prompt = createToxicityAnalyzePrompt({
|
|
1531
|
-
input:
|
|
1532
|
-
output:
|
|
1531
|
+
input: chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1532
|
+
output: chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1533
1533
|
});
|
|
1534
1534
|
return prompt;
|
|
1535
1535
|
}
|
|
@@ -1545,7 +1545,7 @@ function createToxicityScorer({
|
|
|
1545
1545
|
}
|
|
1546
1546
|
}
|
|
1547
1547
|
const score = toxicityCount / numberOfVerdicts;
|
|
1548
|
-
return
|
|
1548
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1549
1549
|
}).generateReason({
|
|
1550
1550
|
description: "Reason about the results",
|
|
1551
1551
|
createPrompt: ({ results, score }) => {
|
|
@@ -1706,7 +1706,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1706
1706
|
if (isInputInvalid || isOutputInvalid) {
|
|
1707
1707
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1708
1708
|
}
|
|
1709
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1709
|
+
const { tools: actualTools, toolCallInfos } = chunkUNQXHPOD_cjs.extractToolCalls(run.output);
|
|
1710
1710
|
return {
|
|
1711
1711
|
actualTools,
|
|
1712
1712
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1716,8 +1716,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1716
1716
|
description: "Analyze the appropriateness of tool selections",
|
|
1717
1717
|
outputSchema: analyzeOutputSchema2,
|
|
1718
1718
|
createPrompt: ({ run, results }) => {
|
|
1719
|
-
const userInput =
|
|
1720
|
-
const agentResponse =
|
|
1719
|
+
const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1720
|
+
const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1721
1721
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1722
1722
|
return createAnalyzePrompt2({
|
|
1723
1723
|
userInput,
|
|
@@ -1734,11 +1734,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1734
1734
|
}
|
|
1735
1735
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1736
1736
|
const totalToolCalls = evaluations.length;
|
|
1737
|
-
return
|
|
1737
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1738
1738
|
}).generateReason({
|
|
1739
1739
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1740
1740
|
createPrompt: ({ run, results, score }) => {
|
|
1741
|
-
const userInput =
|
|
1741
|
+
const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1742
1742
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1743
1743
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1744
1744
|
return createReasonPrompt3({
|
|
@@ -1968,7 +1968,7 @@ var getContext = ({
|
|
|
1968
1968
|
output,
|
|
1969
1969
|
options
|
|
1970
1970
|
}) => {
|
|
1971
|
-
if (options.contextExtractor &&
|
|
1971
|
+
if (options.contextExtractor && chunkUNQXHPOD_cjs.isScorerRunInputForAgent(input) && chunkUNQXHPOD_cjs.isScorerRunOutputForAgent(output)) {
|
|
1972
1972
|
return options.contextExtractor(input, output);
|
|
1973
1973
|
}
|
|
1974
1974
|
return options.context ?? [];
|
|
@@ -1996,8 +1996,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1996
1996
|
description: "Analyze the relevance and utility of provided context",
|
|
1997
1997
|
outputSchema: analyzeOutputSchema3,
|
|
1998
1998
|
createPrompt: ({ run }) => {
|
|
1999
|
-
const userQuery =
|
|
2000
|
-
const agentResponse =
|
|
1999
|
+
const userQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2000
|
+
const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2001
2001
|
const context = getContext({ input: run.input, output: run.output, options });
|
|
2002
2002
|
if (context.length === 0) {
|
|
2003
2003
|
return createAnalyzePrompt3({
|
|
@@ -2045,11 +2045,11 @@ function createContextRelevanceScorerLLM({
|
|
|
2045
2045
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
2046
2046
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
2047
2047
|
const scaledScore = finalScore * (options.scale || 1);
|
|
2048
|
-
return
|
|
2048
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(scaledScore);
|
|
2049
2049
|
}).generateReason({
|
|
2050
2050
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
2051
2051
|
createPrompt: ({ run, results, score }) => {
|
|
2052
|
-
const userQuery =
|
|
2052
|
+
const userQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2053
2053
|
const context = getContext({ input: run.input, output: run.output, options });
|
|
2054
2054
|
if (context.length === 0) {
|
|
2055
2055
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -2224,7 +2224,7 @@ var getContext2 = ({
|
|
|
2224
2224
|
output,
|
|
2225
2225
|
options
|
|
2226
2226
|
}) => {
|
|
2227
|
-
if (options.contextExtractor &&
|
|
2227
|
+
if (options.contextExtractor && chunkUNQXHPOD_cjs.isScorerRunInputForAgent(input) && chunkUNQXHPOD_cjs.isScorerRunOutputForAgent(output)) {
|
|
2228
2228
|
return options.contextExtractor(input, output);
|
|
2229
2229
|
}
|
|
2230
2230
|
return options.context ?? [];
|
|
@@ -2252,8 +2252,8 @@ function createContextPrecisionScorer({
|
|
|
2252
2252
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
2253
2253
|
outputSchema: contextRelevanceOutputSchema,
|
|
2254
2254
|
createPrompt: ({ run }) => {
|
|
2255
|
-
const input =
|
|
2256
|
-
const output =
|
|
2255
|
+
const input = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2256
|
+
const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2257
2257
|
const context = getContext2({ input: run.input, output: run.output, options });
|
|
2258
2258
|
if (context.length === 0) {
|
|
2259
2259
|
throw new Error("No context available for evaluation");
|
|
@@ -2286,12 +2286,12 @@ function createContextPrecisionScorer({
|
|
|
2286
2286
|
}
|
|
2287
2287
|
const map = sumPrecision / relevantCount;
|
|
2288
2288
|
const score = map * (options.scale || 1);
|
|
2289
|
-
return
|
|
2289
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(score);
|
|
2290
2290
|
}).generateReason({
|
|
2291
2291
|
description: "Reason about the context precision results",
|
|
2292
2292
|
createPrompt: ({ run, results, score }) => {
|
|
2293
|
-
const input =
|
|
2294
|
-
const output =
|
|
2293
|
+
const input = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2294
|
+
const output = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2295
2295
|
const context = getContext2({ input: run.input, output: run.output, options });
|
|
2296
2296
|
return createContextPrecisionReasonPrompt({
|
|
2297
2297
|
input,
|
|
@@ -2589,8 +2589,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2589
2589
|
description: "Analyze the impact of noise on agent response quality",
|
|
2590
2590
|
outputSchema: analyzeOutputSchema4,
|
|
2591
2591
|
createPrompt: ({ run }) => {
|
|
2592
|
-
const originalQuery =
|
|
2593
|
-
const noisyResponse =
|
|
2592
|
+
const originalQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2593
|
+
const noisyResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2594
2594
|
if (!originalQuery || !noisyResponse) {
|
|
2595
2595
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2596
2596
|
}
|
|
@@ -2633,11 +2633,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2633
2633
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2634
2634
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2635
2635
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2636
|
-
return
|
|
2636
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(finalScore);
|
|
2637
2637
|
}).generateReason({
|
|
2638
2638
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2639
2639
|
createPrompt: ({ run, results, score }) => {
|
|
2640
|
-
const originalQuery =
|
|
2640
|
+
const originalQuery = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2641
2641
|
const analysisResult = results.analyzeStepResult;
|
|
2642
2642
|
if (!analysisResult) {
|
|
2643
2643
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -3049,9 +3049,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
3049
3049
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
3050
3050
|
outputSchema: analyzeOutputSchema5,
|
|
3051
3051
|
createPrompt: ({ run }) => {
|
|
3052
|
-
const userPrompt =
|
|
3053
|
-
const systemPrompt =
|
|
3054
|
-
const agentResponse =
|
|
3052
|
+
const userPrompt = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3053
|
+
const systemPrompt = chunkUNQXHPOD_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
3054
|
+
const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
3055
3055
|
if (evaluationMode === "user" && !userPrompt) {
|
|
3056
3056
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
3057
3057
|
}
|
|
@@ -3087,12 +3087,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
3087
3087
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
3088
3088
|
}
|
|
3089
3089
|
const finalScore = weightedScore * scale;
|
|
3090
|
-
return
|
|
3090
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(finalScore);
|
|
3091
3091
|
}).generateReason({
|
|
3092
3092
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
3093
3093
|
createPrompt: ({ run, results, score }) => {
|
|
3094
|
-
const userPrompt =
|
|
3095
|
-
const systemPrompt =
|
|
3094
|
+
const userPrompt = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3095
|
+
const systemPrompt = chunkUNQXHPOD_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
3096
3096
|
const analysis = results.analyzeStepResult;
|
|
3097
3097
|
if (!analysis) {
|
|
3098
3098
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -3109,6 +3109,203 @@ function createPromptAlignmentScorerLLM({
|
|
|
3109
3109
|
});
|
|
3110
3110
|
}
|
|
3111
3111
|
|
|
3112
|
+
// src/scorers/llm/rubric/prompts.ts
|
|
3113
|
+
var RUBRIC_INSTRUCTIONS = `You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.
|
|
3114
|
+
|
|
3115
|
+
A rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.
|
|
3116
|
+
|
|
3117
|
+
Grading guidelines:
|
|
3118
|
+
- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.
|
|
3119
|
+
- A criterion is "satisfied" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.
|
|
3120
|
+
- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.
|
|
3121
|
+
- Be concise but specific in your reasoning: say what is present or missing.
|
|
3122
|
+
- Do not reward effort, intent, or partial progress. Only the actual output counts.`;
|
|
3123
|
+
function createAnalyzePrompt6({
|
|
3124
|
+
originalTask,
|
|
3125
|
+
output,
|
|
3126
|
+
criteria
|
|
3127
|
+
}) {
|
|
3128
|
+
const renderedCriteria = criteria.map((c, i) => `${i + 1}. [${c.required ? "required" : "optional"}] ${c.criterion}`).join("\n");
|
|
3129
|
+
return `Grade the agent's output against the rubric below.
|
|
3130
|
+
|
|
3131
|
+
Original task:
|
|
3132
|
+
${originalTask || "(no task provided)"}
|
|
3133
|
+
|
|
3134
|
+
Rubric criteria:
|
|
3135
|
+
${renderedCriteria}
|
|
3136
|
+
|
|
3137
|
+
Agent output to grade:
|
|
3138
|
+
${output || "(empty output)"}
|
|
3139
|
+
|
|
3140
|
+
For every criterion, decide whether the output satisfies it. Preserve the exact criterion text and its required/optional designation in your answer.
|
|
3141
|
+
|
|
3142
|
+
Return your judgement as JSON in this shape:
|
|
3143
|
+
{
|
|
3144
|
+
"criteria": [
|
|
3145
|
+
{
|
|
3146
|
+
"criterion": "exact criterion text",
|
|
3147
|
+
"satisfied": true,
|
|
3148
|
+
"required": true,
|
|
3149
|
+
"reasoning": "why it is or is not satisfied"
|
|
3150
|
+
}
|
|
3151
|
+
],
|
|
3152
|
+
"overallAssessment": "one or two sentence summary of what passed and what is missing"
|
|
3153
|
+
}`;
|
|
3154
|
+
}
|
|
3155
|
+
function formatRubricReason({ score, analysis }) {
|
|
3156
|
+
const complete = score >= 1;
|
|
3157
|
+
const header = complete ? "\u2705 Rubric satisfied: every required criterion is met." : "\u274C Rubric not yet satisfied.";
|
|
3158
|
+
const lines = analysis.criteria.map((c) => {
|
|
3159
|
+
const mark = c.satisfied ? "\u2705" : "\u274C";
|
|
3160
|
+
const tag = c.required ? "required" : "optional";
|
|
3161
|
+
return `${mark} [${tag}] ${c.criterion}
|
|
3162
|
+
\u2192 ${c.reasoning}`;
|
|
3163
|
+
});
|
|
3164
|
+
const unmetRequired = analysis.criteria.filter((c) => c.required && !c.satisfied);
|
|
3165
|
+
const footer = complete ? "" : `
|
|
3166
|
+
|
|
3167
|
+
To finish, address the ${unmetRequired.length} unmet required ${unmetRequired.length === 1 ? "criterion" : "criteria"} above.`;
|
|
3168
|
+
const assessment = analysis.overallAssessment ? `
|
|
3169
|
+
|
|
3170
|
+
${analysis.overallAssessment}` : "";
|
|
3171
|
+
return `${header}
|
|
3172
|
+
|
|
3173
|
+
${lines.join("\n")}${assessment}${footer}`;
|
|
3174
|
+
}
|
|
3175
|
+
|
|
3176
|
+
// src/scorers/llm/rubric/index.ts
|
|
3177
|
+
var analyzeOutputSchema6 = {
|
|
3178
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3179
|
+
"type": "object",
|
|
3180
|
+
"properties": {
|
|
3181
|
+
"criteria": {
|
|
3182
|
+
"type": "array",
|
|
3183
|
+
"items": {
|
|
3184
|
+
"type": "object",
|
|
3185
|
+
"properties": {
|
|
3186
|
+
"criterion": {
|
|
3187
|
+
"type": "string"
|
|
3188
|
+
},
|
|
3189
|
+
"satisfied": {
|
|
3190
|
+
"type": "boolean"
|
|
3191
|
+
},
|
|
3192
|
+
"required": {
|
|
3193
|
+
"type": "boolean"
|
|
3194
|
+
},
|
|
3195
|
+
"reasoning": {
|
|
3196
|
+
"type": "string"
|
|
3197
|
+
}
|
|
3198
|
+
},
|
|
3199
|
+
"required": [
|
|
3200
|
+
"criterion",
|
|
3201
|
+
"satisfied",
|
|
3202
|
+
"required",
|
|
3203
|
+
"reasoning"
|
|
3204
|
+
]
|
|
3205
|
+
}
|
|
3206
|
+
},
|
|
3207
|
+
"overallAssessment": {
|
|
3208
|
+
"type": "string"
|
|
3209
|
+
}
|
|
3210
|
+
},
|
|
3211
|
+
"required": [
|
|
3212
|
+
"criteria",
|
|
3213
|
+
"overallAssessment"
|
|
3214
|
+
]
|
|
3215
|
+
};
|
|
3216
|
+
function parseRubricString(rubric) {
|
|
3217
|
+
return rubric.split("\n").map((line) => line.replace(/^\s*(?:[-*•]|\d+[.)])\s*/, "").trim()).filter((line) => line.length > 0).map((description) => ({ description, required: true }));
|
|
3218
|
+
}
|
|
3219
|
+
function normalizeRubric(rubric) {
|
|
3220
|
+
if (!rubric) return [];
|
|
3221
|
+
if (typeof rubric === "string") return parseRubricString(rubric);
|
|
3222
|
+
return rubric;
|
|
3223
|
+
}
|
|
3224
|
+
function resolveRubric({
|
|
3225
|
+
staticRubric,
|
|
3226
|
+
run
|
|
3227
|
+
}) {
|
|
3228
|
+
if (staticRubric.length > 0) return staticRubric;
|
|
3229
|
+
const dynamic = pickRubric(run.requestContext) ?? pickRubric(run.additionalContext) ?? pickRubric(run.input);
|
|
3230
|
+
return normalizeRubric(dynamic);
|
|
3231
|
+
}
|
|
3232
|
+
function pickRubric(source) {
|
|
3233
|
+
if (!source || typeof source !== "object") return void 0;
|
|
3234
|
+
let value;
|
|
3235
|
+
const getter = source.get;
|
|
3236
|
+
if (typeof getter === "function") {
|
|
3237
|
+
value = getter.call(source, "rubric");
|
|
3238
|
+
} else {
|
|
3239
|
+
value = source.rubric;
|
|
3240
|
+
}
|
|
3241
|
+
if (typeof value === "string") return value;
|
|
3242
|
+
if (Array.isArray(value)) return value;
|
|
3243
|
+
return void 0;
|
|
3244
|
+
}
|
|
3245
|
+
function toCriterionInputs(criteria) {
|
|
3246
|
+
return criteria.map((c) => ({ criterion: c.description, required: c.required !== false }));
|
|
3247
|
+
}
|
|
3248
|
+
function getOutputText(run) {
|
|
3249
|
+
const fromOutput = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output);
|
|
3250
|
+
if (fromOutput) return fromOutput;
|
|
3251
|
+
if (run.input && typeof run.input === "object" && typeof run.input.currentText === "string") {
|
|
3252
|
+
return run.input.currentText;
|
|
3253
|
+
}
|
|
3254
|
+
return typeof run.output === "string" ? run.output : "";
|
|
3255
|
+
}
|
|
3256
|
+
function getTaskText(run) {
|
|
3257
|
+
if (run.input && typeof run.input === "object" && typeof run.input.originalTask === "string") {
|
|
3258
|
+
return run.input.originalTask;
|
|
3259
|
+
}
|
|
3260
|
+
return chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3261
|
+
}
|
|
3262
|
+
function createRubricScorer({
|
|
3263
|
+
model,
|
|
3264
|
+
criteria,
|
|
3265
|
+
options
|
|
3266
|
+
}) {
|
|
3267
|
+
const scale = options?.scale ?? 1;
|
|
3268
|
+
const staticRubric = normalizeRubric(criteria);
|
|
3269
|
+
return evals.createScorer({
|
|
3270
|
+
id: "rubric-scorer",
|
|
3271
|
+
name: "Rubric (LLM)",
|
|
3272
|
+
description: "Grades an agent output against a rubric of criteria, returning 1 only when every required criterion is satisfied",
|
|
3273
|
+
judge: {
|
|
3274
|
+
model,
|
|
3275
|
+
instructions: RUBRIC_INSTRUCTIONS
|
|
3276
|
+
}
|
|
3277
|
+
}).analyze({
|
|
3278
|
+
description: "Judge the output against each rubric criterion",
|
|
3279
|
+
outputSchema: analyzeOutputSchema6,
|
|
3280
|
+
createPrompt: ({ run }) => {
|
|
3281
|
+
const rubric = resolveRubric({ staticRubric, run });
|
|
3282
|
+
if (rubric.length === 0) {
|
|
3283
|
+
return `No rubric was provided. Return exactly: {"criteria": [], "overallAssessment": "No rubric provided; nothing to grade."}`;
|
|
3284
|
+
}
|
|
3285
|
+
return createAnalyzePrompt6({
|
|
3286
|
+
originalTask: getTaskText(run),
|
|
3287
|
+
output: getOutputText(run),
|
|
3288
|
+
criteria: toCriterionInputs(rubric)
|
|
3289
|
+
});
|
|
3290
|
+
}
|
|
3291
|
+
}).generateScore(({ results }) => {
|
|
3292
|
+
const analysis = results.analyzeStepResult;
|
|
3293
|
+
if (!analysis || analysis.criteria.length === 0) {
|
|
3294
|
+
return 1;
|
|
3295
|
+
}
|
|
3296
|
+
const requiredCriteria = analysis.criteria.filter((c) => c.required);
|
|
3297
|
+
const gating = requiredCriteria.length > 0 ? requiredCriteria : analysis.criteria;
|
|
3298
|
+
const allSatisfied = gating.every((c) => c.satisfied);
|
|
3299
|
+
return (allSatisfied ? 1 : 0) * scale;
|
|
3300
|
+
}).generateReason(({ results, score }) => {
|
|
3301
|
+
const analysis = results.analyzeStepResult;
|
|
3302
|
+
if (!analysis || analysis.criteria.length === 0) {
|
|
3303
|
+
return "No rubric was provided, so the rubric check passed by default.";
|
|
3304
|
+
}
|
|
3305
|
+
return formatRubricReason({ score, analysis });
|
|
3306
|
+
});
|
|
3307
|
+
}
|
|
3308
|
+
|
|
3112
3309
|
// src/scorers/llm/trajectory/prompts.ts
|
|
3113
3310
|
var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
|
|
3114
3311
|
You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
|
|
@@ -3131,7 +3328,7 @@ OUTPUT REQUIREMENTS:
|
|
|
3131
3328
|
- Use provided JSON schema exactly as specified
|
|
3132
3329
|
- Be consistent in your evaluation standards
|
|
3133
3330
|
`;
|
|
3134
|
-
var
|
|
3331
|
+
var createAnalyzePrompt7 = ({
|
|
3135
3332
|
userInput,
|
|
3136
3333
|
agentResponse,
|
|
3137
3334
|
actualTrajectory,
|
|
@@ -3198,7 +3395,7 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
3198
3395
|
};
|
|
3199
3396
|
|
|
3200
3397
|
// src/scorers/llm/trajectory/index.ts
|
|
3201
|
-
var
|
|
3398
|
+
var analyzeOutputSchema7 = {
|
|
3202
3399
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3203
3400
|
"type": "object",
|
|
3204
3401
|
"properties": {
|
|
@@ -3340,11 +3537,11 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
3340
3537
|
};
|
|
3341
3538
|
}).analyze({
|
|
3342
3539
|
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
3343
|
-
outputSchema:
|
|
3540
|
+
outputSchema: analyzeOutputSchema7,
|
|
3344
3541
|
createPrompt: ({ run, results }) => {
|
|
3345
|
-
const userInput =
|
|
3346
|
-
const agentResponse =
|
|
3347
|
-
return
|
|
3542
|
+
const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3543
|
+
const agentResponse = chunkUNQXHPOD_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
3544
|
+
return createAnalyzePrompt7({
|
|
3348
3545
|
userInput,
|
|
3349
3546
|
agentResponse,
|
|
3350
3547
|
actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
|
|
@@ -3368,11 +3565,11 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
3368
3565
|
const necessityScore = necessarySteps / totalSteps;
|
|
3369
3566
|
const orderScore = orderedSteps / totalSteps;
|
|
3370
3567
|
const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
|
|
3371
|
-
return
|
|
3568
|
+
return chunkUNQXHPOD_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
|
|
3372
3569
|
}).generateReason({
|
|
3373
3570
|
description: "Generate human-readable explanation of trajectory evaluation",
|
|
3374
3571
|
createPrompt: ({ run, results, score }) => {
|
|
3375
|
-
const userInput =
|
|
3572
|
+
const userInput = chunkUNQXHPOD_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
3376
3573
|
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
3377
3574
|
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
3378
3575
|
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
@@ -3435,18 +3632,18 @@ function createCompletenessScorer() {
|
|
|
3435
3632
|
type: "agent"
|
|
3436
3633
|
}).preprocess(async ({ run }) => {
|
|
3437
3634
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
3438
|
-
const content =
|
|
3635
|
+
const content = chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i);
|
|
3439
3636
|
return content === null || content === void 0;
|
|
3440
3637
|
});
|
|
3441
3638
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
3442
|
-
const content =
|
|
3639
|
+
const content = chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i);
|
|
3443
3640
|
return content === null || content === void 0;
|
|
3444
3641
|
});
|
|
3445
3642
|
if (isInputInvalid || isOutputInvalid) {
|
|
3446
3643
|
throw new Error("Inputs cannot be null or undefined");
|
|
3447
3644
|
}
|
|
3448
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
3449
|
-
const output = run.output?.map((i) =>
|
|
3645
|
+
const input = run.input?.inputMessages.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3646
|
+
const output = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3450
3647
|
const inputToProcess = input;
|
|
3451
3648
|
const outputToProcess = output;
|
|
3452
3649
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -3551,8 +3748,8 @@ function createTextualDifferenceScorer() {
|
|
|
3551
3748
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
3552
3749
|
type: "agent"
|
|
3553
3750
|
}).preprocess(async ({ run }) => {
|
|
3554
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3555
|
-
const output = run.output?.map((i) =>
|
|
3751
|
+
const input = run.input?.inputMessages?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3752
|
+
const output = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3556
3753
|
const ratio = calculateRatio(input, output);
|
|
3557
3754
|
const changes = countChanges(input, output);
|
|
3558
3755
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -3575,8 +3772,8 @@ function createKeywordCoverageScorer() {
|
|
|
3575
3772
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
3576
3773
|
type: "agent"
|
|
3577
3774
|
}).preprocess(async ({ run }) => {
|
|
3578
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3579
|
-
const output = run.output?.map((i) =>
|
|
3775
|
+
const input = run.input?.inputMessages?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3776
|
+
const output = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3580
3777
|
if (!input && !output) {
|
|
3581
3778
|
return {
|
|
3582
3779
|
result: {
|
|
@@ -3629,8 +3826,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
3629
3826
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
3630
3827
|
type: "agent"
|
|
3631
3828
|
}).preprocess(async ({ run }) => {
|
|
3632
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
3633
|
-
let processedOutput = run.output.map((i) =>
|
|
3829
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3830
|
+
let processedOutput = run.output.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3634
3831
|
if (ignoreCase) {
|
|
3635
3832
|
processedInput = processedInput.toLowerCase();
|
|
3636
3833
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -3660,7 +3857,7 @@ function createToneScorer(config = {}) {
|
|
|
3660
3857
|
type: "agent"
|
|
3661
3858
|
}).preprocess(async ({ run }) => {
|
|
3662
3859
|
const sentiment = new Sentiment__default.default();
|
|
3663
|
-
const agentMessage = run.output?.map((i) =>
|
|
3860
|
+
const agentMessage = run.output?.map((i) => chunkUNQXHPOD_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3664
3861
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
3665
3862
|
if (referenceTone) {
|
|
3666
3863
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -3747,7 +3944,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
3747
3944
|
if (isInputInvalid || isOutputInvalid) {
|
|
3748
3945
|
throw new Error("Input and output messages cannot be null or empty");
|
|
3749
3946
|
}
|
|
3750
|
-
const { tools: actualTools, toolCallInfos } =
|
|
3947
|
+
const { tools: actualTools, toolCallInfos } = chunkUNQXHPOD_cjs.extractToolCalls(run.output);
|
|
3751
3948
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
3752
3949
|
return {
|
|
3753
3950
|
expectedTool,
|
|
@@ -3822,7 +4019,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3822
4019
|
const itemExpectation = run.expectedTrajectory;
|
|
3823
4020
|
const effectiveOrdering = itemExpectation?.ordering ?? ordering;
|
|
3824
4021
|
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3825
|
-
const comparison =
|
|
4022
|
+
const comparison = chunkUNQXHPOD_cjs.compareTrajectories(
|
|
3826
4023
|
actualTrajectory,
|
|
3827
4024
|
{ steps: resolvedExpectedSteps },
|
|
3828
4025
|
{
|
|
@@ -3880,7 +4077,7 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
|
|
|
3880
4077
|
const childConfig = expectedStep.children;
|
|
3881
4078
|
let accuracy;
|
|
3882
4079
|
if (childConfig.steps && childConfig.steps.length > 0) {
|
|
3883
|
-
accuracy =
|
|
4080
|
+
accuracy = chunkUNQXHPOD_cjs.compareTrajectories(
|
|
3884
4081
|
childTrajectory,
|
|
3885
4082
|
{ steps: childConfig.steps },
|
|
3886
4083
|
{
|
|
@@ -3890,18 +4087,18 @@ function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accu
|
|
|
3890
4087
|
);
|
|
3891
4088
|
}
|
|
3892
4089
|
const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
|
|
3893
|
-
const efficiency = hasEfficiencyConfig ?
|
|
4090
|
+
const efficiency = hasEfficiencyConfig ? chunkUNQXHPOD_cjs.checkTrajectoryEfficiency(childTrajectory, {
|
|
3894
4091
|
maxSteps: childConfig.maxSteps,
|
|
3895
4092
|
maxTotalTokens: childConfig.maxTotalTokens,
|
|
3896
4093
|
maxTotalDurationMs: childConfig.maxTotalDurationMs,
|
|
3897
4094
|
noRedundantCalls: childConfig.noRedundantCalls ?? true
|
|
3898
4095
|
}) : void 0;
|
|
3899
4096
|
const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
|
|
3900
|
-
const blacklist = hasBlacklistConfig ?
|
|
4097
|
+
const blacklist = hasBlacklistConfig ? chunkUNQXHPOD_cjs.checkTrajectoryBlacklist(childTrajectory, {
|
|
3901
4098
|
blacklistedTools: childConfig.blacklistedTools,
|
|
3902
4099
|
blacklistedSequences: childConfig.blacklistedSequences
|
|
3903
4100
|
}) : void 0;
|
|
3904
|
-
const toolFailures =
|
|
4101
|
+
const toolFailures = chunkUNQXHPOD_cjs.analyzeToolFailures(childTrajectory, {
|
|
3905
4102
|
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3906
4103
|
});
|
|
3907
4104
|
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
|
|
@@ -3966,7 +4163,7 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3966
4163
|
}
|
|
3967
4164
|
let accuracy;
|
|
3968
4165
|
if (config.steps && config.steps.length > 0) {
|
|
3969
|
-
accuracy =
|
|
4166
|
+
accuracy = chunkUNQXHPOD_cjs.compareTrajectories(
|
|
3970
4167
|
actualTrajectory,
|
|
3971
4168
|
{ steps: config.steps },
|
|
3972
4169
|
{
|
|
@@ -3976,18 +4173,18 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3976
4173
|
);
|
|
3977
4174
|
}
|
|
3978
4175
|
const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
|
|
3979
|
-
const efficiency = hasEfficiencyConfig ?
|
|
4176
|
+
const efficiency = hasEfficiencyConfig ? chunkUNQXHPOD_cjs.checkTrajectoryEfficiency(actualTrajectory, {
|
|
3980
4177
|
maxSteps: config.maxSteps,
|
|
3981
4178
|
maxTotalTokens: config.maxTotalTokens,
|
|
3982
4179
|
maxTotalDurationMs: config.maxTotalDurationMs,
|
|
3983
4180
|
noRedundantCalls: config.noRedundantCalls ?? true
|
|
3984
4181
|
}) : void 0;
|
|
3985
4182
|
const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
|
|
3986
|
-
const blacklist = hasBlacklistConfig ?
|
|
4183
|
+
const blacklist = hasBlacklistConfig ? chunkUNQXHPOD_cjs.checkTrajectoryBlacklist(actualTrajectory, {
|
|
3987
4184
|
blacklistedTools: config.blacklistedTools,
|
|
3988
4185
|
blacklistedSequences: config.blacklistedSequences
|
|
3989
4186
|
}) : void 0;
|
|
3990
|
-
const toolFailures =
|
|
4187
|
+
const toolFailures = chunkUNQXHPOD_cjs.analyzeToolFailures(actualTrajectory, {
|
|
3991
4188
|
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3992
4189
|
});
|
|
3993
4190
|
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
|
|
@@ -4121,6 +4318,7 @@ exports.createHallucinationScorer = createHallucinationScorer;
|
|
|
4121
4318
|
exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
|
|
4122
4319
|
exports.createNoiseSensitivityScorerLLM = createNoiseSensitivityScorerLLM;
|
|
4123
4320
|
exports.createPromptAlignmentScorerLLM = createPromptAlignmentScorerLLM;
|
|
4321
|
+
exports.createRubricScorer = createRubricScorer;
|
|
4124
4322
|
exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
|
|
4125
4323
|
exports.createToneScorer = createToneScorer;
|
|
4126
4324
|
exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;
|