@mastra/evals 1.2.0-alpha.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +97 -0
- package/dist/{chunk-XRUR5PBK.cjs → chunk-AY4K3J4R.cjs} +44 -95
- package/dist/chunk-AY4K3J4R.cjs.map +1 -0
- package/dist/{chunk-EVBNIL5M.js → chunk-X4MKZ735.js} +44 -95
- package/dist/chunk-X4MKZ735.js.map +1 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/reference-evals-scorer-utils.md +9 -5
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +29 -15
- package/dist/scorers/code/trajectory/index.d.ts +18 -1
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
- package/dist/scorers/prebuilt/index.cjs +110 -121
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +39 -50
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +23 -23
- package/dist/scorers/utils.d.ts +1 -4
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +7 -7
- package/dist/chunk-EVBNIL5M.js.map +0 -1
- package/dist/chunk-XRUR5PBK.cjs.map +0 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkAY4K3J4R_cjs = require('../../chunk-AY4K3J4R.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var zod = require('zod');
|
|
6
6
|
var nlp = require('compromise');
|
|
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
|
|
|
239
239
|
description: "Extract relevant statements from the LLM output",
|
|
240
240
|
outputSchema: extractOutputSchema,
|
|
241
241
|
createPrompt: ({ run }) => {
|
|
242
|
-
const assistantMessage =
|
|
242
|
+
const assistantMessage = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
243
243
|
return createExtractPrompt(assistantMessage);
|
|
244
244
|
}
|
|
245
245
|
}).analyze({
|
|
246
246
|
description: "Score the relevance of the statements to the input",
|
|
247
247
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
248
248
|
createPrompt: ({ run, results }) => {
|
|
249
|
-
const input =
|
|
249
|
+
const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
250
250
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
251
251
|
}
|
|
252
252
|
}).generateScore(({ results }) => {
|
|
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
|
|
|
263
263
|
}
|
|
264
264
|
}
|
|
265
265
|
const score = relevancyCount / numberOfResults;
|
|
266
|
-
return
|
|
266
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * options.scale);
|
|
267
267
|
}).generateReason({
|
|
268
268
|
description: "Reason about the results",
|
|
269
269
|
createPrompt: ({ run, results, score }) => {
|
|
270
270
|
return createReasonPrompt({
|
|
271
|
-
input:
|
|
272
|
-
output:
|
|
271
|
+
input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
273
273
|
score,
|
|
274
274
|
results: results.analyzeStepResult.results,
|
|
275
275
|
scale: options.scale
|
|
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
|
|
|
466
466
|
groundTruth: ""
|
|
467
467
|
});
|
|
468
468
|
}
|
|
469
|
-
const output =
|
|
469
|
+
const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
470
470
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
471
471
|
return createExtractPrompt2({
|
|
472
472
|
output,
|
|
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
|
|
|
524
524
|
);
|
|
525
525
|
score -= extraInfoPenalty;
|
|
526
526
|
score = Math.max(0, Math.min(1, score));
|
|
527
|
-
return
|
|
527
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
528
528
|
}).generateReason({
|
|
529
529
|
description: "Generate explanation of similarity score",
|
|
530
530
|
createPrompt: ({ run, results, score }) => {
|
|
531
531
|
if (!run.groundTruth) {
|
|
532
532
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
533
533
|
}
|
|
534
|
-
const output =
|
|
534
|
+
const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
535
535
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
536
536
|
return createReasonPrompt2({
|
|
537
537
|
output,
|
|
@@ -717,7 +717,7 @@ function createFaithfulnessScorer({
|
|
|
717
717
|
claims: zod.z.array(zod.z.string())
|
|
718
718
|
}),
|
|
719
719
|
createPrompt: ({ run }) => {
|
|
720
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
720
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
721
721
|
return prompt;
|
|
722
722
|
}
|
|
723
723
|
}).analyze({
|
|
@@ -741,14 +741,14 @@ function createFaithfulnessScorer({
|
|
|
741
741
|
return 0;
|
|
742
742
|
}
|
|
743
743
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
744
|
-
return
|
|
744
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
|
|
745
745
|
}).generateReason({
|
|
746
746
|
description: "Reason about the results",
|
|
747
747
|
createPrompt: ({ run, results, score }) => {
|
|
748
748
|
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
749
749
|
const prompt = createFaithfulnessReasonPrompt({
|
|
750
|
-
input:
|
|
751
|
-
output:
|
|
750
|
+
input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
751
|
+
output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
752
752
|
context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
753
753
|
score,
|
|
754
754
|
scale: options?.scale || 1,
|
|
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
|
|
|
881
881
|
outputSchema: zod.z.object({
|
|
882
882
|
opinions: zod.z.array(zod.z.string())
|
|
883
883
|
}),
|
|
884
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
884
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
885
885
|
}).analyze({
|
|
886
886
|
description: "Score the relevance of the statements to the input",
|
|
887
887
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
888
888
|
createPrompt: ({ run, results }) => {
|
|
889
889
|
const prompt = createBiasAnalyzePrompt({
|
|
890
|
-
output:
|
|
890
|
+
output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
891
891
|
opinions: results.preprocessStepResult?.opinions || []
|
|
892
892
|
});
|
|
893
893
|
return prompt;
|
|
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
|
|
|
898
898
|
}
|
|
899
899
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
900
900
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
901
|
-
return
|
|
901
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
902
902
|
}).generateReason({
|
|
903
903
|
description: "Reason about the results",
|
|
904
904
|
createPrompt: ({ score, results }) => {
|
|
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
|
|
|
1117
1117
|
claims: zod.z.array(zod.z.string())
|
|
1118
1118
|
}),
|
|
1119
1119
|
createPrompt: ({ run }) => {
|
|
1120
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1120
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1121
1121
|
return prompt;
|
|
1122
1122
|
}
|
|
1123
1123
|
}).analyze({
|
|
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
|
|
|
1145
1145
|
return 0;
|
|
1146
1146
|
}
|
|
1147
1147
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1148
|
-
return
|
|
1148
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
|
|
1149
1149
|
}).generateReason({
|
|
1150
1150
|
description: "Reason about the results",
|
|
1151
1151
|
createPrompt: async ({ run, results, score }) => {
|
|
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
|
|
|
1156
1156
|
context = options?.context ?? [];
|
|
1157
1157
|
}
|
|
1158
1158
|
const prompt = createHallucinationReasonPrompt({
|
|
1159
|
-
input:
|
|
1160
|
-
output:
|
|
1159
|
+
input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1160
|
+
output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1161
1161
|
context,
|
|
1162
1162
|
score,
|
|
1163
1163
|
scale: options?.scale || 1,
|
|
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
|
|
|
1271
1271
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1272
1272
|
createPrompt: ({ run }) => {
|
|
1273
1273
|
const prompt = createToxicityAnalyzePrompt({
|
|
1274
|
-
input:
|
|
1275
|
-
output:
|
|
1274
|
+
input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1275
|
+
output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1276
1276
|
});
|
|
1277
1277
|
return prompt;
|
|
1278
1278
|
}
|
|
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
}
|
|
1290
1290
|
const score = toxicityCount / numberOfVerdicts;
|
|
1291
|
-
return
|
|
1291
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1292
1292
|
}).generateReason({
|
|
1293
1293
|
description: "Reason about the results",
|
|
1294
1294
|
createPrompt: ({ results, score }) => {
|
|
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1422
1422
|
if (isInputInvalid || isOutputInvalid) {
|
|
1423
1423
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1424
1424
|
}
|
|
1425
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1425
|
+
const { tools: actualTools, toolCallInfos } = chunkAY4K3J4R_cjs.extractToolCalls(run.output);
|
|
1426
1426
|
return {
|
|
1427
1427
|
actualTools,
|
|
1428
1428
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1432
1432
|
description: "Analyze the appropriateness of tool selections",
|
|
1433
1433
|
outputSchema: analyzeOutputSchema2,
|
|
1434
1434
|
createPrompt: ({ run, results }) => {
|
|
1435
|
-
const userInput =
|
|
1436
|
-
const agentResponse =
|
|
1435
|
+
const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1436
|
+
const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1437
1437
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1438
1438
|
return createAnalyzePrompt2({
|
|
1439
1439
|
userInput,
|
|
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1450
1450
|
}
|
|
1451
1451
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1452
1452
|
const totalToolCalls = evaluations.length;
|
|
1453
|
-
return
|
|
1453
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1454
1454
|
}).generateReason({
|
|
1455
1455
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1456
1456
|
createPrompt: ({ run, results, score }) => {
|
|
1457
|
-
const userInput =
|
|
1457
|
+
const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1458
1458
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1459
1459
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1460
1460
|
return createReasonPrompt3({
|
|
@@ -1659,8 +1659,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1659
1659
|
description: "Analyze the relevance and utility of provided context",
|
|
1660
1660
|
outputSchema: analyzeOutputSchema3,
|
|
1661
1661
|
createPrompt: ({ run }) => {
|
|
1662
|
-
const userQuery =
|
|
1663
|
-
const agentResponse =
|
|
1662
|
+
const userQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1663
|
+
const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1664
1664
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1665
1665
|
if (context.length === 0) {
|
|
1666
1666
|
return createAnalyzePrompt3({
|
|
@@ -1708,11 +1708,11 @@ function createContextRelevanceScorerLLM({
|
|
|
1708
1708
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1709
1709
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1710
1710
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1711
|
-
return
|
|
1711
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(scaledScore);
|
|
1712
1712
|
}).generateReason({
|
|
1713
1713
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1714
1714
|
createPrompt: ({ run, results, score }) => {
|
|
1715
|
-
const userQuery =
|
|
1715
|
+
const userQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1716
1716
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1717
1717
|
if (context.length === 0) {
|
|
1718
1718
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -1883,8 +1883,8 @@ function createContextPrecisionScorer({
|
|
|
1883
1883
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1884
1884
|
outputSchema: contextRelevanceOutputSchema,
|
|
1885
1885
|
createPrompt: ({ run }) => {
|
|
1886
|
-
const input =
|
|
1887
|
-
const output =
|
|
1886
|
+
const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1887
|
+
const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1888
1888
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1889
1889
|
if (context.length === 0) {
|
|
1890
1890
|
throw new Error("No context available for evaluation");
|
|
@@ -1917,12 +1917,12 @@ function createContextPrecisionScorer({
|
|
|
1917
1917
|
}
|
|
1918
1918
|
const map = sumPrecision / relevantCount;
|
|
1919
1919
|
const score = map * (options.scale || 1);
|
|
1920
|
-
return
|
|
1920
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
|
|
1921
1921
|
}).generateReason({
|
|
1922
1922
|
description: "Reason about the context precision results",
|
|
1923
1923
|
createPrompt: ({ run, results, score }) => {
|
|
1924
|
-
const input =
|
|
1925
|
-
const output =
|
|
1924
|
+
const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1925
|
+
const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1926
1926
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1927
1927
|
return createContextPrecisionReasonPrompt({
|
|
1928
1928
|
input,
|
|
@@ -2177,8 +2177,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2177
2177
|
description: "Analyze the impact of noise on agent response quality",
|
|
2178
2178
|
outputSchema: analyzeOutputSchema4,
|
|
2179
2179
|
createPrompt: ({ run }) => {
|
|
2180
|
-
const originalQuery =
|
|
2181
|
-
const noisyResponse =
|
|
2180
|
+
const originalQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2181
|
+
const noisyResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2182
2182
|
if (!originalQuery || !noisyResponse) {
|
|
2183
2183
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2184
2184
|
}
|
|
@@ -2221,11 +2221,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2221
2221
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2222
2222
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2223
2223
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2224
|
-
return
|
|
2224
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(finalScore);
|
|
2225
2225
|
}).generateReason({
|
|
2226
2226
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2227
2227
|
createPrompt: ({ run, results, score }) => {
|
|
2228
|
-
const originalQuery =
|
|
2228
|
+
const originalQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2229
2229
|
const analysisResult = results.analyzeStepResult;
|
|
2230
2230
|
if (!analysisResult) {
|
|
2231
2231
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -2550,9 +2550,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
2550
2550
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2551
2551
|
outputSchema: analyzeOutputSchema5,
|
|
2552
2552
|
createPrompt: ({ run }) => {
|
|
2553
|
-
const userPrompt =
|
|
2554
|
-
const systemPrompt =
|
|
2555
|
-
const agentResponse =
|
|
2553
|
+
const userPrompt = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2554
|
+
const systemPrompt = chunkAY4K3J4R_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2555
|
+
const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2556
2556
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2557
2557
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2558
2558
|
}
|
|
@@ -2588,12 +2588,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
2588
2588
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2589
2589
|
}
|
|
2590
2590
|
const finalScore = weightedScore * scale;
|
|
2591
|
-
return
|
|
2591
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(finalScore);
|
|
2592
2592
|
}).generateReason({
|
|
2593
2593
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2594
2594
|
createPrompt: ({ run, results, score }) => {
|
|
2595
|
-
const userPrompt =
|
|
2596
|
-
const systemPrompt =
|
|
2595
|
+
const userPrompt = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2596
|
+
const systemPrompt = chunkAY4K3J4R_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2597
2597
|
const analysis = results.analyzeStepResult;
|
|
2598
2598
|
if (!analysis) {
|
|
2599
2599
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -2744,7 +2744,8 @@ function formatExpectedSteps(steps, indent = 0) {
|
|
|
2744
2744
|
const prefix = " ".repeat(indent);
|
|
2745
2745
|
return steps.map((step, i) => {
|
|
2746
2746
|
const typeStr = step.stepType ? `[${step.stepType}] ` : "";
|
|
2747
|
-
const
|
|
2747
|
+
const { name: _, stepType: _t, children: _c, ...fields } = step;
|
|
2748
|
+
const dataStr = Object.keys(fields).length > 0 ? ` (${JSON.stringify(fields)})` : "";
|
|
2748
2749
|
let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
|
|
2749
2750
|
if (step.children?.steps && step.children.steps.length > 0) {
|
|
2750
2751
|
line += `
|
|
@@ -2773,22 +2774,15 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
2773
2774
|
if (Array.isArray(staticExpectedTrajectory)) {
|
|
2774
2775
|
expectedSteps = staticExpectedTrajectory;
|
|
2775
2776
|
} else {
|
|
2776
|
-
|
|
2777
|
-
const
|
|
2778
|
-
const
|
|
2779
|
-
if (
|
|
2780
|
-
|
|
2781
|
-
if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
|
|
2782
|
-
data.output = s.toolResult;
|
|
2783
|
-
if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
|
|
2784
|
-
if (Object.keys(data).length > 0) result.data = data;
|
|
2785
|
-
if (s.children && s.children.length > 0) {
|
|
2786
|
-
result.children = {
|
|
2787
|
-
steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
|
|
2788
|
-
};
|
|
2777
|
+
const toExpectedStep = (s) => {
|
|
2778
|
+
const { durationMs: _, metadata: _m, children, ...rest } = s;
|
|
2779
|
+
const result = rest;
|
|
2780
|
+
if (children && children.length > 0) {
|
|
2781
|
+
result.children = { steps: children.map(toExpectedStep) };
|
|
2789
2782
|
}
|
|
2790
2783
|
return result;
|
|
2791
|
-
}
|
|
2784
|
+
};
|
|
2785
|
+
expectedSteps = staticExpectedTrajectory.steps.map(toExpectedStep);
|
|
2792
2786
|
}
|
|
2793
2787
|
} else if (run.expectedTrajectory) {
|
|
2794
2788
|
const expectation = run.expectedTrajectory;
|
|
@@ -2804,8 +2798,8 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
2804
2798
|
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
2805
2799
|
outputSchema: analyzeOutputSchema6,
|
|
2806
2800
|
createPrompt: ({ run, results }) => {
|
|
2807
|
-
const userInput =
|
|
2808
|
-
const agentResponse =
|
|
2801
|
+
const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2802
|
+
const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
2809
2803
|
return createAnalyzePrompt6({
|
|
2810
2804
|
userInput,
|
|
2811
2805
|
agentResponse,
|
|
@@ -2830,11 +2824,11 @@ function createTrajectoryAccuracyScorerLLM({
|
|
|
2830
2824
|
const necessityScore = necessarySteps / totalSteps;
|
|
2831
2825
|
const orderScore = orderedSteps / totalSteps;
|
|
2832
2826
|
const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
|
|
2833
|
-
return
|
|
2827
|
+
return chunkAY4K3J4R_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
|
|
2834
2828
|
}).generateReason({
|
|
2835
2829
|
description: "Generate human-readable explanation of trajectory evaluation",
|
|
2836
2830
|
createPrompt: ({ run, results, score }) => {
|
|
2837
|
-
const userInput =
|
|
2831
|
+
const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2838
2832
|
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
2839
2833
|
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
2840
2834
|
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
@@ -2897,18 +2891,18 @@ function createCompletenessScorer() {
|
|
|
2897
2891
|
type: "agent"
|
|
2898
2892
|
}).preprocess(async ({ run }) => {
|
|
2899
2893
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2900
|
-
const content =
|
|
2894
|
+
const content = chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i);
|
|
2901
2895
|
return content === null || content === void 0;
|
|
2902
2896
|
});
|
|
2903
2897
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2904
|
-
const content =
|
|
2898
|
+
const content = chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i);
|
|
2905
2899
|
return content === null || content === void 0;
|
|
2906
2900
|
});
|
|
2907
2901
|
if (isInputInvalid || isOutputInvalid) {
|
|
2908
2902
|
throw new Error("Inputs cannot be null or undefined");
|
|
2909
2903
|
}
|
|
2910
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
2911
|
-
const output = run.output?.map((i) =>
|
|
2904
|
+
const input = run.input?.inputMessages.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2905
|
+
const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2912
2906
|
const inputToProcess = input;
|
|
2913
2907
|
const outputToProcess = output;
|
|
2914
2908
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -3013,8 +3007,8 @@ function createTextualDifferenceScorer() {
|
|
|
3013
3007
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
3014
3008
|
type: "agent"
|
|
3015
3009
|
}).preprocess(async ({ run }) => {
|
|
3016
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3017
|
-
const output = run.output?.map((i) =>
|
|
3010
|
+
const input = run.input?.inputMessages?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3011
|
+
const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3018
3012
|
const ratio = calculateRatio(input, output);
|
|
3019
3013
|
const changes = countChanges(input, output);
|
|
3020
3014
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -3037,8 +3031,8 @@ function createKeywordCoverageScorer() {
|
|
|
3037
3031
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
3038
3032
|
type: "agent"
|
|
3039
3033
|
}).preprocess(async ({ run }) => {
|
|
3040
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
3041
|
-
const output = run.output?.map((i) =>
|
|
3034
|
+
const input = run.input?.inputMessages?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3035
|
+
const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3042
3036
|
if (!input && !output) {
|
|
3043
3037
|
return {
|
|
3044
3038
|
result: {
|
|
@@ -3091,8 +3085,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
3091
3085
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
3092
3086
|
type: "agent"
|
|
3093
3087
|
}).preprocess(async ({ run }) => {
|
|
3094
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
3095
|
-
let processedOutput = run.output.map((i) =>
|
|
3088
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3089
|
+
let processedOutput = run.output.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3096
3090
|
if (ignoreCase) {
|
|
3097
3091
|
processedInput = processedInput.toLowerCase();
|
|
3098
3092
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -3122,7 +3116,7 @@ function createToneScorer(config = {}) {
|
|
|
3122
3116
|
type: "agent"
|
|
3123
3117
|
}).preprocess(async ({ run }) => {
|
|
3124
3118
|
const sentiment = new Sentiment__default.default();
|
|
3125
|
-
const agentMessage = run.output?.map((i) =>
|
|
3119
|
+
const agentMessage = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3126
3120
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
3127
3121
|
if (referenceTone) {
|
|
3128
3122
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -3209,7 +3203,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
3209
3203
|
if (isInputInvalid || isOutputInvalid) {
|
|
3210
3204
|
throw new Error("Input and output messages cannot be null or empty");
|
|
3211
3205
|
}
|
|
3212
|
-
const { tools: actualTools, toolCallInfos } =
|
|
3206
|
+
const { tools: actualTools, toolCallInfos } = chunkAY4K3J4R_cjs.extractToolCalls(run.output);
|
|
3213
3207
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
3214
3208
|
return {
|
|
3215
3209
|
expectedTool,
|
|
@@ -3235,18 +3229,11 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
3235
3229
|
});
|
|
3236
3230
|
}
|
|
3237
3231
|
function trajectoryStepToExpectedStep(step) {
|
|
3238
|
-
const
|
|
3239
|
-
const
|
|
3240
|
-
if (
|
|
3241
|
-
if (step.toolArgs !== void 0) data.input = step.toolArgs;
|
|
3242
|
-
if (step.toolResult !== void 0) data.output = step.toolResult;
|
|
3243
|
-
} else if (step.stepType === "workflow_step") {
|
|
3244
|
-
if (step.output !== void 0) data.output = step.output;
|
|
3245
|
-
}
|
|
3246
|
-
if (Object.keys(data).length > 0) result.data = data;
|
|
3247
|
-
if (step.children && step.children.length > 0) {
|
|
3232
|
+
const { durationMs: _, metadata: _m, children, ...rest } = step;
|
|
3233
|
+
const result = rest;
|
|
3234
|
+
if (children && children.length > 0) {
|
|
3248
3235
|
result.children = {
|
|
3249
|
-
steps:
|
|
3236
|
+
steps: children.map(trajectoryStepToExpectedStep)
|
|
3250
3237
|
};
|
|
3251
3238
|
}
|
|
3252
3239
|
return result;
|
|
@@ -3257,15 +3244,14 @@ function expectationToExpectedSteps(expectation) {
|
|
|
3257
3244
|
}
|
|
3258
3245
|
function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
3259
3246
|
const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
|
|
3260
|
-
const { ordering
|
|
3261
|
-
const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
|
|
3247
|
+
const { ordering = "relaxed", allowRepeatedSteps = true } = comparisonOptions;
|
|
3262
3248
|
const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
|
|
3263
3249
|
const getDescription = () => {
|
|
3264
3250
|
if (staticExpectedSteps) {
|
|
3265
3251
|
const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
|
|
3266
|
-
return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${
|
|
3252
|
+
return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${ordering} ordering)`;
|
|
3267
3253
|
}
|
|
3268
|
-
return `Evaluates trajectory accuracy against expected trajectory from dataset items (${
|
|
3254
|
+
return `Evaluates trajectory accuracy against expected trajectory from dataset items (${ordering} ordering)`;
|
|
3269
3255
|
};
|
|
3270
3256
|
return evals.createScorer({
|
|
3271
3257
|
id: "code-trajectory-accuracy-scorer",
|
|
@@ -3290,15 +3276,13 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3290
3276
|
};
|
|
3291
3277
|
}
|
|
3292
3278
|
const itemExpectation = run.expectedTrajectory;
|
|
3293
|
-
const effectiveOrdering = itemExpectation?.ordering ??
|
|
3294
|
-
const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
|
|
3279
|
+
const effectiveOrdering = itemExpectation?.ordering ?? ordering;
|
|
3295
3280
|
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3296
|
-
const comparison =
|
|
3281
|
+
const comparison = chunkAY4K3J4R_cjs.compareTrajectories(
|
|
3297
3282
|
actualTrajectory,
|
|
3298
3283
|
{ steps: resolvedExpectedSteps },
|
|
3299
3284
|
{
|
|
3300
3285
|
ordering: effectiveOrdering,
|
|
3301
|
-
compareStepData: effectiveCompareData,
|
|
3302
3286
|
allowRepeatedSteps: effectiveAllowRepeated
|
|
3303
3287
|
}
|
|
3304
3288
|
);
|
|
@@ -3317,7 +3301,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
|
3317
3301
|
return preprocessResult.comparison.score;
|
|
3318
3302
|
});
|
|
3319
3303
|
}
|
|
3320
|
-
function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
3304
|
+
function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }) {
|
|
3321
3305
|
const results = [];
|
|
3322
3306
|
const matchedIndices = /* @__PURE__ */ new Set();
|
|
3323
3307
|
for (const expectedStep of expectedSteps) {
|
|
@@ -3352,47 +3336,47 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
|
3352
3336
|
const childConfig = expectedStep.children;
|
|
3353
3337
|
let accuracy;
|
|
3354
3338
|
if (childConfig.steps && childConfig.steps.length > 0) {
|
|
3355
|
-
accuracy =
|
|
3339
|
+
accuracy = chunkAY4K3J4R_cjs.compareTrajectories(
|
|
3356
3340
|
childTrajectory,
|
|
3357
3341
|
{ steps: childConfig.steps },
|
|
3358
3342
|
{
|
|
3359
3343
|
ordering: childConfig.ordering ?? "relaxed",
|
|
3360
|
-
compareStepData: childConfig.compareStepData ?? false,
|
|
3361
3344
|
allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
|
|
3362
3345
|
}
|
|
3363
3346
|
);
|
|
3364
3347
|
}
|
|
3365
3348
|
const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
|
|
3366
|
-
const efficiency = hasEfficiencyConfig ?
|
|
3349
|
+
const efficiency = hasEfficiencyConfig ? chunkAY4K3J4R_cjs.checkTrajectoryEfficiency(childTrajectory, {
|
|
3367
3350
|
maxSteps: childConfig.maxSteps,
|
|
3368
3351
|
maxTotalTokens: childConfig.maxTotalTokens,
|
|
3369
3352
|
maxTotalDurationMs: childConfig.maxTotalDurationMs,
|
|
3370
3353
|
noRedundantCalls: childConfig.noRedundantCalls ?? true
|
|
3371
3354
|
}) : void 0;
|
|
3372
3355
|
const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
|
|
3373
|
-
const blacklist = hasBlacklistConfig ?
|
|
3356
|
+
const blacklist = hasBlacklistConfig ? chunkAY4K3J4R_cjs.checkTrajectoryBlacklist(childTrajectory, {
|
|
3374
3357
|
blacklistedTools: childConfig.blacklistedTools,
|
|
3375
3358
|
blacklistedSequences: childConfig.blacklistedSequences
|
|
3376
3359
|
}) : void 0;
|
|
3377
|
-
const toolFailures =
|
|
3360
|
+
const toolFailures = chunkAY4K3J4R_cjs.analyzeToolFailures(childTrajectory, {
|
|
3378
3361
|
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3379
3362
|
});
|
|
3380
|
-
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
|
|
3363
|
+
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
|
|
3381
3364
|
const scores = [];
|
|
3382
|
-
if (accuracy) scores.push({ weight:
|
|
3383
|
-
if (efficiency) scores.push({ weight:
|
|
3384
|
-
if (toolFailures && toolFailures.patterns.length > 0)
|
|
3365
|
+
if (accuracy) scores.push({ weight: weights.accuracy, value: accuracy.score });
|
|
3366
|
+
if (efficiency) scores.push({ weight: weights.efficiency, value: efficiency.score });
|
|
3367
|
+
if (toolFailures && toolFailures.patterns.length > 0)
|
|
3368
|
+
scores.push({ weight: weights.toolFailures, value: toolFailures.score });
|
|
3385
3369
|
if (blacklist) {
|
|
3386
3370
|
if (blacklist.score === 0) {
|
|
3387
3371
|
results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
|
|
3388
3372
|
continue;
|
|
3389
3373
|
}
|
|
3390
|
-
scores.push({ weight:
|
|
3374
|
+
scores.push({ weight: weights.blacklist, value: blacklist.score });
|
|
3391
3375
|
}
|
|
3392
3376
|
let levelScore = 1;
|
|
3393
3377
|
if (scores.length > 0) {
|
|
3394
3378
|
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3395
|
-
levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
|
|
3379
|
+
levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
|
|
3396
3380
|
}
|
|
3397
3381
|
let finalScore = levelScore;
|
|
3398
3382
|
if (nested.length > 0) {
|
|
@@ -3417,7 +3401,13 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
|
3417
3401
|
return results;
|
|
3418
3402
|
}
|
|
3419
3403
|
function createTrajectoryScorerCode(options = {}) {
|
|
3420
|
-
const { defaults = {} } = options;
|
|
3404
|
+
const { defaults = {}, weights: userWeights = {} } = options;
|
|
3405
|
+
const w = {
|
|
3406
|
+
accuracy: Math.max(0, userWeights.accuracy ?? 0.4),
|
|
3407
|
+
efficiency: Math.max(0, userWeights.efficiency ?? 0.3),
|
|
3408
|
+
toolFailures: Math.max(0, userWeights.toolFailures ?? 0.2),
|
|
3409
|
+
blacklist: Math.max(0, userWeights.blacklist ?? 0.1)
|
|
3410
|
+
};
|
|
3421
3411
|
return evals.createScorer({
|
|
3422
3412
|
id: "code-trajectory-scorer",
|
|
3423
3413
|
name: "Trajectory Scorer",
|
|
@@ -3432,32 +3422,31 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3432
3422
|
}
|
|
3433
3423
|
let accuracy;
|
|
3434
3424
|
if (config.steps && config.steps.length > 0) {
|
|
3435
|
-
accuracy =
|
|
3425
|
+
accuracy = chunkAY4K3J4R_cjs.compareTrajectories(
|
|
3436
3426
|
actualTrajectory,
|
|
3437
3427
|
{ steps: config.steps },
|
|
3438
3428
|
{
|
|
3439
3429
|
ordering: config.ordering ?? "relaxed",
|
|
3440
|
-
compareStepData: config.compareStepData ?? false,
|
|
3441
3430
|
allowRepeatedSteps: config.allowRepeatedSteps ?? true
|
|
3442
3431
|
}
|
|
3443
3432
|
);
|
|
3444
3433
|
}
|
|
3445
3434
|
const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
|
|
3446
|
-
const efficiency = hasEfficiencyConfig ?
|
|
3435
|
+
const efficiency = hasEfficiencyConfig ? chunkAY4K3J4R_cjs.checkTrajectoryEfficiency(actualTrajectory, {
|
|
3447
3436
|
maxSteps: config.maxSteps,
|
|
3448
3437
|
maxTotalTokens: config.maxTotalTokens,
|
|
3449
3438
|
maxTotalDurationMs: config.maxTotalDurationMs,
|
|
3450
3439
|
noRedundantCalls: config.noRedundantCalls ?? true
|
|
3451
3440
|
}) : void 0;
|
|
3452
3441
|
const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
|
|
3453
|
-
const blacklist = hasBlacklistConfig ?
|
|
3442
|
+
const blacklist = hasBlacklistConfig ? chunkAY4K3J4R_cjs.checkTrajectoryBlacklist(actualTrajectory, {
|
|
3454
3443
|
blacklistedTools: config.blacklistedTools,
|
|
3455
3444
|
blacklistedSequences: config.blacklistedSequences
|
|
3456
3445
|
}) : void 0;
|
|
3457
|
-
const toolFailures =
|
|
3446
|
+
const toolFailures = chunkAY4K3J4R_cjs.analyzeToolFailures(actualTrajectory, {
|
|
3458
3447
|
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3459
3448
|
});
|
|
3460
|
-
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
|
|
3449
|
+
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
|
|
3461
3450
|
return {
|
|
3462
3451
|
accuracy,
|
|
3463
3452
|
efficiency,
|
|
@@ -3473,16 +3462,16 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3473
3462
|
}
|
|
3474
3463
|
const scores = [];
|
|
3475
3464
|
if (accuracy) {
|
|
3476
|
-
scores.push({ weight:
|
|
3465
|
+
scores.push({ weight: w.accuracy, value: accuracy.score });
|
|
3477
3466
|
}
|
|
3478
3467
|
if (efficiency) {
|
|
3479
|
-
scores.push({ weight:
|
|
3468
|
+
scores.push({ weight: w.efficiency, value: efficiency.score });
|
|
3480
3469
|
}
|
|
3481
3470
|
if (toolFailures && toolFailures.patterns.length > 0) {
|
|
3482
|
-
scores.push({ weight:
|
|
3471
|
+
scores.push({ weight: w.toolFailures, value: toolFailures.score });
|
|
3483
3472
|
}
|
|
3484
3473
|
if (blacklist) {
|
|
3485
|
-
scores.push({ weight:
|
|
3474
|
+
scores.push({ weight: w.blacklist, value: blacklist.score });
|
|
3486
3475
|
}
|
|
3487
3476
|
if (scores.length === 0 && !nested) {
|
|
3488
3477
|
return 1;
|
|
@@ -3490,7 +3479,7 @@ function createTrajectoryScorerCode(options = {}) {
|
|
|
3490
3479
|
let levelScore = 1;
|
|
3491
3480
|
if (scores.length > 0) {
|
|
3492
3481
|
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3493
|
-
levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
|
|
3482
|
+
levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
|
|
3494
3483
|
}
|
|
3495
3484
|
if (nested && nested.length > 0) {
|
|
3496
3485
|
const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
|