@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkXRUR5PBK_cjs = require('../../chunk-XRUR5PBK.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var zod = require('zod');
|
|
6
6
|
var nlp = require('compromise');
|
|
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
|
|
|
239
239
|
description: "Extract relevant statements from the LLM output",
|
|
240
240
|
outputSchema: extractOutputSchema,
|
|
241
241
|
createPrompt: ({ run }) => {
|
|
242
|
-
const assistantMessage =
|
|
242
|
+
const assistantMessage = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
243
243
|
return createExtractPrompt(assistantMessage);
|
|
244
244
|
}
|
|
245
245
|
}).analyze({
|
|
246
246
|
description: "Score the relevance of the statements to the input",
|
|
247
247
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
248
248
|
createPrompt: ({ run, results }) => {
|
|
249
|
-
const input =
|
|
249
|
+
const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
250
250
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
251
251
|
}
|
|
252
252
|
}).generateScore(({ results }) => {
|
|
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
|
|
|
263
263
|
}
|
|
264
264
|
}
|
|
265
265
|
const score = relevancyCount / numberOfResults;
|
|
266
|
-
return
|
|
266
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * options.scale);
|
|
267
267
|
}).generateReason({
|
|
268
268
|
description: "Reason about the results",
|
|
269
269
|
createPrompt: ({ run, results, score }) => {
|
|
270
270
|
return createReasonPrompt({
|
|
271
|
-
input:
|
|
272
|
-
output:
|
|
271
|
+
input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
273
273
|
score,
|
|
274
274
|
results: results.analyzeStepResult.results,
|
|
275
275
|
scale: options.scale
|
|
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
|
|
|
466
466
|
groundTruth: ""
|
|
467
467
|
});
|
|
468
468
|
}
|
|
469
|
-
const output =
|
|
469
|
+
const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
470
470
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
471
471
|
return createExtractPrompt2({
|
|
472
472
|
output,
|
|
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
|
|
|
524
524
|
);
|
|
525
525
|
score -= extraInfoPenalty;
|
|
526
526
|
score = Math.max(0, Math.min(1, score));
|
|
527
|
-
return
|
|
527
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
528
528
|
}).generateReason({
|
|
529
529
|
description: "Generate explanation of similarity score",
|
|
530
530
|
createPrompt: ({ run, results, score }) => {
|
|
531
531
|
if (!run.groundTruth) {
|
|
532
532
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
533
533
|
}
|
|
534
|
-
const output =
|
|
534
|
+
const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
535
535
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
536
536
|
return createReasonPrompt2({
|
|
537
537
|
output,
|
|
@@ -717,7 +717,7 @@ function createFaithfulnessScorer({
|
|
|
717
717
|
claims: zod.z.array(zod.z.string())
|
|
718
718
|
}),
|
|
719
719
|
createPrompt: ({ run }) => {
|
|
720
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
720
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
721
721
|
return prompt;
|
|
722
722
|
}
|
|
723
723
|
}).analyze({
|
|
@@ -741,14 +741,14 @@ function createFaithfulnessScorer({
|
|
|
741
741
|
return 0;
|
|
742
742
|
}
|
|
743
743
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
744
|
-
return
|
|
744
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
|
|
745
745
|
}).generateReason({
|
|
746
746
|
description: "Reason about the results",
|
|
747
747
|
createPrompt: ({ run, results, score }) => {
|
|
748
748
|
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
749
749
|
const prompt = createFaithfulnessReasonPrompt({
|
|
750
|
-
input:
|
|
751
|
-
output:
|
|
750
|
+
input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
751
|
+
output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
752
752
|
context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
753
753
|
score,
|
|
754
754
|
scale: options?.scale || 1,
|
|
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
|
|
|
881
881
|
outputSchema: zod.z.object({
|
|
882
882
|
opinions: zod.z.array(zod.z.string())
|
|
883
883
|
}),
|
|
884
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
884
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
885
885
|
}).analyze({
|
|
886
886
|
description: "Score the relevance of the statements to the input",
|
|
887
887
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
888
888
|
createPrompt: ({ run, results }) => {
|
|
889
889
|
const prompt = createBiasAnalyzePrompt({
|
|
890
|
-
output:
|
|
890
|
+
output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
891
891
|
opinions: results.preprocessStepResult?.opinions || []
|
|
892
892
|
});
|
|
893
893
|
return prompt;
|
|
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
|
|
|
898
898
|
}
|
|
899
899
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
900
900
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
901
|
-
return
|
|
901
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
902
902
|
}).generateReason({
|
|
903
903
|
description: "Reason about the results",
|
|
904
904
|
createPrompt: ({ score, results }) => {
|
|
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
|
|
|
1117
1117
|
claims: zod.z.array(zod.z.string())
|
|
1118
1118
|
}),
|
|
1119
1119
|
createPrompt: ({ run }) => {
|
|
1120
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1120
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1121
1121
|
return prompt;
|
|
1122
1122
|
}
|
|
1123
1123
|
}).analyze({
|
|
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
|
|
|
1145
1145
|
return 0;
|
|
1146
1146
|
}
|
|
1147
1147
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1148
|
-
return
|
|
1148
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
|
|
1149
1149
|
}).generateReason({
|
|
1150
1150
|
description: "Reason about the results",
|
|
1151
1151
|
createPrompt: async ({ run, results, score }) => {
|
|
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
|
|
|
1156
1156
|
context = options?.context ?? [];
|
|
1157
1157
|
}
|
|
1158
1158
|
const prompt = createHallucinationReasonPrompt({
|
|
1159
|
-
input:
|
|
1160
|
-
output:
|
|
1159
|
+
input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1160
|
+
output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1161
1161
|
context,
|
|
1162
1162
|
score,
|
|
1163
1163
|
scale: options?.scale || 1,
|
|
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
|
|
|
1271
1271
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1272
1272
|
createPrompt: ({ run }) => {
|
|
1273
1273
|
const prompt = createToxicityAnalyzePrompt({
|
|
1274
|
-
input:
|
|
1275
|
-
output:
|
|
1274
|
+
input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1275
|
+
output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1276
1276
|
});
|
|
1277
1277
|
return prompt;
|
|
1278
1278
|
}
|
|
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
}
|
|
1290
1290
|
const score = toxicityCount / numberOfVerdicts;
|
|
1291
|
-
return
|
|
1291
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1292
1292
|
}).generateReason({
|
|
1293
1293
|
description: "Reason about the results",
|
|
1294
1294
|
createPrompt: ({ results, score }) => {
|
|
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1422
1422
|
if (isInputInvalid || isOutputInvalid) {
|
|
1423
1423
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1424
1424
|
}
|
|
1425
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1425
|
+
const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
|
|
1426
1426
|
return {
|
|
1427
1427
|
actualTools,
|
|
1428
1428
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1432
1432
|
description: "Analyze the appropriateness of tool selections",
|
|
1433
1433
|
outputSchema: analyzeOutputSchema2,
|
|
1434
1434
|
createPrompt: ({ run, results }) => {
|
|
1435
|
-
const userInput =
|
|
1436
|
-
const agentResponse =
|
|
1435
|
+
const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1436
|
+
const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1437
1437
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1438
1438
|
return createAnalyzePrompt2({
|
|
1439
1439
|
userInput,
|
|
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1450
1450
|
}
|
|
1451
1451
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1452
1452
|
const totalToolCalls = evaluations.length;
|
|
1453
|
-
return
|
|
1453
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1454
1454
|
}).generateReason({
|
|
1455
1455
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1456
1456
|
createPrompt: ({ run, results, score }) => {
|
|
1457
|
-
const userInput =
|
|
1457
|
+
const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1458
1458
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1459
1459
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1460
1460
|
return createReasonPrompt3({
|
|
@@ -1659,8 +1659,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1659
1659
|
description: "Analyze the relevance and utility of provided context",
|
|
1660
1660
|
outputSchema: analyzeOutputSchema3,
|
|
1661
1661
|
createPrompt: ({ run }) => {
|
|
1662
|
-
const userQuery =
|
|
1663
|
-
const agentResponse =
|
|
1662
|
+
const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1663
|
+
const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1664
1664
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1665
1665
|
if (context.length === 0) {
|
|
1666
1666
|
return createAnalyzePrompt3({
|
|
@@ -1708,11 +1708,11 @@ function createContextRelevanceScorerLLM({
|
|
|
1708
1708
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1709
1709
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1710
1710
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1711
|
-
return
|
|
1711
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(scaledScore);
|
|
1712
1712
|
}).generateReason({
|
|
1713
1713
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1714
1714
|
createPrompt: ({ run, results, score }) => {
|
|
1715
|
-
const userQuery =
|
|
1715
|
+
const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1716
1716
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1717
1717
|
if (context.length === 0) {
|
|
1718
1718
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -1883,8 +1883,8 @@ function createContextPrecisionScorer({
|
|
|
1883
1883
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1884
1884
|
outputSchema: contextRelevanceOutputSchema,
|
|
1885
1885
|
createPrompt: ({ run }) => {
|
|
1886
|
-
const input =
|
|
1887
|
-
const output =
|
|
1886
|
+
const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1887
|
+
const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1888
1888
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1889
1889
|
if (context.length === 0) {
|
|
1890
1890
|
throw new Error("No context available for evaluation");
|
|
@@ -1917,12 +1917,12 @@ function createContextPrecisionScorer({
|
|
|
1917
1917
|
}
|
|
1918
1918
|
const map = sumPrecision / relevantCount;
|
|
1919
1919
|
const score = map * (options.scale || 1);
|
|
1920
|
-
return
|
|
1920
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
|
|
1921
1921
|
}).generateReason({
|
|
1922
1922
|
description: "Reason about the context precision results",
|
|
1923
1923
|
createPrompt: ({ run, results, score }) => {
|
|
1924
|
-
const input =
|
|
1925
|
-
const output =
|
|
1924
|
+
const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1925
|
+
const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1926
1926
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1927
1927
|
return createContextPrecisionReasonPrompt({
|
|
1928
1928
|
input,
|
|
@@ -2177,8 +2177,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2177
2177
|
description: "Analyze the impact of noise on agent response quality",
|
|
2178
2178
|
outputSchema: analyzeOutputSchema4,
|
|
2179
2179
|
createPrompt: ({ run }) => {
|
|
2180
|
-
const originalQuery =
|
|
2181
|
-
const noisyResponse =
|
|
2180
|
+
const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2181
|
+
const noisyResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2182
2182
|
if (!originalQuery || !noisyResponse) {
|
|
2183
2183
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2184
2184
|
}
|
|
@@ -2221,11 +2221,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2221
2221
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2222
2222
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2223
2223
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2224
|
-
return
|
|
2224
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
|
|
2225
2225
|
}).generateReason({
|
|
2226
2226
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2227
2227
|
createPrompt: ({ run, results, score }) => {
|
|
2228
|
-
const originalQuery =
|
|
2228
|
+
const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2229
2229
|
const analysisResult = results.analyzeStepResult;
|
|
2230
2230
|
if (!analysisResult) {
|
|
2231
2231
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -2550,9 +2550,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
2550
2550
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2551
2551
|
outputSchema: analyzeOutputSchema5,
|
|
2552
2552
|
createPrompt: ({ run }) => {
|
|
2553
|
-
const userPrompt =
|
|
2554
|
-
const systemPrompt =
|
|
2555
|
-
const agentResponse =
|
|
2553
|
+
const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2554
|
+
const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2555
|
+
const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2556
2556
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2557
2557
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2558
2558
|
}
|
|
@@ -2588,12 +2588,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
2588
2588
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2589
2589
|
}
|
|
2590
2590
|
const finalScore = weightedScore * scale;
|
|
2591
|
-
return
|
|
2591
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
|
|
2592
2592
|
}).generateReason({
|
|
2593
2593
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2594
2594
|
createPrompt: ({ run, results, score }) => {
|
|
2595
|
-
const userPrompt =
|
|
2596
|
-
const systemPrompt =
|
|
2595
|
+
const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2596
|
+
const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2597
2597
|
const analysis = results.analyzeStepResult;
|
|
2598
2598
|
if (!analysis) {
|
|
2599
2599
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -2609,6 +2609,245 @@ function createPromptAlignmentScorerLLM({
|
|
|
2609
2609
|
}
|
|
2610
2610
|
});
|
|
2611
2611
|
}
|
|
2612
|
+
|
|
2613
|
+
// src/scorers/llm/trajectory/prompts.ts
|
|
2614
|
+
var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
|
|
2615
|
+
You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
|
|
2616
|
+
|
|
2617
|
+
CORE RESPONSIBILITIES:
|
|
2618
|
+
- Analyze the full sequence of actions the agent took
|
|
2619
|
+
- Evaluate whether each step was necessary and well-ordered
|
|
2620
|
+
- Identify unnecessary, redundant, or missing steps
|
|
2621
|
+
- Assess the overall quality of the agent's action path
|
|
2622
|
+
|
|
2623
|
+
EVALUATION PHILOSOPHY:
|
|
2624
|
+
- Consider both the individual steps AND the overall flow
|
|
2625
|
+
- A good trajectory is efficient, logical, and complete
|
|
2626
|
+
- Redundant steps reduce quality even if the final result is correct
|
|
2627
|
+
- Missing critical steps are a significant issue
|
|
2628
|
+
- Order matters: logical dependencies should be respected
|
|
2629
|
+
|
|
2630
|
+
OUTPUT REQUIREMENTS:
|
|
2631
|
+
- Provide clear reasoning for your trajectory assessment
|
|
2632
|
+
- Use provided JSON schema exactly as specified
|
|
2633
|
+
- Be consistent in your evaluation standards
|
|
2634
|
+
`;
|
|
2635
|
+
var createAnalyzePrompt6 = ({
|
|
2636
|
+
userInput,
|
|
2637
|
+
agentResponse,
|
|
2638
|
+
actualTrajectory,
|
|
2639
|
+
expectedTrajectory
|
|
2640
|
+
}) => {
|
|
2641
|
+
let prompt = `
|
|
2642
|
+
You are evaluating whether an AI agent took an appropriate sequence of actions to fulfill a user request.
|
|
2643
|
+
|
|
2644
|
+
USER REQUEST: "${userInput}"
|
|
2645
|
+
AGENT FINAL RESPONSE: "${agentResponse}"
|
|
2646
|
+
|
|
2647
|
+
ACTUAL TRAJECTORY (sequence of actions the agent took):
|
|
2648
|
+
${actualTrajectory}
|
|
2649
|
+
`;
|
|
2650
|
+
if (expectedTrajectory) {
|
|
2651
|
+
prompt += `
|
|
2652
|
+
EXPECTED TRAJECTORY (the ideal sequence):
|
|
2653
|
+
${expectedTrajectory}
|
|
2654
|
+
|
|
2655
|
+
EVALUATION CRITERIA:
|
|
2656
|
+
1. STEP PRESENCE: Did the agent perform all expected steps?
|
|
2657
|
+
2. STEP ORDER: Were the steps in a logical order? (Expected order is a guideline, not absolute)
|
|
2658
|
+
3. EXTRA STEPS: Did the agent take unnecessary steps not in the expected trajectory?
|
|
2659
|
+
4. MISSING STEPS: Are any expected steps missing from the actual trajectory?
|
|
2660
|
+
5. STEP QUALITY: For each step that matches, was it executed appropriately?
|
|
2661
|
+
|
|
2662
|
+
For each actual step, evaluate:
|
|
2663
|
+
- Does it correspond to an expected step?
|
|
2664
|
+
- Was it necessary for the task?
|
|
2665
|
+
- Was it in the right position in the sequence?
|
|
2666
|
+
`;
|
|
2667
|
+
} else {
|
|
2668
|
+
prompt += `
|
|
2669
|
+
EVALUATION CRITERIA (no expected trajectory provided - evaluate based on the task):
|
|
2670
|
+
1. COMPLETENESS: Did the agent take all necessary steps to fulfill the request?
|
|
2671
|
+
2. EFFICIENCY: Were there any redundant or unnecessary steps?
|
|
2672
|
+
3. ORDERING: Were the steps in a logical order given their dependencies?
|
|
2673
|
+
4. APPROPRIATENESS: Was each step appropriate for the task?
|
|
2674
|
+
`;
|
|
2675
|
+
}
|
|
2676
|
+
prompt += `
|
|
2677
|
+
Evaluate each step and the overall trajectory quality.
|
|
2678
|
+
`;
|
|
2679
|
+
return prompt;
|
|
2680
|
+
};
|
|
2681
|
+
var createReasonPrompt7 = ({
|
|
2682
|
+
userInput,
|
|
2683
|
+
score,
|
|
2684
|
+
stepEvaluations,
|
|
2685
|
+
missingSteps,
|
|
2686
|
+
extraSteps
|
|
2687
|
+
}) => {
|
|
2688
|
+
return `
|
|
2689
|
+
Explain this trajectory evaluation in ONE SENTENCE.
|
|
2690
|
+
|
|
2691
|
+
User Request: "${userInput}"
|
|
2692
|
+
Score: ${score}/1
|
|
2693
|
+
Steps Evaluated: ${JSON.stringify(stepEvaluations)}
|
|
2694
|
+
Missing Steps: ${JSON.stringify(missingSteps)}
|
|
2695
|
+
Extra/Unnecessary Steps: ${JSON.stringify(extraSteps)}
|
|
2696
|
+
|
|
2697
|
+
Provide a single, concise sentence explaining why this score was given.
|
|
2698
|
+
`;
|
|
2699
|
+
};
|
|
2700
|
+
|
|
2701
|
+
// src/scorers/llm/trajectory/index.ts
|
|
2702
|
+
var analyzeOutputSchema6 = zod.z.object({
|
|
2703
|
+
stepEvaluations: zod.z.array(
|
|
2704
|
+
zod.z.object({
|
|
2705
|
+
stepName: zod.z.string().describe("Name of the step (tool name or action)"),
|
|
2706
|
+
wasNecessary: zod.z.boolean().describe("Whether this step was necessary for the task"),
|
|
2707
|
+
wasInOrder: zod.z.boolean().describe("Whether this step was in a logical position in the sequence"),
|
|
2708
|
+
reasoning: zod.z.string().describe("Brief explanation of the evaluation")
|
|
2709
|
+
})
|
|
2710
|
+
),
|
|
2711
|
+
missingSteps: zod.z.array(zod.z.string()).optional().describe("Steps that should have been taken but were not"),
|
|
2712
|
+
extraSteps: zod.z.array(zod.z.string()).optional().describe("Steps that were unnecessary or redundant"),
|
|
2713
|
+
overallAssessment: zod.z.string().describe("Brief overall assessment of the trajectory quality")
|
|
2714
|
+
});
|
|
2715
|
+
function formatStepDetails(step) {
|
|
2716
|
+
switch (step.stepType) {
|
|
2717
|
+
case "tool_call":
|
|
2718
|
+
case "mcp_tool_call": {
|
|
2719
|
+
const parts = [];
|
|
2720
|
+
if (step.toolArgs !== void 0) parts.push(`args: ${JSON.stringify(step.toolArgs)}`);
|
|
2721
|
+
if (step.toolResult !== void 0) parts.push(`result: ${JSON.stringify(step.toolResult)}`);
|
|
2722
|
+
return parts.length > 0 ? ` (${parts.join(", ")})` : "";
|
|
2723
|
+
}
|
|
2724
|
+
case "model_generation":
|
|
2725
|
+
return step.modelId ? ` (model: ${step.modelId})` : "";
|
|
2726
|
+
case "workflow_step":
|
|
2727
|
+
return step.output !== void 0 ? ` (output: ${JSON.stringify(step.output)})` : "";
|
|
2728
|
+
default:
|
|
2729
|
+
return "";
|
|
2730
|
+
}
|
|
2731
|
+
}
|
|
2732
|
+
function formatTrajectory(trajectory, indent = 0) {
|
|
2733
|
+
const prefix = " ".repeat(indent);
|
|
2734
|
+
return trajectory.steps.map((step, i) => {
|
|
2735
|
+
let line = `${prefix}${i + 1}. [${step.stepType}] ${step.name}${formatStepDetails(step)}`;
|
|
2736
|
+
if (step.children && step.children.length > 0) {
|
|
2737
|
+
line += `
|
|
2738
|
+
${formatTrajectory({ steps: step.children }, indent + 1)}`;
|
|
2739
|
+
}
|
|
2740
|
+
return line;
|
|
2741
|
+
}).join("\n");
|
|
2742
|
+
}
|
|
2743
|
+
function formatExpectedSteps(steps, indent = 0) {
|
|
2744
|
+
const prefix = " ".repeat(indent);
|
|
2745
|
+
return steps.map((step, i) => {
|
|
2746
|
+
const typeStr = step.stepType ? `[${step.stepType}] ` : "";
|
|
2747
|
+
const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
|
|
2748
|
+
let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
|
|
2749
|
+
if (step.children?.steps && step.children.steps.length > 0) {
|
|
2750
|
+
line += `
|
|
2751
|
+
${formatExpectedSteps(step.children.steps, indent + 1)}`;
|
|
2752
|
+
}
|
|
2753
|
+
return line;
|
|
2754
|
+
}).join("\n");
|
|
2755
|
+
}
|
|
2756
|
+
function createTrajectoryAccuracyScorerLLM({
|
|
2757
|
+
model,
|
|
2758
|
+
expectedTrajectory: staticExpectedTrajectory
|
|
2759
|
+
}) {
|
|
2760
|
+
return evals.createScorer({
|
|
2761
|
+
id: "llm-trajectory-accuracy-scorer",
|
|
2762
|
+
name: "Trajectory Accuracy (LLM)",
|
|
2763
|
+
description: staticExpectedTrajectory ? "Evaluates the trajectory against an expected trajectory using LLM analysis" : "Evaluates the quality and appropriateness of the trajectory using LLM analysis",
|
|
2764
|
+
judge: {
|
|
2765
|
+
model,
|
|
2766
|
+
instructions: TRAJECTORY_EVALUATION_INSTRUCTIONS
|
|
2767
|
+
},
|
|
2768
|
+
type: "trajectory"
|
|
2769
|
+
}).preprocess(async ({ run }) => {
|
|
2770
|
+
const actualTrajectory = run.output;
|
|
2771
|
+
let expectedSteps;
|
|
2772
|
+
if (staticExpectedTrajectory) {
|
|
2773
|
+
if (Array.isArray(staticExpectedTrajectory)) {
|
|
2774
|
+
expectedSteps = staticExpectedTrajectory;
|
|
2775
|
+
} else {
|
|
2776
|
+
expectedSteps = staticExpectedTrajectory.steps.map((s) => {
|
|
2777
|
+
const result = { name: s.name, stepType: s.stepType };
|
|
2778
|
+
const data = {};
|
|
2779
|
+
if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
|
|
2780
|
+
data.input = s.toolArgs;
|
|
2781
|
+
if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
|
|
2782
|
+
data.output = s.toolResult;
|
|
2783
|
+
if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
|
|
2784
|
+
if (Object.keys(data).length > 0) result.data = data;
|
|
2785
|
+
if (s.children && s.children.length > 0) {
|
|
2786
|
+
result.children = {
|
|
2787
|
+
steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
|
|
2788
|
+
};
|
|
2789
|
+
}
|
|
2790
|
+
return result;
|
|
2791
|
+
});
|
|
2792
|
+
}
|
|
2793
|
+
} else if (run.expectedTrajectory) {
|
|
2794
|
+
const expectation = run.expectedTrajectory;
|
|
2795
|
+
expectedSteps = expectation.steps && expectation.steps.length > 0 ? expectation.steps : void 0;
|
|
2796
|
+
}
|
|
2797
|
+
return {
|
|
2798
|
+
actualTrajectory,
|
|
2799
|
+
actualTrajectoryFormatted: formatTrajectory(actualTrajectory),
|
|
2800
|
+
expectedTrajectoryFormatted: expectedSteps ? formatExpectedSteps(expectedSteps) : void 0,
|
|
2801
|
+
hasSteps: actualTrajectory.steps.length > 0
|
|
2802
|
+
};
|
|
2803
|
+
}).analyze({
|
|
2804
|
+
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
2805
|
+
outputSchema: analyzeOutputSchema6,
|
|
2806
|
+
createPrompt: ({ run, results }) => {
|
|
2807
|
+
const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2808
|
+
const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
2809
|
+
return createAnalyzePrompt6({
|
|
2810
|
+
userInput,
|
|
2811
|
+
agentResponse,
|
|
2812
|
+
actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
|
|
2813
|
+
expectedTrajectory: results.preprocessStepResult?.expectedTrajectoryFormatted
|
|
2814
|
+
});
|
|
2815
|
+
}
|
|
2816
|
+
}).generateScore(({ results }) => {
|
|
2817
|
+
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
2818
|
+
if (stepEvaluations.length === 0) {
|
|
2819
|
+
const missingSteps2 = results.analyzeStepResult?.missingSteps || [];
|
|
2820
|
+
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
2821
|
+
if (missingSteps2.length > 0) return 0;
|
|
2822
|
+
if (extraSteps.length > 0) return 0.5;
|
|
2823
|
+
return 1;
|
|
2824
|
+
}
|
|
2825
|
+
const necessarySteps = stepEvaluations.filter((e) => e.wasNecessary).length;
|
|
2826
|
+
const orderedSteps = stepEvaluations.filter((e) => e.wasInOrder).length;
|
|
2827
|
+
const totalSteps = stepEvaluations.length;
|
|
2828
|
+
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
2829
|
+
const missingPenalty = missingSteps.length > 0 ? missingSteps.length / (totalSteps + missingSteps.length) : 0;
|
|
2830
|
+
const necessityScore = necessarySteps / totalSteps;
|
|
2831
|
+
const orderScore = orderedSteps / totalSteps;
|
|
2832
|
+
const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
|
|
2833
|
+
return chunkXRUR5PBK_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
|
|
2834
|
+
}).generateReason({
|
|
2835
|
+
description: "Generate human-readable explanation of trajectory evaluation",
|
|
2836
|
+
createPrompt: ({ run, results, score }) => {
|
|
2837
|
+
const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2838
|
+
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
2839
|
+
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
2840
|
+
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
2841
|
+
return createReasonPrompt7({
|
|
2842
|
+
userInput,
|
|
2843
|
+
score,
|
|
2844
|
+
stepEvaluations,
|
|
2845
|
+
missingSteps,
|
|
2846
|
+
extraSteps
|
|
2847
|
+
});
|
|
2848
|
+
}
|
|
2849
|
+
});
|
|
2850
|
+
}
|
|
2612
2851
|
function normalizeString(str) {
|
|
2613
2852
|
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
2614
2853
|
}
|
|
@@ -2658,18 +2897,18 @@ function createCompletenessScorer() {
|
|
|
2658
2897
|
type: "agent"
|
|
2659
2898
|
}).preprocess(async ({ run }) => {
|
|
2660
2899
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2661
|
-
const content =
|
|
2900
|
+
const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
|
|
2662
2901
|
return content === null || content === void 0;
|
|
2663
2902
|
});
|
|
2664
2903
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2665
|
-
const content =
|
|
2904
|
+
const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
|
|
2666
2905
|
return content === null || content === void 0;
|
|
2667
2906
|
});
|
|
2668
2907
|
if (isInputInvalid || isOutputInvalid) {
|
|
2669
2908
|
throw new Error("Inputs cannot be null or undefined");
|
|
2670
2909
|
}
|
|
2671
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
2672
|
-
const output = run.output?.map((i) =>
|
|
2910
|
+
const input = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2911
|
+
const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2673
2912
|
const inputToProcess = input;
|
|
2674
2913
|
const outputToProcess = output;
|
|
2675
2914
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -2774,8 +3013,8 @@ function createTextualDifferenceScorer() {
|
|
|
2774
3013
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
2775
3014
|
type: "agent"
|
|
2776
3015
|
}).preprocess(async ({ run }) => {
|
|
2777
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
2778
|
-
const output = run.output?.map((i) =>
|
|
3016
|
+
const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3017
|
+
const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2779
3018
|
const ratio = calculateRatio(input, output);
|
|
2780
3019
|
const changes = countChanges(input, output);
|
|
2781
3020
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -2798,8 +3037,8 @@ function createKeywordCoverageScorer() {
|
|
|
2798
3037
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2799
3038
|
type: "agent"
|
|
2800
3039
|
}).preprocess(async ({ run }) => {
|
|
2801
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
2802
|
-
const output = run.output?.map((i) =>
|
|
3040
|
+
const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3041
|
+
const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2803
3042
|
if (!input && !output) {
|
|
2804
3043
|
return {
|
|
2805
3044
|
result: {
|
|
@@ -2852,8 +3091,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
2852
3091
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
2853
3092
|
type: "agent"
|
|
2854
3093
|
}).preprocess(async ({ run }) => {
|
|
2855
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
2856
|
-
let processedOutput = run.output.map((i) =>
|
|
3094
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
3095
|
+
let processedOutput = run.output.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2857
3096
|
if (ignoreCase) {
|
|
2858
3097
|
processedInput = processedInput.toLowerCase();
|
|
2859
3098
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -2883,7 +3122,7 @@ function createToneScorer(config = {}) {
|
|
|
2883
3122
|
type: "agent"
|
|
2884
3123
|
}).preprocess(async ({ run }) => {
|
|
2885
3124
|
const sentiment = new Sentiment__default.default();
|
|
2886
|
-
const agentMessage = run.output?.map((i) =>
|
|
3125
|
+
const agentMessage = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2887
3126
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
2888
3127
|
if (referenceTone) {
|
|
2889
3128
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -2970,7 +3209,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
2970
3209
|
if (isInputInvalid || isOutputInvalid) {
|
|
2971
3210
|
throw new Error("Input and output messages cannot be null or empty");
|
|
2972
3211
|
}
|
|
2973
|
-
const { tools: actualTools, toolCallInfos } =
|
|
3212
|
+
const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
|
|
2974
3213
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
2975
3214
|
return {
|
|
2976
3215
|
expectedTool,
|
|
@@ -2995,6 +3234,343 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
2995
3234
|
});
|
|
2996
3235
|
});
|
|
2997
3236
|
}
|
|
3237
|
+
function trajectoryStepToExpectedStep(step) {
|
|
3238
|
+
const result = { name: step.name, stepType: step.stepType };
|
|
3239
|
+
const data = {};
|
|
3240
|
+
if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
|
|
3241
|
+
if (step.toolArgs !== void 0) data.input = step.toolArgs;
|
|
3242
|
+
if (step.toolResult !== void 0) data.output = step.toolResult;
|
|
3243
|
+
} else if (step.stepType === "workflow_step") {
|
|
3244
|
+
if (step.output !== void 0) data.output = step.output;
|
|
3245
|
+
}
|
|
3246
|
+
if (Object.keys(data).length > 0) result.data = data;
|
|
3247
|
+
if (step.children && step.children.length > 0) {
|
|
3248
|
+
result.children = {
|
|
3249
|
+
steps: step.children.map(trajectoryStepToExpectedStep)
|
|
3250
|
+
};
|
|
3251
|
+
}
|
|
3252
|
+
return result;
|
|
3253
|
+
}
|
|
3254
|
+
function expectationToExpectedSteps(expectation) {
|
|
3255
|
+
if (!expectation.steps || expectation.steps.length === 0) return void 0;
|
|
3256
|
+
return expectation.steps;
|
|
3257
|
+
}
|
|
3258
|
+
function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
3259
|
+
const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
|
|
3260
|
+
const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
|
|
3261
|
+
const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
|
|
3262
|
+
const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
|
|
3263
|
+
const getDescription = () => {
|
|
3264
|
+
if (staticExpectedSteps) {
|
|
3265
|
+
const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
|
|
3266
|
+
return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
|
|
3267
|
+
}
|
|
3268
|
+
return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
|
|
3269
|
+
};
|
|
3270
|
+
return evals.createScorer({
|
|
3271
|
+
id: "code-trajectory-accuracy-scorer",
|
|
3272
|
+
name: "Trajectory Accuracy Scorer",
|
|
3273
|
+
description: getDescription(),
|
|
3274
|
+
type: "trajectory"
|
|
3275
|
+
}).preprocess(async ({ run }) => {
|
|
3276
|
+
const actualTrajectory = run.output;
|
|
3277
|
+
let resolvedExpectedSteps = staticExpectedSteps;
|
|
3278
|
+
if (!resolvedExpectedSteps && run.expectedTrajectory) {
|
|
3279
|
+
const expectation = run.expectedTrajectory;
|
|
3280
|
+
resolvedExpectedSteps = expectationToExpectedSteps(expectation);
|
|
3281
|
+
}
|
|
3282
|
+
if (!resolvedExpectedSteps || resolvedExpectedSteps.length === 0) {
|
|
3283
|
+
return {
|
|
3284
|
+
actualTrajectory,
|
|
3285
|
+
expectedTrajectory: void 0,
|
|
3286
|
+
comparison: void 0,
|
|
3287
|
+
actualStepNames: actualTrajectory.steps.map((s) => s.name),
|
|
3288
|
+
expectedStepNames: [],
|
|
3289
|
+
error: "No expected trajectory provided (pass via options or dataset item expectedTrajectory)"
|
|
3290
|
+
};
|
|
3291
|
+
}
|
|
3292
|
+
const itemExpectation = run.expectedTrajectory;
|
|
3293
|
+
const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
|
|
3294
|
+
const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
|
|
3295
|
+
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3296
|
+
const comparison = chunkXRUR5PBK_cjs.compareTrajectories(
|
|
3297
|
+
actualTrajectory,
|
|
3298
|
+
{ steps: resolvedExpectedSteps },
|
|
3299
|
+
{
|
|
3300
|
+
ordering: effectiveOrdering,
|
|
3301
|
+
compareStepData: effectiveCompareData,
|
|
3302
|
+
allowRepeatedSteps: effectiveAllowRepeated
|
|
3303
|
+
}
|
|
3304
|
+
);
|
|
3305
|
+
return {
|
|
3306
|
+
actualTrajectory,
|
|
3307
|
+
expectedTrajectory: { steps: resolvedExpectedSteps },
|
|
3308
|
+
comparison,
|
|
3309
|
+
actualStepNames: actualTrajectory.steps.map((s) => s.name),
|
|
3310
|
+
expectedStepNames: resolvedExpectedSteps.map((s) => s.name)
|
|
3311
|
+
};
|
|
3312
|
+
}).generateScore(({ results }) => {
|
|
3313
|
+
const preprocessResult = results.preprocessStepResult;
|
|
3314
|
+
if (!preprocessResult || !preprocessResult.comparison) {
|
|
3315
|
+
return 0;
|
|
3316
|
+
}
|
|
3317
|
+
return preprocessResult.comparison.score;
|
|
3318
|
+
});
|
|
3319
|
+
}
|
|
3320
|
+
function evaluateNestedExpectations(expectedSteps, actualSteps) {
|
|
3321
|
+
const results = [];
|
|
3322
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
3323
|
+
for (const expectedStep of expectedSteps) {
|
|
3324
|
+
if (!expectedStep.children) continue;
|
|
3325
|
+
const matchIndex = actualSteps.findIndex(
|
|
3326
|
+
(s, i) => !matchedIndices.has(i) && s.name === expectedStep.name && (!expectedStep.stepType || s.stepType === expectedStep.stepType)
|
|
3327
|
+
);
|
|
3328
|
+
const actualStep = matchIndex >= 0 ? actualSteps[matchIndex] : void 0;
|
|
3329
|
+
if (matchIndex >= 0) matchedIndices.add(matchIndex);
|
|
3330
|
+
if (!actualStep?.children || actualStep.children.length === 0) {
|
|
3331
|
+
const expectedStepCount = expectedStep.children.steps?.length ?? 0;
|
|
3332
|
+
results.push({
|
|
3333
|
+
stepName: expectedStep.name,
|
|
3334
|
+
score: 0,
|
|
3335
|
+
accuracy: expectedStepCount > 0 ? {
|
|
3336
|
+
score: 0,
|
|
3337
|
+
matchedSteps: 0,
|
|
3338
|
+
totalExpectedSteps: expectedStepCount,
|
|
3339
|
+
totalActualSteps: 0,
|
|
3340
|
+
missingSteps: expectedStep.children.steps.map((s) => s.name),
|
|
3341
|
+
extraSteps: [],
|
|
3342
|
+
outOfOrderSteps: [],
|
|
3343
|
+
repeatedSteps: []
|
|
3344
|
+
} : void 0
|
|
3345
|
+
});
|
|
3346
|
+
continue;
|
|
3347
|
+
}
|
|
3348
|
+
const childTrajectory = {
|
|
3349
|
+
steps: actualStep.children,
|
|
3350
|
+
totalDurationMs: actualStep.durationMs
|
|
3351
|
+
};
|
|
3352
|
+
const childConfig = expectedStep.children;
|
|
3353
|
+
let accuracy;
|
|
3354
|
+
if (childConfig.steps && childConfig.steps.length > 0) {
|
|
3355
|
+
accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
|
|
3356
|
+
childTrajectory,
|
|
3357
|
+
{ steps: childConfig.steps },
|
|
3358
|
+
{
|
|
3359
|
+
ordering: childConfig.ordering ?? "relaxed",
|
|
3360
|
+
compareStepData: childConfig.compareStepData ?? false,
|
|
3361
|
+
allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
|
|
3362
|
+
}
|
|
3363
|
+
);
|
|
3364
|
+
}
|
|
3365
|
+
const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
|
|
3366
|
+
const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(childTrajectory, {
|
|
3367
|
+
maxSteps: childConfig.maxSteps,
|
|
3368
|
+
maxTotalTokens: childConfig.maxTotalTokens,
|
|
3369
|
+
maxTotalDurationMs: childConfig.maxTotalDurationMs,
|
|
3370
|
+
noRedundantCalls: childConfig.noRedundantCalls ?? true
|
|
3371
|
+
}) : void 0;
|
|
3372
|
+
const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
|
|
3373
|
+
const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(childTrajectory, {
|
|
3374
|
+
blacklistedTools: childConfig.blacklistedTools,
|
|
3375
|
+
blacklistedSequences: childConfig.blacklistedSequences
|
|
3376
|
+
}) : void 0;
|
|
3377
|
+
const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(childTrajectory, {
|
|
3378
|
+
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3379
|
+
});
|
|
3380
|
+
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
|
|
3381
|
+
const scores = [];
|
|
3382
|
+
if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
|
|
3383
|
+
if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
|
|
3384
|
+
if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
|
|
3385
|
+
if (blacklist) {
|
|
3386
|
+
if (blacklist.score === 0) {
|
|
3387
|
+
results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
|
|
3388
|
+
continue;
|
|
3389
|
+
}
|
|
3390
|
+
scores.push({ weight: 0.1, value: blacklist.score });
|
|
3391
|
+
}
|
|
3392
|
+
let levelScore = 1;
|
|
3393
|
+
if (scores.length > 0) {
|
|
3394
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3395
|
+
levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
|
|
3396
|
+
}
|
|
3397
|
+
let finalScore = levelScore;
|
|
3398
|
+
if (nested.length > 0) {
|
|
3399
|
+
const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
|
|
3400
|
+
if (hasNestedBlacklistViolation) {
|
|
3401
|
+
results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
|
|
3402
|
+
continue;
|
|
3403
|
+
}
|
|
3404
|
+
const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
|
|
3405
|
+
finalScore = 0.7 * levelScore + 0.3 * nestedAvg;
|
|
3406
|
+
}
|
|
3407
|
+
results.push({
|
|
3408
|
+
stepName: expectedStep.name,
|
|
3409
|
+
score: Math.round(finalScore * 100) / 100,
|
|
3410
|
+
accuracy,
|
|
3411
|
+
efficiency,
|
|
3412
|
+
blacklist,
|
|
3413
|
+
toolFailures,
|
|
3414
|
+
nested: nested.length > 0 ? nested : void 0
|
|
3415
|
+
});
|
|
3416
|
+
}
|
|
3417
|
+
return results;
|
|
3418
|
+
}
|
|
3419
|
+
function createTrajectoryScorerCode(options = {}) {
|
|
3420
|
+
const { defaults = {} } = options;
|
|
3421
|
+
return evals.createScorer({
|
|
3422
|
+
id: "code-trajectory-scorer",
|
|
3423
|
+
name: "Trajectory Scorer",
|
|
3424
|
+
description: "Multi-dimensional trajectory evaluation: accuracy, efficiency, blacklist, and tool failures",
|
|
3425
|
+
type: "trajectory"
|
|
3426
|
+
}).preprocess(async ({ run }) => {
|
|
3427
|
+
const actualTrajectory = run.output;
|
|
3428
|
+
const itemExpectation = run.expectedTrajectory ?? {};
|
|
3429
|
+
const config = { ...defaults, ...itemExpectation };
|
|
3430
|
+
if (itemExpectation.steps !== void 0) {
|
|
3431
|
+
config.steps = itemExpectation.steps;
|
|
3432
|
+
}
|
|
3433
|
+
let accuracy;
|
|
3434
|
+
if (config.steps && config.steps.length > 0) {
|
|
3435
|
+
accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
|
|
3436
|
+
actualTrajectory,
|
|
3437
|
+
{ steps: config.steps },
|
|
3438
|
+
{
|
|
3439
|
+
ordering: config.ordering ?? "relaxed",
|
|
3440
|
+
compareStepData: config.compareStepData ?? false,
|
|
3441
|
+
allowRepeatedSteps: config.allowRepeatedSteps ?? true
|
|
3442
|
+
}
|
|
3443
|
+
);
|
|
3444
|
+
}
|
|
3445
|
+
const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
|
|
3446
|
+
const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(actualTrajectory, {
|
|
3447
|
+
maxSteps: config.maxSteps,
|
|
3448
|
+
maxTotalTokens: config.maxTotalTokens,
|
|
3449
|
+
maxTotalDurationMs: config.maxTotalDurationMs,
|
|
3450
|
+
noRedundantCalls: config.noRedundantCalls ?? true
|
|
3451
|
+
}) : void 0;
|
|
3452
|
+
const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
|
|
3453
|
+
const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(actualTrajectory, {
|
|
3454
|
+
blacklistedTools: config.blacklistedTools,
|
|
3455
|
+
blacklistedSequences: config.blacklistedSequences
|
|
3456
|
+
}) : void 0;
|
|
3457
|
+
const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(actualTrajectory, {
|
|
3458
|
+
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3459
|
+
});
|
|
3460
|
+
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
|
|
3461
|
+
return {
|
|
3462
|
+
accuracy,
|
|
3463
|
+
efficiency,
|
|
3464
|
+
blacklist,
|
|
3465
|
+
toolFailures,
|
|
3466
|
+
nested: nested && nested.length > 0 ? nested : void 0,
|
|
3467
|
+
config
|
|
3468
|
+
};
|
|
3469
|
+
}).generateScore(({ results }) => {
|
|
3470
|
+
const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
|
|
3471
|
+
if (blacklist && blacklist.score === 0) {
|
|
3472
|
+
return 0;
|
|
3473
|
+
}
|
|
3474
|
+
const scores = [];
|
|
3475
|
+
if (accuracy) {
|
|
3476
|
+
scores.push({ weight: 0.4, value: accuracy.score });
|
|
3477
|
+
}
|
|
3478
|
+
if (efficiency) {
|
|
3479
|
+
scores.push({ weight: 0.3, value: efficiency.score });
|
|
3480
|
+
}
|
|
3481
|
+
if (toolFailures && toolFailures.patterns.length > 0) {
|
|
3482
|
+
scores.push({ weight: 0.2, value: toolFailures.score });
|
|
3483
|
+
}
|
|
3484
|
+
if (blacklist) {
|
|
3485
|
+
scores.push({ weight: 0.1, value: blacklist.score });
|
|
3486
|
+
}
|
|
3487
|
+
if (scores.length === 0 && !nested) {
|
|
3488
|
+
return 1;
|
|
3489
|
+
}
|
|
3490
|
+
let levelScore = 1;
|
|
3491
|
+
if (scores.length > 0) {
|
|
3492
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3493
|
+
levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
|
|
3494
|
+
}
|
|
3495
|
+
if (nested && nested.length > 0) {
|
|
3496
|
+
const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
|
|
3497
|
+
if (hasNestedBlacklistViolation) {
|
|
3498
|
+
return 0;
|
|
3499
|
+
}
|
|
3500
|
+
const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
|
|
3501
|
+
levelScore = 0.7 * levelScore + 0.3 * nestedAvg;
|
|
3502
|
+
}
|
|
3503
|
+
return Math.round(levelScore * 100) / 100;
|
|
3504
|
+
}).generateReason(({ results, score }) => {
|
|
3505
|
+
const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
|
|
3506
|
+
const parts = [];
|
|
3507
|
+
parts.push(`Score: ${score}`);
|
|
3508
|
+
if (blacklist && blacklist.score === 0) {
|
|
3509
|
+
const violations = [];
|
|
3510
|
+
if (blacklist.violatedTools.length > 0) {
|
|
3511
|
+
violations.push(`forbidden tools used: ${blacklist.violatedTools.join(", ")}`);
|
|
3512
|
+
}
|
|
3513
|
+
if (blacklist.violatedSequences.length > 0) {
|
|
3514
|
+
violations.push(`forbidden sequences: ${blacklist.violatedSequences.map((s) => s.join(" \u2192 ")).join("; ")}`);
|
|
3515
|
+
}
|
|
3516
|
+
parts.push(`Blacklist violation: ${violations.join(". ")}.`);
|
|
3517
|
+
return parts.join("\n");
|
|
3518
|
+
}
|
|
3519
|
+
if (nested && nested.some((r) => r.blacklist && r.blacklist.score === 0)) {
|
|
3520
|
+
const violating = nested.filter((r) => r.blacklist && r.blacklist.score === 0).map((r) => r.stepName);
|
|
3521
|
+
parts.push(`Nested blacklist violation in: ${violating.join(", ")}.`);
|
|
3522
|
+
return parts.join("\n");
|
|
3523
|
+
}
|
|
3524
|
+
if (accuracy) {
|
|
3525
|
+
const details = [`${accuracy.matchedSteps}/${accuracy.totalExpectedSteps} expected steps matched`];
|
|
3526
|
+
if (accuracy.missingSteps.length > 0) {
|
|
3527
|
+
details.push(`missing: ${accuracy.missingSteps.join(", ")}`);
|
|
3528
|
+
}
|
|
3529
|
+
if (accuracy.extraSteps.length > 0) {
|
|
3530
|
+
details.push(`extra: ${accuracy.extraSteps.join(", ")}`);
|
|
3531
|
+
}
|
|
3532
|
+
if (accuracy.outOfOrderSteps.length > 0) {
|
|
3533
|
+
details.push(`out of order: ${accuracy.outOfOrderSteps.join(", ")}`);
|
|
3534
|
+
}
|
|
3535
|
+
parts.push(`Accuracy (${accuracy.score}): ${details.join(". ")}.`);
|
|
3536
|
+
}
|
|
3537
|
+
if (efficiency) {
|
|
3538
|
+
const details = [];
|
|
3539
|
+
if (efficiency.overStepBudget) {
|
|
3540
|
+
details.push(`over step budget (${efficiency.totalSteps} steps)`);
|
|
3541
|
+
}
|
|
3542
|
+
if (efficiency.overTokenBudget) {
|
|
3543
|
+
details.push(`over token budget (${efficiency.totalTokens} tokens)`);
|
|
3544
|
+
}
|
|
3545
|
+
if (efficiency.overDurationBudget) {
|
|
3546
|
+
details.push(`over duration budget (${efficiency.totalDurationMs}ms)`);
|
|
3547
|
+
}
|
|
3548
|
+
if (efficiency.redundantCalls.length > 0) {
|
|
3549
|
+
details.push(`redundant calls: ${efficiency.redundantCalls.map((c) => c.name).join(", ")}`);
|
|
3550
|
+
}
|
|
3551
|
+
if (details.length > 0) {
|
|
3552
|
+
parts.push(`Efficiency (${efficiency.score}): ${details.join(". ")}.`);
|
|
3553
|
+
} else {
|
|
3554
|
+
parts.push(`Efficiency (${efficiency.score}): all budgets met, no redundant calls.`);
|
|
3555
|
+
}
|
|
3556
|
+
}
|
|
3557
|
+
if (toolFailures && toolFailures.patterns.length > 0) {
|
|
3558
|
+
const details = [];
|
|
3559
|
+
if (toolFailures.totalRetries > 0) {
|
|
3560
|
+
details.push(`${toolFailures.totalRetries} total retries`);
|
|
3561
|
+
}
|
|
3562
|
+
if (toolFailures.excessiveRetryTools.length > 0) {
|
|
3563
|
+
details.push(`excessive retries: ${toolFailures.excessiveRetryTools.join(", ")}`);
|
|
3564
|
+
}
|
|
3565
|
+
parts.push(`Tool failures (${toolFailures.score}): ${details.join(". ")}.`);
|
|
3566
|
+
}
|
|
3567
|
+
if (nested && nested.length > 0) {
|
|
3568
|
+
const nestedSummary = nested.map((r) => `${r.stepName}: ${r.score}`).join(", ");
|
|
3569
|
+
parts.push(`Nested scores: ${nestedSummary}.`);
|
|
3570
|
+
}
|
|
3571
|
+
return parts.join("\n");
|
|
3572
|
+
});
|
|
3573
|
+
}
|
|
2998
3574
|
|
|
2999
3575
|
exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
|
|
3000
3576
|
exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
|
|
@@ -3017,5 +3593,8 @@ exports.createToneScorer = createToneScorer;
|
|
|
3017
3593
|
exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;
|
|
3018
3594
|
exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
|
|
3019
3595
|
exports.createToxicityScorer = createToxicityScorer;
|
|
3596
|
+
exports.createTrajectoryAccuracyScorerCode = createTrajectoryAccuracyScorerCode;
|
|
3597
|
+
exports.createTrajectoryAccuracyScorerLLM = createTrajectoryAccuracyScorerLLM;
|
|
3598
|
+
exports.createTrajectoryScorerCode = createTrajectoryScorerCode;
|
|
3020
3599
|
//# sourceMappingURL=index.cjs.map
|
|
3021
3600
|
//# sourceMappingURL=index.cjs.map
|