@mastra/evals 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/dist/{chunk-6EA6D7JG.js → chunk-OEOE7ZHN.js} +21 -3
- package/dist/chunk-OEOE7ZHN.js.map +1 -0
- package/dist/{chunk-DSXZHUHI.cjs → chunk-W3U7MMDX.cjs} +21 -2
- package/dist/chunk-W3U7MMDX.cjs.map +1 -0
- package/dist/docs/README.md +1 -1
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/SOURCE_MAP.json +1 -1
- package/dist/docs/evals/03-reference.md +84 -10
- package/dist/scorers/code/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/index.d.ts +1 -0
- package/dist/scorers/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +19 -2
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/prebuilt/index.cjs +75 -63
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +17 -5
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +20 -16
- package/dist/scorers/utils.d.ts +39 -0
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +4 -4
- package/dist/chunk-6EA6D7JG.js.map +0 -1
- package/dist/chunk-DSXZHUHI.cjs.map +0 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkW3U7MMDX_cjs = require('../../chunk-W3U7MMDX.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var zod = require('zod');
|
|
6
6
|
var nlp = require('compromise');
|
|
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
|
|
|
239
239
|
description: "Extract relevant statements from the LLM output",
|
|
240
240
|
outputSchema: extractOutputSchema,
|
|
241
241
|
createPrompt: ({ run }) => {
|
|
242
|
-
const assistantMessage =
|
|
242
|
+
const assistantMessage = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
243
243
|
return createExtractPrompt(assistantMessage);
|
|
244
244
|
}
|
|
245
245
|
}).analyze({
|
|
246
246
|
description: "Score the relevance of the statements to the input",
|
|
247
247
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
248
248
|
createPrompt: ({ run, results }) => {
|
|
249
|
-
const input =
|
|
249
|
+
const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
250
250
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
251
251
|
}
|
|
252
252
|
}).generateScore(({ results }) => {
|
|
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
|
|
|
263
263
|
}
|
|
264
264
|
}
|
|
265
265
|
const score = relevancyCount / numberOfResults;
|
|
266
|
-
return
|
|
266
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * options.scale);
|
|
267
267
|
}).generateReason({
|
|
268
268
|
description: "Reason about the results",
|
|
269
269
|
createPrompt: ({ run, results, score }) => {
|
|
270
270
|
return createReasonPrompt({
|
|
271
|
-
input:
|
|
272
|
-
output:
|
|
271
|
+
input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
273
273
|
score,
|
|
274
274
|
results: results.analyzeStepResult.results,
|
|
275
275
|
scale: options.scale
|
|
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
|
|
|
466
466
|
groundTruth: ""
|
|
467
467
|
});
|
|
468
468
|
}
|
|
469
|
-
const output =
|
|
469
|
+
const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
470
470
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
471
471
|
return createExtractPrompt2({
|
|
472
472
|
output,
|
|
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
|
|
|
524
524
|
);
|
|
525
525
|
score -= extraInfoPenalty;
|
|
526
526
|
score = Math.max(0, Math.min(1, score));
|
|
527
|
-
return
|
|
527
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
528
528
|
}).generateReason({
|
|
529
529
|
description: "Generate explanation of similarity score",
|
|
530
530
|
createPrompt: ({ run, results, score }) => {
|
|
531
531
|
if (!run.groundTruth) {
|
|
532
532
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
533
533
|
}
|
|
534
|
-
const output =
|
|
534
|
+
const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
535
535
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
536
536
|
return createReasonPrompt2({
|
|
537
537
|
output,
|
|
@@ -715,7 +715,7 @@ function createFaithfulnessScorer({
|
|
|
715
715
|
description: "Extract relevant statements from the LLM output",
|
|
716
716
|
outputSchema: zod.z.array(zod.z.string()),
|
|
717
717
|
createPrompt: ({ run }) => {
|
|
718
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
718
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
719
719
|
return prompt;
|
|
720
720
|
}
|
|
721
721
|
}).analyze({
|
|
@@ -739,14 +739,14 @@ function createFaithfulnessScorer({
|
|
|
739
739
|
return 0;
|
|
740
740
|
}
|
|
741
741
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
742
|
-
return
|
|
742
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
|
|
743
743
|
}).generateReason({
|
|
744
744
|
description: "Reason about the results",
|
|
745
745
|
createPrompt: ({ run, results, score }) => {
|
|
746
746
|
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
747
747
|
const prompt = createFaithfulnessReasonPrompt({
|
|
748
|
-
input:
|
|
749
|
-
output:
|
|
748
|
+
input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
749
|
+
output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
750
750
|
context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
751
751
|
score,
|
|
752
752
|
scale: options?.scale || 1,
|
|
@@ -879,13 +879,13 @@ function createBiasScorer({ model, options }) {
|
|
|
879
879
|
outputSchema: zod.z.object({
|
|
880
880
|
opinions: zod.z.array(zod.z.string())
|
|
881
881
|
}),
|
|
882
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
882
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
883
883
|
}).analyze({
|
|
884
884
|
description: "Score the relevance of the statements to the input",
|
|
885
885
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
886
886
|
createPrompt: ({ run, results }) => {
|
|
887
887
|
const prompt = createBiasAnalyzePrompt({
|
|
888
|
-
output:
|
|
888
|
+
output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
889
889
|
opinions: results.preprocessStepResult?.opinions || []
|
|
890
890
|
});
|
|
891
891
|
return prompt;
|
|
@@ -896,7 +896,7 @@ function createBiasScorer({ model, options }) {
|
|
|
896
896
|
}
|
|
897
897
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
898
898
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
899
|
-
return
|
|
899
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
900
900
|
}).generateReason({
|
|
901
901
|
description: "Reason about the results",
|
|
902
902
|
createPrompt: ({ score, results }) => {
|
|
@@ -1115,7 +1115,7 @@ function createHallucinationScorer({
|
|
|
1115
1115
|
claims: zod.z.array(zod.z.string())
|
|
1116
1116
|
}),
|
|
1117
1117
|
createPrompt: ({ run }) => {
|
|
1118
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1118
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1119
1119
|
return prompt;
|
|
1120
1120
|
}
|
|
1121
1121
|
}).analyze({
|
|
@@ -1123,10 +1123,16 @@ function createHallucinationScorer({
|
|
|
1123
1123
|
outputSchema: zod.z.object({
|
|
1124
1124
|
verdicts: zod.z.array(zod.z.object({ statement: zod.z.string(), verdict: zod.z.string(), reason: zod.z.string() }))
|
|
1125
1125
|
}),
|
|
1126
|
-
createPrompt: ({ results }) => {
|
|
1126
|
+
createPrompt: async ({ run, results }) => {
|
|
1127
|
+
let context;
|
|
1128
|
+
if (options?.getContext) {
|
|
1129
|
+
context = await options.getContext({ run, results, step: "analyze" });
|
|
1130
|
+
} else {
|
|
1131
|
+
context = options?.context ?? [];
|
|
1132
|
+
}
|
|
1127
1133
|
const prompt = createHallucinationAnalyzePrompt({
|
|
1128
1134
|
claims: results.preprocessStepResult.claims,
|
|
1129
|
-
context
|
|
1135
|
+
context
|
|
1130
1136
|
});
|
|
1131
1137
|
return prompt;
|
|
1132
1138
|
}
|
|
@@ -1137,14 +1143,20 @@ function createHallucinationScorer({
|
|
|
1137
1143
|
return 0;
|
|
1138
1144
|
}
|
|
1139
1145
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1140
|
-
return
|
|
1146
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
|
|
1141
1147
|
}).generateReason({
|
|
1142
1148
|
description: "Reason about the results",
|
|
1143
|
-
createPrompt: ({ run, results, score }) => {
|
|
1149
|
+
createPrompt: async ({ run, results, score }) => {
|
|
1150
|
+
let context;
|
|
1151
|
+
if (options?.getContext) {
|
|
1152
|
+
context = await options.getContext({ run, results, score, step: "generateReason" });
|
|
1153
|
+
} else {
|
|
1154
|
+
context = options?.context ?? [];
|
|
1155
|
+
}
|
|
1144
1156
|
const prompt = createHallucinationReasonPrompt({
|
|
1145
|
-
input:
|
|
1146
|
-
output:
|
|
1147
|
-
context
|
|
1157
|
+
input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1158
|
+
output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1159
|
+
context,
|
|
1148
1160
|
score,
|
|
1149
1161
|
scale: options?.scale || 1,
|
|
1150
1162
|
verdicts: results.analyzeStepResult?.verdicts || []
|
|
@@ -1257,8 +1269,8 @@ function createToxicityScorer({
|
|
|
1257
1269
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1258
1270
|
createPrompt: ({ run }) => {
|
|
1259
1271
|
const prompt = createToxicityAnalyzePrompt({
|
|
1260
|
-
input:
|
|
1261
|
-
output:
|
|
1272
|
+
input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1273
|
+
output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1262
1274
|
});
|
|
1263
1275
|
return prompt;
|
|
1264
1276
|
}
|
|
@@ -1274,7 +1286,7 @@ function createToxicityScorer({
|
|
|
1274
1286
|
}
|
|
1275
1287
|
}
|
|
1276
1288
|
const score = toxicityCount / numberOfVerdicts;
|
|
1277
|
-
return
|
|
1289
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1278
1290
|
}).generateReason({
|
|
1279
1291
|
description: "Reason about the results",
|
|
1280
1292
|
createPrompt: ({ results, score }) => {
|
|
@@ -1408,7 +1420,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1408
1420
|
if (isInputInvalid || isOutputInvalid) {
|
|
1409
1421
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1410
1422
|
}
|
|
1411
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1423
|
+
const { tools: actualTools, toolCallInfos } = chunkW3U7MMDX_cjs.extractToolCalls(run.output);
|
|
1412
1424
|
return {
|
|
1413
1425
|
actualTools,
|
|
1414
1426
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1418,8 +1430,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1418
1430
|
description: "Analyze the appropriateness of tool selections",
|
|
1419
1431
|
outputSchema: analyzeOutputSchema2,
|
|
1420
1432
|
createPrompt: ({ run, results }) => {
|
|
1421
|
-
const userInput =
|
|
1422
|
-
const agentResponse =
|
|
1433
|
+
const userInput = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1434
|
+
const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1423
1435
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1424
1436
|
return createAnalyzePrompt2({
|
|
1425
1437
|
userInput,
|
|
@@ -1436,11 +1448,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1436
1448
|
}
|
|
1437
1449
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1438
1450
|
const totalToolCalls = evaluations.length;
|
|
1439
|
-
return
|
|
1451
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1440
1452
|
}).generateReason({
|
|
1441
1453
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1442
1454
|
createPrompt: ({ run, results, score }) => {
|
|
1443
|
-
const userInput =
|
|
1455
|
+
const userInput = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1444
1456
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1445
1457
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1446
1458
|
return createReasonPrompt3({
|
|
@@ -1645,8 +1657,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1645
1657
|
description: "Analyze the relevance and utility of provided context",
|
|
1646
1658
|
outputSchema: analyzeOutputSchema3,
|
|
1647
1659
|
createPrompt: ({ run }) => {
|
|
1648
|
-
const userQuery =
|
|
1649
|
-
const agentResponse =
|
|
1660
|
+
const userQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1661
|
+
const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1650
1662
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1651
1663
|
if (context.length === 0) {
|
|
1652
1664
|
return createAnalyzePrompt3({
|
|
@@ -1694,11 +1706,11 @@ function createContextRelevanceScorerLLM({
|
|
|
1694
1706
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1695
1707
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1696
1708
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1697
|
-
return
|
|
1709
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(scaledScore);
|
|
1698
1710
|
}).generateReason({
|
|
1699
1711
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1700
1712
|
createPrompt: ({ run, results, score }) => {
|
|
1701
|
-
const userQuery =
|
|
1713
|
+
const userQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1702
1714
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1703
1715
|
if (context.length === 0) {
|
|
1704
1716
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -1869,8 +1881,8 @@ function createContextPrecisionScorer({
|
|
|
1869
1881
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1870
1882
|
outputSchema: contextRelevanceOutputSchema,
|
|
1871
1883
|
createPrompt: ({ run }) => {
|
|
1872
|
-
const input =
|
|
1873
|
-
const output =
|
|
1884
|
+
const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1885
|
+
const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1874
1886
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1875
1887
|
if (context.length === 0) {
|
|
1876
1888
|
throw new Error("No context available for evaluation");
|
|
@@ -1903,12 +1915,12 @@ function createContextPrecisionScorer({
|
|
|
1903
1915
|
}
|
|
1904
1916
|
const map = sumPrecision / relevantCount;
|
|
1905
1917
|
const score = map * (options.scale || 1);
|
|
1906
|
-
return
|
|
1918
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
|
|
1907
1919
|
}).generateReason({
|
|
1908
1920
|
description: "Reason about the context precision results",
|
|
1909
1921
|
createPrompt: ({ run, results, score }) => {
|
|
1910
|
-
const input =
|
|
1911
|
-
const output =
|
|
1922
|
+
const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1923
|
+
const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1912
1924
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1913
1925
|
return createContextPrecisionReasonPrompt({
|
|
1914
1926
|
input,
|
|
@@ -2162,8 +2174,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2162
2174
|
description: "Analyze the impact of noise on agent response quality",
|
|
2163
2175
|
outputSchema: analyzeOutputSchema4,
|
|
2164
2176
|
createPrompt: ({ run }) => {
|
|
2165
|
-
const originalQuery =
|
|
2166
|
-
const noisyResponse =
|
|
2177
|
+
const originalQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2178
|
+
const noisyResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2167
2179
|
if (!originalQuery || !noisyResponse) {
|
|
2168
2180
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2169
2181
|
}
|
|
@@ -2206,11 +2218,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2206
2218
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2207
2219
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2208
2220
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2209
|
-
return
|
|
2221
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(finalScore);
|
|
2210
2222
|
}).generateReason({
|
|
2211
2223
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2212
2224
|
createPrompt: ({ run, results, score }) => {
|
|
2213
|
-
const originalQuery =
|
|
2225
|
+
const originalQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2214
2226
|
const analysisResult = results.analyzeStepResult;
|
|
2215
2227
|
if (!analysisResult) {
|
|
2216
2228
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -2534,9 +2546,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
2534
2546
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2535
2547
|
outputSchema: analyzeOutputSchema5,
|
|
2536
2548
|
createPrompt: ({ run }) => {
|
|
2537
|
-
const userPrompt =
|
|
2538
|
-
const systemPrompt =
|
|
2539
|
-
const agentResponse =
|
|
2549
|
+
const userPrompt = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2550
|
+
const systemPrompt = chunkW3U7MMDX_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2551
|
+
const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2540
2552
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2541
2553
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2542
2554
|
}
|
|
@@ -2572,12 +2584,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
2572
2584
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2573
2585
|
}
|
|
2574
2586
|
const finalScore = weightedScore * scale;
|
|
2575
|
-
return
|
|
2587
|
+
return chunkW3U7MMDX_cjs.roundToTwoDecimals(finalScore);
|
|
2576
2588
|
}).generateReason({
|
|
2577
2589
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2578
2590
|
createPrompt: ({ run, results, score }) => {
|
|
2579
|
-
const userPrompt =
|
|
2580
|
-
const systemPrompt =
|
|
2591
|
+
const userPrompt = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2592
|
+
const systemPrompt = chunkW3U7MMDX_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2581
2593
|
const analysis = results.analyzeStepResult;
|
|
2582
2594
|
if (!analysis) {
|
|
2583
2595
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -2642,18 +2654,18 @@ function createCompletenessScorer() {
|
|
|
2642
2654
|
type: "agent"
|
|
2643
2655
|
}).preprocess(async ({ run }) => {
|
|
2644
2656
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2645
|
-
const content =
|
|
2657
|
+
const content = chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i);
|
|
2646
2658
|
return content === null || content === void 0;
|
|
2647
2659
|
});
|
|
2648
2660
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2649
|
-
const content =
|
|
2661
|
+
const content = chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i);
|
|
2650
2662
|
return content === null || content === void 0;
|
|
2651
2663
|
});
|
|
2652
2664
|
if (isInputInvalid || isOutputInvalid) {
|
|
2653
2665
|
throw new Error("Inputs cannot be null or undefined");
|
|
2654
2666
|
}
|
|
2655
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
2656
|
-
const output = run.output?.map((i) =>
|
|
2667
|
+
const input = run.input?.inputMessages.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2668
|
+
const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2657
2669
|
const inputToProcess = input;
|
|
2658
2670
|
const outputToProcess = output;
|
|
2659
2671
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -2758,8 +2770,8 @@ function createTextualDifferenceScorer() {
|
|
|
2758
2770
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
2759
2771
|
type: "agent"
|
|
2760
2772
|
}).preprocess(async ({ run }) => {
|
|
2761
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
2762
|
-
const output = run.output?.map((i) =>
|
|
2773
|
+
const input = run.input?.inputMessages?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2774
|
+
const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2763
2775
|
const ratio = calculateRatio(input, output);
|
|
2764
2776
|
const changes = countChanges(input, output);
|
|
2765
2777
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -2782,8 +2794,8 @@ function createKeywordCoverageScorer() {
|
|
|
2782
2794
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2783
2795
|
type: "agent"
|
|
2784
2796
|
}).preprocess(async ({ run }) => {
|
|
2785
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
2786
|
-
const output = run.output?.map((i) =>
|
|
2797
|
+
const input = run.input?.inputMessages?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2798
|
+
const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2787
2799
|
if (!input && !output) {
|
|
2788
2800
|
return {
|
|
2789
2801
|
result: {
|
|
@@ -2836,8 +2848,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
2836
2848
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
2837
2849
|
type: "agent"
|
|
2838
2850
|
}).preprocess(async ({ run }) => {
|
|
2839
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
2840
|
-
let processedOutput = run.output.map((i) =>
|
|
2851
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2852
|
+
let processedOutput = run.output.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2841
2853
|
if (ignoreCase) {
|
|
2842
2854
|
processedInput = processedInput.toLowerCase();
|
|
2843
2855
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -2867,7 +2879,7 @@ function createToneScorer(config = {}) {
|
|
|
2867
2879
|
type: "agent"
|
|
2868
2880
|
}).preprocess(async ({ run }) => {
|
|
2869
2881
|
const sentiment = new Sentiment__default.default();
|
|
2870
|
-
const agentMessage = run.output?.map((i) =>
|
|
2882
|
+
const agentMessage = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2871
2883
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
2872
2884
|
if (referenceTone) {
|
|
2873
2885
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -2954,7 +2966,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
2954
2966
|
if (isInputInvalid || isOutputInvalid) {
|
|
2955
2967
|
throw new Error("Input and output messages cannot be null or empty");
|
|
2956
2968
|
}
|
|
2957
|
-
const { tools: actualTools, toolCallInfos } =
|
|
2969
|
+
const { tools: actualTools, toolCallInfos } = chunkW3U7MMDX_cjs.extractToolCalls(run.output);
|
|
2958
2970
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
2959
2971
|
return {
|
|
2960
2972
|
expectedTool,
|