@mastra/evals 1.0.0-beta.2 → 1.0.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/dist/{chunk-CKKVCGRB.js → chunk-6EA6D7JG.js} +2 -2
- package/dist/chunk-6EA6D7JG.js.map +1 -0
- package/dist/{chunk-AT7HXT3U.cjs → chunk-DSXZHUHI.cjs} +2 -2
- package/dist/chunk-DSXZHUHI.cjs.map +1 -0
- package/dist/docs/README.md +31 -0
- package/dist/docs/SKILL.md +32 -0
- package/dist/docs/SOURCE_MAP.json +6 -0
- package/dist/docs/evals/01-overview.md +130 -0
- package/dist/docs/evals/02-built-in-scorers.md +49 -0
- package/dist/docs/evals/03-reference.md +4018 -0
- package/dist/index.cjs +4 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +59 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +1 -1
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +16 -16
- package/dist/scorers/utils.d.ts +1 -2
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +8 -10
- package/dist/chunk-AT7HXT3U.cjs.map +0 -1
- package/dist/chunk-CKKVCGRB.js.map +0 -1
package/dist/index.cjs
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"index.cjs"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @mastra/evals - Evaluation framework for AI agents
|
|
3
|
+
*
|
|
4
|
+
* This package uses subpath exports. Import from specific paths:
|
|
5
|
+
*
|
|
6
|
+
* @example
|
|
7
|
+
* ```ts
|
|
8
|
+
* import { createToolCallAccuracyScorerCode } from '@mastra/evals/scorers/prebuilt';
|
|
9
|
+
* import { getUserMessageFromRunInput } from '@mastra/evals/scorers/utils';
|
|
10
|
+
* */
|
|
11
|
+
export {};
|
|
12
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;KASK"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"index.js"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunkDSXZHUHI_cjs = require('../../chunk-DSXZHUHI.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
5
|
var zod = require('zod');
|
|
6
6
|
var nlp = require('compromise');
|
|
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
|
|
|
239
239
|
description: "Extract relevant statements from the LLM output",
|
|
240
240
|
outputSchema: extractOutputSchema,
|
|
241
241
|
createPrompt: ({ run }) => {
|
|
242
|
-
const assistantMessage =
|
|
242
|
+
const assistantMessage = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
243
243
|
return createExtractPrompt(assistantMessage);
|
|
244
244
|
}
|
|
245
245
|
}).analyze({
|
|
246
246
|
description: "Score the relevance of the statements to the input",
|
|
247
247
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
248
248
|
createPrompt: ({ run, results }) => {
|
|
249
|
-
const input =
|
|
249
|
+
const input = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
250
250
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
251
251
|
}
|
|
252
252
|
}).generateScore(({ results }) => {
|
|
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
|
|
|
263
263
|
}
|
|
264
264
|
}
|
|
265
265
|
const score = relevancyCount / numberOfResults;
|
|
266
|
-
return
|
|
266
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score * options.scale);
|
|
267
267
|
}).generateReason({
|
|
268
268
|
description: "Reason about the results",
|
|
269
269
|
createPrompt: ({ run, results, score }) => {
|
|
270
270
|
return createReasonPrompt({
|
|
271
|
-
input:
|
|
272
|
-
output:
|
|
271
|
+
input: chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
273
273
|
score,
|
|
274
274
|
results: results.analyzeStepResult.results,
|
|
275
275
|
scale: options.scale
|
|
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
|
|
|
466
466
|
groundTruth: ""
|
|
467
467
|
});
|
|
468
468
|
}
|
|
469
|
-
const output =
|
|
469
|
+
const output = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
470
470
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
471
471
|
return createExtractPrompt2({
|
|
472
472
|
output,
|
|
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
|
|
|
524
524
|
);
|
|
525
525
|
score -= extraInfoPenalty;
|
|
526
526
|
score = Math.max(0, Math.min(1, score));
|
|
527
|
-
return
|
|
527
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
528
528
|
}).generateReason({
|
|
529
529
|
description: "Generate explanation of similarity score",
|
|
530
530
|
createPrompt: ({ run, results, score }) => {
|
|
531
531
|
if (!run.groundTruth) {
|
|
532
532
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
533
533
|
}
|
|
534
|
-
const output =
|
|
534
|
+
const output = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
535
535
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
536
536
|
return createReasonPrompt2({
|
|
537
537
|
output,
|
|
@@ -715,7 +715,7 @@ function createFaithfulnessScorer({
|
|
|
715
715
|
description: "Extract relevant statements from the LLM output",
|
|
716
716
|
outputSchema: zod.z.array(zod.z.string()),
|
|
717
717
|
createPrompt: ({ run }) => {
|
|
718
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
718
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
719
719
|
return prompt;
|
|
720
720
|
}
|
|
721
721
|
}).analyze({
|
|
@@ -739,14 +739,14 @@ function createFaithfulnessScorer({
|
|
|
739
739
|
return 0;
|
|
740
740
|
}
|
|
741
741
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
742
|
-
return
|
|
742
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score);
|
|
743
743
|
}).generateReason({
|
|
744
744
|
description: "Reason about the results",
|
|
745
745
|
createPrompt: ({ run, results, score }) => {
|
|
746
746
|
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
747
747
|
const prompt = createFaithfulnessReasonPrompt({
|
|
748
|
-
input:
|
|
749
|
-
output:
|
|
748
|
+
input: chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
749
|
+
output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
750
750
|
context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
751
751
|
score,
|
|
752
752
|
scale: options?.scale || 1,
|
|
@@ -879,13 +879,13 @@ function createBiasScorer({ model, options }) {
|
|
|
879
879
|
outputSchema: zod.z.object({
|
|
880
880
|
opinions: zod.z.array(zod.z.string())
|
|
881
881
|
}),
|
|
882
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
882
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
883
883
|
}).analyze({
|
|
884
884
|
description: "Score the relevance of the statements to the input",
|
|
885
885
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
886
886
|
createPrompt: ({ run, results }) => {
|
|
887
887
|
const prompt = createBiasAnalyzePrompt({
|
|
888
|
-
output:
|
|
888
|
+
output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
889
889
|
opinions: results.preprocessStepResult?.opinions || []
|
|
890
890
|
});
|
|
891
891
|
return prompt;
|
|
@@ -896,7 +896,7 @@ function createBiasScorer({ model, options }) {
|
|
|
896
896
|
}
|
|
897
897
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
898
898
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
899
|
-
return
|
|
899
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
900
900
|
}).generateReason({
|
|
901
901
|
description: "Reason about the results",
|
|
902
902
|
createPrompt: ({ score, results }) => {
|
|
@@ -1115,7 +1115,7 @@ function createHallucinationScorer({
|
|
|
1115
1115
|
claims: zod.z.array(zod.z.string())
|
|
1116
1116
|
}),
|
|
1117
1117
|
createPrompt: ({ run }) => {
|
|
1118
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1118
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1119
1119
|
return prompt;
|
|
1120
1120
|
}
|
|
1121
1121
|
}).analyze({
|
|
@@ -1137,13 +1137,13 @@ function createHallucinationScorer({
|
|
|
1137
1137
|
return 0;
|
|
1138
1138
|
}
|
|
1139
1139
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1140
|
-
return
|
|
1140
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score);
|
|
1141
1141
|
}).generateReason({
|
|
1142
1142
|
description: "Reason about the results",
|
|
1143
1143
|
createPrompt: ({ run, results, score }) => {
|
|
1144
1144
|
const prompt = createHallucinationReasonPrompt({
|
|
1145
|
-
input:
|
|
1146
|
-
output:
|
|
1145
|
+
input: chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1146
|
+
output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1147
1147
|
context: options?.context || [],
|
|
1148
1148
|
score,
|
|
1149
1149
|
scale: options?.scale || 1,
|
|
@@ -1257,8 +1257,8 @@ function createToxicityScorer({
|
|
|
1257
1257
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1258
1258
|
createPrompt: ({ run }) => {
|
|
1259
1259
|
const prompt = createToxicityAnalyzePrompt({
|
|
1260
|
-
input:
|
|
1261
|
-
output:
|
|
1260
|
+
input: chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1261
|
+
output: chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1262
1262
|
});
|
|
1263
1263
|
return prompt;
|
|
1264
1264
|
}
|
|
@@ -1274,7 +1274,7 @@ function createToxicityScorer({
|
|
|
1274
1274
|
}
|
|
1275
1275
|
}
|
|
1276
1276
|
const score = toxicityCount / numberOfVerdicts;
|
|
1277
|
-
return
|
|
1277
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1278
1278
|
}).generateReason({
|
|
1279
1279
|
description: "Reason about the results",
|
|
1280
1280
|
createPrompt: ({ results, score }) => {
|
|
@@ -1408,7 +1408,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1408
1408
|
if (isInputInvalid || isOutputInvalid) {
|
|
1409
1409
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1410
1410
|
}
|
|
1411
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1411
|
+
const { tools: actualTools, toolCallInfos } = chunkDSXZHUHI_cjs.extractToolCalls(run.output);
|
|
1412
1412
|
return {
|
|
1413
1413
|
actualTools,
|
|
1414
1414
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1418,8 +1418,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1418
1418
|
description: "Analyze the appropriateness of tool selections",
|
|
1419
1419
|
outputSchema: analyzeOutputSchema2,
|
|
1420
1420
|
createPrompt: ({ run, results }) => {
|
|
1421
|
-
const userInput =
|
|
1422
|
-
const agentResponse =
|
|
1421
|
+
const userInput = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1422
|
+
const agentResponse = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1423
1423
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1424
1424
|
return createAnalyzePrompt2({
|
|
1425
1425
|
userInput,
|
|
@@ -1436,11 +1436,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1436
1436
|
}
|
|
1437
1437
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1438
1438
|
const totalToolCalls = evaluations.length;
|
|
1439
|
-
return
|
|
1439
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1440
1440
|
}).generateReason({
|
|
1441
1441
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1442
1442
|
createPrompt: ({ run, results, score }) => {
|
|
1443
|
-
const userInput =
|
|
1443
|
+
const userInput = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1444
1444
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1445
1445
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1446
1446
|
return createReasonPrompt3({
|
|
@@ -1645,8 +1645,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1645
1645
|
description: "Analyze the relevance and utility of provided context",
|
|
1646
1646
|
outputSchema: analyzeOutputSchema3,
|
|
1647
1647
|
createPrompt: ({ run }) => {
|
|
1648
|
-
const userQuery =
|
|
1649
|
-
const agentResponse =
|
|
1648
|
+
const userQuery = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1649
|
+
const agentResponse = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1650
1650
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1651
1651
|
if (context.length === 0) {
|
|
1652
1652
|
return createAnalyzePrompt3({
|
|
@@ -1694,11 +1694,11 @@ function createContextRelevanceScorerLLM({
|
|
|
1694
1694
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1695
1695
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1696
1696
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1697
|
-
return
|
|
1697
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(scaledScore);
|
|
1698
1698
|
}).generateReason({
|
|
1699
1699
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1700
1700
|
createPrompt: ({ run, results, score }) => {
|
|
1701
|
-
const userQuery =
|
|
1701
|
+
const userQuery = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1702
1702
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1703
1703
|
if (context.length === 0) {
|
|
1704
1704
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -1869,8 +1869,8 @@ function createContextPrecisionScorer({
|
|
|
1869
1869
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1870
1870
|
outputSchema: contextRelevanceOutputSchema,
|
|
1871
1871
|
createPrompt: ({ run }) => {
|
|
1872
|
-
const input =
|
|
1873
|
-
const output =
|
|
1872
|
+
const input = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1873
|
+
const output = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1874
1874
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1875
1875
|
if (context.length === 0) {
|
|
1876
1876
|
throw new Error("No context available for evaluation");
|
|
@@ -1903,12 +1903,12 @@ function createContextPrecisionScorer({
|
|
|
1903
1903
|
}
|
|
1904
1904
|
const map = sumPrecision / relevantCount;
|
|
1905
1905
|
const score = map * (options.scale || 1);
|
|
1906
|
-
return
|
|
1906
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(score);
|
|
1907
1907
|
}).generateReason({
|
|
1908
1908
|
description: "Reason about the context precision results",
|
|
1909
1909
|
createPrompt: ({ run, results, score }) => {
|
|
1910
|
-
const input =
|
|
1911
|
-
const output =
|
|
1910
|
+
const input = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1911
|
+
const output = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1912
1912
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1913
1913
|
return createContextPrecisionReasonPrompt({
|
|
1914
1914
|
input,
|
|
@@ -2162,8 +2162,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2162
2162
|
description: "Analyze the impact of noise on agent response quality",
|
|
2163
2163
|
outputSchema: analyzeOutputSchema4,
|
|
2164
2164
|
createPrompt: ({ run }) => {
|
|
2165
|
-
const originalQuery =
|
|
2166
|
-
const noisyResponse =
|
|
2165
|
+
const originalQuery = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2166
|
+
const noisyResponse = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2167
2167
|
if (!originalQuery || !noisyResponse) {
|
|
2168
2168
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2169
2169
|
}
|
|
@@ -2206,11 +2206,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2206
2206
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2207
2207
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2208
2208
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2209
|
-
return
|
|
2209
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(finalScore);
|
|
2210
2210
|
}).generateReason({
|
|
2211
2211
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2212
2212
|
createPrompt: ({ run, results, score }) => {
|
|
2213
|
-
const originalQuery =
|
|
2213
|
+
const originalQuery = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2214
2214
|
const analysisResult = results.analyzeStepResult;
|
|
2215
2215
|
if (!analysisResult) {
|
|
2216
2216
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -2534,9 +2534,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
2534
2534
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2535
2535
|
outputSchema: analyzeOutputSchema5,
|
|
2536
2536
|
createPrompt: ({ run }) => {
|
|
2537
|
-
const userPrompt =
|
|
2538
|
-
const systemPrompt =
|
|
2539
|
-
const agentResponse =
|
|
2537
|
+
const userPrompt = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2538
|
+
const systemPrompt = chunkDSXZHUHI_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2539
|
+
const agentResponse = chunkDSXZHUHI_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2540
2540
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2541
2541
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2542
2542
|
}
|
|
@@ -2572,12 +2572,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
2572
2572
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2573
2573
|
}
|
|
2574
2574
|
const finalScore = weightedScore * scale;
|
|
2575
|
-
return
|
|
2575
|
+
return chunkDSXZHUHI_cjs.roundToTwoDecimals(finalScore);
|
|
2576
2576
|
}).generateReason({
|
|
2577
2577
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2578
2578
|
createPrompt: ({ run, results, score }) => {
|
|
2579
|
-
const userPrompt =
|
|
2580
|
-
const systemPrompt =
|
|
2579
|
+
const userPrompt = chunkDSXZHUHI_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2580
|
+
const systemPrompt = chunkDSXZHUHI_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2581
2581
|
const analysis = results.analyzeStepResult;
|
|
2582
2582
|
if (!analysis) {
|
|
2583
2583
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -2642,18 +2642,18 @@ function createCompletenessScorer() {
|
|
|
2642
2642
|
type: "agent"
|
|
2643
2643
|
}).preprocess(async ({ run }) => {
|
|
2644
2644
|
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2645
|
-
const content =
|
|
2645
|
+
const content = chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i);
|
|
2646
2646
|
return content === null || content === void 0;
|
|
2647
2647
|
});
|
|
2648
2648
|
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2649
|
-
const content =
|
|
2649
|
+
const content = chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i);
|
|
2650
2650
|
return content === null || content === void 0;
|
|
2651
2651
|
});
|
|
2652
2652
|
if (isInputInvalid || isOutputInvalid) {
|
|
2653
2653
|
throw new Error("Inputs cannot be null or undefined");
|
|
2654
2654
|
}
|
|
2655
|
-
const input = run.input?.inputMessages.map((i) =>
|
|
2656
|
-
const output = run.output?.map((i) =>
|
|
2655
|
+
const input = run.input?.inputMessages.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2656
|
+
const output = run.output?.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2657
2657
|
const inputToProcess = input;
|
|
2658
2658
|
const outputToProcess = output;
|
|
2659
2659
|
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
@@ -2758,8 +2758,8 @@ function createTextualDifferenceScorer() {
|
|
|
2758
2758
|
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
2759
2759
|
type: "agent"
|
|
2760
2760
|
}).preprocess(async ({ run }) => {
|
|
2761
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
2762
|
-
const output = run.output?.map((i) =>
|
|
2761
|
+
const input = run.input?.inputMessages?.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2762
|
+
const output = run.output?.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2763
2763
|
const ratio = calculateRatio(input, output);
|
|
2764
2764
|
const changes = countChanges(input, output);
|
|
2765
2765
|
const maxLength = Math.max(input.length, output.length);
|
|
@@ -2782,8 +2782,8 @@ function createKeywordCoverageScorer() {
|
|
|
2782
2782
|
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2783
2783
|
type: "agent"
|
|
2784
2784
|
}).preprocess(async ({ run }) => {
|
|
2785
|
-
const input = run.input?.inputMessages?.map((i) =>
|
|
2786
|
-
const output = run.output?.map((i) =>
|
|
2785
|
+
const input = run.input?.inputMessages?.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2786
|
+
const output = run.output?.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2787
2787
|
if (!input && !output) {
|
|
2788
2788
|
return {
|
|
2789
2789
|
result: {
|
|
@@ -2836,8 +2836,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
|
|
|
2836
2836
|
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
2837
2837
|
type: "agent"
|
|
2838
2838
|
}).preprocess(async ({ run }) => {
|
|
2839
|
-
let processedInput = run.input?.inputMessages.map((i) =>
|
|
2840
|
-
let processedOutput = run.output.map((i) =>
|
|
2839
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2840
|
+
let processedOutput = run.output.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2841
2841
|
if (ignoreCase) {
|
|
2842
2842
|
processedInput = processedInput.toLowerCase();
|
|
2843
2843
|
processedOutput = processedOutput.toLowerCase();
|
|
@@ -2867,7 +2867,7 @@ function createToneScorer(config = {}) {
|
|
|
2867
2867
|
type: "agent"
|
|
2868
2868
|
}).preprocess(async ({ run }) => {
|
|
2869
2869
|
const sentiment = new Sentiment__default.default();
|
|
2870
|
-
const agentMessage = run.output?.map((i) =>
|
|
2870
|
+
const agentMessage = run.output?.map((i) => chunkDSXZHUHI_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2871
2871
|
const responseSentiment = sentiment.analyze(agentMessage);
|
|
2872
2872
|
if (referenceTone) {
|
|
2873
2873
|
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
@@ -2954,7 +2954,7 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
2954
2954
|
if (isInputInvalid || isOutputInvalid) {
|
|
2955
2955
|
throw new Error("Input and output messages cannot be null or empty");
|
|
2956
2956
|
}
|
|
2957
|
-
const { tools: actualTools, toolCallInfos } =
|
|
2957
|
+
const { tools: actualTools, toolCallInfos } = chunkDSXZHUHI_cjs.extractToolCalls(run.output);
|
|
2958
2958
|
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
2959
2959
|
return {
|
|
2960
2960
|
expectedTool,
|