@mastra/evals 0.13.2 → 0.13.3-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5CVZXIFW.js → chunk-4LRZVFXR.js} +32 -3
- package/dist/chunk-4LRZVFXR.js.map +1 -0
- package/dist/{chunk-QVZBKGOE.cjs → chunk-EKSPLMYP.cjs} +32 -2
- package/dist/chunk-EKSPLMYP.cjs.map +1 -0
- package/dist/{dist-JVIEAZJ6.js → dist-CI72CYZJ.js} +10 -10
- package/dist/{dist-JVIEAZJ6.js.map → dist-CI72CYZJ.js.map} +1 -1
- package/dist/{dist-JQCAD3AD.cjs → dist-IKJJ2AX4.cjs} +10 -10
- package/dist/{dist-JQCAD3AD.cjs.map → dist-IKJJ2AX4.cjs.map} +1 -1
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/{magic-string.es-NBXOXRCK.cjs → magic-string.es-VZN2EYER.cjs} +3 -3
- package/dist/{magic-string.es-NBXOXRCK.cjs.map → magic-string.es-VZN2EYER.cjs.map} +1 -1
- package/dist/{magic-string.es-6JSI7KY4.js → magic-string.es-WQRLTQPQ.js} +3 -3
- package/dist/{magic-string.es-6JSI7KY4.js.map → magic-string.es-WQRLTQPQ.js.map} +1 -1
- package/dist/scorers/code/index.cjs +2 -2
- package/dist/scorers/code/index.js +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +18 -0
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -0
- package/dist/scorers/llm/context-precision/prompts.d.ts +19 -0
- package/dist/scorers/llm/context-precision/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/context-relevance/index.d.ts +27 -0
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -0
- package/dist/scorers/llm/context-relevance/prompts.d.ts +20 -0
- package/dist/scorers/llm/context-relevance/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/index.cjs +1163 -25
- package/dist/scorers/llm/index.cjs.map +1 -1
- package/dist/scorers/llm/index.d.ts +4 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/index.js +1137 -3
- package/dist/scorers/llm/index.js.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +36 -0
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -0
- package/dist/scorers/llm/noise-sensitivity/prompts.d.ts +21 -0
- package/dist/scorers/llm/noise-sensitivity/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/prompt-alignment/index.d.ts +38 -0
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -0
- package/dist/scorers/llm/prompt-alignment/prompts.d.ts +44 -0
- package/dist/scorers/llm/prompt-alignment/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -4
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/utils.d.ts +2 -0
- package/dist/scorers/utils.d.ts.map +1 -1
- package/package.json +3 -3
- package/dist/chunk-5CVZXIFW.js.map +0 -1
- package/dist/chunk-QVZBKGOE.cjs.map +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
|
|
4
|
-
var
|
|
4
|
+
var chunkEKSPLMYP_cjs = require('../../chunk-EKSPLMYP.cjs');
|
|
5
5
|
var scores = require('@mastra/core/scores');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
|
|
@@ -227,14 +227,14 @@ function createAnswerRelevancyScorer({
|
|
|
227
227
|
description: "Extract relevant statements from the LLM output",
|
|
228
228
|
outputSchema: extractOutputSchema,
|
|
229
229
|
createPrompt: ({ run }) => {
|
|
230
|
-
const assistantMessage =
|
|
230
|
+
const assistantMessage = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
231
231
|
return createExtractPrompt(assistantMessage);
|
|
232
232
|
}
|
|
233
233
|
}).analyze({
|
|
234
234
|
description: "Score the relevance of the statements to the input",
|
|
235
235
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
236
236
|
createPrompt: ({ run, results }) => {
|
|
237
|
-
const input =
|
|
237
|
+
const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
238
238
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
239
239
|
}
|
|
240
240
|
}).generateScore(({ results }) => {
|
|
@@ -256,8 +256,8 @@ function createAnswerRelevancyScorer({
|
|
|
256
256
|
description: "Reason about the results",
|
|
257
257
|
createPrompt: ({ run, results, score }) => {
|
|
258
258
|
return createReasonPrompt({
|
|
259
|
-
input:
|
|
260
|
-
output:
|
|
259
|
+
input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
260
|
+
output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
261
261
|
score,
|
|
262
262
|
results: results.analyzeStepResult.results,
|
|
263
263
|
scale: options.scale
|
|
@@ -435,7 +435,7 @@ function createFaithfulnessScorer({
|
|
|
435
435
|
description: "Extract relevant statements from the LLM output",
|
|
436
436
|
outputSchema: zod.z.array(zod.z.string()),
|
|
437
437
|
createPrompt: ({ run }) => {
|
|
438
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
438
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
439
439
|
return prompt;
|
|
440
440
|
}
|
|
441
441
|
}).analyze({
|
|
@@ -456,13 +456,13 @@ function createFaithfulnessScorer({
|
|
|
456
456
|
return 0;
|
|
457
457
|
}
|
|
458
458
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
459
|
-
return
|
|
459
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
|
|
460
460
|
}).generateReason({
|
|
461
461
|
description: "Reason about the results",
|
|
462
462
|
createPrompt: ({ run, results, score }) => {
|
|
463
463
|
const prompt = createFaithfulnessReasonPrompt({
|
|
464
|
-
input:
|
|
465
|
-
output:
|
|
464
|
+
input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
465
|
+
output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
466
466
|
context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
467
467
|
score,
|
|
468
468
|
scale: options?.scale || 1,
|
|
@@ -593,13 +593,13 @@ function createBiasScorer({ model, options }) {
|
|
|
593
593
|
outputSchema: zod.z.object({
|
|
594
594
|
opinions: zod.z.array(zod.z.string())
|
|
595
595
|
}),
|
|
596
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
596
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
597
597
|
}).analyze({
|
|
598
598
|
description: "Score the relevance of the statements to the input",
|
|
599
599
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
600
600
|
createPrompt: ({ run, results }) => {
|
|
601
601
|
const prompt = createBiasAnalyzePrompt({
|
|
602
|
-
output:
|
|
602
|
+
output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
603
603
|
opinions: results.preprocessStepResult?.opinions || []
|
|
604
604
|
});
|
|
605
605
|
return prompt;
|
|
@@ -610,7 +610,7 @@ function createBiasScorer({ model, options }) {
|
|
|
610
610
|
}
|
|
611
611
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
612
612
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
613
|
-
return
|
|
613
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
614
614
|
}).generateReason({
|
|
615
615
|
description: "Reason about the results",
|
|
616
616
|
createPrompt: ({ score, results }) => {
|
|
@@ -827,7 +827,7 @@ function createHallucinationScorer({
|
|
|
827
827
|
claims: zod.z.array(zod.z.string())
|
|
828
828
|
}),
|
|
829
829
|
createPrompt: ({ run }) => {
|
|
830
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
830
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
831
831
|
return prompt;
|
|
832
832
|
}
|
|
833
833
|
}).analyze({
|
|
@@ -849,13 +849,13 @@ function createHallucinationScorer({
|
|
|
849
849
|
return 0;
|
|
850
850
|
}
|
|
851
851
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
852
|
-
return
|
|
852
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
|
|
853
853
|
}).generateReason({
|
|
854
854
|
description: "Reason about the results",
|
|
855
855
|
createPrompt: ({ run, results, score }) => {
|
|
856
856
|
const prompt = createHallucinationReasonPrompt({
|
|
857
|
-
input:
|
|
858
|
-
output:
|
|
857
|
+
input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
858
|
+
output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
859
859
|
context: options?.context || [],
|
|
860
860
|
score,
|
|
861
861
|
scale: options?.scale || 1,
|
|
@@ -964,8 +964,8 @@ function createToxicityScorer({ model, options }) {
|
|
|
964
964
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
965
965
|
createPrompt: ({ run }) => {
|
|
966
966
|
const prompt = createToxicityAnalyzePrompt({
|
|
967
|
-
input:
|
|
968
|
-
output:
|
|
967
|
+
input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
968
|
+
output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
969
969
|
});
|
|
970
970
|
return prompt;
|
|
971
971
|
}
|
|
@@ -981,7 +981,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
981
981
|
}
|
|
982
982
|
}
|
|
983
983
|
const score = toxicityCount / numberOfVerdicts;
|
|
984
|
-
return
|
|
984
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
985
985
|
}).generateReason({
|
|
986
986
|
description: "Reason about the results",
|
|
987
987
|
createPrompt: ({ results, score }) => {
|
|
@@ -1099,7 +1099,7 @@ var analyzeOutputSchema = zod.z.object({
|
|
|
1099
1099
|
missingTools: zod.z.array(zod.z.string()).optional()
|
|
1100
1100
|
});
|
|
1101
1101
|
function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
1102
|
-
const toolDefinitions = availableTools.map((tool) => `${tool.
|
|
1102
|
+
const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
|
|
1103
1103
|
return scores.createScorer({
|
|
1104
1104
|
name: "Tool Call Accuracy (LLM)",
|
|
1105
1105
|
description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
|
|
@@ -1113,7 +1113,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1113
1113
|
if (isInputInvalid || isOutputInvalid) {
|
|
1114
1114
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1115
1115
|
}
|
|
1116
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1116
|
+
const { tools: actualTools, toolCallInfos } = chunkEKSPLMYP_cjs.extractToolCalls(run.output);
|
|
1117
1117
|
return {
|
|
1118
1118
|
actualTools,
|
|
1119
1119
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1123,8 +1123,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1123
1123
|
description: "Analyze the appropriateness of tool selections",
|
|
1124
1124
|
outputSchema: analyzeOutputSchema,
|
|
1125
1125
|
createPrompt: ({ run, results }) => {
|
|
1126
|
-
const userInput =
|
|
1127
|
-
const agentResponse =
|
|
1126
|
+
const userInput = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1127
|
+
const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1128
1128
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1129
1129
|
return createAnalyzePrompt({
|
|
1130
1130
|
userInput,
|
|
@@ -1141,11 +1141,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1141
1141
|
}
|
|
1142
1142
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1143
1143
|
const totalToolCalls = evaluations.length;
|
|
1144
|
-
return
|
|
1144
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1145
1145
|
}).generateReason({
|
|
1146
1146
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1147
1147
|
createPrompt: ({ run, results, score }) => {
|
|
1148
|
-
const userInput =
|
|
1148
|
+
const userInput = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1149
1149
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1150
1150
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1151
1151
|
return createReasonPrompt2({
|
|
@@ -1158,12 +1158,1150 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1158
1158
|
});
|
|
1159
1159
|
}
|
|
1160
1160
|
|
|
1161
|
+
// src/scorers/llm/context-relevance/prompts.ts
|
|
1162
|
+
var CONTEXT_RELEVANCE_INSTRUCTIONS = `You are an expert context relevance evaluator. Your job is to analyze whether the provided context information was appropriate and useful for generating the agent's response to the user's query.
|
|
1163
|
+
|
|
1164
|
+
Key Evaluation Criteria:
|
|
1165
|
+
1. **Relevance**: Does the context directly relate to the user's query?
|
|
1166
|
+
2. **Utility**: Did the context help produce a better response?
|
|
1167
|
+
3. **Completeness**: Was the context sufficient for the task?
|
|
1168
|
+
4. **Quality**: Is the context accurate and trustworthy?
|
|
1169
|
+
|
|
1170
|
+
Evaluation Guidelines:
|
|
1171
|
+
- Context that directly answers or supports the user's query should be marked as highly relevant
|
|
1172
|
+
- Context that provides background information relevant to the query should be considered moderately relevant
|
|
1173
|
+
- Context that is tangentially related but doesn't directly help should be marked as low relevance
|
|
1174
|
+
- Context that is completely unrelated should be marked as irrelevant
|
|
1175
|
+
- Consider whether missing context might have led to a better response
|
|
1176
|
+
|
|
1177
|
+
Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
|
|
1178
|
+
function createAnalyzePrompt2({
|
|
1179
|
+
userQuery,
|
|
1180
|
+
agentResponse,
|
|
1181
|
+
providedContext
|
|
1182
|
+
}) {
|
|
1183
|
+
const contextList = providedContext.map((ctx, index) => `[${index}] ${ctx}`).join("\n");
|
|
1184
|
+
return `Analyze the relevance of the provided context for answering the user's query and generating the agent's response.
|
|
1185
|
+
|
|
1186
|
+
User Query:
|
|
1187
|
+
${userQuery}
|
|
1188
|
+
|
|
1189
|
+
Agent Response:
|
|
1190
|
+
${agentResponse}
|
|
1191
|
+
|
|
1192
|
+
Context pieces to evaluate:
|
|
1193
|
+
${contextList}
|
|
1194
|
+
|
|
1195
|
+
For each context piece, evaluate:
|
|
1196
|
+
1. **Relevance Level**: How relevant is it to the user's query?
|
|
1197
|
+
- "high": Directly addresses the query or provides essential information
|
|
1198
|
+
- "medium": Provides supporting or background information that's helpful
|
|
1199
|
+
- "low": Tangentially related but not very helpful
|
|
1200
|
+
- "none": Completely irrelevant or unrelated
|
|
1201
|
+
|
|
1202
|
+
2. **Usage**: Was this context actually used in generating the agent's response?
|
|
1203
|
+
- true: The response clearly incorporates or reflects this information
|
|
1204
|
+
- false: This information doesn't appear to be used in the response
|
|
1205
|
+
|
|
1206
|
+
3. **Reasoning**: Explain your assessment in detail
|
|
1207
|
+
|
|
1208
|
+
Also identify any missing context that should have been provided to better answer the query.
|
|
1209
|
+
|
|
1210
|
+
Format your response as:
|
|
1211
|
+
{
|
|
1212
|
+
"evaluations": [
|
|
1213
|
+
{
|
|
1214
|
+
"context_index": 0,
|
|
1215
|
+
"contextPiece": "the actual text of the context piece",
|
|
1216
|
+
"relevanceLevel": "high/medium/low/none",
|
|
1217
|
+
"wasUsed": true/false,
|
|
1218
|
+
"reasoning": "detailed explanation of the evaluation"
|
|
1219
|
+
}
|
|
1220
|
+
],
|
|
1221
|
+
"missingContext": ["list of missing information that would have been helpful"],
|
|
1222
|
+
"overallAssessment": "summary of the context quality and usage"
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
The number of evaluations MUST match the number of context pieces exactly.
|
|
1226
|
+
|
|
1227
|
+
Example:
|
|
1228
|
+
User Query: "What are the benefits of exercise?"
|
|
1229
|
+
Agent Response: "Regular exercise improves cardiovascular health and mental wellbeing."
|
|
1230
|
+
Context:
|
|
1231
|
+
[0] "Exercise strengthens the heart and improves blood circulation."
|
|
1232
|
+
[1] "A balanced diet is important for overall health."
|
|
1233
|
+
[2] "Regular physical activity reduces stress and anxiety levels."
|
|
1234
|
+
|
|
1235
|
+
{
|
|
1236
|
+
"evaluations": [
|
|
1237
|
+
{
|
|
1238
|
+
"context_index": 0,
|
|
1239
|
+
"contextPiece": "Exercise strengthens the heart and improves blood circulation.",
|
|
1240
|
+
"relevanceLevel": "high",
|
|
1241
|
+
"wasUsed": true,
|
|
1242
|
+
"reasoning": "This context directly supports the cardiovascular health benefit mentioned in the response"
|
|
1243
|
+
},
|
|
1244
|
+
{
|
|
1245
|
+
"context_index": 1,
|
|
1246
|
+
"contextPiece": "A balanced diet is important for overall health.",
|
|
1247
|
+
"relevanceLevel": "none",
|
|
1248
|
+
"wasUsed": false,
|
|
1249
|
+
"reasoning": "This context is about diet, not exercise benefits, and doesn't contribute to answering the query"
|
|
1250
|
+
},
|
|
1251
|
+
{
|
|
1252
|
+
"context_index": 2,
|
|
1253
|
+
"contextPiece": "Regular physical activity reduces stress and anxiety levels.",
|
|
1254
|
+
"relevanceLevel": "high",
|
|
1255
|
+
"wasUsed": true,
|
|
1256
|
+
"reasoning": "This context directly supports the mental wellbeing benefit mentioned in the response"
|
|
1257
|
+
}
|
|
1258
|
+
],
|
|
1259
|
+
"missingContext": [],
|
|
1260
|
+
"overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
|
|
1261
|
+
}`;
|
|
1262
|
+
}
|
|
1263
|
+
function createReasonPrompt3({
|
|
1264
|
+
userQuery,
|
|
1265
|
+
score,
|
|
1266
|
+
evaluations,
|
|
1267
|
+
missingContext,
|
|
1268
|
+
scale
|
|
1269
|
+
}) {
|
|
1270
|
+
return `Explain the context relevance score for the provided context based on its relevance and usage in generating the agent's response.
|
|
1271
|
+
|
|
1272
|
+
User Query:
|
|
1273
|
+
${userQuery}
|
|
1274
|
+
|
|
1275
|
+
Score: ${score} out of ${scale}
|
|
1276
|
+
|
|
1277
|
+
Context Evaluations:
|
|
1278
|
+
${evaluations.map(
|
|
1279
|
+
(evaluation) => `[${evaluation.context_index}] Relevance: ${evaluation.relevanceLevel}, Used: ${evaluation.wasUsed ? "Yes" : "No"}
|
|
1280
|
+
Context: "${evaluation.contextPiece}"
|
|
1281
|
+
Reasoning: ${evaluation.reasoning}`
|
|
1282
|
+
).join("\n\n")}
|
|
1283
|
+
|
|
1284
|
+
${missingContext.length > 0 ? `
|
|
1285
|
+
Missing Context Issues:
|
|
1286
|
+
${missingContext.map((item) => `- ${item}`).join("\n")}` : ""}
|
|
1287
|
+
|
|
1288
|
+
Context Relevance measures how well the provided context supports answering the user's query and generating the expected response. The score considers:
|
|
1289
|
+
- Relevance levels (high=1.0, medium=0.7, low=0.3, none=0.0)
|
|
1290
|
+
- Usage penalties (10% penalty per unused high-relevance context)
|
|
1291
|
+
- Missing context penalties (up to 50% penalty for identified gaps)
|
|
1292
|
+
|
|
1293
|
+
Rules for explanation:
|
|
1294
|
+
- Explain the score based on context relevance levels and usage
|
|
1295
|
+
- Mention any penalties applied for unused relevant context or missing information
|
|
1296
|
+
- Keep explanation concise and actionable for improving context selection
|
|
1297
|
+
- Use the given score, don't recalculate
|
|
1298
|
+
|
|
1299
|
+
Format:
|
|
1300
|
+
"The score is ${score} because {explanation of context relevance, usage, and any penalties}"
|
|
1301
|
+
|
|
1302
|
+
Example responses:
|
|
1303
|
+
"The score is 0.85 because 2 out of 3 context pieces are highly relevant and used in the response, with only minor penalty for one unused medium-relevance context piece."
|
|
1304
|
+
"The score is 1.0 because all context pieces are highly relevant to the query about exercise benefits and were effectively used in generating the comprehensive response."
|
|
1305
|
+
"The score is 0.40 because while some context is relevant, key information about the topic was missing and one highly relevant context piece was not utilized in the response."`;
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
// src/scorers/llm/context-relevance/index.ts
|
|
1309
|
+
var analyzeOutputSchema2 = zod.z.object({
|
|
1310
|
+
evaluations: zod.z.array(
|
|
1311
|
+
zod.z.object({
|
|
1312
|
+
context_index: zod.z.number(),
|
|
1313
|
+
contextPiece: zod.z.string(),
|
|
1314
|
+
relevanceLevel: zod.z.enum(["high", "medium", "low", "none"]),
|
|
1315
|
+
wasUsed: zod.z.boolean(),
|
|
1316
|
+
reasoning: zod.z.string()
|
|
1317
|
+
})
|
|
1318
|
+
),
|
|
1319
|
+
missingContext: zod.z.array(zod.z.string()).optional().default([]),
|
|
1320
|
+
overallAssessment: zod.z.string()
|
|
1321
|
+
});
|
|
1322
|
+
var DEFAULT_PENALTIES = {
|
|
1323
|
+
UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
|
|
1324
|
+
// 10% penalty per unused high-relevance context
|
|
1325
|
+
MISSING_CONTEXT_PER_ITEM: 0.15,
|
|
1326
|
+
// 15% penalty per missing context item
|
|
1327
|
+
MAX_MISSING_CONTEXT_PENALTY: 0.5
|
|
1328
|
+
// Maximum 50% penalty for missing context
|
|
1329
|
+
};
|
|
1330
|
+
function createContextRelevanceScorerLLM({
|
|
1331
|
+
model,
|
|
1332
|
+
options
|
|
1333
|
+
}) {
|
|
1334
|
+
if (!options.context && !options.contextExtractor) {
|
|
1335
|
+
throw new Error("Either context or contextExtractor is required for Context Relevance scoring");
|
|
1336
|
+
}
|
|
1337
|
+
if (options.context && options.context.length === 0) {
|
|
1338
|
+
throw new Error("Context array cannot be empty if provided");
|
|
1339
|
+
}
|
|
1340
|
+
return scores.createScorer({
|
|
1341
|
+
name: "Context Relevance (LLM)",
|
|
1342
|
+
description: "Evaluates how relevant and useful the provided context was for generating the agent response",
|
|
1343
|
+
judge: {
|
|
1344
|
+
model,
|
|
1345
|
+
instructions: CONTEXT_RELEVANCE_INSTRUCTIONS
|
|
1346
|
+
}
|
|
1347
|
+
}).analyze({
|
|
1348
|
+
description: "Analyze the relevance and utility of provided context",
|
|
1349
|
+
outputSchema: analyzeOutputSchema2,
|
|
1350
|
+
createPrompt: ({ run }) => {
|
|
1351
|
+
const userQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1352
|
+
const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1353
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1354
|
+
if (context.length === 0) {
|
|
1355
|
+
return createAnalyzePrompt2({
|
|
1356
|
+
userQuery,
|
|
1357
|
+
agentResponse,
|
|
1358
|
+
providedContext: ["[No context was provided for evaluation]"]
|
|
1359
|
+
});
|
|
1360
|
+
}
|
|
1361
|
+
return createAnalyzePrompt2({
|
|
1362
|
+
userQuery,
|
|
1363
|
+
agentResponse,
|
|
1364
|
+
providedContext: context
|
|
1365
|
+
});
|
|
1366
|
+
}
|
|
1367
|
+
}).generateScore(({ results, run }) => {
|
|
1368
|
+
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1369
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1370
|
+
if (context.length === 0) {
|
|
1371
|
+
return 1 * (options.scale || 1);
|
|
1372
|
+
}
|
|
1373
|
+
if (evaluations.length === 0) {
|
|
1374
|
+
const missingContext2 = results.analyzeStepResult?.missingContext || [];
|
|
1375
|
+
return missingContext2.length > 0 ? 0 : 1;
|
|
1376
|
+
}
|
|
1377
|
+
const relevanceWeights = {
|
|
1378
|
+
high: 1,
|
|
1379
|
+
medium: 0.7,
|
|
1380
|
+
low: 0.3,
|
|
1381
|
+
none: 0
|
|
1382
|
+
};
|
|
1383
|
+
const totalWeight = evaluations.reduce((sum, evaluation) => {
|
|
1384
|
+
return sum + relevanceWeights[evaluation.relevanceLevel];
|
|
1385
|
+
}, 0);
|
|
1386
|
+
const maxPossibleWeight = evaluations.length * relevanceWeights.high;
|
|
1387
|
+
const relevanceScore = maxPossibleWeight > 0 ? totalWeight / maxPossibleWeight : 0;
|
|
1388
|
+
const highRelevanceUnused = evaluations.filter(
|
|
1389
|
+
(evaluation) => evaluation.relevanceLevel === "high" && !evaluation.wasUsed
|
|
1390
|
+
).length;
|
|
1391
|
+
const penalties = options.penalties || {};
|
|
1392
|
+
const unusedPenaltyRate = penalties.unusedHighRelevanceContext ?? DEFAULT_PENALTIES.UNUSED_HIGH_RELEVANCE_CONTEXT;
|
|
1393
|
+
const missingPenaltyRate = penalties.missingContextPerItem ?? DEFAULT_PENALTIES.MISSING_CONTEXT_PER_ITEM;
|
|
1394
|
+
const maxMissingPenalty = penalties.maxMissingContextPenalty ?? DEFAULT_PENALTIES.MAX_MISSING_CONTEXT_PENALTY;
|
|
1395
|
+
const usagePenalty = highRelevanceUnused * unusedPenaltyRate;
|
|
1396
|
+
const missingContext = results.analyzeStepResult?.missingContext || [];
|
|
1397
|
+
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1398
|
+
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1399
|
+
const scaledScore = finalScore * (options.scale || 1);
|
|
1400
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(scaledScore);
|
|
1401
|
+
}).generateReason({
|
|
1402
|
+
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1403
|
+
createPrompt: ({ run, results, score }) => {
|
|
1404
|
+
const userQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1405
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1406
|
+
if (context.length === 0) {
|
|
1407
|
+
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
1408
|
+
}
|
|
1409
|
+
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1410
|
+
const missingContext = results.analyzeStepResult?.missingContext || [];
|
|
1411
|
+
return createReasonPrompt3({
|
|
1412
|
+
userQuery,
|
|
1413
|
+
score,
|
|
1414
|
+
evaluations,
|
|
1415
|
+
missingContext,
|
|
1416
|
+
scale: options.scale || 1
|
|
1417
|
+
});
|
|
1418
|
+
}
|
|
1419
|
+
});
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
// src/scorers/llm/context-precision/prompts.ts
|
|
1423
|
+
var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = `You are a precise context precision evaluator. Your job is to determine if context nodes are relevant for generating the expected output based on the input query.
|
|
1424
|
+
|
|
1425
|
+
Key Principles:
|
|
1426
|
+
1. Evaluate each context piece independently for relevance to the input-output pair
|
|
1427
|
+
2. Consider relevance as the ability of the context to contribute to generating the expected output
|
|
1428
|
+
3. Mark context as relevant only if it directly supports or informs the expected output
|
|
1429
|
+
4. Consider the input query when determining relevance
|
|
1430
|
+
5. Focus on practical utility for output generation, not just topical similarity
|
|
1431
|
+
6. Be strict in your evaluation - context must be clearly useful for generating the output
|
|
1432
|
+
7. Context that provides background but doesn't directly contribute should be marked as not relevant`;
|
|
1433
|
+
function createContextRelevancePrompt({
|
|
1434
|
+
input,
|
|
1435
|
+
output,
|
|
1436
|
+
context
|
|
1437
|
+
}) {
|
|
1438
|
+
return `Evaluate the relevance of each context piece for generating the expected output given the input query.
|
|
1439
|
+
|
|
1440
|
+
Input Query:
|
|
1441
|
+
${input}
|
|
1442
|
+
|
|
1443
|
+
Expected Output:
|
|
1444
|
+
${output}
|
|
1445
|
+
|
|
1446
|
+
Context pieces to evaluate:
|
|
1447
|
+
${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
|
|
1448
|
+
|
|
1449
|
+
For each context piece, determine if it is relevant for generating the expected output. A context piece is relevant if:
|
|
1450
|
+
- It provides information that directly supports or informs the expected output
|
|
1451
|
+
- It contains facts, data, or details that are needed to answer the input query
|
|
1452
|
+
- It contributes to the accuracy or completeness of the expected output
|
|
1453
|
+
|
|
1454
|
+
Mark as "yes" only if the context piece is clearly useful for generating the output.
|
|
1455
|
+
Mark as "no" if the context piece does not contribute to generating the expected output.
|
|
1456
|
+
|
|
1457
|
+
Format your response as:
|
|
1458
|
+
{
|
|
1459
|
+
"verdicts": [
|
|
1460
|
+
{
|
|
1461
|
+
"context_index": 0,
|
|
1462
|
+
"verdict": "yes/no",
|
|
1463
|
+
"reason": "explanation of why this context is or isn't relevant"
|
|
1464
|
+
}
|
|
1465
|
+
]
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
The number of verdicts MUST match the number of context pieces exactly.
|
|
1469
|
+
|
|
1470
|
+
Example:
|
|
1471
|
+
Input: "What are the benefits of exercise?"
|
|
1472
|
+
Output: "Regular exercise improves cardiovascular health and mental wellbeing."
|
|
1473
|
+
Context:
|
|
1474
|
+
[0] "Exercise strengthens the heart and improves blood circulation."
|
|
1475
|
+
[1] "A balanced diet is important for health."
|
|
1476
|
+
[2] "Regular physical activity reduces stress and anxiety."
|
|
1477
|
+
|
|
1478
|
+
{
|
|
1479
|
+
"verdicts": [
|
|
1480
|
+
{
|
|
1481
|
+
"context_index": 0,
|
|
1482
|
+
"verdict": "yes",
|
|
1483
|
+
"reason": "This context directly supports the cardiovascular health benefit mentioned in the output"
|
|
1484
|
+
},
|
|
1485
|
+
{
|
|
1486
|
+
"context_index": 1,
|
|
1487
|
+
"verdict": "no",
|
|
1488
|
+
"reason": "This context is about diet, not exercise benefits, and doesn't contribute to the expected output"
|
|
1489
|
+
},
|
|
1490
|
+
{
|
|
1491
|
+
"context_index": 2,
|
|
1492
|
+
"verdict": "yes",
|
|
1493
|
+
"reason": "This context directly supports the mental wellbeing benefit mentioned in the output"
|
|
1494
|
+
}
|
|
1495
|
+
]
|
|
1496
|
+
}`;
|
|
1497
|
+
}
|
|
1498
|
+
function createContextPrecisionReasonPrompt({
|
|
1499
|
+
input,
|
|
1500
|
+
output,
|
|
1501
|
+
context,
|
|
1502
|
+
score,
|
|
1503
|
+
scale,
|
|
1504
|
+
verdicts
|
|
1505
|
+
}) {
|
|
1506
|
+
return `Explain the context precision score for the retrieved context based on its relevance to generating the expected output.
|
|
1507
|
+
|
|
1508
|
+
Input Query:
|
|
1509
|
+
${input}
|
|
1510
|
+
|
|
1511
|
+
Expected Output:
|
|
1512
|
+
${output}
|
|
1513
|
+
|
|
1514
|
+
Context pieces:
|
|
1515
|
+
${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
|
|
1516
|
+
|
|
1517
|
+
Score: ${score} out of ${scale}
|
|
1518
|
+
Verdicts:
|
|
1519
|
+
${JSON.stringify(verdicts, null, 2)}
|
|
1520
|
+
|
|
1521
|
+
Context Precision measures how relevant and precise the retrieved context nodes are for generating the expected output. The score is calculated using Mean Average Precision (MAP) which:
|
|
1522
|
+
- Gives binary relevance scores (1 for relevant, 0 for irrelevant)
|
|
1523
|
+
- Weights earlier positions more heavily in the scoring
|
|
1524
|
+
- Rewards having relevant context early in the sequence
|
|
1525
|
+
|
|
1526
|
+
Rules for explanation:
|
|
1527
|
+
- Explain the score based on which context pieces were relevant and their positions
|
|
1528
|
+
- Mention how the positioning affects the MAP score
|
|
1529
|
+
- Keep explanation concise and focused on context quality
|
|
1530
|
+
- Use the given score, don't recalculate
|
|
1531
|
+
- Focus on how well the context supports generating the expected output
|
|
1532
|
+
|
|
1533
|
+
Format:
|
|
1534
|
+
"The score is ${score} because {explanation of context precision and positioning}"
|
|
1535
|
+
|
|
1536
|
+
Example responses:
|
|
1537
|
+
"The score is 0.75 because the first and third contexts are highly relevant to the benefits mentioned in the output, while the second and fourth contexts are not directly related to exercise benefits. The relevant contexts are well-positioned at the beginning and middle of the sequence."
|
|
1538
|
+
"The score is 1.0 because all context pieces are relevant for generating the expected output and are optimally ordered."
|
|
1539
|
+
"The score is 0.33 because only the first context piece is relevant to the query, and the remaining contexts don't contribute to generating the expected output about exercise benefits."`;
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
// src/scorers/llm/context-precision/index.ts
|
|
1543
|
+
var contextRelevanceOutputSchema = zod.z.object({
|
|
1544
|
+
verdicts: zod.z.array(
|
|
1545
|
+
zod.z.object({
|
|
1546
|
+
context_index: zod.z.number(),
|
|
1547
|
+
verdict: zod.z.string(),
|
|
1548
|
+
reason: zod.z.string()
|
|
1549
|
+
})
|
|
1550
|
+
)
|
|
1551
|
+
});
|
|
1552
|
+
function createContextPrecisionScorer({
|
|
1553
|
+
model,
|
|
1554
|
+
options
|
|
1555
|
+
}) {
|
|
1556
|
+
if (!options.context && !options.contextExtractor) {
|
|
1557
|
+
throw new Error("Either context or contextExtractor is required for Context Precision scoring");
|
|
1558
|
+
}
|
|
1559
|
+
if (options.context && options.context.length === 0) {
|
|
1560
|
+
throw new Error("Context array cannot be empty if provided");
|
|
1561
|
+
}
|
|
1562
|
+
return scores.createScorer({
|
|
1563
|
+
name: "Context Precision Scorer",
|
|
1564
|
+
description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
|
|
1565
|
+
judge: {
|
|
1566
|
+
model,
|
|
1567
|
+
instructions: CONTEXT_PRECISION_AGENT_INSTRUCTIONS
|
|
1568
|
+
}
|
|
1569
|
+
}).analyze({
|
|
1570
|
+
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1571
|
+
outputSchema: contextRelevanceOutputSchema,
|
|
1572
|
+
createPrompt: ({ run }) => {
|
|
1573
|
+
const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1574
|
+
const output = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1575
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1576
|
+
if (context.length === 0) {
|
|
1577
|
+
throw new Error("No context available for evaluation");
|
|
1578
|
+
}
|
|
1579
|
+
return createContextRelevancePrompt({
|
|
1580
|
+
input,
|
|
1581
|
+
output,
|
|
1582
|
+
context
|
|
1583
|
+
});
|
|
1584
|
+
}
|
|
1585
|
+
}).generateScore(({ results }) => {
|
|
1586
|
+
if (!results.analyzeStepResult || results.analyzeStepResult.verdicts.length === 0) {
|
|
1587
|
+
return 0;
|
|
1588
|
+
}
|
|
1589
|
+
const verdicts = results.analyzeStepResult.verdicts;
|
|
1590
|
+
const sortedVerdicts = verdicts.sort((a, b) => a.context_index - b.context_index);
|
|
1591
|
+
let sumPrecision = 0;
|
|
1592
|
+
let relevantCount = 0;
|
|
1593
|
+
for (let i = 0; i < sortedVerdicts.length; i++) {
|
|
1594
|
+
const targetVerdict = sortedVerdicts[i];
|
|
1595
|
+
const isRelevant = targetVerdict?.verdict?.toLowerCase().trim() === "yes";
|
|
1596
|
+
if (isRelevant) {
|
|
1597
|
+
relevantCount++;
|
|
1598
|
+
const precisionAtI = relevantCount / (i + 1);
|
|
1599
|
+
sumPrecision += precisionAtI;
|
|
1600
|
+
}
|
|
1601
|
+
}
|
|
1602
|
+
if (relevantCount === 0) {
|
|
1603
|
+
return 0;
|
|
1604
|
+
}
|
|
1605
|
+
const map = sumPrecision / relevantCount;
|
|
1606
|
+
const score = map * (options.scale || 1);
|
|
1607
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
|
|
1608
|
+
}).generateReason({
|
|
1609
|
+
description: "Reason about the context precision results",
|
|
1610
|
+
createPrompt: ({ run, results, score }) => {
|
|
1611
|
+
const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1612
|
+
const output = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1613
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1614
|
+
return createContextPrecisionReasonPrompt({
|
|
1615
|
+
input,
|
|
1616
|
+
output,
|
|
1617
|
+
context,
|
|
1618
|
+
score,
|
|
1619
|
+
scale: options.scale || 1,
|
|
1620
|
+
verdicts: results.analyzeStepResult?.verdicts || []
|
|
1621
|
+
});
|
|
1622
|
+
}
|
|
1623
|
+
});
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
// src/scorers/llm/noise-sensitivity/prompts.ts
|
|
1627
|
+
var NOISE_SENSITIVITY_INSTRUCTIONS = `You are an expert noise sensitivity evaluator. Your job is to analyze how much irrelevant, distracting, or misleading information (noise) affected the agent's response quality and accuracy.
|
|
1628
|
+
|
|
1629
|
+
Key Evaluation Criteria:
|
|
1630
|
+
1. **Response Consistency**: How similar are the baseline and noisy responses in content and correctness?
|
|
1631
|
+
2. **Information Integrity**: Did the agent maintain accuracy despite noise, or was it misled?
|
|
1632
|
+
3. **Focus Preservation**: Did the agent stay on topic or get distracted by irrelevant information?
|
|
1633
|
+
4. **Hallucination Resistance**: Did noise cause the agent to generate false or fabricated information?
|
|
1634
|
+
5. **Completeness**: Did noise cause the agent to miss important parts of the original query?
|
|
1635
|
+
|
|
1636
|
+
Noise Impact Assessment:
|
|
1637
|
+
- **No Impact (1.0)**: Response is virtually identical in quality, accuracy, and completeness
|
|
1638
|
+
- **Minimal Impact (0.8-0.9)**: Slight changes in phrasing but maintains correctness and completeness
|
|
1639
|
+
- **Moderate Impact (0.5-0.7)**: Noticeable changes that affect quality but core information remains correct
|
|
1640
|
+
- **Significant Impact (0.2-0.4)**: Major degradation in quality, accuracy, or completeness
|
|
1641
|
+
- **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
|
|
1642
|
+
|
|
1643
|
+
Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
|
|
1644
|
+
function createAnalyzePrompt3({
|
|
1645
|
+
userQuery,
|
|
1646
|
+
baselineResponse,
|
|
1647
|
+
noisyQuery,
|
|
1648
|
+
noisyResponse,
|
|
1649
|
+
noiseType
|
|
1650
|
+
}) {
|
|
1651
|
+
return `Analyze how the added noise affected the agent's response quality and accuracy.
|
|
1652
|
+
|
|
1653
|
+
Original User Query:
|
|
1654
|
+
${userQuery}
|
|
1655
|
+
|
|
1656
|
+
Baseline Agent Response (clean input):
|
|
1657
|
+
${baselineResponse}
|
|
1658
|
+
|
|
1659
|
+
Noisy User Query (with added distractions):
|
|
1660
|
+
${noisyQuery}
|
|
1661
|
+
|
|
1662
|
+
Noisy Agent Response:
|
|
1663
|
+
${noisyResponse}
|
|
1664
|
+
|
|
1665
|
+
${noiseType ? `Type of noise added: ${noiseType}` : ""}
|
|
1666
|
+
|
|
1667
|
+
Compare the baseline and noisy responses across these dimensions:
|
|
1668
|
+
|
|
1669
|
+
1. **Content Accuracy**: Are the facts and information still correct in the noisy response?
|
|
1670
|
+
2. **Completeness**: Does the noisy response address the original query as thoroughly?
|
|
1671
|
+
3. **Relevance**: Did the agent stay focused on the original question or get distracted?
|
|
1672
|
+
4. **Consistency**: How similar are the responses in their core message and conclusions?
|
|
1673
|
+
5. **Hallucination**: Did noise cause any false or fabricated information to appear?
|
|
1674
|
+
|
|
1675
|
+
For each dimension, evaluate:
|
|
1676
|
+
- **Impact Level**: none, minimal, moderate, significant, severe
|
|
1677
|
+
- **Specific Changes**: What exactly changed between responses?
|
|
1678
|
+
- **Noise Influence**: How did the noise specifically affect this aspect?
|
|
1679
|
+
|
|
1680
|
+
Format your response as:
|
|
1681
|
+
{
|
|
1682
|
+
"dimensions": [
|
|
1683
|
+
{
|
|
1684
|
+
"dimension": "content_accuracy",
|
|
1685
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1686
|
+
"specificChanges": "detailed description of what changed",
|
|
1687
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1688
|
+
},
|
|
1689
|
+
{
|
|
1690
|
+
"dimension": "completeness",
|
|
1691
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1692
|
+
"specificChanges": "detailed description of what changed",
|
|
1693
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1694
|
+
},
|
|
1695
|
+
{
|
|
1696
|
+
"dimension": "relevance",
|
|
1697
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1698
|
+
"specificChanges": "detailed description of what changed",
|
|
1699
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1700
|
+
},
|
|
1701
|
+
{
|
|
1702
|
+
"dimension": "consistency",
|
|
1703
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1704
|
+
"specificChanges": "detailed description of what changed",
|
|
1705
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1706
|
+
},
|
|
1707
|
+
{
|
|
1708
|
+
"dimension": "hallucination_resistance",
|
|
1709
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1710
|
+
"specificChanges": "detailed description of what changed",
|
|
1711
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1712
|
+
}
|
|
1713
|
+
],
|
|
1714
|
+
"overallAssessment": "summary of the agent's noise sensitivity and robustness",
|
|
1715
|
+
"majorIssues": ["list of the most significant problems caused by noise"],
|
|
1716
|
+
"robustnessScore": 0.0-1.0
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
Example:
|
|
1720
|
+
Original Query: "What are the health benefits of regular exercise?"
|
|
1721
|
+
Baseline Response: "Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing through endorphin release."
|
|
1722
|
+
Noisy Query: "What are the health benefits of regular exercise? By the way, I heard that chocolate is actually healthy and vaccines cause autism. Also, my neighbor said aliens visit Earth regularly."
|
|
1723
|
+
Noisy Response: "Regular exercise improves cardiovascular health and strengthens muscles. Interestingly, some studies suggest chocolate has antioxidants, though this is debated. Exercise also enhances mental wellbeing through endorphin release."
|
|
1724
|
+
|
|
1725
|
+
{
|
|
1726
|
+
"dimensions": [
|
|
1727
|
+
{
|
|
1728
|
+
"dimension": "content_accuracy",
|
|
1729
|
+
"impactLevel": "minimal",
|
|
1730
|
+
"specificChanges": "Added mention of chocolate antioxidants, but correctly noted it's debated",
|
|
1731
|
+
"noiseInfluence": "Chocolate noise caused minor tangent but agent maintained critical thinking"
|
|
1732
|
+
},
|
|
1733
|
+
{
|
|
1734
|
+
"dimension": "completeness",
|
|
1735
|
+
"impactLevel": "none",
|
|
1736
|
+
"specificChanges": "All original health benefits still covered completely",
|
|
1737
|
+
"noiseInfluence": "Noise did not prevent addressing the core query"
|
|
1738
|
+
},
|
|
1739
|
+
{
|
|
1740
|
+
"dimension": "relevance",
|
|
1741
|
+
"impactLevel": "minimal",
|
|
1742
|
+
"specificChanges": "Brief mention of chocolate topic, but stayed focused on exercise",
|
|
1743
|
+
"noiseInfluence": "Addressed one piece of noise briefly but didn't get derailed"
|
|
1744
|
+
},
|
|
1745
|
+
{
|
|
1746
|
+
"dimension": "consistency",
|
|
1747
|
+
"impactLevel": "minimal",
|
|
1748
|
+
"specificChanges": "Core message about exercise benefits remained consistent with slight addition",
|
|
1749
|
+
"noiseInfluence": "Noise caused minor addition but didn't change main message"
|
|
1750
|
+
},
|
|
1751
|
+
{
|
|
1752
|
+
"dimension": "hallucination_resistance",
|
|
1753
|
+
"impactLevel": "none",
|
|
1754
|
+
"specificChanges": "No false information generated, properly qualified chocolate statement",
|
|
1755
|
+
"noiseInfluence": "Successfully resisted misinformation about vaccines and aliens"
|
|
1756
|
+
}
|
|
1757
|
+
],
|
|
1758
|
+
"overallAssessment": "Agent showed good robustness, addressing original query completely while minimally engaging with one benign noise element and completely ignoring harmful misinformation",
|
|
1759
|
+
"majorIssues": [],
|
|
1760
|
+
"robustnessScore": 0.85
|
|
1761
|
+
}`;
|
|
1762
|
+
}
|
|
1763
|
+
function createReasonPrompt4({
|
|
1764
|
+
userQuery,
|
|
1765
|
+
score,
|
|
1766
|
+
dimensions,
|
|
1767
|
+
majorIssues,
|
|
1768
|
+
overallAssessment
|
|
1769
|
+
}) {
|
|
1770
|
+
const impactSummary = dimensions.map((d) => `${d.dimension}: ${d.impactLevel} impact`).join(", ");
|
|
1771
|
+
return `Explain the noise sensitivity score based on how well the agent maintained response quality despite irrelevant or distracting information.
|
|
1772
|
+
|
|
1773
|
+
Original Query:
|
|
1774
|
+
${userQuery}
|
|
1775
|
+
|
|
1776
|
+
Score: ${score} out of 1.0
|
|
1777
|
+
|
|
1778
|
+
Impact Assessment:
|
|
1779
|
+
${impactSummary}
|
|
1780
|
+
|
|
1781
|
+
${majorIssues.length > 0 ? `
|
|
1782
|
+
Major Issues Identified:
|
|
1783
|
+
${majorIssues.map((issue) => `- ${issue}`).join("\n")}` : ""}
|
|
1784
|
+
|
|
1785
|
+
Overall Assessment:
|
|
1786
|
+
${overallAssessment}
|
|
1787
|
+
|
|
1788
|
+
Noise Sensitivity measures how robust an agent is when irrelevant, misleading, or distracting information is added to the input. The score considers:
|
|
1789
|
+
- Content accuracy preservation (maintaining factual correctness)
|
|
1790
|
+
- Completeness retention (addressing the full original query)
|
|
1791
|
+
- Focus maintenance (not getting distracted by irrelevant information)
|
|
1792
|
+
- Consistency preservation (keeping core message intact)
|
|
1793
|
+
- Hallucination resistance (not generating false information due to noise)
|
|
1794
|
+
|
|
1795
|
+
Scoring Guide:
|
|
1796
|
+
- 0.9-1.0: Highly robust, virtually no impact from noise
|
|
1797
|
+
- 0.7-0.8: Good robustness, minimal impact that doesn't affect correctness
|
|
1798
|
+
- 0.5-0.6: Moderate sensitivity, noticeable quality degradation
|
|
1799
|
+
- 0.3-0.4: High sensitivity, significant impact on accuracy or completeness
|
|
1800
|
+
- 0.0-0.2: Very sensitive, severe degradation or derailment
|
|
1801
|
+
|
|
1802
|
+
Rules for explanation:
|
|
1803
|
+
- Explain the score based on specific impacts observed across all dimensions
|
|
1804
|
+
- Highlight the agent's strengths and weaknesses in handling noise
|
|
1805
|
+
- Keep explanation actionable for improving noise robustness
|
|
1806
|
+
- Use the given score, don't recalculate
|
|
1807
|
+
|
|
1808
|
+
Format:
|
|
1809
|
+
"The score is ${score} because {explanation of robustness performance and specific noise impacts}"
|
|
1810
|
+
|
|
1811
|
+
Example responses:
|
|
1812
|
+
"The score is 0.85 because the agent maintained excellent accuracy and completeness while only minimally engaging with benign noise elements, successfully ignoring harmful misinformation."
|
|
1813
|
+
"The score is 1.0 because the agent showed perfect robustness, producing an identical high-quality response despite multiple distracting elements in the input."
|
|
1814
|
+
"The score is 0.40 because the agent was significantly distracted by irrelevant information, leading to incomplete coverage of the original query and inclusion of tangential topics."`;
|
|
1815
|
+
}
|
|
1816
|
+
|
|
1817
|
+
// src/scorers/llm/noise-sensitivity/index.ts
|
|
1818
|
+
var analyzeOutputSchema3 = zod.z.object({
|
|
1819
|
+
dimensions: zod.z.array(
|
|
1820
|
+
zod.z.object({
|
|
1821
|
+
dimension: zod.z.string(),
|
|
1822
|
+
impactLevel: zod.z.enum(["none", "minimal", "moderate", "significant", "severe"]),
|
|
1823
|
+
specificChanges: zod.z.string(),
|
|
1824
|
+
noiseInfluence: zod.z.string()
|
|
1825
|
+
})
|
|
1826
|
+
),
|
|
1827
|
+
overallAssessment: zod.z.string(),
|
|
1828
|
+
majorIssues: zod.z.array(zod.z.string()).optional().default([]),
|
|
1829
|
+
robustnessScore: zod.z.number().min(0).max(1)
|
|
1830
|
+
});
|
|
1831
|
+
var DEFAULT_IMPACT_WEIGHTS = {
|
|
1832
|
+
none: 1,
|
|
1833
|
+
minimal: 0.85,
|
|
1834
|
+
moderate: 0.6,
|
|
1835
|
+
significant: 0.3,
|
|
1836
|
+
severe: 0.1
|
|
1837
|
+
};
|
|
1838
|
+
var DEFAULT_SCORING = {
|
|
1839
|
+
MAJOR_ISSUE_PENALTY_PER_ITEM: 0.1,
|
|
1840
|
+
// 10% penalty per major issue
|
|
1841
|
+
MAX_MAJOR_ISSUE_PENALTY: 0.3,
|
|
1842
|
+
// Maximum 30% penalty for major issues
|
|
1843
|
+
DISCREPANCY_THRESHOLD: 0.2
|
|
1844
|
+
// Threshold for choosing conservative score
|
|
1845
|
+
};
|
|
1846
|
+
function createNoiseSensitivityScorerLLM({
|
|
1847
|
+
model,
|
|
1848
|
+
options
|
|
1849
|
+
}) {
|
|
1850
|
+
if (!options.baselineResponse || !options.noisyQuery) {
|
|
1851
|
+
throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
|
|
1852
|
+
}
|
|
1853
|
+
return scores.createScorer({
|
|
1854
|
+
name: "Noise Sensitivity (LLM)",
|
|
1855
|
+
description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
|
|
1856
|
+
judge: {
|
|
1857
|
+
model,
|
|
1858
|
+
instructions: NOISE_SENSITIVITY_INSTRUCTIONS
|
|
1859
|
+
}
|
|
1860
|
+
}).analyze({
|
|
1861
|
+
description: "Analyze the impact of noise on agent response quality",
|
|
1862
|
+
outputSchema: analyzeOutputSchema3,
|
|
1863
|
+
createPrompt: ({ run }) => {
|
|
1864
|
+
const originalQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1865
|
+
const noisyResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1866
|
+
if (!originalQuery || !noisyResponse) {
|
|
1867
|
+
throw new Error("Both original query and noisy response are required for evaluation");
|
|
1868
|
+
}
|
|
1869
|
+
return createAnalyzePrompt3({
|
|
1870
|
+
userQuery: originalQuery,
|
|
1871
|
+
baselineResponse: options.baselineResponse,
|
|
1872
|
+
noisyQuery: options.noisyQuery,
|
|
1873
|
+
noisyResponse,
|
|
1874
|
+
noiseType: options.noiseType
|
|
1875
|
+
});
|
|
1876
|
+
}
|
|
1877
|
+
}).generateScore(({ results }) => {
|
|
1878
|
+
const analysisResult = results.analyzeStepResult;
|
|
1879
|
+
if (!analysisResult) {
|
|
1880
|
+
throw new Error("Analysis step failed to produce results");
|
|
1881
|
+
}
|
|
1882
|
+
let finalScore = analysisResult.robustnessScore;
|
|
1883
|
+
finalScore = Math.max(0, Math.min(1, finalScore));
|
|
1884
|
+
const scoring = options.scoring || {};
|
|
1885
|
+
const impactWeights = {
|
|
1886
|
+
none: scoring.impactWeights?.none ?? DEFAULT_IMPACT_WEIGHTS.none,
|
|
1887
|
+
minimal: scoring.impactWeights?.minimal ?? DEFAULT_IMPACT_WEIGHTS.minimal,
|
|
1888
|
+
moderate: scoring.impactWeights?.moderate ?? DEFAULT_IMPACT_WEIGHTS.moderate,
|
|
1889
|
+
significant: scoring.impactWeights?.significant ?? DEFAULT_IMPACT_WEIGHTS.significant,
|
|
1890
|
+
severe: scoring.impactWeights?.severe ?? DEFAULT_IMPACT_WEIGHTS.severe
|
|
1891
|
+
};
|
|
1892
|
+
const discrepancyThreshold = scoring.discrepancyThreshold ?? DEFAULT_SCORING.DISCREPANCY_THRESHOLD;
|
|
1893
|
+
const majorIssuePenaltyRate = scoring.penalties?.majorIssuePerItem ?? DEFAULT_SCORING.MAJOR_ISSUE_PENALTY_PER_ITEM;
|
|
1894
|
+
const maxMajorIssuePenalty = scoring.penalties?.maxMajorIssuePenalty ?? DEFAULT_SCORING.MAX_MAJOR_ISSUE_PENALTY;
|
|
1895
|
+
const dimensions = analysisResult.dimensions || [];
|
|
1896
|
+
if (dimensions.length > 0) {
|
|
1897
|
+
const averageImpact = dimensions.reduce((sum, dim) => {
|
|
1898
|
+
return sum + impactWeights[dim.impactLevel];
|
|
1899
|
+
}, 0) / dimensions.length;
|
|
1900
|
+
const calculatedScore = averageImpact;
|
|
1901
|
+
if (Math.abs(finalScore - calculatedScore) > discrepancyThreshold) {
|
|
1902
|
+
finalScore = Math.min(finalScore, calculatedScore);
|
|
1903
|
+
}
|
|
1904
|
+
}
|
|
1905
|
+
const majorIssues = analysisResult.majorIssues || [];
|
|
1906
|
+
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
1907
|
+
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
1908
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(finalScore);
|
|
1909
|
+
}).generateReason({
|
|
1910
|
+
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
1911
|
+
createPrompt: ({ run, results, score }) => {
|
|
1912
|
+
const originalQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1913
|
+
const analysisResult = results.analyzeStepResult;
|
|
1914
|
+
if (!analysisResult) {
|
|
1915
|
+
throw new Error("Analysis step failed to produce results for reason generation");
|
|
1916
|
+
}
|
|
1917
|
+
return createReasonPrompt4({
|
|
1918
|
+
userQuery: originalQuery,
|
|
1919
|
+
score,
|
|
1920
|
+
dimensions: analysisResult.dimensions || [],
|
|
1921
|
+
majorIssues: analysisResult.majorIssues || [],
|
|
1922
|
+
overallAssessment: analysisResult.overallAssessment
|
|
1923
|
+
});
|
|
1924
|
+
}
|
|
1925
|
+
});
|
|
1926
|
+
}
|
|
1927
|
+
|
|
1928
|
+
// src/scorers/llm/prompt-alignment/prompts.ts
|
|
1929
|
+
var PROMPT_ALIGNMENT_INSTRUCTIONS = `You are an expert prompt-response alignment evaluator. Your job is to analyze how well an agent's response aligns with the user's prompt in terms of intent, requirements, completeness, and appropriateness.
|
|
1930
|
+
|
|
1931
|
+
Key Evaluation Dimensions:
|
|
1932
|
+
1. **Intent Alignment**: Does the response address the core purpose of the prompt?
|
|
1933
|
+
2. **Requirements Fulfillment**: Are all explicit and implicit requirements met?
|
|
1934
|
+
3. **Completeness**: Is the response comprehensive and thorough?
|
|
1935
|
+
4. **Response Appropriateness**: Does the format, tone, and style match expectations?
|
|
1936
|
+
|
|
1937
|
+
Evaluation Guidelines:
|
|
1938
|
+
- Identify the primary intent and any secondary intents in the prompt
|
|
1939
|
+
- Extract all explicit requirements (specific tasks, constraints, formats)
|
|
1940
|
+
- Consider implicit requirements based on context and standard expectations
|
|
1941
|
+
- Assess whether the response fully addresses the prompt or leaves gaps
|
|
1942
|
+
- Evaluate if the response format and tone are appropriate for the request
|
|
1943
|
+
- Be objective and focus on alignment rather than response quality
|
|
1944
|
+
|
|
1945
|
+
Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
|
|
1946
|
+
function createAnalyzePrompt4({
|
|
1947
|
+
userPrompt,
|
|
1948
|
+
systemPrompt,
|
|
1949
|
+
agentResponse,
|
|
1950
|
+
evaluationMode
|
|
1951
|
+
}) {
|
|
1952
|
+
let promptContext = "";
|
|
1953
|
+
let evaluationTarget = "";
|
|
1954
|
+
if (evaluationMode === "user") {
|
|
1955
|
+
promptContext = `User Prompt:
|
|
1956
|
+
${userPrompt}`;
|
|
1957
|
+
evaluationTarget = "the user's prompt";
|
|
1958
|
+
} else if (evaluationMode === "system") {
|
|
1959
|
+
promptContext = `System Prompt:
|
|
1960
|
+
${systemPrompt}`;
|
|
1961
|
+
evaluationTarget = "the system's behavioral guidelines and constraints";
|
|
1962
|
+
} else {
|
|
1963
|
+
promptContext = `User Prompt:
|
|
1964
|
+
${userPrompt}
|
|
1965
|
+
|
|
1966
|
+
System Prompt:
|
|
1967
|
+
${systemPrompt}`;
|
|
1968
|
+
evaluationTarget = "both the user's prompt and the system's behavioral guidelines";
|
|
1969
|
+
}
|
|
1970
|
+
return `Analyze how well the agent's response aligns with ${evaluationTarget} across multiple dimensions.
|
|
1971
|
+
|
|
1972
|
+
${promptContext}
|
|
1973
|
+
|
|
1974
|
+
Agent Response:
|
|
1975
|
+
${agentResponse}
|
|
1976
|
+
|
|
1977
|
+
Evaluate the following aspects:
|
|
1978
|
+
|
|
1979
|
+
1. **Intent Alignment**:
|
|
1980
|
+
${evaluationMode === "system" ? `- Identify the primary behavioral guidelines and constraints from the system prompt
|
|
1981
|
+
- Assess whether the response follows these guidelines
|
|
1982
|
+
- Score from 0.0 (violates system constraints) to 1.0 (perfectly follows system guidelines)` : evaluationMode === "user" ? `- Identify the primary intent of the user's prompt
|
|
1983
|
+
- Assess whether the response addresses this intent
|
|
1984
|
+
- Score from 0.0 (completely misses intent) to 1.0 (perfectly addresses intent)` : `- Identify both the user's intent AND system behavioral guidelines
|
|
1985
|
+
- Assess whether the response addresses user intent while following system constraints
|
|
1986
|
+
- Score from 0.0 (misses both) to 1.0 (perfectly addresses both)`}
|
|
1987
|
+
- Provide reasoning for your assessment
|
|
1988
|
+
|
|
1989
|
+
2. **Requirements Fulfillment**:
|
|
1990
|
+
${evaluationMode === "system" ? `- List all system constraints and rules from the system prompt
|
|
1991
|
+
- Check if each constraint is respected
|
|
1992
|
+
- Calculate an overall score based on respected vs. total constraints` : evaluationMode === "user" ? `- List all explicit requirements from the user prompt
|
|
1993
|
+
- Check if each requirement is fulfilled
|
|
1994
|
+
- Calculate an overall score based on fulfilled vs. total requirements` : `- List requirements from BOTH user prompt and system constraints
|
|
1995
|
+
- Check fulfillment of each requirement
|
|
1996
|
+
- Calculate separate scores for user requirements and system constraints, then combine`}
|
|
1997
|
+
- Provide reasoning for each requirement assessment
|
|
1998
|
+
|
|
1999
|
+
3. **Completeness**:
|
|
2000
|
+
${evaluationMode === "system" ? `- Evaluate if the response fully adheres to all system guidelines
|
|
2001
|
+
- Identify any system rules that were not followed` : evaluationMode === "user" ? `- Evaluate if the response is comprehensive for the user's request
|
|
2002
|
+
- Identify any missing elements that should have been included` : `- Evaluate completeness for both user request AND system compliance
|
|
2003
|
+
- Identify missing elements from either perspective`}
|
|
2004
|
+
- Score from 0.0 (severely incomplete) to 1.0 (fully complete)
|
|
2005
|
+
- Provide reasoning for your assessment
|
|
2006
|
+
|
|
2007
|
+
4. **Response Appropriateness**:
|
|
2008
|
+
${evaluationMode === "system" ? `- Check if the format/tone matches system specifications
|
|
2009
|
+
- Evaluate consistency with defined agent behavior` : evaluationMode === "user" ? `- Check if the format matches what was requested (e.g., list, paragraph, code)
|
|
2010
|
+
- Evaluate if the tone is appropriate (e.g., formal, casual, technical)` : `- Check format/tone for both user expectations AND system requirements
|
|
2011
|
+
- Evaluate if response satisfies both perspectives`}
|
|
2012
|
+
- Score from 0.0 (completely inappropriate) to 1.0 (perfectly appropriate)
|
|
2013
|
+
- Provide reasoning for your assessment
|
|
2014
|
+
|
|
2015
|
+
Format your response as:
|
|
2016
|
+
{
|
|
2017
|
+
"intentAlignment": {
|
|
2018
|
+
"score": 0.0-1.0,
|
|
2019
|
+
"primaryIntent": "the main purpose of the prompt",
|
|
2020
|
+
"isAddressed": true/false,
|
|
2021
|
+
"reasoning": "explanation of intent alignment"
|
|
2022
|
+
},
|
|
2023
|
+
"requirementsFulfillment": {
|
|
2024
|
+
"requirements": [
|
|
2025
|
+
{
|
|
2026
|
+
"requirement": "specific requirement from prompt",
|
|
2027
|
+
"isFulfilled": true/false,
|
|
2028
|
+
"reasoning": "explanation of fulfillment status"
|
|
2029
|
+
}
|
|
2030
|
+
],
|
|
2031
|
+
"overallScore": 0.0-1.0
|
|
2032
|
+
},
|
|
2033
|
+
"completeness": {
|
|
2034
|
+
"score": 0.0-1.0,
|
|
2035
|
+
"missingElements": ["list of missing elements if any"],
|
|
2036
|
+
"reasoning": "explanation of completeness assessment"
|
|
2037
|
+
},
|
|
2038
|
+
"responseAppropriateness": {
|
|
2039
|
+
"score": 0.0-1.0,
|
|
2040
|
+
"formatAlignment": true/false,
|
|
2041
|
+
"toneAlignment": true/false,
|
|
2042
|
+
"reasoning": "explanation of appropriateness"
|
|
2043
|
+
},
|
|
2044
|
+
"overallAssessment": "summary of the prompt-response alignment"
|
|
2045
|
+
}
|
|
2046
|
+
|
|
2047
|
+
Example:
|
|
2048
|
+
User Prompt: "Write a Python function to calculate factorial with error handling for negative numbers."
|
|
2049
|
+
|
|
2050
|
+
Agent Response: "def factorial(n):
|
|
2051
|
+
if n < 0:
|
|
2052
|
+
raise ValueError('Factorial not defined for negative numbers')
|
|
2053
|
+
if n == 0:
|
|
2054
|
+
return 1
|
|
2055
|
+
return n * factorial(n-1)"
|
|
2056
|
+
|
|
2057
|
+
{
|
|
2058
|
+
"intentAlignment": {
|
|
2059
|
+
"score": 1.0,
|
|
2060
|
+
"primaryIntent": "Create a Python function to calculate factorial",
|
|
2061
|
+
"isAddressed": true,
|
|
2062
|
+
"reasoning": "The response provides exactly what was requested - a Python function that calculates factorial"
|
|
2063
|
+
},
|
|
2064
|
+
"requirementsFulfillment": {
|
|
2065
|
+
"requirements": [
|
|
2066
|
+
{
|
|
2067
|
+
"requirement": "Write a Python function",
|
|
2068
|
+
"isFulfilled": true,
|
|
2069
|
+
"reasoning": "A proper Python function is provided with correct syntax"
|
|
2070
|
+
},
|
|
2071
|
+
{
|
|
2072
|
+
"requirement": "Calculate factorial",
|
|
2073
|
+
"isFulfilled": true,
|
|
2074
|
+
"reasoning": "The function correctly implements factorial calculation using recursion"
|
|
2075
|
+
},
|
|
2076
|
+
{
|
|
2077
|
+
"requirement": "Include error handling for negative numbers",
|
|
2078
|
+
"isFulfilled": true,
|
|
2079
|
+
"reasoning": "The function raises a ValueError for negative inputs with an appropriate message"
|
|
2080
|
+
}
|
|
2081
|
+
],
|
|
2082
|
+
"overallScore": 1.0
|
|
2083
|
+
},
|
|
2084
|
+
"completeness": {
|
|
2085
|
+
"score": 0.9,
|
|
2086
|
+
"missingElements": ["No docstring or comments"],
|
|
2087
|
+
"reasoning": "The function is complete and functional but could benefit from documentation"
|
|
2088
|
+
},
|
|
2089
|
+
"responseAppropriateness": {
|
|
2090
|
+
"score": 1.0,
|
|
2091
|
+
"formatAlignment": true,
|
|
2092
|
+
"toneAlignment": true,
|
|
2093
|
+
"reasoning": "The response is in the exact format requested (Python code) with appropriate technical implementation"
|
|
2094
|
+
},
|
|
2095
|
+
"overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
|
|
2096
|
+
}`;
|
|
2097
|
+
}
|
|
2098
|
+
function createReasonPrompt5({
|
|
2099
|
+
userPrompt,
|
|
2100
|
+
systemPrompt,
|
|
2101
|
+
score,
|
|
2102
|
+
scale,
|
|
2103
|
+
analysis,
|
|
2104
|
+
evaluationMode
|
|
2105
|
+
}) {
|
|
2106
|
+
const fulfilledCount = analysis.requirementsFulfillment.requirements.filter((r) => r.isFulfilled).length;
|
|
2107
|
+
const totalRequirements = analysis.requirementsFulfillment.requirements.length;
|
|
2108
|
+
const promptContext = evaluationMode === "system" ? `System Prompt:
|
|
2109
|
+
${systemPrompt}` : evaluationMode === "user" ? `User Prompt:
|
|
2110
|
+
${userPrompt}` : `User Prompt:
|
|
2111
|
+
${userPrompt}
|
|
2112
|
+
|
|
2113
|
+
System Prompt:
|
|
2114
|
+
${systemPrompt}`;
|
|
2115
|
+
const alignmentDescription = evaluationMode === "system" ? "system behavioral guidelines and constraints" : evaluationMode === "user" ? "user's prompt" : "both user's prompt and system guidelines";
|
|
2116
|
+
return `Explain the prompt alignment score based on how well the agent's response addresses the ${alignmentDescription}.
|
|
2117
|
+
|
|
2118
|
+
${promptContext}
|
|
2119
|
+
|
|
2120
|
+
Score: ${score} out of ${scale}
|
|
2121
|
+
|
|
2122
|
+
Evaluation Breakdown:
|
|
2123
|
+
- Intent Alignment (40% weight): ${analysis.intentAlignment.score}
|
|
2124
|
+
Primary Intent: "${analysis.intentAlignment.primaryIntent}"
|
|
2125
|
+
Addressed: ${analysis.intentAlignment.isAddressed ? "Yes" : "No"}
|
|
2126
|
+
${analysis.intentAlignment.reasoning}
|
|
2127
|
+
|
|
2128
|
+
- Requirements Fulfillment (30% weight): ${analysis.requirementsFulfillment.overallScore}
|
|
2129
|
+
${fulfilledCount} out of ${totalRequirements} requirements met
|
|
2130
|
+
${analysis.requirementsFulfillment.requirements.map((r) => `\u2022 ${r.requirement}: ${r.isFulfilled ? "\u2713" : "\u2717"}`).join("\n ")}
|
|
2131
|
+
|
|
2132
|
+
- Completeness (20% weight): ${analysis.completeness.score}
|
|
2133
|
+
${analysis.completeness.missingElements.length > 0 ? `Missing elements: ${analysis.completeness.missingElements.join(", ")}` : "Response is complete"}
|
|
2134
|
+
${analysis.completeness.reasoning}
|
|
2135
|
+
|
|
2136
|
+
- Response Appropriateness (10% weight): ${analysis.responseAppropriateness.score}
|
|
2137
|
+
Format: ${analysis.responseAppropriateness.formatAlignment ? "Aligned" : "Misaligned"}
|
|
2138
|
+
Tone: ${analysis.responseAppropriateness.toneAlignment ? "Aligned" : "Misaligned"}
|
|
2139
|
+
${analysis.responseAppropriateness.reasoning}
|
|
2140
|
+
|
|
2141
|
+
Overall Assessment: ${analysis.overallAssessment}
|
|
2142
|
+
|
|
2143
|
+
Prompt Alignment measures how well the response addresses the user's request across intent, requirements, completeness, and appropriateness. The weighted scoring ensures primary focus on understanding and addressing the core intent while meeting specific requirements.
|
|
2144
|
+
|
|
2145
|
+
Rules for explanation:
|
|
2146
|
+
- Summarize the key strengths and weaknesses of alignment
|
|
2147
|
+
- Highlight any major misalignments that significantly impacted the score
|
|
2148
|
+
- Be concise but comprehensive in the explanation
|
|
2149
|
+
- Use the given score, don't recalculate
|
|
2150
|
+
|
|
2151
|
+
Format:
|
|
2152
|
+
"The score is ${score} because {explanation of alignment strengths and weaknesses based on the weighted dimensions}"
|
|
2153
|
+
|
|
2154
|
+
Example responses:
|
|
2155
|
+
"The score is 0.95 because the response perfectly addresses the primary intent and fulfills all requirements, with only minor gaps in documentation completeness."
|
|
2156
|
+
"The score is 0.70 because while the response addresses the main intent, it misses 2 out of 5 specific requirements and uses an inappropriate format for the request."
|
|
2157
|
+
"The score is 0.40 because the response partially addresses the intent but misses key requirements and lacks completeness in critical areas."`;
|
|
2158
|
+
}
|
|
2159
|
+
|
|
2160
|
+
// src/scorers/llm/prompt-alignment/index.ts
|
|
2161
|
+
var analyzeOutputSchema4 = zod.z.object({
|
|
2162
|
+
intentAlignment: zod.z.object({
|
|
2163
|
+
score: zod.z.number().min(0).max(1),
|
|
2164
|
+
primaryIntent: zod.z.string(),
|
|
2165
|
+
isAddressed: zod.z.boolean(),
|
|
2166
|
+
reasoning: zod.z.string()
|
|
2167
|
+
}),
|
|
2168
|
+
requirementsFulfillment: zod.z.object({
|
|
2169
|
+
requirements: zod.z.array(
|
|
2170
|
+
zod.z.object({
|
|
2171
|
+
requirement: zod.z.string(),
|
|
2172
|
+
isFulfilled: zod.z.boolean(),
|
|
2173
|
+
reasoning: zod.z.string()
|
|
2174
|
+
})
|
|
2175
|
+
),
|
|
2176
|
+
overallScore: zod.z.number().min(0).max(1)
|
|
2177
|
+
}),
|
|
2178
|
+
completeness: zod.z.object({
|
|
2179
|
+
score: zod.z.number().min(0).max(1),
|
|
2180
|
+
missingElements: zod.z.array(zod.z.string()),
|
|
2181
|
+
reasoning: zod.z.string()
|
|
2182
|
+
}),
|
|
2183
|
+
responseAppropriateness: zod.z.object({
|
|
2184
|
+
score: zod.z.number().min(0).max(1),
|
|
2185
|
+
formatAlignment: zod.z.boolean(),
|
|
2186
|
+
toneAlignment: zod.z.boolean(),
|
|
2187
|
+
reasoning: zod.z.string()
|
|
2188
|
+
}),
|
|
2189
|
+
overallAssessment: zod.z.string()
|
|
2190
|
+
});
|
|
2191
|
+
var SCORING_WEIGHTS = {
|
|
2192
|
+
USER: {
|
|
2193
|
+
INTENT_ALIGNMENT: 0.4,
|
|
2194
|
+
// 40% - Core intent is most important
|
|
2195
|
+
REQUIREMENTS_FULFILLMENT: 0.3,
|
|
2196
|
+
// 30% - Meeting specific requirements
|
|
2197
|
+
COMPLETENESS: 0.2,
|
|
2198
|
+
// 20% - Comprehensive response
|
|
2199
|
+
RESPONSE_APPROPRIATENESS: 0.1
|
|
2200
|
+
// 10% - Format and tone matching
|
|
2201
|
+
},
|
|
2202
|
+
SYSTEM: {
|
|
2203
|
+
INTENT_ALIGNMENT: 0.35,
|
|
2204
|
+
// 35% - Following system behavioral guidelines
|
|
2205
|
+
REQUIREMENTS_FULFILLMENT: 0.35,
|
|
2206
|
+
// 35% - Meeting system constraints
|
|
2207
|
+
COMPLETENESS: 0.15,
|
|
2208
|
+
// 15% - Adherence to all system rules
|
|
2209
|
+
RESPONSE_APPROPRIATENESS: 0.15
|
|
2210
|
+
// 15% - Consistency with system tone/format
|
|
2211
|
+
},
|
|
2212
|
+
BOTH: {
|
|
2213
|
+
// When evaluating both, we weight user alignment at 70% and system at 30%
|
|
2214
|
+
USER_WEIGHT: 0.7,
|
|
2215
|
+
SYSTEM_WEIGHT: 0.3
|
|
2216
|
+
}
|
|
2217
|
+
};
|
|
2218
|
+
function createPromptAlignmentScorerLLM({
|
|
2219
|
+
model,
|
|
2220
|
+
options
|
|
2221
|
+
}) {
|
|
2222
|
+
const scale = options?.scale || 1;
|
|
2223
|
+
const evaluationMode = options?.evaluationMode || "both";
|
|
2224
|
+
return scores.createScorer({
|
|
2225
|
+
name: "Prompt Alignment (LLM)",
|
|
2226
|
+
description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
|
|
2227
|
+
judge: {
|
|
2228
|
+
model,
|
|
2229
|
+
instructions: PROMPT_ALIGNMENT_INSTRUCTIONS
|
|
2230
|
+
}
|
|
2231
|
+
}).analyze({
|
|
2232
|
+
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2233
|
+
outputSchema: analyzeOutputSchema4,
|
|
2234
|
+
createPrompt: ({ run }) => {
|
|
2235
|
+
const userPrompt = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2236
|
+
const systemPrompt = chunkEKSPLMYP_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2237
|
+
const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2238
|
+
if (evaluationMode === "user" && !userPrompt) {
|
|
2239
|
+
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2240
|
+
}
|
|
2241
|
+
if (evaluationMode === "system" && !systemPrompt) {
|
|
2242
|
+
throw new Error("System prompt is required for system prompt alignment scoring");
|
|
2243
|
+
}
|
|
2244
|
+
if (evaluationMode === "both" && (!userPrompt || !systemPrompt)) {
|
|
2245
|
+
throw new Error("Both user and system prompts are required for combined alignment scoring");
|
|
2246
|
+
}
|
|
2247
|
+
if (!agentResponse) {
|
|
2248
|
+
throw new Error("Agent response is required for prompt alignment scoring");
|
|
2249
|
+
}
|
|
2250
|
+
return createAnalyzePrompt4({
|
|
2251
|
+
userPrompt,
|
|
2252
|
+
systemPrompt,
|
|
2253
|
+
agentResponse,
|
|
2254
|
+
evaluationMode
|
|
2255
|
+
});
|
|
2256
|
+
}
|
|
2257
|
+
}).generateScore(({ results }) => {
|
|
2258
|
+
const analysis = results.analyzeStepResult;
|
|
2259
|
+
if (!analysis) {
|
|
2260
|
+
return 0;
|
|
2261
|
+
}
|
|
2262
|
+
let weightedScore = 0;
|
|
2263
|
+
if (evaluationMode === "user") {
|
|
2264
|
+
weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
|
|
2265
|
+
} else if (evaluationMode === "system") {
|
|
2266
|
+
weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.SYSTEM.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.SYSTEM.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.SYSTEM.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.SYSTEM.RESPONSE_APPROPRIATENESS;
|
|
2267
|
+
} else {
|
|
2268
|
+
const userScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
|
|
2269
|
+
const systemScore = userScore;
|
|
2270
|
+
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2271
|
+
}
|
|
2272
|
+
const finalScore = weightedScore * scale;
|
|
2273
|
+
return chunkEKSPLMYP_cjs.roundToTwoDecimals(finalScore);
|
|
2274
|
+
}).generateReason({
|
|
2275
|
+
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2276
|
+
createPrompt: ({ run, results, score }) => {
|
|
2277
|
+
const userPrompt = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2278
|
+
const systemPrompt = chunkEKSPLMYP_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2279
|
+
const analysis = results.analyzeStepResult;
|
|
2280
|
+
if (!analysis) {
|
|
2281
|
+
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
2282
|
+
}
|
|
2283
|
+
return createReasonPrompt5({
|
|
2284
|
+
userPrompt,
|
|
2285
|
+
systemPrompt,
|
|
2286
|
+
score,
|
|
2287
|
+
scale,
|
|
2288
|
+
analysis,
|
|
2289
|
+
evaluationMode
|
|
2290
|
+
});
|
|
2291
|
+
}
|
|
2292
|
+
});
|
|
2293
|
+
}
|
|
2294
|
+
|
|
1161
2295
|
exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
|
|
1162
2296
|
exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
|
|
1163
2297
|
exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
|
|
1164
2298
|
exports.createBiasScorer = createBiasScorer;
|
|
2299
|
+
exports.createContextPrecisionScorer = createContextPrecisionScorer;
|
|
2300
|
+
exports.createContextRelevanceScorerLLM = createContextRelevanceScorerLLM;
|
|
1165
2301
|
exports.createFaithfulnessScorer = createFaithfulnessScorer;
|
|
1166
2302
|
exports.createHallucinationScorer = createHallucinationScorer;
|
|
2303
|
+
exports.createNoiseSensitivityScorerLLM = createNoiseSensitivityScorerLLM;
|
|
2304
|
+
exports.createPromptAlignmentScorerLLM = createPromptAlignmentScorerLLM;
|
|
1167
2305
|
exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
|
|
1168
2306
|
exports.createToxicityScorer = createToxicityScorer;
|
|
1169
2307
|
//# sourceMappingURL=index.cjs.map
|