@mastra/evals 0.13.2 → 0.13.3-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +2333 -0
- package/dist/{chunk-5CVZXIFW.js → chunk-4LRZVFXR.js} +32 -3
- package/dist/chunk-4LRZVFXR.js.map +1 -0
- package/dist/{chunk-QVZBKGOE.cjs → chunk-EKSPLMYP.cjs} +32 -2
- package/dist/chunk-EKSPLMYP.cjs.map +1 -0
- package/dist/{dist-JVIEAZJ6.js → dist-CI72CYZJ.js} +10 -10
- package/dist/{dist-JVIEAZJ6.js.map → dist-CI72CYZJ.js.map} +1 -1
- package/dist/{dist-JQCAD3AD.cjs → dist-IKJJ2AX4.cjs} +10 -10
- package/dist/{dist-JQCAD3AD.cjs.map → dist-IKJJ2AX4.cjs.map} +1 -1
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/{magic-string.es-NBXOXRCK.cjs → magic-string.es-VZN2EYER.cjs} +3 -3
- package/dist/{magic-string.es-NBXOXRCK.cjs.map → magic-string.es-VZN2EYER.cjs.map} +1 -1
- package/dist/{magic-string.es-6JSI7KY4.js → magic-string.es-WQRLTQPQ.js} +3 -3
- package/dist/{magic-string.es-6JSI7KY4.js.map → magic-string.es-WQRLTQPQ.js.map} +1 -1
- package/dist/scorers/code/index.cjs +2 -2
- package/dist/scorers/code/index.js +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +18 -0
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -0
- package/dist/scorers/llm/context-precision/prompts.d.ts +19 -0
- package/dist/scorers/llm/context-precision/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/context-relevance/index.d.ts +27 -0
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -0
- package/dist/scorers/llm/context-relevance/prompts.d.ts +20 -0
- package/dist/scorers/llm/context-relevance/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/index.cjs +1163 -25
- package/dist/scorers/llm/index.cjs.map +1 -1
- package/dist/scorers/llm/index.d.ts +4 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/index.js +1137 -3
- package/dist/scorers/llm/index.js.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +36 -0
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -0
- package/dist/scorers/llm/noise-sensitivity/prompts.d.ts +21 -0
- package/dist/scorers/llm/noise-sensitivity/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/prompt-alignment/index.d.ts +38 -0
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -0
- package/dist/scorers/llm/prompt-alignment/prompts.d.ts +44 -0
- package/dist/scorers/llm/prompt-alignment/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -4
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/utils.d.ts +2 -0
- package/dist/scorers/utils.d.ts.map +1 -1
- package/package.json +15 -5
- package/dist/chunk-5CVZXIFW.js.map +0 -1
- package/dist/chunk-QVZBKGOE.cjs.map +0 -1
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
|
|
2
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls } from '../../chunk-
|
|
2
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-4LRZVFXR.js';
|
|
3
3
|
import { createScorer } from '@mastra/core/scores';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
|
|
@@ -1097,7 +1097,7 @@ var analyzeOutputSchema = z.object({
|
|
|
1097
1097
|
missingTools: z.array(z.string()).optional()
|
|
1098
1098
|
});
|
|
1099
1099
|
function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
1100
|
-
const toolDefinitions = availableTools.map((tool) => `${tool.
|
|
1100
|
+
const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
|
|
1101
1101
|
return createScorer({
|
|
1102
1102
|
name: "Tool Call Accuracy (LLM)",
|
|
1103
1103
|
description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
|
|
@@ -1156,6 +1156,1140 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1156
1156
|
});
|
|
1157
1157
|
}
|
|
1158
1158
|
|
|
1159
|
-
|
|
1159
|
+
// src/scorers/llm/context-relevance/prompts.ts
|
|
1160
|
+
var CONTEXT_RELEVANCE_INSTRUCTIONS = `You are an expert context relevance evaluator. Your job is to analyze whether the provided context information was appropriate and useful for generating the agent's response to the user's query.
|
|
1161
|
+
|
|
1162
|
+
Key Evaluation Criteria:
|
|
1163
|
+
1. **Relevance**: Does the context directly relate to the user's query?
|
|
1164
|
+
2. **Utility**: Did the context help produce a better response?
|
|
1165
|
+
3. **Completeness**: Was the context sufficient for the task?
|
|
1166
|
+
4. **Quality**: Is the context accurate and trustworthy?
|
|
1167
|
+
|
|
1168
|
+
Evaluation Guidelines:
|
|
1169
|
+
- Context that directly answers or supports the user's query should be marked as highly relevant
|
|
1170
|
+
- Context that provides background information relevant to the query should be considered moderately relevant
|
|
1171
|
+
- Context that is tangentially related but doesn't directly help should be marked as low relevance
|
|
1172
|
+
- Context that is completely unrelated should be marked as irrelevant
|
|
1173
|
+
- Consider whether missing context might have led to a better response
|
|
1174
|
+
|
|
1175
|
+
Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
|
|
1176
|
+
function createAnalyzePrompt2({
|
|
1177
|
+
userQuery,
|
|
1178
|
+
agentResponse,
|
|
1179
|
+
providedContext
|
|
1180
|
+
}) {
|
|
1181
|
+
const contextList = providedContext.map((ctx, index) => `[${index}] ${ctx}`).join("\n");
|
|
1182
|
+
return `Analyze the relevance of the provided context for answering the user's query and generating the agent's response.
|
|
1183
|
+
|
|
1184
|
+
User Query:
|
|
1185
|
+
${userQuery}
|
|
1186
|
+
|
|
1187
|
+
Agent Response:
|
|
1188
|
+
${agentResponse}
|
|
1189
|
+
|
|
1190
|
+
Context pieces to evaluate:
|
|
1191
|
+
${contextList}
|
|
1192
|
+
|
|
1193
|
+
For each context piece, evaluate:
|
|
1194
|
+
1. **Relevance Level**: How relevant is it to the user's query?
|
|
1195
|
+
- "high": Directly addresses the query or provides essential information
|
|
1196
|
+
- "medium": Provides supporting or background information that's helpful
|
|
1197
|
+
- "low": Tangentially related but not very helpful
|
|
1198
|
+
- "none": Completely irrelevant or unrelated
|
|
1199
|
+
|
|
1200
|
+
2. **Usage**: Was this context actually used in generating the agent's response?
|
|
1201
|
+
- true: The response clearly incorporates or reflects this information
|
|
1202
|
+
- false: This information doesn't appear to be used in the response
|
|
1203
|
+
|
|
1204
|
+
3. **Reasoning**: Explain your assessment in detail
|
|
1205
|
+
|
|
1206
|
+
Also identify any missing context that should have been provided to better answer the query.
|
|
1207
|
+
|
|
1208
|
+
Format your response as:
|
|
1209
|
+
{
|
|
1210
|
+
"evaluations": [
|
|
1211
|
+
{
|
|
1212
|
+
"context_index": 0,
|
|
1213
|
+
"contextPiece": "the actual text of the context piece",
|
|
1214
|
+
"relevanceLevel": "high/medium/low/none",
|
|
1215
|
+
"wasUsed": true/false,
|
|
1216
|
+
"reasoning": "detailed explanation of the evaluation"
|
|
1217
|
+
}
|
|
1218
|
+
],
|
|
1219
|
+
"missingContext": ["list of missing information that would have been helpful"],
|
|
1220
|
+
"overallAssessment": "summary of the context quality and usage"
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
The number of evaluations MUST match the number of context pieces exactly.
|
|
1224
|
+
|
|
1225
|
+
Example:
|
|
1226
|
+
User Query: "What are the benefits of exercise?"
|
|
1227
|
+
Agent Response: "Regular exercise improves cardiovascular health and mental wellbeing."
|
|
1228
|
+
Context:
|
|
1229
|
+
[0] "Exercise strengthens the heart and improves blood circulation."
|
|
1230
|
+
[1] "A balanced diet is important for overall health."
|
|
1231
|
+
[2] "Regular physical activity reduces stress and anxiety levels."
|
|
1232
|
+
|
|
1233
|
+
{
|
|
1234
|
+
"evaluations": [
|
|
1235
|
+
{
|
|
1236
|
+
"context_index": 0,
|
|
1237
|
+
"contextPiece": "Exercise strengthens the heart and improves blood circulation.",
|
|
1238
|
+
"relevanceLevel": "high",
|
|
1239
|
+
"wasUsed": true,
|
|
1240
|
+
"reasoning": "This context directly supports the cardiovascular health benefit mentioned in the response"
|
|
1241
|
+
},
|
|
1242
|
+
{
|
|
1243
|
+
"context_index": 1,
|
|
1244
|
+
"contextPiece": "A balanced diet is important for overall health.",
|
|
1245
|
+
"relevanceLevel": "none",
|
|
1246
|
+
"wasUsed": false,
|
|
1247
|
+
"reasoning": "This context is about diet, not exercise benefits, and doesn't contribute to answering the query"
|
|
1248
|
+
},
|
|
1249
|
+
{
|
|
1250
|
+
"context_index": 2,
|
|
1251
|
+
"contextPiece": "Regular physical activity reduces stress and anxiety levels.",
|
|
1252
|
+
"relevanceLevel": "high",
|
|
1253
|
+
"wasUsed": true,
|
|
1254
|
+
"reasoning": "This context directly supports the mental wellbeing benefit mentioned in the response"
|
|
1255
|
+
}
|
|
1256
|
+
],
|
|
1257
|
+
"missingContext": [],
|
|
1258
|
+
"overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
|
|
1259
|
+
}`;
|
|
1260
|
+
}
|
|
1261
|
+
function createReasonPrompt3({
|
|
1262
|
+
userQuery,
|
|
1263
|
+
score,
|
|
1264
|
+
evaluations,
|
|
1265
|
+
missingContext,
|
|
1266
|
+
scale
|
|
1267
|
+
}) {
|
|
1268
|
+
return `Explain the context relevance score for the provided context based on its relevance and usage in generating the agent's response.
|
|
1269
|
+
|
|
1270
|
+
User Query:
|
|
1271
|
+
${userQuery}
|
|
1272
|
+
|
|
1273
|
+
Score: ${score} out of ${scale}
|
|
1274
|
+
|
|
1275
|
+
Context Evaluations:
|
|
1276
|
+
${evaluations.map(
|
|
1277
|
+
(evaluation) => `[${evaluation.context_index}] Relevance: ${evaluation.relevanceLevel}, Used: ${evaluation.wasUsed ? "Yes" : "No"}
|
|
1278
|
+
Context: "${evaluation.contextPiece}"
|
|
1279
|
+
Reasoning: ${evaluation.reasoning}`
|
|
1280
|
+
).join("\n\n")}
|
|
1281
|
+
|
|
1282
|
+
${missingContext.length > 0 ? `
|
|
1283
|
+
Missing Context Issues:
|
|
1284
|
+
${missingContext.map((item) => `- ${item}`).join("\n")}` : ""}
|
|
1285
|
+
|
|
1286
|
+
Context Relevance measures how well the provided context supports answering the user's query and generating the expected response. The score considers:
|
|
1287
|
+
- Relevance levels (high=1.0, medium=0.7, low=0.3, none=0.0)
|
|
1288
|
+
- Usage penalties (10% penalty per unused high-relevance context)
|
|
1289
|
+
- Missing context penalties (up to 50% penalty for identified gaps)
|
|
1290
|
+
|
|
1291
|
+
Rules for explanation:
|
|
1292
|
+
- Explain the score based on context relevance levels and usage
|
|
1293
|
+
- Mention any penalties applied for unused relevant context or missing information
|
|
1294
|
+
- Keep explanation concise and actionable for improving context selection
|
|
1295
|
+
- Use the given score, don't recalculate
|
|
1296
|
+
|
|
1297
|
+
Format:
|
|
1298
|
+
"The score is ${score} because {explanation of context relevance, usage, and any penalties}"
|
|
1299
|
+
|
|
1300
|
+
Example responses:
|
|
1301
|
+
"The score is 0.85 because 2 out of 3 context pieces are highly relevant and used in the response, with only minor penalty for one unused medium-relevance context piece."
|
|
1302
|
+
"The score is 1.0 because all context pieces are highly relevant to the query about exercise benefits and were effectively used in generating the comprehensive response."
|
|
1303
|
+
"The score is 0.40 because while some context is relevant, key information about the topic was missing and one highly relevant context piece was not utilized in the response."`;
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
// src/scorers/llm/context-relevance/index.ts
|
|
1307
|
+
var analyzeOutputSchema2 = z.object({
|
|
1308
|
+
evaluations: z.array(
|
|
1309
|
+
z.object({
|
|
1310
|
+
context_index: z.number(),
|
|
1311
|
+
contextPiece: z.string(),
|
|
1312
|
+
relevanceLevel: z.enum(["high", "medium", "low", "none"]),
|
|
1313
|
+
wasUsed: z.boolean(),
|
|
1314
|
+
reasoning: z.string()
|
|
1315
|
+
})
|
|
1316
|
+
),
|
|
1317
|
+
missingContext: z.array(z.string()).optional().default([]),
|
|
1318
|
+
overallAssessment: z.string()
|
|
1319
|
+
});
|
|
1320
|
+
var DEFAULT_PENALTIES = {
|
|
1321
|
+
UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
|
|
1322
|
+
// 10% penalty per unused high-relevance context
|
|
1323
|
+
MISSING_CONTEXT_PER_ITEM: 0.15,
|
|
1324
|
+
// 15% penalty per missing context item
|
|
1325
|
+
MAX_MISSING_CONTEXT_PENALTY: 0.5
|
|
1326
|
+
// Maximum 50% penalty for missing context
|
|
1327
|
+
};
|
|
1328
|
+
function createContextRelevanceScorerLLM({
|
|
1329
|
+
model,
|
|
1330
|
+
options
|
|
1331
|
+
}) {
|
|
1332
|
+
if (!options.context && !options.contextExtractor) {
|
|
1333
|
+
throw new Error("Either context or contextExtractor is required for Context Relevance scoring");
|
|
1334
|
+
}
|
|
1335
|
+
if (options.context && options.context.length === 0) {
|
|
1336
|
+
throw new Error("Context array cannot be empty if provided");
|
|
1337
|
+
}
|
|
1338
|
+
return createScorer({
|
|
1339
|
+
name: "Context Relevance (LLM)",
|
|
1340
|
+
description: "Evaluates how relevant and useful the provided context was for generating the agent response",
|
|
1341
|
+
judge: {
|
|
1342
|
+
model,
|
|
1343
|
+
instructions: CONTEXT_RELEVANCE_INSTRUCTIONS
|
|
1344
|
+
}
|
|
1345
|
+
}).analyze({
|
|
1346
|
+
description: "Analyze the relevance and utility of provided context",
|
|
1347
|
+
outputSchema: analyzeOutputSchema2,
|
|
1348
|
+
createPrompt: ({ run }) => {
|
|
1349
|
+
const userQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1350
|
+
const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1351
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1352
|
+
if (context.length === 0) {
|
|
1353
|
+
return createAnalyzePrompt2({
|
|
1354
|
+
userQuery,
|
|
1355
|
+
agentResponse,
|
|
1356
|
+
providedContext: ["[No context was provided for evaluation]"]
|
|
1357
|
+
});
|
|
1358
|
+
}
|
|
1359
|
+
return createAnalyzePrompt2({
|
|
1360
|
+
userQuery,
|
|
1361
|
+
agentResponse,
|
|
1362
|
+
providedContext: context
|
|
1363
|
+
});
|
|
1364
|
+
}
|
|
1365
|
+
}).generateScore(({ results, run }) => {
|
|
1366
|
+
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1367
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1368
|
+
if (context.length === 0) {
|
|
1369
|
+
return 1 * (options.scale || 1);
|
|
1370
|
+
}
|
|
1371
|
+
if (evaluations.length === 0) {
|
|
1372
|
+
const missingContext2 = results.analyzeStepResult?.missingContext || [];
|
|
1373
|
+
return missingContext2.length > 0 ? 0 : 1;
|
|
1374
|
+
}
|
|
1375
|
+
const relevanceWeights = {
|
|
1376
|
+
high: 1,
|
|
1377
|
+
medium: 0.7,
|
|
1378
|
+
low: 0.3,
|
|
1379
|
+
none: 0
|
|
1380
|
+
};
|
|
1381
|
+
const totalWeight = evaluations.reduce((sum, evaluation) => {
|
|
1382
|
+
return sum + relevanceWeights[evaluation.relevanceLevel];
|
|
1383
|
+
}, 0);
|
|
1384
|
+
const maxPossibleWeight = evaluations.length * relevanceWeights.high;
|
|
1385
|
+
const relevanceScore = maxPossibleWeight > 0 ? totalWeight / maxPossibleWeight : 0;
|
|
1386
|
+
const highRelevanceUnused = evaluations.filter(
|
|
1387
|
+
(evaluation) => evaluation.relevanceLevel === "high" && !evaluation.wasUsed
|
|
1388
|
+
).length;
|
|
1389
|
+
const penalties = options.penalties || {};
|
|
1390
|
+
const unusedPenaltyRate = penalties.unusedHighRelevanceContext ?? DEFAULT_PENALTIES.UNUSED_HIGH_RELEVANCE_CONTEXT;
|
|
1391
|
+
const missingPenaltyRate = penalties.missingContextPerItem ?? DEFAULT_PENALTIES.MISSING_CONTEXT_PER_ITEM;
|
|
1392
|
+
const maxMissingPenalty = penalties.maxMissingContextPenalty ?? DEFAULT_PENALTIES.MAX_MISSING_CONTEXT_PENALTY;
|
|
1393
|
+
const usagePenalty = highRelevanceUnused * unusedPenaltyRate;
|
|
1394
|
+
const missingContext = results.analyzeStepResult?.missingContext || [];
|
|
1395
|
+
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1396
|
+
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1397
|
+
const scaledScore = finalScore * (options.scale || 1);
|
|
1398
|
+
return roundToTwoDecimals$1(scaledScore);
|
|
1399
|
+
}).generateReason({
|
|
1400
|
+
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1401
|
+
createPrompt: ({ run, results, score }) => {
|
|
1402
|
+
const userQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1403
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1404
|
+
if (context.length === 0) {
|
|
1405
|
+
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
1406
|
+
}
|
|
1407
|
+
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1408
|
+
const missingContext = results.analyzeStepResult?.missingContext || [];
|
|
1409
|
+
return createReasonPrompt3({
|
|
1410
|
+
userQuery,
|
|
1411
|
+
score,
|
|
1412
|
+
evaluations,
|
|
1413
|
+
missingContext,
|
|
1414
|
+
scale: options.scale || 1
|
|
1415
|
+
});
|
|
1416
|
+
}
|
|
1417
|
+
});
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
// src/scorers/llm/context-precision/prompts.ts
|
|
1421
|
+
var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = `You are a precise context precision evaluator. Your job is to determine if context nodes are relevant for generating the expected output based on the input query.
|
|
1422
|
+
|
|
1423
|
+
Key Principles:
|
|
1424
|
+
1. Evaluate each context piece independently for relevance to the input-output pair
|
|
1425
|
+
2. Consider relevance as the ability of the context to contribute to generating the expected output
|
|
1426
|
+
3. Mark context as relevant only if it directly supports or informs the expected output
|
|
1427
|
+
4. Consider the input query when determining relevance
|
|
1428
|
+
5. Focus on practical utility for output generation, not just topical similarity
|
|
1429
|
+
6. Be strict in your evaluation - context must be clearly useful for generating the output
|
|
1430
|
+
7. Context that provides background but doesn't directly contribute should be marked as not relevant`;
|
|
1431
|
+
function createContextRelevancePrompt({
|
|
1432
|
+
input,
|
|
1433
|
+
output,
|
|
1434
|
+
context
|
|
1435
|
+
}) {
|
|
1436
|
+
return `Evaluate the relevance of each context piece for generating the expected output given the input query.
|
|
1437
|
+
|
|
1438
|
+
Input Query:
|
|
1439
|
+
${input}
|
|
1440
|
+
|
|
1441
|
+
Expected Output:
|
|
1442
|
+
${output}
|
|
1443
|
+
|
|
1444
|
+
Context pieces to evaluate:
|
|
1445
|
+
${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
|
|
1446
|
+
|
|
1447
|
+
For each context piece, determine if it is relevant for generating the expected output. A context piece is relevant if:
|
|
1448
|
+
- It provides information that directly supports or informs the expected output
|
|
1449
|
+
- It contains facts, data, or details that are needed to answer the input query
|
|
1450
|
+
- It contributes to the accuracy or completeness of the expected output
|
|
1451
|
+
|
|
1452
|
+
Mark as "yes" only if the context piece is clearly useful for generating the output.
|
|
1453
|
+
Mark as "no" if the context piece does not contribute to generating the expected output.
|
|
1454
|
+
|
|
1455
|
+
Format your response as:
|
|
1456
|
+
{
|
|
1457
|
+
"verdicts": [
|
|
1458
|
+
{
|
|
1459
|
+
"context_index": 0,
|
|
1460
|
+
"verdict": "yes/no",
|
|
1461
|
+
"reason": "explanation of why this context is or isn't relevant"
|
|
1462
|
+
}
|
|
1463
|
+
]
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
The number of verdicts MUST match the number of context pieces exactly.
|
|
1467
|
+
|
|
1468
|
+
Example:
|
|
1469
|
+
Input: "What are the benefits of exercise?"
|
|
1470
|
+
Output: "Regular exercise improves cardiovascular health and mental wellbeing."
|
|
1471
|
+
Context:
|
|
1472
|
+
[0] "Exercise strengthens the heart and improves blood circulation."
|
|
1473
|
+
[1] "A balanced diet is important for health."
|
|
1474
|
+
[2] "Regular physical activity reduces stress and anxiety."
|
|
1475
|
+
|
|
1476
|
+
{
|
|
1477
|
+
"verdicts": [
|
|
1478
|
+
{
|
|
1479
|
+
"context_index": 0,
|
|
1480
|
+
"verdict": "yes",
|
|
1481
|
+
"reason": "This context directly supports the cardiovascular health benefit mentioned in the output"
|
|
1482
|
+
},
|
|
1483
|
+
{
|
|
1484
|
+
"context_index": 1,
|
|
1485
|
+
"verdict": "no",
|
|
1486
|
+
"reason": "This context is about diet, not exercise benefits, and doesn't contribute to the expected output"
|
|
1487
|
+
},
|
|
1488
|
+
{
|
|
1489
|
+
"context_index": 2,
|
|
1490
|
+
"verdict": "yes",
|
|
1491
|
+
"reason": "This context directly supports the mental wellbeing benefit mentioned in the output"
|
|
1492
|
+
}
|
|
1493
|
+
]
|
|
1494
|
+
}`;
|
|
1495
|
+
}
|
|
1496
|
+
function createContextPrecisionReasonPrompt({
|
|
1497
|
+
input,
|
|
1498
|
+
output,
|
|
1499
|
+
context,
|
|
1500
|
+
score,
|
|
1501
|
+
scale,
|
|
1502
|
+
verdicts
|
|
1503
|
+
}) {
|
|
1504
|
+
return `Explain the context precision score for the retrieved context based on its relevance to generating the expected output.
|
|
1505
|
+
|
|
1506
|
+
Input Query:
|
|
1507
|
+
${input}
|
|
1508
|
+
|
|
1509
|
+
Expected Output:
|
|
1510
|
+
${output}
|
|
1511
|
+
|
|
1512
|
+
Context pieces:
|
|
1513
|
+
${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
|
|
1514
|
+
|
|
1515
|
+
Score: ${score} out of ${scale}
|
|
1516
|
+
Verdicts:
|
|
1517
|
+
${JSON.stringify(verdicts, null, 2)}
|
|
1518
|
+
|
|
1519
|
+
Context Precision measures how relevant and precise the retrieved context nodes are for generating the expected output. The score is calculated using Mean Average Precision (MAP) which:
|
|
1520
|
+
- Gives binary relevance scores (1 for relevant, 0 for irrelevant)
|
|
1521
|
+
- Weights earlier positions more heavily in the scoring
|
|
1522
|
+
- Rewards having relevant context early in the sequence
|
|
1523
|
+
|
|
1524
|
+
Rules for explanation:
|
|
1525
|
+
- Explain the score based on which context pieces were relevant and their positions
|
|
1526
|
+
- Mention how the positioning affects the MAP score
|
|
1527
|
+
- Keep explanation concise and focused on context quality
|
|
1528
|
+
- Use the given score, don't recalculate
|
|
1529
|
+
- Focus on how well the context supports generating the expected output
|
|
1530
|
+
|
|
1531
|
+
Format:
|
|
1532
|
+
"The score is ${score} because {explanation of context precision and positioning}"
|
|
1533
|
+
|
|
1534
|
+
Example responses:
|
|
1535
|
+
"The score is 0.75 because the first and third contexts are highly relevant to the benefits mentioned in the output, while the second and fourth contexts are not directly related to exercise benefits. The relevant contexts are well-positioned at the beginning and middle of the sequence."
|
|
1536
|
+
"The score is 1.0 because all context pieces are relevant for generating the expected output and are optimally ordered."
|
|
1537
|
+
"The score is 0.33 because only the first context piece is relevant to the query, and the remaining contexts don't contribute to generating the expected output about exercise benefits."`;
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
// src/scorers/llm/context-precision/index.ts
|
|
1541
|
+
var contextRelevanceOutputSchema = z.object({
|
|
1542
|
+
verdicts: z.array(
|
|
1543
|
+
z.object({
|
|
1544
|
+
context_index: z.number(),
|
|
1545
|
+
verdict: z.string(),
|
|
1546
|
+
reason: z.string()
|
|
1547
|
+
})
|
|
1548
|
+
)
|
|
1549
|
+
});
|
|
1550
|
+
function createContextPrecisionScorer({
|
|
1551
|
+
model,
|
|
1552
|
+
options
|
|
1553
|
+
}) {
|
|
1554
|
+
if (!options.context && !options.contextExtractor) {
|
|
1555
|
+
throw new Error("Either context or contextExtractor is required for Context Precision scoring");
|
|
1556
|
+
}
|
|
1557
|
+
if (options.context && options.context.length === 0) {
|
|
1558
|
+
throw new Error("Context array cannot be empty if provided");
|
|
1559
|
+
}
|
|
1560
|
+
return createScorer({
|
|
1561
|
+
name: "Context Precision Scorer",
|
|
1562
|
+
description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
|
|
1563
|
+
judge: {
|
|
1564
|
+
model,
|
|
1565
|
+
instructions: CONTEXT_PRECISION_AGENT_INSTRUCTIONS
|
|
1566
|
+
}
|
|
1567
|
+
}).analyze({
|
|
1568
|
+
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1569
|
+
outputSchema: contextRelevanceOutputSchema,
|
|
1570
|
+
createPrompt: ({ run }) => {
|
|
1571
|
+
const input = getUserMessageFromRunInput(run.input) ?? "";
|
|
1572
|
+
const output = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1573
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1574
|
+
if (context.length === 0) {
|
|
1575
|
+
throw new Error("No context available for evaluation");
|
|
1576
|
+
}
|
|
1577
|
+
return createContextRelevancePrompt({
|
|
1578
|
+
input,
|
|
1579
|
+
output,
|
|
1580
|
+
context
|
|
1581
|
+
});
|
|
1582
|
+
}
|
|
1583
|
+
}).generateScore(({ results }) => {
|
|
1584
|
+
if (!results.analyzeStepResult || results.analyzeStepResult.verdicts.length === 0) {
|
|
1585
|
+
return 0;
|
|
1586
|
+
}
|
|
1587
|
+
const verdicts = results.analyzeStepResult.verdicts;
|
|
1588
|
+
const sortedVerdicts = verdicts.sort((a, b) => a.context_index - b.context_index);
|
|
1589
|
+
let sumPrecision = 0;
|
|
1590
|
+
let relevantCount = 0;
|
|
1591
|
+
for (let i = 0; i < sortedVerdicts.length; i++) {
|
|
1592
|
+
const targetVerdict = sortedVerdicts[i];
|
|
1593
|
+
const isRelevant = targetVerdict?.verdict?.toLowerCase().trim() === "yes";
|
|
1594
|
+
if (isRelevant) {
|
|
1595
|
+
relevantCount++;
|
|
1596
|
+
const precisionAtI = relevantCount / (i + 1);
|
|
1597
|
+
sumPrecision += precisionAtI;
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
if (relevantCount === 0) {
|
|
1601
|
+
return 0;
|
|
1602
|
+
}
|
|
1603
|
+
const map = sumPrecision / relevantCount;
|
|
1604
|
+
const score = map * (options.scale || 1);
|
|
1605
|
+
return roundToTwoDecimals$1(score);
|
|
1606
|
+
}).generateReason({
|
|
1607
|
+
description: "Reason about the context precision results",
|
|
1608
|
+
createPrompt: ({ run, results, score }) => {
|
|
1609
|
+
const input = getUserMessageFromRunInput(run.input) ?? "";
|
|
1610
|
+
const output = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1611
|
+
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1612
|
+
return createContextPrecisionReasonPrompt({
|
|
1613
|
+
input,
|
|
1614
|
+
output,
|
|
1615
|
+
context,
|
|
1616
|
+
score,
|
|
1617
|
+
scale: options.scale || 1,
|
|
1618
|
+
verdicts: results.analyzeStepResult?.verdicts || []
|
|
1619
|
+
});
|
|
1620
|
+
}
|
|
1621
|
+
});
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
// src/scorers/llm/noise-sensitivity/prompts.ts
|
|
1625
|
+
var NOISE_SENSITIVITY_INSTRUCTIONS = `You are an expert noise sensitivity evaluator. Your job is to analyze how much irrelevant, distracting, or misleading information (noise) affected the agent's response quality and accuracy.
|
|
1626
|
+
|
|
1627
|
+
Key Evaluation Criteria:
|
|
1628
|
+
1. **Response Consistency**: How similar are the baseline and noisy responses in content and correctness?
|
|
1629
|
+
2. **Information Integrity**: Did the agent maintain accuracy despite noise, or was it misled?
|
|
1630
|
+
3. **Focus Preservation**: Did the agent stay on topic or get distracted by irrelevant information?
|
|
1631
|
+
4. **Hallucination Resistance**: Did noise cause the agent to generate false or fabricated information?
|
|
1632
|
+
5. **Completeness**: Did noise cause the agent to miss important parts of the original query?
|
|
1633
|
+
|
|
1634
|
+
Noise Impact Assessment:
|
|
1635
|
+
- **No Impact (1.0)**: Response is virtually identical in quality, accuracy, and completeness
|
|
1636
|
+
- **Minimal Impact (0.8-0.9)**: Slight changes in phrasing but maintains correctness and completeness
|
|
1637
|
+
- **Moderate Impact (0.5-0.7)**: Noticeable changes that affect quality but core information remains correct
|
|
1638
|
+
- **Significant Impact (0.2-0.4)**: Major degradation in quality, accuracy, or completeness
|
|
1639
|
+
- **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
|
|
1640
|
+
|
|
1641
|
+
Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
|
|
1642
|
+
function createAnalyzePrompt3({
|
|
1643
|
+
userQuery,
|
|
1644
|
+
baselineResponse,
|
|
1645
|
+
noisyQuery,
|
|
1646
|
+
noisyResponse,
|
|
1647
|
+
noiseType
|
|
1648
|
+
}) {
|
|
1649
|
+
return `Analyze how the added noise affected the agent's response quality and accuracy.
|
|
1650
|
+
|
|
1651
|
+
Original User Query:
|
|
1652
|
+
${userQuery}
|
|
1653
|
+
|
|
1654
|
+
Baseline Agent Response (clean input):
|
|
1655
|
+
${baselineResponse}
|
|
1656
|
+
|
|
1657
|
+
Noisy User Query (with added distractions):
|
|
1658
|
+
${noisyQuery}
|
|
1659
|
+
|
|
1660
|
+
Noisy Agent Response:
|
|
1661
|
+
${noisyResponse}
|
|
1662
|
+
|
|
1663
|
+
${noiseType ? `Type of noise added: ${noiseType}` : ""}
|
|
1664
|
+
|
|
1665
|
+
Compare the baseline and noisy responses across these dimensions:
|
|
1666
|
+
|
|
1667
|
+
1. **Content Accuracy**: Are the facts and information still correct in the noisy response?
|
|
1668
|
+
2. **Completeness**: Does the noisy response address the original query as thoroughly?
|
|
1669
|
+
3. **Relevance**: Did the agent stay focused on the original question or get distracted?
|
|
1670
|
+
4. **Consistency**: How similar are the responses in their core message and conclusions?
|
|
1671
|
+
5. **Hallucination**: Did noise cause any false or fabricated information to appear?
|
|
1672
|
+
|
|
1673
|
+
For each dimension, evaluate:
|
|
1674
|
+
- **Impact Level**: none, minimal, moderate, significant, severe
|
|
1675
|
+
- **Specific Changes**: What exactly changed between responses?
|
|
1676
|
+
- **Noise Influence**: How did the noise specifically affect this aspect?
|
|
1677
|
+
|
|
1678
|
+
Format your response as:
|
|
1679
|
+
{
|
|
1680
|
+
"dimensions": [
|
|
1681
|
+
{
|
|
1682
|
+
"dimension": "content_accuracy",
|
|
1683
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1684
|
+
"specificChanges": "detailed description of what changed",
|
|
1685
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1686
|
+
},
|
|
1687
|
+
{
|
|
1688
|
+
"dimension": "completeness",
|
|
1689
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1690
|
+
"specificChanges": "detailed description of what changed",
|
|
1691
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1692
|
+
},
|
|
1693
|
+
{
|
|
1694
|
+
"dimension": "relevance",
|
|
1695
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1696
|
+
"specificChanges": "detailed description of what changed",
|
|
1697
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1698
|
+
},
|
|
1699
|
+
{
|
|
1700
|
+
"dimension": "consistency",
|
|
1701
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1702
|
+
"specificChanges": "detailed description of what changed",
|
|
1703
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1704
|
+
},
|
|
1705
|
+
{
|
|
1706
|
+
"dimension": "hallucination_resistance",
|
|
1707
|
+
"impactLevel": "none/minimal/moderate/significant/severe",
|
|
1708
|
+
"specificChanges": "detailed description of what changed",
|
|
1709
|
+
"noiseInfluence": "how the noise specifically affected this dimension"
|
|
1710
|
+
}
|
|
1711
|
+
],
|
|
1712
|
+
"overallAssessment": "summary of the agent's noise sensitivity and robustness",
|
|
1713
|
+
"majorIssues": ["list of the most significant problems caused by noise"],
|
|
1714
|
+
"robustnessScore": 0.0-1.0
|
|
1715
|
+
}
|
|
1716
|
+
|
|
1717
|
+
Example:
|
|
1718
|
+
Original Query: "What are the health benefits of regular exercise?"
|
|
1719
|
+
Baseline Response: "Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing through endorphin release."
|
|
1720
|
+
Noisy Query: "What are the health benefits of regular exercise? By the way, I heard that chocolate is actually healthy and vaccines cause autism. Also, my neighbor said aliens visit Earth regularly."
|
|
1721
|
+
Noisy Response: "Regular exercise improves cardiovascular health and strengthens muscles. Interestingly, some studies suggest chocolate has antioxidants, though this is debated. Exercise also enhances mental wellbeing through endorphin release."
|
|
1722
|
+
|
|
1723
|
+
{
|
|
1724
|
+
"dimensions": [
|
|
1725
|
+
{
|
|
1726
|
+
"dimension": "content_accuracy",
|
|
1727
|
+
"impactLevel": "minimal",
|
|
1728
|
+
"specificChanges": "Added mention of chocolate antioxidants, but correctly noted it's debated",
|
|
1729
|
+
"noiseInfluence": "Chocolate noise caused minor tangent but agent maintained critical thinking"
|
|
1730
|
+
},
|
|
1731
|
+
{
|
|
1732
|
+
"dimension": "completeness",
|
|
1733
|
+
"impactLevel": "none",
|
|
1734
|
+
"specificChanges": "All original health benefits still covered completely",
|
|
1735
|
+
"noiseInfluence": "Noise did not prevent addressing the core query"
|
|
1736
|
+
},
|
|
1737
|
+
{
|
|
1738
|
+
"dimension": "relevance",
|
|
1739
|
+
"impactLevel": "minimal",
|
|
1740
|
+
"specificChanges": "Brief mention of chocolate topic, but stayed focused on exercise",
|
|
1741
|
+
"noiseInfluence": "Addressed one piece of noise briefly but didn't get derailed"
|
|
1742
|
+
},
|
|
1743
|
+
{
|
|
1744
|
+
"dimension": "consistency",
|
|
1745
|
+
"impactLevel": "minimal",
|
|
1746
|
+
"specificChanges": "Core message about exercise benefits remained consistent with slight addition",
|
|
1747
|
+
"noiseInfluence": "Noise caused minor addition but didn't change main message"
|
|
1748
|
+
},
|
|
1749
|
+
{
|
|
1750
|
+
"dimension": "hallucination_resistance",
|
|
1751
|
+
"impactLevel": "none",
|
|
1752
|
+
"specificChanges": "No false information generated, properly qualified chocolate statement",
|
|
1753
|
+
"noiseInfluence": "Successfully resisted misinformation about vaccines and aliens"
|
|
1754
|
+
}
|
|
1755
|
+
],
|
|
1756
|
+
"overallAssessment": "Agent showed good robustness, addressing original query completely while minimally engaging with one benign noise element and completely ignoring harmful misinformation",
|
|
1757
|
+
"majorIssues": [],
|
|
1758
|
+
"robustnessScore": 0.85
|
|
1759
|
+
}`;
|
|
1760
|
+
}
|
|
1761
|
+
function createReasonPrompt4({
|
|
1762
|
+
userQuery,
|
|
1763
|
+
score,
|
|
1764
|
+
dimensions,
|
|
1765
|
+
majorIssues,
|
|
1766
|
+
overallAssessment
|
|
1767
|
+
}) {
|
|
1768
|
+
const impactSummary = dimensions.map((d) => `${d.dimension}: ${d.impactLevel} impact`).join(", ");
|
|
1769
|
+
return `Explain the noise sensitivity score based on how well the agent maintained response quality despite irrelevant or distracting information.
|
|
1770
|
+
|
|
1771
|
+
Original Query:
|
|
1772
|
+
${userQuery}
|
|
1773
|
+
|
|
1774
|
+
Score: ${score} out of 1.0
|
|
1775
|
+
|
|
1776
|
+
Impact Assessment:
|
|
1777
|
+
${impactSummary}
|
|
1778
|
+
|
|
1779
|
+
${majorIssues.length > 0 ? `
|
|
1780
|
+
Major Issues Identified:
|
|
1781
|
+
${majorIssues.map((issue) => `- ${issue}`).join("\n")}` : ""}
|
|
1782
|
+
|
|
1783
|
+
Overall Assessment:
|
|
1784
|
+
${overallAssessment}
|
|
1785
|
+
|
|
1786
|
+
Noise Sensitivity measures how robust an agent is when irrelevant, misleading, or distracting information is added to the input. The score considers:
|
|
1787
|
+
- Content accuracy preservation (maintaining factual correctness)
|
|
1788
|
+
- Completeness retention (addressing the full original query)
|
|
1789
|
+
- Focus maintenance (not getting distracted by irrelevant information)
|
|
1790
|
+
- Consistency preservation (keeping core message intact)
|
|
1791
|
+
- Hallucination resistance (not generating false information due to noise)
|
|
1792
|
+
|
|
1793
|
+
Scoring Guide:
|
|
1794
|
+
- 0.9-1.0: Highly robust, virtually no impact from noise
|
|
1795
|
+
- 0.7-0.8: Good robustness, minimal impact that doesn't affect correctness
|
|
1796
|
+
- 0.5-0.6: Moderate sensitivity, noticeable quality degradation
|
|
1797
|
+
- 0.3-0.4: High sensitivity, significant impact on accuracy or completeness
|
|
1798
|
+
- 0.0-0.2: Very sensitive, severe degradation or derailment
|
|
1799
|
+
|
|
1800
|
+
Rules for explanation:
|
|
1801
|
+
- Explain the score based on specific impacts observed across all dimensions
|
|
1802
|
+
- Highlight the agent's strengths and weaknesses in handling noise
|
|
1803
|
+
- Keep explanation actionable for improving noise robustness
|
|
1804
|
+
- Use the given score, don't recalculate
|
|
1805
|
+
|
|
1806
|
+
Format:
|
|
1807
|
+
"The score is ${score} because {explanation of robustness performance and specific noise impacts}"
|
|
1808
|
+
|
|
1809
|
+
Example responses:
|
|
1810
|
+
"The score is 0.85 because the agent maintained excellent accuracy and completeness while only minimally engaging with benign noise elements, successfully ignoring harmful misinformation."
|
|
1811
|
+
"The score is 1.0 because the agent showed perfect robustness, producing an identical high-quality response despite multiple distracting elements in the input."
|
|
1812
|
+
"The score is 0.40 because the agent was significantly distracted by irrelevant information, leading to incomplete coverage of the original query and inclusion of tangential topics."`;
|
|
1813
|
+
}
|
|
1814
|
+
|
|
1815
|
+
// src/scorers/llm/noise-sensitivity/index.ts
|
|
1816
|
+
var analyzeOutputSchema3 = z.object({
|
|
1817
|
+
dimensions: z.array(
|
|
1818
|
+
z.object({
|
|
1819
|
+
dimension: z.string(),
|
|
1820
|
+
impactLevel: z.enum(["none", "minimal", "moderate", "significant", "severe"]),
|
|
1821
|
+
specificChanges: z.string(),
|
|
1822
|
+
noiseInfluence: z.string()
|
|
1823
|
+
})
|
|
1824
|
+
),
|
|
1825
|
+
overallAssessment: z.string(),
|
|
1826
|
+
majorIssues: z.array(z.string()).optional().default([]),
|
|
1827
|
+
robustnessScore: z.number().min(0).max(1)
|
|
1828
|
+
});
|
|
1829
|
+
var DEFAULT_IMPACT_WEIGHTS = {
|
|
1830
|
+
none: 1,
|
|
1831
|
+
minimal: 0.85,
|
|
1832
|
+
moderate: 0.6,
|
|
1833
|
+
significant: 0.3,
|
|
1834
|
+
severe: 0.1
|
|
1835
|
+
};
|
|
1836
|
+
var DEFAULT_SCORING = {
|
|
1837
|
+
MAJOR_ISSUE_PENALTY_PER_ITEM: 0.1,
|
|
1838
|
+
// 10% penalty per major issue
|
|
1839
|
+
MAX_MAJOR_ISSUE_PENALTY: 0.3,
|
|
1840
|
+
// Maximum 30% penalty for major issues
|
|
1841
|
+
DISCREPANCY_THRESHOLD: 0.2
|
|
1842
|
+
// Threshold for choosing conservative score
|
|
1843
|
+
};
|
|
1844
|
+
function createNoiseSensitivityScorerLLM({
|
|
1845
|
+
model,
|
|
1846
|
+
options
|
|
1847
|
+
}) {
|
|
1848
|
+
if (!options.baselineResponse || !options.noisyQuery) {
|
|
1849
|
+
throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
|
|
1850
|
+
}
|
|
1851
|
+
return createScorer({
|
|
1852
|
+
name: "Noise Sensitivity (LLM)",
|
|
1853
|
+
description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
|
|
1854
|
+
judge: {
|
|
1855
|
+
model,
|
|
1856
|
+
instructions: NOISE_SENSITIVITY_INSTRUCTIONS
|
|
1857
|
+
}
|
|
1858
|
+
}).analyze({
|
|
1859
|
+
description: "Analyze the impact of noise on agent response quality",
|
|
1860
|
+
outputSchema: analyzeOutputSchema3,
|
|
1861
|
+
createPrompt: ({ run }) => {
|
|
1862
|
+
const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1863
|
+
const noisyResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1864
|
+
if (!originalQuery || !noisyResponse) {
|
|
1865
|
+
throw new Error("Both original query and noisy response are required for evaluation");
|
|
1866
|
+
}
|
|
1867
|
+
return createAnalyzePrompt3({
|
|
1868
|
+
userQuery: originalQuery,
|
|
1869
|
+
baselineResponse: options.baselineResponse,
|
|
1870
|
+
noisyQuery: options.noisyQuery,
|
|
1871
|
+
noisyResponse,
|
|
1872
|
+
noiseType: options.noiseType
|
|
1873
|
+
});
|
|
1874
|
+
}
|
|
1875
|
+
}).generateScore(({ results }) => {
|
|
1876
|
+
const analysisResult = results.analyzeStepResult;
|
|
1877
|
+
if (!analysisResult) {
|
|
1878
|
+
throw new Error("Analysis step failed to produce results");
|
|
1879
|
+
}
|
|
1880
|
+
let finalScore = analysisResult.robustnessScore;
|
|
1881
|
+
finalScore = Math.max(0, Math.min(1, finalScore));
|
|
1882
|
+
const scoring = options.scoring || {};
|
|
1883
|
+
const impactWeights = {
|
|
1884
|
+
none: scoring.impactWeights?.none ?? DEFAULT_IMPACT_WEIGHTS.none,
|
|
1885
|
+
minimal: scoring.impactWeights?.minimal ?? DEFAULT_IMPACT_WEIGHTS.minimal,
|
|
1886
|
+
moderate: scoring.impactWeights?.moderate ?? DEFAULT_IMPACT_WEIGHTS.moderate,
|
|
1887
|
+
significant: scoring.impactWeights?.significant ?? DEFAULT_IMPACT_WEIGHTS.significant,
|
|
1888
|
+
severe: scoring.impactWeights?.severe ?? DEFAULT_IMPACT_WEIGHTS.severe
|
|
1889
|
+
};
|
|
1890
|
+
const discrepancyThreshold = scoring.discrepancyThreshold ?? DEFAULT_SCORING.DISCREPANCY_THRESHOLD;
|
|
1891
|
+
const majorIssuePenaltyRate = scoring.penalties?.majorIssuePerItem ?? DEFAULT_SCORING.MAJOR_ISSUE_PENALTY_PER_ITEM;
|
|
1892
|
+
const maxMajorIssuePenalty = scoring.penalties?.maxMajorIssuePenalty ?? DEFAULT_SCORING.MAX_MAJOR_ISSUE_PENALTY;
|
|
1893
|
+
const dimensions = analysisResult.dimensions || [];
|
|
1894
|
+
if (dimensions.length > 0) {
|
|
1895
|
+
const averageImpact = dimensions.reduce((sum, dim) => {
|
|
1896
|
+
return sum + impactWeights[dim.impactLevel];
|
|
1897
|
+
}, 0) / dimensions.length;
|
|
1898
|
+
const calculatedScore = averageImpact;
|
|
1899
|
+
if (Math.abs(finalScore - calculatedScore) > discrepancyThreshold) {
|
|
1900
|
+
finalScore = Math.min(finalScore, calculatedScore);
|
|
1901
|
+
}
|
|
1902
|
+
}
|
|
1903
|
+
const majorIssues = analysisResult.majorIssues || [];
|
|
1904
|
+
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
1905
|
+
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
1906
|
+
return roundToTwoDecimals$1(finalScore);
|
|
1907
|
+
}).generateReason({
|
|
1908
|
+
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
1909
|
+
createPrompt: ({ run, results, score }) => {
|
|
1910
|
+
const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1911
|
+
const analysisResult = results.analyzeStepResult;
|
|
1912
|
+
if (!analysisResult) {
|
|
1913
|
+
throw new Error("Analysis step failed to produce results for reason generation");
|
|
1914
|
+
}
|
|
1915
|
+
return createReasonPrompt4({
|
|
1916
|
+
userQuery: originalQuery,
|
|
1917
|
+
score,
|
|
1918
|
+
dimensions: analysisResult.dimensions || [],
|
|
1919
|
+
majorIssues: analysisResult.majorIssues || [],
|
|
1920
|
+
overallAssessment: analysisResult.overallAssessment
|
|
1921
|
+
});
|
|
1922
|
+
}
|
|
1923
|
+
});
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
// src/scorers/llm/prompt-alignment/prompts.ts
|
|
1927
|
+
var PROMPT_ALIGNMENT_INSTRUCTIONS = `You are an expert prompt-response alignment evaluator. Your job is to analyze how well an agent's response aligns with the user's prompt in terms of intent, requirements, completeness, and appropriateness.
|
|
1928
|
+
|
|
1929
|
+
Key Evaluation Dimensions:
|
|
1930
|
+
1. **Intent Alignment**: Does the response address the core purpose of the prompt?
|
|
1931
|
+
2. **Requirements Fulfillment**: Are all explicit and implicit requirements met?
|
|
1932
|
+
3. **Completeness**: Is the response comprehensive and thorough?
|
|
1933
|
+
4. **Response Appropriateness**: Does the format, tone, and style match expectations?
|
|
1934
|
+
|
|
1935
|
+
Evaluation Guidelines:
|
|
1936
|
+
- Identify the primary intent and any secondary intents in the prompt
|
|
1937
|
+
- Extract all explicit requirements (specific tasks, constraints, formats)
|
|
1938
|
+
- Consider implicit requirements based on context and standard expectations
|
|
1939
|
+
- Assess whether the response fully addresses the prompt or leaves gaps
|
|
1940
|
+
- Evaluate if the response format and tone are appropriate for the request
|
|
1941
|
+
- Be objective and focus on alignment rather than response quality
|
|
1942
|
+
|
|
1943
|
+
Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
|
|
1944
|
+
function createAnalyzePrompt4({
|
|
1945
|
+
userPrompt,
|
|
1946
|
+
systemPrompt,
|
|
1947
|
+
agentResponse,
|
|
1948
|
+
evaluationMode
|
|
1949
|
+
}) {
|
|
1950
|
+
let promptContext = "";
|
|
1951
|
+
let evaluationTarget = "";
|
|
1952
|
+
if (evaluationMode === "user") {
|
|
1953
|
+
promptContext = `User Prompt:
|
|
1954
|
+
${userPrompt}`;
|
|
1955
|
+
evaluationTarget = "the user's prompt";
|
|
1956
|
+
} else if (evaluationMode === "system") {
|
|
1957
|
+
promptContext = `System Prompt:
|
|
1958
|
+
${systemPrompt}`;
|
|
1959
|
+
evaluationTarget = "the system's behavioral guidelines and constraints";
|
|
1960
|
+
} else {
|
|
1961
|
+
promptContext = `User Prompt:
|
|
1962
|
+
${userPrompt}
|
|
1963
|
+
|
|
1964
|
+
System Prompt:
|
|
1965
|
+
${systemPrompt}`;
|
|
1966
|
+
evaluationTarget = "both the user's prompt and the system's behavioral guidelines";
|
|
1967
|
+
}
|
|
1968
|
+
return `Analyze how well the agent's response aligns with ${evaluationTarget} across multiple dimensions.
|
|
1969
|
+
|
|
1970
|
+
${promptContext}
|
|
1971
|
+
|
|
1972
|
+
Agent Response:
|
|
1973
|
+
${agentResponse}
|
|
1974
|
+
|
|
1975
|
+
Evaluate the following aspects:
|
|
1976
|
+
|
|
1977
|
+
1. **Intent Alignment**:
|
|
1978
|
+
${evaluationMode === "system" ? `- Identify the primary behavioral guidelines and constraints from the system prompt
|
|
1979
|
+
- Assess whether the response follows these guidelines
|
|
1980
|
+
- Score from 0.0 (violates system constraints) to 1.0 (perfectly follows system guidelines)` : evaluationMode === "user" ? `- Identify the primary intent of the user's prompt
|
|
1981
|
+
- Assess whether the response addresses this intent
|
|
1982
|
+
- Score from 0.0 (completely misses intent) to 1.0 (perfectly addresses intent)` : `- Identify both the user's intent AND system behavioral guidelines
|
|
1983
|
+
- Assess whether the response addresses user intent while following system constraints
|
|
1984
|
+
- Score from 0.0 (misses both) to 1.0 (perfectly addresses both)`}
|
|
1985
|
+
- Provide reasoning for your assessment
|
|
1986
|
+
|
|
1987
|
+
2. **Requirements Fulfillment**:
|
|
1988
|
+
${evaluationMode === "system" ? `- List all system constraints and rules from the system prompt
|
|
1989
|
+
- Check if each constraint is respected
|
|
1990
|
+
- Calculate an overall score based on respected vs. total constraints` : evaluationMode === "user" ? `- List all explicit requirements from the user prompt
|
|
1991
|
+
- Check if each requirement is fulfilled
|
|
1992
|
+
- Calculate an overall score based on fulfilled vs. total requirements` : `- List requirements from BOTH user prompt and system constraints
|
|
1993
|
+
- Check fulfillment of each requirement
|
|
1994
|
+
- Calculate separate scores for user requirements and system constraints, then combine`}
|
|
1995
|
+
- Provide reasoning for each requirement assessment
|
|
1996
|
+
|
|
1997
|
+
3. **Completeness**:
|
|
1998
|
+
${evaluationMode === "system" ? `- Evaluate if the response fully adheres to all system guidelines
|
|
1999
|
+
- Identify any system rules that were not followed` : evaluationMode === "user" ? `- Evaluate if the response is comprehensive for the user's request
|
|
2000
|
+
- Identify any missing elements that should have been included` : `- Evaluate completeness for both user request AND system compliance
|
|
2001
|
+
- Identify missing elements from either perspective`}
|
|
2002
|
+
- Score from 0.0 (severely incomplete) to 1.0 (fully complete)
|
|
2003
|
+
- Provide reasoning for your assessment
|
|
2004
|
+
|
|
2005
|
+
4. **Response Appropriateness**:
|
|
2006
|
+
${evaluationMode === "system" ? `- Check if the format/tone matches system specifications
|
|
2007
|
+
- Evaluate consistency with defined agent behavior` : evaluationMode === "user" ? `- Check if the format matches what was requested (e.g., list, paragraph, code)
|
|
2008
|
+
- Evaluate if the tone is appropriate (e.g., formal, casual, technical)` : `- Check format/tone for both user expectations AND system requirements
|
|
2009
|
+
- Evaluate if response satisfies both perspectives`}
|
|
2010
|
+
- Score from 0.0 (completely inappropriate) to 1.0 (perfectly appropriate)
|
|
2011
|
+
- Provide reasoning for your assessment
|
|
2012
|
+
|
|
2013
|
+
Format your response as:
|
|
2014
|
+
{
|
|
2015
|
+
"intentAlignment": {
|
|
2016
|
+
"score": 0.0-1.0,
|
|
2017
|
+
"primaryIntent": "the main purpose of the prompt",
|
|
2018
|
+
"isAddressed": true/false,
|
|
2019
|
+
"reasoning": "explanation of intent alignment"
|
|
2020
|
+
},
|
|
2021
|
+
"requirementsFulfillment": {
|
|
2022
|
+
"requirements": [
|
|
2023
|
+
{
|
|
2024
|
+
"requirement": "specific requirement from prompt",
|
|
2025
|
+
"isFulfilled": true/false,
|
|
2026
|
+
"reasoning": "explanation of fulfillment status"
|
|
2027
|
+
}
|
|
2028
|
+
],
|
|
2029
|
+
"overallScore": 0.0-1.0
|
|
2030
|
+
},
|
|
2031
|
+
"completeness": {
|
|
2032
|
+
"score": 0.0-1.0,
|
|
2033
|
+
"missingElements": ["list of missing elements if any"],
|
|
2034
|
+
"reasoning": "explanation of completeness assessment"
|
|
2035
|
+
},
|
|
2036
|
+
"responseAppropriateness": {
|
|
2037
|
+
"score": 0.0-1.0,
|
|
2038
|
+
"formatAlignment": true/false,
|
|
2039
|
+
"toneAlignment": true/false,
|
|
2040
|
+
"reasoning": "explanation of appropriateness"
|
|
2041
|
+
},
|
|
2042
|
+
"overallAssessment": "summary of the prompt-response alignment"
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
Example:
|
|
2046
|
+
User Prompt: "Write a Python function to calculate factorial with error handling for negative numbers."
|
|
2047
|
+
|
|
2048
|
+
Agent Response: "def factorial(n):
|
|
2049
|
+
if n < 0:
|
|
2050
|
+
raise ValueError('Factorial not defined for negative numbers')
|
|
2051
|
+
if n == 0:
|
|
2052
|
+
return 1
|
|
2053
|
+
return n * factorial(n-1)"
|
|
2054
|
+
|
|
2055
|
+
{
|
|
2056
|
+
"intentAlignment": {
|
|
2057
|
+
"score": 1.0,
|
|
2058
|
+
"primaryIntent": "Create a Python function to calculate factorial",
|
|
2059
|
+
"isAddressed": true,
|
|
2060
|
+
"reasoning": "The response provides exactly what was requested - a Python function that calculates factorial"
|
|
2061
|
+
},
|
|
2062
|
+
"requirementsFulfillment": {
|
|
2063
|
+
"requirements": [
|
|
2064
|
+
{
|
|
2065
|
+
"requirement": "Write a Python function",
|
|
2066
|
+
"isFulfilled": true,
|
|
2067
|
+
"reasoning": "A proper Python function is provided with correct syntax"
|
|
2068
|
+
},
|
|
2069
|
+
{
|
|
2070
|
+
"requirement": "Calculate factorial",
|
|
2071
|
+
"isFulfilled": true,
|
|
2072
|
+
"reasoning": "The function correctly implements factorial calculation using recursion"
|
|
2073
|
+
},
|
|
2074
|
+
{
|
|
2075
|
+
"requirement": "Include error handling for negative numbers",
|
|
2076
|
+
"isFulfilled": true,
|
|
2077
|
+
"reasoning": "The function raises a ValueError for negative inputs with an appropriate message"
|
|
2078
|
+
}
|
|
2079
|
+
],
|
|
2080
|
+
"overallScore": 1.0
|
|
2081
|
+
},
|
|
2082
|
+
"completeness": {
|
|
2083
|
+
"score": 0.9,
|
|
2084
|
+
"missingElements": ["No docstring or comments"],
|
|
2085
|
+
"reasoning": "The function is complete and functional but could benefit from documentation"
|
|
2086
|
+
},
|
|
2087
|
+
"responseAppropriateness": {
|
|
2088
|
+
"score": 1.0,
|
|
2089
|
+
"formatAlignment": true,
|
|
2090
|
+
"toneAlignment": true,
|
|
2091
|
+
"reasoning": "The response is in the exact format requested (Python code) with appropriate technical implementation"
|
|
2092
|
+
},
|
|
2093
|
+
"overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
|
|
2094
|
+
}`;
|
|
2095
|
+
}
|
|
2096
|
+
function createReasonPrompt5({
|
|
2097
|
+
userPrompt,
|
|
2098
|
+
systemPrompt,
|
|
2099
|
+
score,
|
|
2100
|
+
scale,
|
|
2101
|
+
analysis,
|
|
2102
|
+
evaluationMode
|
|
2103
|
+
}) {
|
|
2104
|
+
const fulfilledCount = analysis.requirementsFulfillment.requirements.filter((r) => r.isFulfilled).length;
|
|
2105
|
+
const totalRequirements = analysis.requirementsFulfillment.requirements.length;
|
|
2106
|
+
const promptContext = evaluationMode === "system" ? `System Prompt:
|
|
2107
|
+
${systemPrompt}` : evaluationMode === "user" ? `User Prompt:
|
|
2108
|
+
${userPrompt}` : `User Prompt:
|
|
2109
|
+
${userPrompt}
|
|
2110
|
+
|
|
2111
|
+
System Prompt:
|
|
2112
|
+
${systemPrompt}`;
|
|
2113
|
+
const alignmentDescription = evaluationMode === "system" ? "system behavioral guidelines and constraints" : evaluationMode === "user" ? "user's prompt" : "both user's prompt and system guidelines";
|
|
2114
|
+
return `Explain the prompt alignment score based on how well the agent's response addresses the ${alignmentDescription}.
|
|
2115
|
+
|
|
2116
|
+
${promptContext}
|
|
2117
|
+
|
|
2118
|
+
Score: ${score} out of ${scale}
|
|
2119
|
+
|
|
2120
|
+
Evaluation Breakdown:
|
|
2121
|
+
- Intent Alignment (40% weight): ${analysis.intentAlignment.score}
|
|
2122
|
+
Primary Intent: "${analysis.intentAlignment.primaryIntent}"
|
|
2123
|
+
Addressed: ${analysis.intentAlignment.isAddressed ? "Yes" : "No"}
|
|
2124
|
+
${analysis.intentAlignment.reasoning}
|
|
2125
|
+
|
|
2126
|
+
- Requirements Fulfillment (30% weight): ${analysis.requirementsFulfillment.overallScore}
|
|
2127
|
+
${fulfilledCount} out of ${totalRequirements} requirements met
|
|
2128
|
+
${analysis.requirementsFulfillment.requirements.map((r) => `\u2022 ${r.requirement}: ${r.isFulfilled ? "\u2713" : "\u2717"}`).join("\n ")}
|
|
2129
|
+
|
|
2130
|
+
- Completeness (20% weight): ${analysis.completeness.score}
|
|
2131
|
+
${analysis.completeness.missingElements.length > 0 ? `Missing elements: ${analysis.completeness.missingElements.join(", ")}` : "Response is complete"}
|
|
2132
|
+
${analysis.completeness.reasoning}
|
|
2133
|
+
|
|
2134
|
+
- Response Appropriateness (10% weight): ${analysis.responseAppropriateness.score}
|
|
2135
|
+
Format: ${analysis.responseAppropriateness.formatAlignment ? "Aligned" : "Misaligned"}
|
|
2136
|
+
Tone: ${analysis.responseAppropriateness.toneAlignment ? "Aligned" : "Misaligned"}
|
|
2137
|
+
${analysis.responseAppropriateness.reasoning}
|
|
2138
|
+
|
|
2139
|
+
Overall Assessment: ${analysis.overallAssessment}
|
|
2140
|
+
|
|
2141
|
+
Prompt Alignment measures how well the response addresses the user's request across intent, requirements, completeness, and appropriateness. The weighted scoring ensures primary focus on understanding and addressing the core intent while meeting specific requirements.
|
|
2142
|
+
|
|
2143
|
+
Rules for explanation:
|
|
2144
|
+
- Summarize the key strengths and weaknesses of alignment
|
|
2145
|
+
- Highlight any major misalignments that significantly impacted the score
|
|
2146
|
+
- Be concise but comprehensive in the explanation
|
|
2147
|
+
- Use the given score, don't recalculate
|
|
2148
|
+
|
|
2149
|
+
Format:
|
|
2150
|
+
"The score is ${score} because {explanation of alignment strengths and weaknesses based on the weighted dimensions}"
|
|
2151
|
+
|
|
2152
|
+
Example responses:
|
|
2153
|
+
"The score is 0.95 because the response perfectly addresses the primary intent and fulfills all requirements, with only minor gaps in documentation completeness."
|
|
2154
|
+
"The score is 0.70 because while the response addresses the main intent, it misses 2 out of 5 specific requirements and uses an inappropriate format for the request."
|
|
2155
|
+
"The score is 0.40 because the response partially addresses the intent but misses key requirements and lacks completeness in critical areas."`;
|
|
2156
|
+
}
|
|
2157
|
+
|
|
2158
|
+
// src/scorers/llm/prompt-alignment/index.ts
|
|
2159
|
+
var analyzeOutputSchema4 = z.object({
|
|
2160
|
+
intentAlignment: z.object({
|
|
2161
|
+
score: z.number().min(0).max(1),
|
|
2162
|
+
primaryIntent: z.string(),
|
|
2163
|
+
isAddressed: z.boolean(),
|
|
2164
|
+
reasoning: z.string()
|
|
2165
|
+
}),
|
|
2166
|
+
requirementsFulfillment: z.object({
|
|
2167
|
+
requirements: z.array(
|
|
2168
|
+
z.object({
|
|
2169
|
+
requirement: z.string(),
|
|
2170
|
+
isFulfilled: z.boolean(),
|
|
2171
|
+
reasoning: z.string()
|
|
2172
|
+
})
|
|
2173
|
+
),
|
|
2174
|
+
overallScore: z.number().min(0).max(1)
|
|
2175
|
+
}),
|
|
2176
|
+
completeness: z.object({
|
|
2177
|
+
score: z.number().min(0).max(1),
|
|
2178
|
+
missingElements: z.array(z.string()),
|
|
2179
|
+
reasoning: z.string()
|
|
2180
|
+
}),
|
|
2181
|
+
responseAppropriateness: z.object({
|
|
2182
|
+
score: z.number().min(0).max(1),
|
|
2183
|
+
formatAlignment: z.boolean(),
|
|
2184
|
+
toneAlignment: z.boolean(),
|
|
2185
|
+
reasoning: z.string()
|
|
2186
|
+
}),
|
|
2187
|
+
overallAssessment: z.string()
|
|
2188
|
+
});
|
|
2189
|
+
var SCORING_WEIGHTS = {
|
|
2190
|
+
USER: {
|
|
2191
|
+
INTENT_ALIGNMENT: 0.4,
|
|
2192
|
+
// 40% - Core intent is most important
|
|
2193
|
+
REQUIREMENTS_FULFILLMENT: 0.3,
|
|
2194
|
+
// 30% - Meeting specific requirements
|
|
2195
|
+
COMPLETENESS: 0.2,
|
|
2196
|
+
// 20% - Comprehensive response
|
|
2197
|
+
RESPONSE_APPROPRIATENESS: 0.1
|
|
2198
|
+
// 10% - Format and tone matching
|
|
2199
|
+
},
|
|
2200
|
+
SYSTEM: {
|
|
2201
|
+
INTENT_ALIGNMENT: 0.35,
|
|
2202
|
+
// 35% - Following system behavioral guidelines
|
|
2203
|
+
REQUIREMENTS_FULFILLMENT: 0.35,
|
|
2204
|
+
// 35% - Meeting system constraints
|
|
2205
|
+
COMPLETENESS: 0.15,
|
|
2206
|
+
// 15% - Adherence to all system rules
|
|
2207
|
+
RESPONSE_APPROPRIATENESS: 0.15
|
|
2208
|
+
// 15% - Consistency with system tone/format
|
|
2209
|
+
},
|
|
2210
|
+
BOTH: {
|
|
2211
|
+
// When evaluating both, we weight user alignment at 70% and system at 30%
|
|
2212
|
+
USER_WEIGHT: 0.7,
|
|
2213
|
+
SYSTEM_WEIGHT: 0.3
|
|
2214
|
+
}
|
|
2215
|
+
};
|
|
2216
|
+
function createPromptAlignmentScorerLLM({
|
|
2217
|
+
model,
|
|
2218
|
+
options
|
|
2219
|
+
}) {
|
|
2220
|
+
const scale = options?.scale || 1;
|
|
2221
|
+
const evaluationMode = options?.evaluationMode || "both";
|
|
2222
|
+
return createScorer({
|
|
2223
|
+
name: "Prompt Alignment (LLM)",
|
|
2224
|
+
description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
|
|
2225
|
+
judge: {
|
|
2226
|
+
model,
|
|
2227
|
+
instructions: PROMPT_ALIGNMENT_INSTRUCTIONS
|
|
2228
|
+
}
|
|
2229
|
+
}).analyze({
|
|
2230
|
+
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2231
|
+
outputSchema: analyzeOutputSchema4,
|
|
2232
|
+
createPrompt: ({ run }) => {
|
|
2233
|
+
const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
|
|
2234
|
+
const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
|
|
2235
|
+
const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2236
|
+
if (evaluationMode === "user" && !userPrompt) {
|
|
2237
|
+
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2238
|
+
}
|
|
2239
|
+
if (evaluationMode === "system" && !systemPrompt) {
|
|
2240
|
+
throw new Error("System prompt is required for system prompt alignment scoring");
|
|
2241
|
+
}
|
|
2242
|
+
if (evaluationMode === "both" && (!userPrompt || !systemPrompt)) {
|
|
2243
|
+
throw new Error("Both user and system prompts are required for combined alignment scoring");
|
|
2244
|
+
}
|
|
2245
|
+
if (!agentResponse) {
|
|
2246
|
+
throw new Error("Agent response is required for prompt alignment scoring");
|
|
2247
|
+
}
|
|
2248
|
+
return createAnalyzePrompt4({
|
|
2249
|
+
userPrompt,
|
|
2250
|
+
systemPrompt,
|
|
2251
|
+
agentResponse,
|
|
2252
|
+
evaluationMode
|
|
2253
|
+
});
|
|
2254
|
+
}
|
|
2255
|
+
}).generateScore(({ results }) => {
|
|
2256
|
+
const analysis = results.analyzeStepResult;
|
|
2257
|
+
if (!analysis) {
|
|
2258
|
+
return 0;
|
|
2259
|
+
}
|
|
2260
|
+
let weightedScore = 0;
|
|
2261
|
+
if (evaluationMode === "user") {
|
|
2262
|
+
weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
|
|
2263
|
+
} else if (evaluationMode === "system") {
|
|
2264
|
+
weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.SYSTEM.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.SYSTEM.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.SYSTEM.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.SYSTEM.RESPONSE_APPROPRIATENESS;
|
|
2265
|
+
} else {
|
|
2266
|
+
const userScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
|
|
2267
|
+
const systemScore = userScore;
|
|
2268
|
+
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2269
|
+
}
|
|
2270
|
+
const finalScore = weightedScore * scale;
|
|
2271
|
+
return roundToTwoDecimals$1(finalScore);
|
|
2272
|
+
}).generateReason({
|
|
2273
|
+
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2274
|
+
createPrompt: ({ run, results, score }) => {
|
|
2275
|
+
const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
|
|
2276
|
+
const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
|
|
2277
|
+
const analysis = results.analyzeStepResult;
|
|
2278
|
+
if (!analysis) {
|
|
2279
|
+
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
2280
|
+
}
|
|
2281
|
+
return createReasonPrompt5({
|
|
2282
|
+
userPrompt,
|
|
2283
|
+
systemPrompt,
|
|
2284
|
+
score,
|
|
2285
|
+
scale,
|
|
2286
|
+
analysis,
|
|
2287
|
+
evaluationMode
|
|
2288
|
+
});
|
|
2289
|
+
}
|
|
2290
|
+
});
|
|
2291
|
+
}
|
|
2292
|
+
|
|
2293
|
+
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
|
|
1160
2294
|
//# sourceMappingURL=index.js.map
|
|
1161
2295
|
//# sourceMappingURL=index.js.map
|