@mastra/evals 0.13.1 → 0.13.3-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/{chunk-5CVZXIFW.js → chunk-4LRZVFXR.js} +32 -3
  2. package/dist/chunk-4LRZVFXR.js.map +1 -0
  3. package/dist/{chunk-QVZBKGOE.cjs → chunk-EKSPLMYP.cjs} +32 -2
  4. package/dist/chunk-EKSPLMYP.cjs.map +1 -0
  5. package/dist/{dist-JVIEAZJ6.js → dist-CI72CYZJ.js} +10 -10
  6. package/dist/{dist-JVIEAZJ6.js.map → dist-CI72CYZJ.js.map} +1 -1
  7. package/dist/{dist-JQCAD3AD.cjs → dist-IKJJ2AX4.cjs} +10 -10
  8. package/dist/{dist-JQCAD3AD.cjs.map → dist-IKJJ2AX4.cjs.map} +1 -1
  9. package/dist/index.cjs +1 -1
  10. package/dist/index.js +1 -1
  11. package/dist/{magic-string.es-NBXOXRCK.cjs → magic-string.es-VZN2EYER.cjs} +3 -3
  12. package/dist/{magic-string.es-NBXOXRCK.cjs.map → magic-string.es-VZN2EYER.cjs.map} +1 -1
  13. package/dist/{magic-string.es-6JSI7KY4.js → magic-string.es-WQRLTQPQ.js} +3 -3
  14. package/dist/{magic-string.es-6JSI7KY4.js.map → magic-string.es-WQRLTQPQ.js.map} +1 -1
  15. package/dist/scorers/code/index.cjs +2 -2
  16. package/dist/scorers/code/index.js +1 -1
  17. package/dist/scorers/llm/context-precision/index.d.ts +18 -0
  18. package/dist/scorers/llm/context-precision/index.d.ts.map +1 -0
  19. package/dist/scorers/llm/context-precision/prompts.d.ts +19 -0
  20. package/dist/scorers/llm/context-precision/prompts.d.ts.map +1 -0
  21. package/dist/scorers/llm/context-relevance/index.d.ts +27 -0
  22. package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -0
  23. package/dist/scorers/llm/context-relevance/prompts.d.ts +20 -0
  24. package/dist/scorers/llm/context-relevance/prompts.d.ts.map +1 -0
  25. package/dist/scorers/llm/index.cjs +1163 -25
  26. package/dist/scorers/llm/index.cjs.map +1 -1
  27. package/dist/scorers/llm/index.d.ts +4 -0
  28. package/dist/scorers/llm/index.d.ts.map +1 -1
  29. package/dist/scorers/llm/index.js +1137 -3
  30. package/dist/scorers/llm/index.js.map +1 -1
  31. package/dist/scorers/llm/noise-sensitivity/index.d.ts +36 -0
  32. package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/noise-sensitivity/prompts.d.ts +21 -0
  34. package/dist/scorers/llm/noise-sensitivity/prompts.d.ts.map +1 -0
  35. package/dist/scorers/llm/prompt-alignment/index.d.ts +38 -0
  36. package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -0
  37. package/dist/scorers/llm/prompt-alignment/prompts.d.ts +44 -0
  38. package/dist/scorers/llm/prompt-alignment/prompts.d.ts.map +1 -0
  39. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -4
  40. package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
  41. package/dist/scorers/utils.d.ts +2 -0
  42. package/dist/scorers/utils.d.ts.map +1 -1
  43. package/package.json +6 -6
  44. package/dist/chunk-5CVZXIFW.js.map +0 -1
  45. package/dist/chunk-QVZBKGOE.cjs.map +0 -1
@@ -1,5 +1,5 @@
1
1
  import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
2
- import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls } from '../../chunk-5CVZXIFW.js';
2
+ import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-4LRZVFXR.js';
3
3
  import { createScorer } from '@mastra/core/scores';
4
4
  import { z } from 'zod';
5
5
 
@@ -1097,7 +1097,7 @@ var analyzeOutputSchema = z.object({
1097
1097
  missingTools: z.array(z.string()).optional()
1098
1098
  });
1099
1099
  function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1100
- const toolDefinitions = availableTools.map((tool) => `${tool.name}: ${tool.description}`).join("\n");
1100
+ const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
1101
1101
  return createScorer({
1102
1102
  name: "Tool Call Accuracy (LLM)",
1103
1103
  description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
@@ -1156,6 +1156,1140 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1156
1156
  });
1157
1157
  }
1158
1158
 
1159
- export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createFaithfulnessScorer, createHallucinationScorer, createToolCallAccuracyScorerLLM, createToxicityScorer };
1159
+ // src/scorers/llm/context-relevance/prompts.ts
1160
+ var CONTEXT_RELEVANCE_INSTRUCTIONS = `You are an expert context relevance evaluator. Your job is to analyze whether the provided context information was appropriate and useful for generating the agent's response to the user's query.
1161
+
1162
+ Key Evaluation Criteria:
1163
+ 1. **Relevance**: Does the context directly relate to the user's query?
1164
+ 2. **Utility**: Did the context help produce a better response?
1165
+ 3. **Completeness**: Was the context sufficient for the task?
1166
+ 4. **Quality**: Is the context accurate and trustworthy?
1167
+
1168
+ Evaluation Guidelines:
1169
+ - Context that directly answers or supports the user's query should be marked as highly relevant
1170
+ - Context that provides background information relevant to the query should be considered moderately relevant
1171
+ - Context that is tangentially related but doesn't directly help should be marked as low relevance
1172
+ - Context that is completely unrelated should be marked as irrelevant
1173
+ - Consider whether missing context might have led to a better response
1174
+
1175
+ Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
1176
+ function createAnalyzePrompt2({
1177
+ userQuery,
1178
+ agentResponse,
1179
+ providedContext
1180
+ }) {
1181
+ const contextList = providedContext.map((ctx, index) => `[${index}] ${ctx}`).join("\n");
1182
+ return `Analyze the relevance of the provided context for answering the user's query and generating the agent's response.
1183
+
1184
+ User Query:
1185
+ ${userQuery}
1186
+
1187
+ Agent Response:
1188
+ ${agentResponse}
1189
+
1190
+ Context pieces to evaluate:
1191
+ ${contextList}
1192
+
1193
+ For each context piece, evaluate:
1194
+ 1. **Relevance Level**: How relevant is it to the user's query?
1195
+ - "high": Directly addresses the query or provides essential information
1196
+ - "medium": Provides supporting or background information that's helpful
1197
+ - "low": Tangentially related but not very helpful
1198
+ - "none": Completely irrelevant or unrelated
1199
+
1200
+ 2. **Usage**: Was this context actually used in generating the agent's response?
1201
+ - true: The response clearly incorporates or reflects this information
1202
+ - false: This information doesn't appear to be used in the response
1203
+
1204
+ 3. **Reasoning**: Explain your assessment in detail
1205
+
1206
+ Also identify any missing context that should have been provided to better answer the query.
1207
+
1208
+ Format your response as:
1209
+ {
1210
+ "evaluations": [
1211
+ {
1212
+ "context_index": 0,
1213
+ "contextPiece": "the actual text of the context piece",
1214
+ "relevanceLevel": "high/medium/low/none",
1215
+ "wasUsed": true/false,
1216
+ "reasoning": "detailed explanation of the evaluation"
1217
+ }
1218
+ ],
1219
+ "missingContext": ["list of missing information that would have been helpful"],
1220
+ "overallAssessment": "summary of the context quality and usage"
1221
+ }
1222
+
1223
+ The number of evaluations MUST match the number of context pieces exactly.
1224
+
1225
+ Example:
1226
+ User Query: "What are the benefits of exercise?"
1227
+ Agent Response: "Regular exercise improves cardiovascular health and mental wellbeing."
1228
+ Context:
1229
+ [0] "Exercise strengthens the heart and improves blood circulation."
1230
+ [1] "A balanced diet is important for overall health."
1231
+ [2] "Regular physical activity reduces stress and anxiety levels."
1232
+
1233
+ {
1234
+ "evaluations": [
1235
+ {
1236
+ "context_index": 0,
1237
+ "contextPiece": "Exercise strengthens the heart and improves blood circulation.",
1238
+ "relevanceLevel": "high",
1239
+ "wasUsed": true,
1240
+ "reasoning": "This context directly supports the cardiovascular health benefit mentioned in the response"
1241
+ },
1242
+ {
1243
+ "context_index": 1,
1244
+ "contextPiece": "A balanced diet is important for overall health.",
1245
+ "relevanceLevel": "none",
1246
+ "wasUsed": false,
1247
+ "reasoning": "This context is about diet, not exercise benefits, and doesn't contribute to answering the query"
1248
+ },
1249
+ {
1250
+ "context_index": 2,
1251
+ "contextPiece": "Regular physical activity reduces stress and anxiety levels.",
1252
+ "relevanceLevel": "high",
1253
+ "wasUsed": true,
1254
+ "reasoning": "This context directly supports the mental wellbeing benefit mentioned in the response"
1255
+ }
1256
+ ],
1257
+ "missingContext": [],
1258
+ "overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
1259
+ }`;
1260
+ }
1261
+ function createReasonPrompt3({
1262
+ userQuery,
1263
+ score,
1264
+ evaluations,
1265
+ missingContext,
1266
+ scale
1267
+ }) {
1268
+ return `Explain the context relevance score for the provided context based on its relevance and usage in generating the agent's response.
1269
+
1270
+ User Query:
1271
+ ${userQuery}
1272
+
1273
+ Score: ${score} out of ${scale}
1274
+
1275
+ Context Evaluations:
1276
+ ${evaluations.map(
1277
+ (evaluation) => `[${evaluation.context_index}] Relevance: ${evaluation.relevanceLevel}, Used: ${evaluation.wasUsed ? "Yes" : "No"}
1278
+ Context: "${evaluation.contextPiece}"
1279
+ Reasoning: ${evaluation.reasoning}`
1280
+ ).join("\n\n")}
1281
+
1282
+ ${missingContext.length > 0 ? `
1283
+ Missing Context Issues:
1284
+ ${missingContext.map((item) => `- ${item}`).join("\n")}` : ""}
1285
+
1286
+ Context Relevance measures how well the provided context supports answering the user's query and generating the expected response. The score considers:
1287
+ - Relevance levels (high=1.0, medium=0.7, low=0.3, none=0.0)
1288
+ - Usage penalties (10% penalty per unused high-relevance context)
1289
+ - Missing context penalties (up to 50% penalty for identified gaps)
1290
+
1291
+ Rules for explanation:
1292
+ - Explain the score based on context relevance levels and usage
1293
+ - Mention any penalties applied for unused relevant context or missing information
1294
+ - Keep explanation concise and actionable for improving context selection
1295
+ - Use the given score, don't recalculate
1296
+
1297
+ Format:
1298
+ "The score is ${score} because {explanation of context relevance, usage, and any penalties}"
1299
+
1300
+ Example responses:
1301
+ "The score is 0.85 because 2 out of 3 context pieces are highly relevant and used in the response, with only minor penalty for one unused medium-relevance context piece."
1302
+ "The score is 1.0 because all context pieces are highly relevant to the query about exercise benefits and were effectively used in generating the comprehensive response."
1303
+ "The score is 0.40 because while some context is relevant, key information about the topic was missing and one highly relevant context piece was not utilized in the response."`;
1304
+ }
1305
+
1306
+ // src/scorers/llm/context-relevance/index.ts
1307
+ var analyzeOutputSchema2 = z.object({
1308
+ evaluations: z.array(
1309
+ z.object({
1310
+ context_index: z.number(),
1311
+ contextPiece: z.string(),
1312
+ relevanceLevel: z.enum(["high", "medium", "low", "none"]),
1313
+ wasUsed: z.boolean(),
1314
+ reasoning: z.string()
1315
+ })
1316
+ ),
1317
+ missingContext: z.array(z.string()).optional().default([]),
1318
+ overallAssessment: z.string()
1319
+ });
1320
+ var DEFAULT_PENALTIES = {
1321
+ UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
1322
+ // 10% penalty per unused high-relevance context
1323
+ MISSING_CONTEXT_PER_ITEM: 0.15,
1324
+ // 15% penalty per missing context item
1325
+ MAX_MISSING_CONTEXT_PENALTY: 0.5
1326
+ // Maximum 50% penalty for missing context
1327
+ };
1328
+ function createContextRelevanceScorerLLM({
1329
+ model,
1330
+ options
1331
+ }) {
1332
+ if (!options.context && !options.contextExtractor) {
1333
+ throw new Error("Either context or contextExtractor is required for Context Relevance scoring");
1334
+ }
1335
+ if (options.context && options.context.length === 0) {
1336
+ throw new Error("Context array cannot be empty if provided");
1337
+ }
1338
+ return createScorer({
1339
+ name: "Context Relevance (LLM)",
1340
+ description: "Evaluates how relevant and useful the provided context was for generating the agent response",
1341
+ judge: {
1342
+ model,
1343
+ instructions: CONTEXT_RELEVANCE_INSTRUCTIONS
1344
+ }
1345
+ }).analyze({
1346
+ description: "Analyze the relevance and utility of provided context",
1347
+ outputSchema: analyzeOutputSchema2,
1348
+ createPrompt: ({ run }) => {
1349
+ const userQuery = getUserMessageFromRunInput(run.input) ?? "";
1350
+ const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
1351
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1352
+ if (context.length === 0) {
1353
+ return createAnalyzePrompt2({
1354
+ userQuery,
1355
+ agentResponse,
1356
+ providedContext: ["[No context was provided for evaluation]"]
1357
+ });
1358
+ }
1359
+ return createAnalyzePrompt2({
1360
+ userQuery,
1361
+ agentResponse,
1362
+ providedContext: context
1363
+ });
1364
+ }
1365
+ }).generateScore(({ results, run }) => {
1366
+ const evaluations = results.analyzeStepResult?.evaluations || [];
1367
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1368
+ if (context.length === 0) {
1369
+ return 1 * (options.scale || 1);
1370
+ }
1371
+ if (evaluations.length === 0) {
1372
+ const missingContext2 = results.analyzeStepResult?.missingContext || [];
1373
+ return missingContext2.length > 0 ? 0 : 1;
1374
+ }
1375
+ const relevanceWeights = {
1376
+ high: 1,
1377
+ medium: 0.7,
1378
+ low: 0.3,
1379
+ none: 0
1380
+ };
1381
+ const totalWeight = evaluations.reduce((sum, evaluation) => {
1382
+ return sum + relevanceWeights[evaluation.relevanceLevel];
1383
+ }, 0);
1384
+ const maxPossibleWeight = evaluations.length * relevanceWeights.high;
1385
+ const relevanceScore = maxPossibleWeight > 0 ? totalWeight / maxPossibleWeight : 0;
1386
+ const highRelevanceUnused = evaluations.filter(
1387
+ (evaluation) => evaluation.relevanceLevel === "high" && !evaluation.wasUsed
1388
+ ).length;
1389
+ const penalties = options.penalties || {};
1390
+ const unusedPenaltyRate = penalties.unusedHighRelevanceContext ?? DEFAULT_PENALTIES.UNUSED_HIGH_RELEVANCE_CONTEXT;
1391
+ const missingPenaltyRate = penalties.missingContextPerItem ?? DEFAULT_PENALTIES.MISSING_CONTEXT_PER_ITEM;
1392
+ const maxMissingPenalty = penalties.maxMissingContextPenalty ?? DEFAULT_PENALTIES.MAX_MISSING_CONTEXT_PENALTY;
1393
+ const usagePenalty = highRelevanceUnused * unusedPenaltyRate;
1394
+ const missingContext = results.analyzeStepResult?.missingContext || [];
1395
+ const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1396
+ const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1397
+ const scaledScore = finalScore * (options.scale || 1);
1398
+ return roundToTwoDecimals$1(scaledScore);
1399
+ }).generateReason({
1400
+ description: "Generate human-readable explanation of context relevance evaluation",
1401
+ createPrompt: ({ run, results, score }) => {
1402
+ const userQuery = getUserMessageFromRunInput(run.input) ?? "";
1403
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1404
+ if (context.length === 0) {
1405
+ return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
1406
+ }
1407
+ const evaluations = results.analyzeStepResult?.evaluations || [];
1408
+ const missingContext = results.analyzeStepResult?.missingContext || [];
1409
+ return createReasonPrompt3({
1410
+ userQuery,
1411
+ score,
1412
+ evaluations,
1413
+ missingContext,
1414
+ scale: options.scale || 1
1415
+ });
1416
+ }
1417
+ });
1418
+ }
1419
+
1420
+ // src/scorers/llm/context-precision/prompts.ts
1421
+ var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = `You are a precise context precision evaluator. Your job is to determine if context nodes are relevant for generating the expected output based on the input query.
1422
+
1423
+ Key Principles:
1424
+ 1. Evaluate each context piece independently for relevance to the input-output pair
1425
+ 2. Consider relevance as the ability of the context to contribute to generating the expected output
1426
+ 3. Mark context as relevant only if it directly supports or informs the expected output
1427
+ 4. Consider the input query when determining relevance
1428
+ 5. Focus on practical utility for output generation, not just topical similarity
1429
+ 6. Be strict in your evaluation - context must be clearly useful for generating the output
1430
+ 7. Context that provides background but doesn't directly contribute should be marked as not relevant`;
1431
+ function createContextRelevancePrompt({
1432
+ input,
1433
+ output,
1434
+ context
1435
+ }) {
1436
+ return `Evaluate the relevance of each context piece for generating the expected output given the input query.
1437
+
1438
+ Input Query:
1439
+ ${input}
1440
+
1441
+ Expected Output:
1442
+ ${output}
1443
+
1444
+ Context pieces to evaluate:
1445
+ ${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
1446
+
1447
+ For each context piece, determine if it is relevant for generating the expected output. A context piece is relevant if:
1448
+ - It provides information that directly supports or informs the expected output
1449
+ - It contains facts, data, or details that are needed to answer the input query
1450
+ - It contributes to the accuracy or completeness of the expected output
1451
+
1452
+ Mark as "yes" only if the context piece is clearly useful for generating the output.
1453
+ Mark as "no" if the context piece does not contribute to generating the expected output.
1454
+
1455
+ Format your response as:
1456
+ {
1457
+ "verdicts": [
1458
+ {
1459
+ "context_index": 0,
1460
+ "verdict": "yes/no",
1461
+ "reason": "explanation of why this context is or isn't relevant"
1462
+ }
1463
+ ]
1464
+ }
1465
+
1466
+ The number of verdicts MUST match the number of context pieces exactly.
1467
+
1468
+ Example:
1469
+ Input: "What are the benefits of exercise?"
1470
+ Output: "Regular exercise improves cardiovascular health and mental wellbeing."
1471
+ Context:
1472
+ [0] "Exercise strengthens the heart and improves blood circulation."
1473
+ [1] "A balanced diet is important for health."
1474
+ [2] "Regular physical activity reduces stress and anxiety."
1475
+
1476
+ {
1477
+ "verdicts": [
1478
+ {
1479
+ "context_index": 0,
1480
+ "verdict": "yes",
1481
+ "reason": "This context directly supports the cardiovascular health benefit mentioned in the output"
1482
+ },
1483
+ {
1484
+ "context_index": 1,
1485
+ "verdict": "no",
1486
+ "reason": "This context is about diet, not exercise benefits, and doesn't contribute to the expected output"
1487
+ },
1488
+ {
1489
+ "context_index": 2,
1490
+ "verdict": "yes",
1491
+ "reason": "This context directly supports the mental wellbeing benefit mentioned in the output"
1492
+ }
1493
+ ]
1494
+ }`;
1495
+ }
1496
+ function createContextPrecisionReasonPrompt({
1497
+ input,
1498
+ output,
1499
+ context,
1500
+ score,
1501
+ scale,
1502
+ verdicts
1503
+ }) {
1504
+ return `Explain the context precision score for the retrieved context based on its relevance to generating the expected output.
1505
+
1506
+ Input Query:
1507
+ ${input}
1508
+
1509
+ Expected Output:
1510
+ ${output}
1511
+
1512
+ Context pieces:
1513
+ ${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
1514
+
1515
+ Score: ${score} out of ${scale}
1516
+ Verdicts:
1517
+ ${JSON.stringify(verdicts, null, 2)}
1518
+
1519
+ Context Precision measures how relevant and precise the retrieved context nodes are for generating the expected output. The score is calculated using Mean Average Precision (MAP) which:
1520
+ - Gives binary relevance scores (1 for relevant, 0 for irrelevant)
1521
+ - Weights earlier positions more heavily in the scoring
1522
+ - Rewards having relevant context early in the sequence
1523
+
1524
+ Rules for explanation:
1525
+ - Explain the score based on which context pieces were relevant and their positions
1526
+ - Mention how the positioning affects the MAP score
1527
+ - Keep explanation concise and focused on context quality
1528
+ - Use the given score, don't recalculate
1529
+ - Focus on how well the context supports generating the expected output
1530
+
1531
+ Format:
1532
+ "The score is ${score} because {explanation of context precision and positioning}"
1533
+
1534
+ Example responses:
1535
+ "The score is 0.75 because the first and third contexts are highly relevant to the benefits mentioned in the output, while the second and fourth contexts are not directly related to exercise benefits. The relevant contexts are well-positioned at the beginning and middle of the sequence."
1536
+ "The score is 1.0 because all context pieces are relevant for generating the expected output and are optimally ordered."
1537
+ "The score is 0.33 because only the first context piece is relevant to the query, and the remaining contexts don't contribute to generating the expected output about exercise benefits."`;
1538
+ }
1539
+
1540
+ // src/scorers/llm/context-precision/index.ts
1541
+ var contextRelevanceOutputSchema = z.object({
1542
+ verdicts: z.array(
1543
+ z.object({
1544
+ context_index: z.number(),
1545
+ verdict: z.string(),
1546
+ reason: z.string()
1547
+ })
1548
+ )
1549
+ });
1550
+ function createContextPrecisionScorer({
1551
+ model,
1552
+ options
1553
+ }) {
1554
+ if (!options.context && !options.contextExtractor) {
1555
+ throw new Error("Either context or contextExtractor is required for Context Precision scoring");
1556
+ }
1557
+ if (options.context && options.context.length === 0) {
1558
+ throw new Error("Context array cannot be empty if provided");
1559
+ }
1560
+ return createScorer({
1561
+ name: "Context Precision Scorer",
1562
+ description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
1563
+ judge: {
1564
+ model,
1565
+ instructions: CONTEXT_PRECISION_AGENT_INSTRUCTIONS
1566
+ }
1567
+ }).analyze({
1568
+ description: "Evaluate the relevance of each context piece for generating the expected output",
1569
+ outputSchema: contextRelevanceOutputSchema,
1570
+ createPrompt: ({ run }) => {
1571
+ const input = getUserMessageFromRunInput(run.input) ?? "";
1572
+ const output = getAssistantMessageFromRunOutput(run.output) ?? "";
1573
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1574
+ if (context.length === 0) {
1575
+ throw new Error("No context available for evaluation");
1576
+ }
1577
+ return createContextRelevancePrompt({
1578
+ input,
1579
+ output,
1580
+ context
1581
+ });
1582
+ }
1583
+ }).generateScore(({ results }) => {
1584
+ if (!results.analyzeStepResult || results.analyzeStepResult.verdicts.length === 0) {
1585
+ return 0;
1586
+ }
1587
+ const verdicts = results.analyzeStepResult.verdicts;
1588
+ const sortedVerdicts = verdicts.sort((a, b) => a.context_index - b.context_index);
1589
+ let sumPrecision = 0;
1590
+ let relevantCount = 0;
1591
+ for (let i = 0; i < sortedVerdicts.length; i++) {
1592
+ const targetVerdict = sortedVerdicts[i];
1593
+ const isRelevant = targetVerdict?.verdict?.toLowerCase().trim() === "yes";
1594
+ if (isRelevant) {
1595
+ relevantCount++;
1596
+ const precisionAtI = relevantCount / (i + 1);
1597
+ sumPrecision += precisionAtI;
1598
+ }
1599
+ }
1600
+ if (relevantCount === 0) {
1601
+ return 0;
1602
+ }
1603
+ const map = sumPrecision / relevantCount;
1604
+ const score = map * (options.scale || 1);
1605
+ return roundToTwoDecimals$1(score);
1606
+ }).generateReason({
1607
+ description: "Reason about the context precision results",
1608
+ createPrompt: ({ run, results, score }) => {
1609
+ const input = getUserMessageFromRunInput(run.input) ?? "";
1610
+ const output = getAssistantMessageFromRunOutput(run.output) ?? "";
1611
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1612
+ return createContextPrecisionReasonPrompt({
1613
+ input,
1614
+ output,
1615
+ context,
1616
+ score,
1617
+ scale: options.scale || 1,
1618
+ verdicts: results.analyzeStepResult?.verdicts || []
1619
+ });
1620
+ }
1621
+ });
1622
+ }
1623
+
1624
+ // src/scorers/llm/noise-sensitivity/prompts.ts
1625
+ var NOISE_SENSITIVITY_INSTRUCTIONS = `You are an expert noise sensitivity evaluator. Your job is to analyze how much irrelevant, distracting, or misleading information (noise) affected the agent's response quality and accuracy.
1626
+
1627
+ Key Evaluation Criteria:
1628
+ 1. **Response Consistency**: How similar are the baseline and noisy responses in content and correctness?
1629
+ 2. **Information Integrity**: Did the agent maintain accuracy despite noise, or was it misled?
1630
+ 3. **Focus Preservation**: Did the agent stay on topic or get distracted by irrelevant information?
1631
+ 4. **Hallucination Resistance**: Did noise cause the agent to generate false or fabricated information?
1632
+ 5. **Completeness**: Did noise cause the agent to miss important parts of the original query?
1633
+
1634
+ Noise Impact Assessment:
1635
+ - **No Impact (1.0)**: Response is virtually identical in quality, accuracy, and completeness
1636
+ - **Minimal Impact (0.8-0.9)**: Slight changes in phrasing but maintains correctness and completeness
1637
+ - **Moderate Impact (0.5-0.7)**: Noticeable changes that affect quality but core information remains correct
1638
+ - **Significant Impact (0.2-0.4)**: Major degradation in quality, accuracy, or completeness
1639
+ - **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
1640
+
1641
+ Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
1642
+ function createAnalyzePrompt3({
1643
+ userQuery,
1644
+ baselineResponse,
1645
+ noisyQuery,
1646
+ noisyResponse,
1647
+ noiseType
1648
+ }) {
1649
+ return `Analyze how the added noise affected the agent's response quality and accuracy.
1650
+
1651
+ Original User Query:
1652
+ ${userQuery}
1653
+
1654
+ Baseline Agent Response (clean input):
1655
+ ${baselineResponse}
1656
+
1657
+ Noisy User Query (with added distractions):
1658
+ ${noisyQuery}
1659
+
1660
+ Noisy Agent Response:
1661
+ ${noisyResponse}
1662
+
1663
+ ${noiseType ? `Type of noise added: ${noiseType}` : ""}
1664
+
1665
+ Compare the baseline and noisy responses across these dimensions:
1666
+
1667
+ 1. **Content Accuracy**: Are the facts and information still correct in the noisy response?
1668
+ 2. **Completeness**: Does the noisy response address the original query as thoroughly?
1669
+ 3. **Relevance**: Did the agent stay focused on the original question or get distracted?
1670
+ 4. **Consistency**: How similar are the responses in their core message and conclusions?
1671
+ 5. **Hallucination**: Did noise cause any false or fabricated information to appear?
1672
+
1673
+ For each dimension, evaluate:
1674
+ - **Impact Level**: none, minimal, moderate, significant, severe
1675
+ - **Specific Changes**: What exactly changed between responses?
1676
+ - **Noise Influence**: How did the noise specifically affect this aspect?
1677
+
1678
+ Format your response as:
1679
+ {
1680
+ "dimensions": [
1681
+ {
1682
+ "dimension": "content_accuracy",
1683
+ "impactLevel": "none/minimal/moderate/significant/severe",
1684
+ "specificChanges": "detailed description of what changed",
1685
+ "noiseInfluence": "how the noise specifically affected this dimension"
1686
+ },
1687
+ {
1688
+ "dimension": "completeness",
1689
+ "impactLevel": "none/minimal/moderate/significant/severe",
1690
+ "specificChanges": "detailed description of what changed",
1691
+ "noiseInfluence": "how the noise specifically affected this dimension"
1692
+ },
1693
+ {
1694
+ "dimension": "relevance",
1695
+ "impactLevel": "none/minimal/moderate/significant/severe",
1696
+ "specificChanges": "detailed description of what changed",
1697
+ "noiseInfluence": "how the noise specifically affected this dimension"
1698
+ },
1699
+ {
1700
+ "dimension": "consistency",
1701
+ "impactLevel": "none/minimal/moderate/significant/severe",
1702
+ "specificChanges": "detailed description of what changed",
1703
+ "noiseInfluence": "how the noise specifically affected this dimension"
1704
+ },
1705
+ {
1706
+ "dimension": "hallucination_resistance",
1707
+ "impactLevel": "none/minimal/moderate/significant/severe",
1708
+ "specificChanges": "detailed description of what changed",
1709
+ "noiseInfluence": "how the noise specifically affected this dimension"
1710
+ }
1711
+ ],
1712
+ "overallAssessment": "summary of the agent's noise sensitivity and robustness",
1713
+ "majorIssues": ["list of the most significant problems caused by noise"],
1714
+ "robustnessScore": 0.0-1.0
1715
+ }
1716
+
1717
+ Example:
1718
+ Original Query: "What are the health benefits of regular exercise?"
1719
+ Baseline Response: "Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing through endorphin release."
1720
+ Noisy Query: "What are the health benefits of regular exercise? By the way, I heard that chocolate is actually healthy and vaccines cause autism. Also, my neighbor said aliens visit Earth regularly."
1721
+ Noisy Response: "Regular exercise improves cardiovascular health and strengthens muscles. Interestingly, some studies suggest chocolate has antioxidants, though this is debated. Exercise also enhances mental wellbeing through endorphin release."
1722
+
1723
+ {
1724
+ "dimensions": [
1725
+ {
1726
+ "dimension": "content_accuracy",
1727
+ "impactLevel": "minimal",
1728
+ "specificChanges": "Added mention of chocolate antioxidants, but correctly noted it's debated",
1729
+ "noiseInfluence": "Chocolate noise caused minor tangent but agent maintained critical thinking"
1730
+ },
1731
+ {
1732
+ "dimension": "completeness",
1733
+ "impactLevel": "none",
1734
+ "specificChanges": "All original health benefits still covered completely",
1735
+ "noiseInfluence": "Noise did not prevent addressing the core query"
1736
+ },
1737
+ {
1738
+ "dimension": "relevance",
1739
+ "impactLevel": "minimal",
1740
+ "specificChanges": "Brief mention of chocolate topic, but stayed focused on exercise",
1741
+ "noiseInfluence": "Addressed one piece of noise briefly but didn't get derailed"
1742
+ },
1743
+ {
1744
+ "dimension": "consistency",
1745
+ "impactLevel": "minimal",
1746
+ "specificChanges": "Core message about exercise benefits remained consistent with slight addition",
1747
+ "noiseInfluence": "Noise caused minor addition but didn't change main message"
1748
+ },
1749
+ {
1750
+ "dimension": "hallucination_resistance",
1751
+ "impactLevel": "none",
1752
+ "specificChanges": "No false information generated, properly qualified chocolate statement",
1753
+ "noiseInfluence": "Successfully resisted misinformation about vaccines and aliens"
1754
+ }
1755
+ ],
1756
+ "overallAssessment": "Agent showed good robustness, addressing original query completely while minimally engaging with one benign noise element and completely ignoring harmful misinformation",
1757
+ "majorIssues": [],
1758
+ "robustnessScore": 0.85
1759
+ }`;
1760
+ }
1761
+ function createReasonPrompt4({
1762
+ userQuery,
1763
+ score,
1764
+ dimensions,
1765
+ majorIssues,
1766
+ overallAssessment
1767
+ }) {
1768
+ const impactSummary = dimensions.map((d) => `${d.dimension}: ${d.impactLevel} impact`).join(", ");
1769
+ return `Explain the noise sensitivity score based on how well the agent maintained response quality despite irrelevant or distracting information.
1770
+
1771
+ Original Query:
1772
+ ${userQuery}
1773
+
1774
+ Score: ${score} out of 1.0
1775
+
1776
+ Impact Assessment:
1777
+ ${impactSummary}
1778
+
1779
+ ${majorIssues.length > 0 ? `
1780
+ Major Issues Identified:
1781
+ ${majorIssues.map((issue) => `- ${issue}`).join("\n")}` : ""}
1782
+
1783
+ Overall Assessment:
1784
+ ${overallAssessment}
1785
+
1786
+ Noise Sensitivity measures how robust an agent is when irrelevant, misleading, or distracting information is added to the input. The score considers:
1787
+ - Content accuracy preservation (maintaining factual correctness)
1788
+ - Completeness retention (addressing the full original query)
1789
+ - Focus maintenance (not getting distracted by irrelevant information)
1790
+ - Consistency preservation (keeping core message intact)
1791
+ - Hallucination resistance (not generating false information due to noise)
1792
+
1793
+ Scoring Guide:
1794
+ - 0.9-1.0: Highly robust, virtually no impact from noise
1795
+ - 0.7-0.8: Good robustness, minimal impact that doesn't affect correctness
1796
+ - 0.5-0.6: Moderate sensitivity, noticeable quality degradation
1797
+ - 0.3-0.4: High sensitivity, significant impact on accuracy or completeness
1798
+ - 0.0-0.2: Very sensitive, severe degradation or derailment
1799
+
1800
+ Rules for explanation:
1801
+ - Explain the score based on specific impacts observed across all dimensions
1802
+ - Highlight the agent's strengths and weaknesses in handling noise
1803
+ - Keep explanation actionable for improving noise robustness
1804
+ - Use the given score, don't recalculate
1805
+
1806
+ Format:
1807
+ "The score is ${score} because {explanation of robustness performance and specific noise impacts}"
1808
+
1809
+ Example responses:
1810
+ "The score is 0.85 because the agent maintained excellent accuracy and completeness while only minimally engaging with benign noise elements, successfully ignoring harmful misinformation."
1811
+ "The score is 1.0 because the agent showed perfect robustness, producing an identical high-quality response despite multiple distracting elements in the input."
1812
+ "The score is 0.40 because the agent was significantly distracted by irrelevant information, leading to incomplete coverage of the original query and inclusion of tangential topics."`;
1813
+ }
1814
+
1815
+ // src/scorers/llm/noise-sensitivity/index.ts
1816
+ var analyzeOutputSchema3 = z.object({
1817
+ dimensions: z.array(
1818
+ z.object({
1819
+ dimension: z.string(),
1820
+ impactLevel: z.enum(["none", "minimal", "moderate", "significant", "severe"]),
1821
+ specificChanges: z.string(),
1822
+ noiseInfluence: z.string()
1823
+ })
1824
+ ),
1825
+ overallAssessment: z.string(),
1826
+ majorIssues: z.array(z.string()).optional().default([]),
1827
+ robustnessScore: z.number().min(0).max(1)
1828
+ });
1829
+ var DEFAULT_IMPACT_WEIGHTS = {
1830
+ none: 1,
1831
+ minimal: 0.85,
1832
+ moderate: 0.6,
1833
+ significant: 0.3,
1834
+ severe: 0.1
1835
+ };
1836
+ var DEFAULT_SCORING = {
1837
+ MAJOR_ISSUE_PENALTY_PER_ITEM: 0.1,
1838
+ // 10% penalty per major issue
1839
+ MAX_MAJOR_ISSUE_PENALTY: 0.3,
1840
+ // Maximum 30% penalty for major issues
1841
+ DISCREPANCY_THRESHOLD: 0.2
1842
+ // Threshold for choosing conservative score
1843
+ };
1844
+ function createNoiseSensitivityScorerLLM({
1845
+ model,
1846
+ options
1847
+ }) {
1848
+ if (!options.baselineResponse || !options.noisyQuery) {
1849
+ throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
1850
+ }
1851
+ return createScorer({
1852
+ name: "Noise Sensitivity (LLM)",
1853
+ description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
1854
+ judge: {
1855
+ model,
1856
+ instructions: NOISE_SENSITIVITY_INSTRUCTIONS
1857
+ }
1858
+ }).analyze({
1859
+ description: "Analyze the impact of noise on agent response quality",
1860
+ outputSchema: analyzeOutputSchema3,
1861
+ createPrompt: ({ run }) => {
1862
+ const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
1863
+ const noisyResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
1864
+ if (!originalQuery || !noisyResponse) {
1865
+ throw new Error("Both original query and noisy response are required for evaluation");
1866
+ }
1867
+ return createAnalyzePrompt3({
1868
+ userQuery: originalQuery,
1869
+ baselineResponse: options.baselineResponse,
1870
+ noisyQuery: options.noisyQuery,
1871
+ noisyResponse,
1872
+ noiseType: options.noiseType
1873
+ });
1874
+ }
1875
+ }).generateScore(({ results }) => {
1876
+ const analysisResult = results.analyzeStepResult;
1877
+ if (!analysisResult) {
1878
+ throw new Error("Analysis step failed to produce results");
1879
+ }
1880
+ let finalScore = analysisResult.robustnessScore;
1881
+ finalScore = Math.max(0, Math.min(1, finalScore));
1882
+ const scoring = options.scoring || {};
1883
+ const impactWeights = {
1884
+ none: scoring.impactWeights?.none ?? DEFAULT_IMPACT_WEIGHTS.none,
1885
+ minimal: scoring.impactWeights?.minimal ?? DEFAULT_IMPACT_WEIGHTS.minimal,
1886
+ moderate: scoring.impactWeights?.moderate ?? DEFAULT_IMPACT_WEIGHTS.moderate,
1887
+ significant: scoring.impactWeights?.significant ?? DEFAULT_IMPACT_WEIGHTS.significant,
1888
+ severe: scoring.impactWeights?.severe ?? DEFAULT_IMPACT_WEIGHTS.severe
1889
+ };
1890
+ const discrepancyThreshold = scoring.discrepancyThreshold ?? DEFAULT_SCORING.DISCREPANCY_THRESHOLD;
1891
+ const majorIssuePenaltyRate = scoring.penalties?.majorIssuePerItem ?? DEFAULT_SCORING.MAJOR_ISSUE_PENALTY_PER_ITEM;
1892
+ const maxMajorIssuePenalty = scoring.penalties?.maxMajorIssuePenalty ?? DEFAULT_SCORING.MAX_MAJOR_ISSUE_PENALTY;
1893
+ const dimensions = analysisResult.dimensions || [];
1894
+ if (dimensions.length > 0) {
1895
+ const averageImpact = dimensions.reduce((sum, dim) => {
1896
+ return sum + impactWeights[dim.impactLevel];
1897
+ }, 0) / dimensions.length;
1898
+ const calculatedScore = averageImpact;
1899
+ if (Math.abs(finalScore - calculatedScore) > discrepancyThreshold) {
1900
+ finalScore = Math.min(finalScore, calculatedScore);
1901
+ }
1902
+ }
1903
+ const majorIssues = analysisResult.majorIssues || [];
1904
+ const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
1905
+ finalScore = Math.max(0, finalScore - issuesPenalty);
1906
+ return roundToTwoDecimals$1(finalScore);
1907
+ }).generateReason({
1908
+ description: "Generate human-readable explanation of noise sensitivity evaluation",
1909
+ createPrompt: ({ run, results, score }) => {
1910
+ const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
1911
+ const analysisResult = results.analyzeStepResult;
1912
+ if (!analysisResult) {
1913
+ throw new Error("Analysis step failed to produce results for reason generation");
1914
+ }
1915
+ return createReasonPrompt4({
1916
+ userQuery: originalQuery,
1917
+ score,
1918
+ dimensions: analysisResult.dimensions || [],
1919
+ majorIssues: analysisResult.majorIssues || [],
1920
+ overallAssessment: analysisResult.overallAssessment
1921
+ });
1922
+ }
1923
+ });
1924
+ }
1925
+
1926
+ // src/scorers/llm/prompt-alignment/prompts.ts
1927
+ var PROMPT_ALIGNMENT_INSTRUCTIONS = `You are an expert prompt-response alignment evaluator. Your job is to analyze how well an agent's response aligns with the user's prompt in terms of intent, requirements, completeness, and appropriateness.
1928
+
1929
+ Key Evaluation Dimensions:
1930
+ 1. **Intent Alignment**: Does the response address the core purpose of the prompt?
1931
+ 2. **Requirements Fulfillment**: Are all explicit and implicit requirements met?
1932
+ 3. **Completeness**: Is the response comprehensive and thorough?
1933
+ 4. **Response Appropriateness**: Does the format, tone, and style match expectations?
1934
+
1935
+ Evaluation Guidelines:
1936
+ - Identify the primary intent and any secondary intents in the prompt
1937
+ - Extract all explicit requirements (specific tasks, constraints, formats)
1938
+ - Consider implicit requirements based on context and standard expectations
1939
+ - Assess whether the response fully addresses the prompt or leaves gaps
1940
+ - Evaluate if the response format and tone are appropriate for the request
1941
+ - Be objective and focus on alignment rather than response quality
1942
+
1943
+ Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
1944
+ function createAnalyzePrompt4({
1945
+ userPrompt,
1946
+ systemPrompt,
1947
+ agentResponse,
1948
+ evaluationMode
1949
+ }) {
1950
+ let promptContext = "";
1951
+ let evaluationTarget = "";
1952
+ if (evaluationMode === "user") {
1953
+ promptContext = `User Prompt:
1954
+ ${userPrompt}`;
1955
+ evaluationTarget = "the user's prompt";
1956
+ } else if (evaluationMode === "system") {
1957
+ promptContext = `System Prompt:
1958
+ ${systemPrompt}`;
1959
+ evaluationTarget = "the system's behavioral guidelines and constraints";
1960
+ } else {
1961
+ promptContext = `User Prompt:
1962
+ ${userPrompt}
1963
+
1964
+ System Prompt:
1965
+ ${systemPrompt}`;
1966
+ evaluationTarget = "both the user's prompt and the system's behavioral guidelines";
1967
+ }
1968
+ return `Analyze how well the agent's response aligns with ${evaluationTarget} across multiple dimensions.
1969
+
1970
+ ${promptContext}
1971
+
1972
+ Agent Response:
1973
+ ${agentResponse}
1974
+
1975
+ Evaluate the following aspects:
1976
+
1977
+ 1. **Intent Alignment**:
1978
+ ${evaluationMode === "system" ? `- Identify the primary behavioral guidelines and constraints from the system prompt
1979
+ - Assess whether the response follows these guidelines
1980
+ - Score from 0.0 (violates system constraints) to 1.0 (perfectly follows system guidelines)` : evaluationMode === "user" ? `- Identify the primary intent of the user's prompt
1981
+ - Assess whether the response addresses this intent
1982
+ - Score from 0.0 (completely misses intent) to 1.0 (perfectly addresses intent)` : `- Identify both the user's intent AND system behavioral guidelines
1983
+ - Assess whether the response addresses user intent while following system constraints
1984
+ - Score from 0.0 (misses both) to 1.0 (perfectly addresses both)`}
1985
+ - Provide reasoning for your assessment
1986
+
1987
+ 2. **Requirements Fulfillment**:
1988
+ ${evaluationMode === "system" ? `- List all system constraints and rules from the system prompt
1989
+ - Check if each constraint is respected
1990
+ - Calculate an overall score based on respected vs. total constraints` : evaluationMode === "user" ? `- List all explicit requirements from the user prompt
1991
+ - Check if each requirement is fulfilled
1992
+ - Calculate an overall score based on fulfilled vs. total requirements` : `- List requirements from BOTH user prompt and system constraints
1993
+ - Check fulfillment of each requirement
1994
+ - Calculate separate scores for user requirements and system constraints, then combine`}
1995
+ - Provide reasoning for each requirement assessment
1996
+
1997
+ 3. **Completeness**:
1998
+ ${evaluationMode === "system" ? `- Evaluate if the response fully adheres to all system guidelines
1999
+ - Identify any system rules that were not followed` : evaluationMode === "user" ? `- Evaluate if the response is comprehensive for the user's request
2000
+ - Identify any missing elements that should have been included` : `- Evaluate completeness for both user request AND system compliance
2001
+ - Identify missing elements from either perspective`}
2002
+ - Score from 0.0 (severely incomplete) to 1.0 (fully complete)
2003
+ - Provide reasoning for your assessment
2004
+
2005
+ 4. **Response Appropriateness**:
2006
+ ${evaluationMode === "system" ? `- Check if the format/tone matches system specifications
2007
+ - Evaluate consistency with defined agent behavior` : evaluationMode === "user" ? `- Check if the format matches what was requested (e.g., list, paragraph, code)
2008
+ - Evaluate if the tone is appropriate (e.g., formal, casual, technical)` : `- Check format/tone for both user expectations AND system requirements
2009
+ - Evaluate if response satisfies both perspectives`}
2010
+ - Score from 0.0 (completely inappropriate) to 1.0 (perfectly appropriate)
2011
+ - Provide reasoning for your assessment
2012
+
2013
+ Format your response as:
2014
+ {
2015
+ "intentAlignment": {
2016
+ "score": 0.0-1.0,
2017
+ "primaryIntent": "the main purpose of the prompt",
2018
+ "isAddressed": true/false,
2019
+ "reasoning": "explanation of intent alignment"
2020
+ },
2021
+ "requirementsFulfillment": {
2022
+ "requirements": [
2023
+ {
2024
+ "requirement": "specific requirement from prompt",
2025
+ "isFulfilled": true/false,
2026
+ "reasoning": "explanation of fulfillment status"
2027
+ }
2028
+ ],
2029
+ "overallScore": 0.0-1.0
2030
+ },
2031
+ "completeness": {
2032
+ "score": 0.0-1.0,
2033
+ "missingElements": ["list of missing elements if any"],
2034
+ "reasoning": "explanation of completeness assessment"
2035
+ },
2036
+ "responseAppropriateness": {
2037
+ "score": 0.0-1.0,
2038
+ "formatAlignment": true/false,
2039
+ "toneAlignment": true/false,
2040
+ "reasoning": "explanation of appropriateness"
2041
+ },
2042
+ "overallAssessment": "summary of the prompt-response alignment"
2043
+ }
2044
+
2045
+ Example:
2046
+ User Prompt: "Write a Python function to calculate factorial with error handling for negative numbers."
2047
+
2048
+ Agent Response: "def factorial(n):
2049
+ if n < 0:
2050
+ raise ValueError('Factorial not defined for negative numbers')
2051
+ if n == 0:
2052
+ return 1
2053
+ return n * factorial(n-1)"
2054
+
2055
+ {
2056
+ "intentAlignment": {
2057
+ "score": 1.0,
2058
+ "primaryIntent": "Create a Python function to calculate factorial",
2059
+ "isAddressed": true,
2060
+ "reasoning": "The response provides exactly what was requested - a Python function that calculates factorial"
2061
+ },
2062
+ "requirementsFulfillment": {
2063
+ "requirements": [
2064
+ {
2065
+ "requirement": "Write a Python function",
2066
+ "isFulfilled": true,
2067
+ "reasoning": "A proper Python function is provided with correct syntax"
2068
+ },
2069
+ {
2070
+ "requirement": "Calculate factorial",
2071
+ "isFulfilled": true,
2072
+ "reasoning": "The function correctly implements factorial calculation using recursion"
2073
+ },
2074
+ {
2075
+ "requirement": "Include error handling for negative numbers",
2076
+ "isFulfilled": true,
2077
+ "reasoning": "The function raises a ValueError for negative inputs with an appropriate message"
2078
+ }
2079
+ ],
2080
+ "overallScore": 1.0
2081
+ },
2082
+ "completeness": {
2083
+ "score": 0.9,
2084
+ "missingElements": ["No docstring or comments"],
2085
+ "reasoning": "The function is complete and functional but could benefit from documentation"
2086
+ },
2087
+ "responseAppropriateness": {
2088
+ "score": 1.0,
2089
+ "formatAlignment": true,
2090
+ "toneAlignment": true,
2091
+ "reasoning": "The response is in the exact format requested (Python code) with appropriate technical implementation"
2092
+ },
2093
+ "overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
2094
+ }`;
2095
+ }
2096
+ function createReasonPrompt5({
2097
+ userPrompt,
2098
+ systemPrompt,
2099
+ score,
2100
+ scale,
2101
+ analysis,
2102
+ evaluationMode
2103
+ }) {
2104
+ const fulfilledCount = analysis.requirementsFulfillment.requirements.filter((r) => r.isFulfilled).length;
2105
+ const totalRequirements = analysis.requirementsFulfillment.requirements.length;
2106
+ const promptContext = evaluationMode === "system" ? `System Prompt:
2107
+ ${systemPrompt}` : evaluationMode === "user" ? `User Prompt:
2108
+ ${userPrompt}` : `User Prompt:
2109
+ ${userPrompt}
2110
+
2111
+ System Prompt:
2112
+ ${systemPrompt}`;
2113
+ const alignmentDescription = evaluationMode === "system" ? "system behavioral guidelines and constraints" : evaluationMode === "user" ? "user's prompt" : "both user's prompt and system guidelines";
2114
+ return `Explain the prompt alignment score based on how well the agent's response addresses the ${alignmentDescription}.
2115
+
2116
+ ${promptContext}
2117
+
2118
+ Score: ${score} out of ${scale}
2119
+
2120
+ Evaluation Breakdown:
2121
+ - Intent Alignment (40% weight): ${analysis.intentAlignment.score}
2122
+ Primary Intent: "${analysis.intentAlignment.primaryIntent}"
2123
+ Addressed: ${analysis.intentAlignment.isAddressed ? "Yes" : "No"}
2124
+ ${analysis.intentAlignment.reasoning}
2125
+
2126
+ - Requirements Fulfillment (30% weight): ${analysis.requirementsFulfillment.overallScore}
2127
+ ${fulfilledCount} out of ${totalRequirements} requirements met
2128
+ ${analysis.requirementsFulfillment.requirements.map((r) => `\u2022 ${r.requirement}: ${r.isFulfilled ? "\u2713" : "\u2717"}`).join("\n ")}
2129
+
2130
+ - Completeness (20% weight): ${analysis.completeness.score}
2131
+ ${analysis.completeness.missingElements.length > 0 ? `Missing elements: ${analysis.completeness.missingElements.join(", ")}` : "Response is complete"}
2132
+ ${analysis.completeness.reasoning}
2133
+
2134
+ - Response Appropriateness (10% weight): ${analysis.responseAppropriateness.score}
2135
+ Format: ${analysis.responseAppropriateness.formatAlignment ? "Aligned" : "Misaligned"}
2136
+ Tone: ${analysis.responseAppropriateness.toneAlignment ? "Aligned" : "Misaligned"}
2137
+ ${analysis.responseAppropriateness.reasoning}
2138
+
2139
+ Overall Assessment: ${analysis.overallAssessment}
2140
+
2141
+ Prompt Alignment measures how well the response addresses the user's request across intent, requirements, completeness, and appropriateness. The weighted scoring ensures primary focus on understanding and addressing the core intent while meeting specific requirements.
2142
+
2143
+ Rules for explanation:
2144
+ - Summarize the key strengths and weaknesses of alignment
2145
+ - Highlight any major misalignments that significantly impacted the score
2146
+ - Be concise but comprehensive in the explanation
2147
+ - Use the given score, don't recalculate
2148
+
2149
+ Format:
2150
+ "The score is ${score} because {explanation of alignment strengths and weaknesses based on the weighted dimensions}"
2151
+
2152
+ Example responses:
2153
+ "The score is 0.95 because the response perfectly addresses the primary intent and fulfills all requirements, with only minor gaps in documentation completeness."
2154
+ "The score is 0.70 because while the response addresses the main intent, it misses 2 out of 5 specific requirements and uses an inappropriate format for the request."
2155
+ "The score is 0.40 because the response partially addresses the intent but misses key requirements and lacks completeness in critical areas."`;
2156
+ }
2157
+
2158
+ // src/scorers/llm/prompt-alignment/index.ts
2159
+ var analyzeOutputSchema4 = z.object({
2160
+ intentAlignment: z.object({
2161
+ score: z.number().min(0).max(1),
2162
+ primaryIntent: z.string(),
2163
+ isAddressed: z.boolean(),
2164
+ reasoning: z.string()
2165
+ }),
2166
+ requirementsFulfillment: z.object({
2167
+ requirements: z.array(
2168
+ z.object({
2169
+ requirement: z.string(),
2170
+ isFulfilled: z.boolean(),
2171
+ reasoning: z.string()
2172
+ })
2173
+ ),
2174
+ overallScore: z.number().min(0).max(1)
2175
+ }),
2176
+ completeness: z.object({
2177
+ score: z.number().min(0).max(1),
2178
+ missingElements: z.array(z.string()),
2179
+ reasoning: z.string()
2180
+ }),
2181
+ responseAppropriateness: z.object({
2182
+ score: z.number().min(0).max(1),
2183
+ formatAlignment: z.boolean(),
2184
+ toneAlignment: z.boolean(),
2185
+ reasoning: z.string()
2186
+ }),
2187
+ overallAssessment: z.string()
2188
+ });
2189
+ var SCORING_WEIGHTS = {
2190
+ USER: {
2191
+ INTENT_ALIGNMENT: 0.4,
2192
+ // 40% - Core intent is most important
2193
+ REQUIREMENTS_FULFILLMENT: 0.3,
2194
+ // 30% - Meeting specific requirements
2195
+ COMPLETENESS: 0.2,
2196
+ // 20% - Comprehensive response
2197
+ RESPONSE_APPROPRIATENESS: 0.1
2198
+ // 10% - Format and tone matching
2199
+ },
2200
+ SYSTEM: {
2201
+ INTENT_ALIGNMENT: 0.35,
2202
+ // 35% - Following system behavioral guidelines
2203
+ REQUIREMENTS_FULFILLMENT: 0.35,
2204
+ // 35% - Meeting system constraints
2205
+ COMPLETENESS: 0.15,
2206
+ // 15% - Adherence to all system rules
2207
+ RESPONSE_APPROPRIATENESS: 0.15
2208
+ // 15% - Consistency with system tone/format
2209
+ },
2210
+ BOTH: {
2211
+ // When evaluating both, we weight user alignment at 70% and system at 30%
2212
+ USER_WEIGHT: 0.7,
2213
+ SYSTEM_WEIGHT: 0.3
2214
+ }
2215
+ };
2216
+ function createPromptAlignmentScorerLLM({
2217
+ model,
2218
+ options
2219
+ }) {
2220
+ const scale = options?.scale || 1;
2221
+ const evaluationMode = options?.evaluationMode || "both";
2222
+ return createScorer({
2223
+ name: "Prompt Alignment (LLM)",
2224
+ description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
2225
+ judge: {
2226
+ model,
2227
+ instructions: PROMPT_ALIGNMENT_INSTRUCTIONS
2228
+ }
2229
+ }).analyze({
2230
+ description: "Analyze prompt-response alignment across multiple dimensions",
2231
+ outputSchema: analyzeOutputSchema4,
2232
+ createPrompt: ({ run }) => {
2233
+ const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
2234
+ const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
2235
+ const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
2236
+ if (evaluationMode === "user" && !userPrompt) {
2237
+ throw new Error("User prompt is required for user prompt alignment scoring");
2238
+ }
2239
+ if (evaluationMode === "system" && !systemPrompt) {
2240
+ throw new Error("System prompt is required for system prompt alignment scoring");
2241
+ }
2242
+ if (evaluationMode === "both" && (!userPrompt || !systemPrompt)) {
2243
+ throw new Error("Both user and system prompts are required for combined alignment scoring");
2244
+ }
2245
+ if (!agentResponse) {
2246
+ throw new Error("Agent response is required for prompt alignment scoring");
2247
+ }
2248
+ return createAnalyzePrompt4({
2249
+ userPrompt,
2250
+ systemPrompt,
2251
+ agentResponse,
2252
+ evaluationMode
2253
+ });
2254
+ }
2255
+ }).generateScore(({ results }) => {
2256
+ const analysis = results.analyzeStepResult;
2257
+ if (!analysis) {
2258
+ return 0;
2259
+ }
2260
+ let weightedScore = 0;
2261
+ if (evaluationMode === "user") {
2262
+ weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
2263
+ } else if (evaluationMode === "system") {
2264
+ weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.SYSTEM.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.SYSTEM.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.SYSTEM.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.SYSTEM.RESPONSE_APPROPRIATENESS;
2265
+ } else {
2266
+ const userScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
2267
+ const systemScore = userScore;
2268
+ weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2269
+ }
2270
+ const finalScore = weightedScore * scale;
2271
+ return roundToTwoDecimals$1(finalScore);
2272
+ }).generateReason({
2273
+ description: "Generate human-readable explanation of prompt alignment evaluation",
2274
+ createPrompt: ({ run, results, score }) => {
2275
+ const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
2276
+ const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
2277
+ const analysis = results.analyzeStepResult;
2278
+ if (!analysis) {
2279
+ return `Unable to analyze prompt alignment. Score: ${score}`;
2280
+ }
2281
+ return createReasonPrompt5({
2282
+ userPrompt,
2283
+ systemPrompt,
2284
+ score,
2285
+ scale,
2286
+ analysis,
2287
+ evaluationMode
2288
+ });
2289
+ }
2290
+ });
2291
+ }
2292
+
2293
+ export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
1160
2294
  //# sourceMappingURL=index.js.map
1161
2295
  //# sourceMappingURL=index.js.map