@mastra/evals 0.13.2 → 0.13.3-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/{chunk-5CVZXIFW.js → chunk-4LRZVFXR.js} +32 -3
  2. package/dist/chunk-4LRZVFXR.js.map +1 -0
  3. package/dist/{chunk-QVZBKGOE.cjs → chunk-EKSPLMYP.cjs} +32 -2
  4. package/dist/chunk-EKSPLMYP.cjs.map +1 -0
  5. package/dist/{dist-JVIEAZJ6.js → dist-CI72CYZJ.js} +10 -10
  6. package/dist/{dist-JVIEAZJ6.js.map → dist-CI72CYZJ.js.map} +1 -1
  7. package/dist/{dist-JQCAD3AD.cjs → dist-IKJJ2AX4.cjs} +10 -10
  8. package/dist/{dist-JQCAD3AD.cjs.map → dist-IKJJ2AX4.cjs.map} +1 -1
  9. package/dist/index.cjs +1 -1
  10. package/dist/index.js +1 -1
  11. package/dist/{magic-string.es-NBXOXRCK.cjs → magic-string.es-VZN2EYER.cjs} +3 -3
  12. package/dist/{magic-string.es-NBXOXRCK.cjs.map → magic-string.es-VZN2EYER.cjs.map} +1 -1
  13. package/dist/{magic-string.es-6JSI7KY4.js → magic-string.es-WQRLTQPQ.js} +3 -3
  14. package/dist/{magic-string.es-6JSI7KY4.js.map → magic-string.es-WQRLTQPQ.js.map} +1 -1
  15. package/dist/scorers/code/index.cjs +2 -2
  16. package/dist/scorers/code/index.js +1 -1
  17. package/dist/scorers/llm/context-precision/index.d.ts +18 -0
  18. package/dist/scorers/llm/context-precision/index.d.ts.map +1 -0
  19. package/dist/scorers/llm/context-precision/prompts.d.ts +19 -0
  20. package/dist/scorers/llm/context-precision/prompts.d.ts.map +1 -0
  21. package/dist/scorers/llm/context-relevance/index.d.ts +27 -0
  22. package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -0
  23. package/dist/scorers/llm/context-relevance/prompts.d.ts +20 -0
  24. package/dist/scorers/llm/context-relevance/prompts.d.ts.map +1 -0
  25. package/dist/scorers/llm/index.cjs +1163 -25
  26. package/dist/scorers/llm/index.cjs.map +1 -1
  27. package/dist/scorers/llm/index.d.ts +4 -0
  28. package/dist/scorers/llm/index.d.ts.map +1 -1
  29. package/dist/scorers/llm/index.js +1137 -3
  30. package/dist/scorers/llm/index.js.map +1 -1
  31. package/dist/scorers/llm/noise-sensitivity/index.d.ts +36 -0
  32. package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/noise-sensitivity/prompts.d.ts +21 -0
  34. package/dist/scorers/llm/noise-sensitivity/prompts.d.ts.map +1 -0
  35. package/dist/scorers/llm/prompt-alignment/index.d.ts +38 -0
  36. package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -0
  37. package/dist/scorers/llm/prompt-alignment/prompts.d.ts +44 -0
  38. package/dist/scorers/llm/prompt-alignment/prompts.d.ts.map +1 -0
  39. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -4
  40. package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
  41. package/dist/scorers/utils.d.ts +2 -0
  42. package/dist/scorers/utils.d.ts.map +1 -1
  43. package/package.json +3 -3
  44. package/dist/chunk-5CVZXIFW.js.map +0 -1
  45. package/dist/chunk-QVZBKGOE.cjs.map +0 -1
@@ -1,7 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
4
- var chunkQVZBKGOE_cjs = require('../../chunk-QVZBKGOE.cjs');
4
+ var chunkEKSPLMYP_cjs = require('../../chunk-EKSPLMYP.cjs');
5
5
  var scores = require('@mastra/core/scores');
6
6
  var zod = require('zod');
7
7
 
@@ -227,14 +227,14 @@ function createAnswerRelevancyScorer({
227
227
  description: "Extract relevant statements from the LLM output",
228
228
  outputSchema: extractOutputSchema,
229
229
  createPrompt: ({ run }) => {
230
- const assistantMessage = chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
230
+ const assistantMessage = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
231
231
  return createExtractPrompt(assistantMessage);
232
232
  }
233
233
  }).analyze({
234
234
  description: "Score the relevance of the statements to the input",
235
235
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
236
236
  createPrompt: ({ run, results }) => {
237
- const input = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
237
+ const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
238
238
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
239
239
  }
240
240
  }).generateScore(({ results }) => {
@@ -256,8 +256,8 @@ function createAnswerRelevancyScorer({
256
256
  description: "Reason about the results",
257
257
  createPrompt: ({ run, results, score }) => {
258
258
  return createReasonPrompt({
259
- input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
260
- output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
259
+ input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
260
+ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
261
261
  score,
262
262
  results: results.analyzeStepResult.results,
263
263
  scale: options.scale
@@ -435,7 +435,7 @@ function createFaithfulnessScorer({
435
435
  description: "Extract relevant statements from the LLM output",
436
436
  outputSchema: zod.z.array(zod.z.string()),
437
437
  createPrompt: ({ run }) => {
438
- const prompt = createFaithfulnessExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
438
+ const prompt = createFaithfulnessExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
439
439
  return prompt;
440
440
  }
441
441
  }).analyze({
@@ -456,13 +456,13 @@ function createFaithfulnessScorer({
456
456
  return 0;
457
457
  }
458
458
  const score = supportedClaims / totalClaims * (options?.scale || 1);
459
- return chunkQVZBKGOE_cjs.roundToTwoDecimals(score);
459
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
460
460
  }).generateReason({
461
461
  description: "Reason about the results",
462
462
  createPrompt: ({ run, results, score }) => {
463
463
  const prompt = createFaithfulnessReasonPrompt({
464
- input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
465
- output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
464
+ input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
465
+ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
466
466
  context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
467
467
  score,
468
468
  scale: options?.scale || 1,
@@ -593,13 +593,13 @@ function createBiasScorer({ model, options }) {
593
593
  outputSchema: zod.z.object({
594
594
  opinions: zod.z.array(zod.z.string())
595
595
  }),
596
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
596
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
597
597
  }).analyze({
598
598
  description: "Score the relevance of the statements to the input",
599
599
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
600
600
  createPrompt: ({ run, results }) => {
601
601
  const prompt = createBiasAnalyzePrompt({
602
- output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
602
+ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
603
603
  opinions: results.preprocessStepResult?.opinions || []
604
604
  });
605
605
  return prompt;
@@ -610,7 +610,7 @@ function createBiasScorer({ model, options }) {
610
610
  }
611
611
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
612
612
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
613
- return chunkQVZBKGOE_cjs.roundToTwoDecimals(score * (options?.scale || 1));
613
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(score * (options?.scale || 1));
614
614
  }).generateReason({
615
615
  description: "Reason about the results",
616
616
  createPrompt: ({ score, results }) => {
@@ -827,7 +827,7 @@ function createHallucinationScorer({
827
827
  claims: zod.z.array(zod.z.string())
828
828
  }),
829
829
  createPrompt: ({ run }) => {
830
- const prompt = createHallucinationExtractPrompt({ output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
830
+ const prompt = createHallucinationExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
831
831
  return prompt;
832
832
  }
833
833
  }).analyze({
@@ -849,13 +849,13 @@ function createHallucinationScorer({
849
849
  return 0;
850
850
  }
851
851
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
852
- return chunkQVZBKGOE_cjs.roundToTwoDecimals(score);
852
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
853
853
  }).generateReason({
854
854
  description: "Reason about the results",
855
855
  createPrompt: ({ run, results, score }) => {
856
856
  const prompt = createHallucinationReasonPrompt({
857
- input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
858
- output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
857
+ input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
858
+ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
859
859
  context: options?.context || [],
860
860
  score,
861
861
  scale: options?.scale || 1,
@@ -964,8 +964,8 @@ function createToxicityScorer({ model, options }) {
964
964
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
965
965
  createPrompt: ({ run }) => {
966
966
  const prompt = createToxicityAnalyzePrompt({
967
- input: chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "",
968
- output: chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
967
+ input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
968
+ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
969
969
  });
970
970
  return prompt;
971
971
  }
@@ -981,7 +981,7 @@ function createToxicityScorer({ model, options }) {
981
981
  }
982
982
  }
983
983
  const score = toxicityCount / numberOfVerdicts;
984
- return chunkQVZBKGOE_cjs.roundToTwoDecimals(score * (options?.scale || 1));
984
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(score * (options?.scale || 1));
985
985
  }).generateReason({
986
986
  description: "Reason about the results",
987
987
  createPrompt: ({ results, score }) => {
@@ -1099,7 +1099,7 @@ var analyzeOutputSchema = zod.z.object({
1099
1099
  missingTools: zod.z.array(zod.z.string()).optional()
1100
1100
  });
1101
1101
  function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1102
- const toolDefinitions = availableTools.map((tool) => `${tool.name}: ${tool.description}`).join("\n");
1102
+ const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
1103
1103
  return scores.createScorer({
1104
1104
  name: "Tool Call Accuracy (LLM)",
1105
1105
  description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
@@ -1113,7 +1113,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1113
1113
  if (isInputInvalid || isOutputInvalid) {
1114
1114
  throw new Error("Input and output messages cannot be null or empty");
1115
1115
  }
1116
- const { tools: actualTools, toolCallInfos } = chunkQVZBKGOE_cjs.extractToolCalls(run.output);
1116
+ const { tools: actualTools, toolCallInfos } = chunkEKSPLMYP_cjs.extractToolCalls(run.output);
1117
1117
  return {
1118
1118
  actualTools,
1119
1119
  hasToolCalls: actualTools.length > 0,
@@ -1123,8 +1123,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1123
1123
  description: "Analyze the appropriateness of tool selections",
1124
1124
  outputSchema: analyzeOutputSchema,
1125
1125
  createPrompt: ({ run, results }) => {
1126
- const userInput = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
1127
- const agentResponse = chunkQVZBKGOE_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1126
+ const userInput = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1127
+ const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1128
1128
  const toolsCalled = results.preprocessStepResult?.actualTools || [];
1129
1129
  return createAnalyzePrompt({
1130
1130
  userInput,
@@ -1141,11 +1141,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1141
1141
  }
1142
1142
  const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1143
1143
  const totalToolCalls = evaluations.length;
1144
- return chunkQVZBKGOE_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1144
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1145
1145
  }).generateReason({
1146
1146
  description: "Generate human-readable explanation of tool selection evaluation",
1147
1147
  createPrompt: ({ run, results, score }) => {
1148
- const userInput = chunkQVZBKGOE_cjs.getUserMessageFromRunInput(run.input) ?? "";
1148
+ const userInput = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1149
1149
  const evaluations = results.analyzeStepResult?.evaluations || [];
1150
1150
  const missingTools = results.analyzeStepResult?.missingTools || [];
1151
1151
  return createReasonPrompt2({
@@ -1158,12 +1158,1150 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1158
1158
  });
1159
1159
  }
1160
1160
 
1161
+ // src/scorers/llm/context-relevance/prompts.ts
1162
+ var CONTEXT_RELEVANCE_INSTRUCTIONS = `You are an expert context relevance evaluator. Your job is to analyze whether the provided context information was appropriate and useful for generating the agent's response to the user's query.
1163
+
1164
+ Key Evaluation Criteria:
1165
+ 1. **Relevance**: Does the context directly relate to the user's query?
1166
+ 2. **Utility**: Did the context help produce a better response?
1167
+ 3. **Completeness**: Was the context sufficient for the task?
1168
+ 4. **Quality**: Is the context accurate and trustworthy?
1169
+
1170
+ Evaluation Guidelines:
1171
+ - Context that directly answers or supports the user's query should be marked as highly relevant
1172
+ - Context that provides background information relevant to the query should be considered moderately relevant
1173
+ - Context that is tangentially related but doesn't directly help should be marked as low relevance
1174
+ - Context that is completely unrelated should be marked as irrelevant
1175
+ - Consider whether missing context might have led to a better response
1176
+
1177
+ Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
1178
+ function createAnalyzePrompt2({
1179
+ userQuery,
1180
+ agentResponse,
1181
+ providedContext
1182
+ }) {
1183
+ const contextList = providedContext.map((ctx, index) => `[${index}] ${ctx}`).join("\n");
1184
+ return `Analyze the relevance of the provided context for answering the user's query and generating the agent's response.
1185
+
1186
+ User Query:
1187
+ ${userQuery}
1188
+
1189
+ Agent Response:
1190
+ ${agentResponse}
1191
+
1192
+ Context pieces to evaluate:
1193
+ ${contextList}
1194
+
1195
+ For each context piece, evaluate:
1196
+ 1. **Relevance Level**: How relevant is it to the user's query?
1197
+ - "high": Directly addresses the query or provides essential information
1198
+ - "medium": Provides supporting or background information that's helpful
1199
+ - "low": Tangentially related but not very helpful
1200
+ - "none": Completely irrelevant or unrelated
1201
+
1202
+ 2. **Usage**: Was this context actually used in generating the agent's response?
1203
+ - true: The response clearly incorporates or reflects this information
1204
+ - false: This information doesn't appear to be used in the response
1205
+
1206
+ 3. **Reasoning**: Explain your assessment in detail
1207
+
1208
+ Also identify any missing context that should have been provided to better answer the query.
1209
+
1210
+ Format your response as:
1211
+ {
1212
+ "evaluations": [
1213
+ {
1214
+ "context_index": 0,
1215
+ "contextPiece": "the actual text of the context piece",
1216
+ "relevanceLevel": "high/medium/low/none",
1217
+ "wasUsed": true/false,
1218
+ "reasoning": "detailed explanation of the evaluation"
1219
+ }
1220
+ ],
1221
+ "missingContext": ["list of missing information that would have been helpful"],
1222
+ "overallAssessment": "summary of the context quality and usage"
1223
+ }
1224
+
1225
+ The number of evaluations MUST match the number of context pieces exactly.
1226
+
1227
+ Example:
1228
+ User Query: "What are the benefits of exercise?"
1229
+ Agent Response: "Regular exercise improves cardiovascular health and mental wellbeing."
1230
+ Context:
1231
+ [0] "Exercise strengthens the heart and improves blood circulation."
1232
+ [1] "A balanced diet is important for overall health."
1233
+ [2] "Regular physical activity reduces stress and anxiety levels."
1234
+
1235
+ {
1236
+ "evaluations": [
1237
+ {
1238
+ "context_index": 0,
1239
+ "contextPiece": "Exercise strengthens the heart and improves blood circulation.",
1240
+ "relevanceLevel": "high",
1241
+ "wasUsed": true,
1242
+ "reasoning": "This context directly supports the cardiovascular health benefit mentioned in the response"
1243
+ },
1244
+ {
1245
+ "context_index": 1,
1246
+ "contextPiece": "A balanced diet is important for overall health.",
1247
+ "relevanceLevel": "none",
1248
+ "wasUsed": false,
1249
+ "reasoning": "This context is about diet, not exercise benefits, and doesn't contribute to answering the query"
1250
+ },
1251
+ {
1252
+ "context_index": 2,
1253
+ "contextPiece": "Regular physical activity reduces stress and anxiety levels.",
1254
+ "relevanceLevel": "high",
1255
+ "wasUsed": true,
1256
+ "reasoning": "This context directly supports the mental wellbeing benefit mentioned in the response"
1257
+ }
1258
+ ],
1259
+ "missingContext": [],
1260
+ "overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
1261
+ }`;
1262
+ }
1263
+ function createReasonPrompt3({
1264
+ userQuery,
1265
+ score,
1266
+ evaluations,
1267
+ missingContext,
1268
+ scale
1269
+ }) {
1270
+ return `Explain the context relevance score for the provided context based on its relevance and usage in generating the agent's response.
1271
+
1272
+ User Query:
1273
+ ${userQuery}
1274
+
1275
+ Score: ${score} out of ${scale}
1276
+
1277
+ Context Evaluations:
1278
+ ${evaluations.map(
1279
+ (evaluation) => `[${evaluation.context_index}] Relevance: ${evaluation.relevanceLevel}, Used: ${evaluation.wasUsed ? "Yes" : "No"}
1280
+ Context: "${evaluation.contextPiece}"
1281
+ Reasoning: ${evaluation.reasoning}`
1282
+ ).join("\n\n")}
1283
+
1284
+ ${missingContext.length > 0 ? `
1285
+ Missing Context Issues:
1286
+ ${missingContext.map((item) => `- ${item}`).join("\n")}` : ""}
1287
+
1288
+ Context Relevance measures how well the provided context supports answering the user's query and generating the expected response. The score considers:
1289
+ - Relevance levels (high=1.0, medium=0.7, low=0.3, none=0.0)
1290
+ - Usage penalties (10% penalty per unused high-relevance context)
1291
+ - Missing context penalties (up to 50% penalty for identified gaps)
1292
+
1293
+ Rules for explanation:
1294
+ - Explain the score based on context relevance levels and usage
1295
+ - Mention any penalties applied for unused relevant context or missing information
1296
+ - Keep explanation concise and actionable for improving context selection
1297
+ - Use the given score, don't recalculate
1298
+
1299
+ Format:
1300
+ "The score is ${score} because {explanation of context relevance, usage, and any penalties}"
1301
+
1302
+ Example responses:
1303
+ "The score is 0.85 because 2 out of 3 context pieces are highly relevant and used in the response, with only minor penalty for one unused medium-relevance context piece."
1304
+ "The score is 1.0 because all context pieces are highly relevant to the query about exercise benefits and were effectively used in generating the comprehensive response."
1305
+ "The score is 0.40 because while some context is relevant, key information about the topic was missing and one highly relevant context piece was not utilized in the response."`;
1306
+ }
1307
+
1308
+ // src/scorers/llm/context-relevance/index.ts
1309
+ var analyzeOutputSchema2 = zod.z.object({
1310
+ evaluations: zod.z.array(
1311
+ zod.z.object({
1312
+ context_index: zod.z.number(),
1313
+ contextPiece: zod.z.string(),
1314
+ relevanceLevel: zod.z.enum(["high", "medium", "low", "none"]),
1315
+ wasUsed: zod.z.boolean(),
1316
+ reasoning: zod.z.string()
1317
+ })
1318
+ ),
1319
+ missingContext: zod.z.array(zod.z.string()).optional().default([]),
1320
+ overallAssessment: zod.z.string()
1321
+ });
1322
+ var DEFAULT_PENALTIES = {
1323
+ UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
1324
+ // 10% penalty per unused high-relevance context
1325
+ MISSING_CONTEXT_PER_ITEM: 0.15,
1326
+ // 15% penalty per missing context item
1327
+ MAX_MISSING_CONTEXT_PENALTY: 0.5
1328
+ // Maximum 50% penalty for missing context
1329
+ };
1330
+ function createContextRelevanceScorerLLM({
1331
+ model,
1332
+ options
1333
+ }) {
1334
+ if (!options.context && !options.contextExtractor) {
1335
+ throw new Error("Either context or contextExtractor is required for Context Relevance scoring");
1336
+ }
1337
+ if (options.context && options.context.length === 0) {
1338
+ throw new Error("Context array cannot be empty if provided");
1339
+ }
1340
+ return scores.createScorer({
1341
+ name: "Context Relevance (LLM)",
1342
+ description: "Evaluates how relevant and useful the provided context was for generating the agent response",
1343
+ judge: {
1344
+ model,
1345
+ instructions: CONTEXT_RELEVANCE_INSTRUCTIONS
1346
+ }
1347
+ }).analyze({
1348
+ description: "Analyze the relevance and utility of provided context",
1349
+ outputSchema: analyzeOutputSchema2,
1350
+ createPrompt: ({ run }) => {
1351
+ const userQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1352
+ const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1353
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1354
+ if (context.length === 0) {
1355
+ return createAnalyzePrompt2({
1356
+ userQuery,
1357
+ agentResponse,
1358
+ providedContext: ["[No context was provided for evaluation]"]
1359
+ });
1360
+ }
1361
+ return createAnalyzePrompt2({
1362
+ userQuery,
1363
+ agentResponse,
1364
+ providedContext: context
1365
+ });
1366
+ }
1367
+ }).generateScore(({ results, run }) => {
1368
+ const evaluations = results.analyzeStepResult?.evaluations || [];
1369
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1370
+ if (context.length === 0) {
1371
+ return 1 * (options.scale || 1);
1372
+ }
1373
+ if (evaluations.length === 0) {
1374
+ const missingContext2 = results.analyzeStepResult?.missingContext || [];
1375
+ return missingContext2.length > 0 ? 0 : 1;
1376
+ }
1377
+ const relevanceWeights = {
1378
+ high: 1,
1379
+ medium: 0.7,
1380
+ low: 0.3,
1381
+ none: 0
1382
+ };
1383
+ const totalWeight = evaluations.reduce((sum, evaluation) => {
1384
+ return sum + relevanceWeights[evaluation.relevanceLevel];
1385
+ }, 0);
1386
+ const maxPossibleWeight = evaluations.length * relevanceWeights.high;
1387
+ const relevanceScore = maxPossibleWeight > 0 ? totalWeight / maxPossibleWeight : 0;
1388
+ const highRelevanceUnused = evaluations.filter(
1389
+ (evaluation) => evaluation.relevanceLevel === "high" && !evaluation.wasUsed
1390
+ ).length;
1391
+ const penalties = options.penalties || {};
1392
+ const unusedPenaltyRate = penalties.unusedHighRelevanceContext ?? DEFAULT_PENALTIES.UNUSED_HIGH_RELEVANCE_CONTEXT;
1393
+ const missingPenaltyRate = penalties.missingContextPerItem ?? DEFAULT_PENALTIES.MISSING_CONTEXT_PER_ITEM;
1394
+ const maxMissingPenalty = penalties.maxMissingContextPenalty ?? DEFAULT_PENALTIES.MAX_MISSING_CONTEXT_PENALTY;
1395
+ const usagePenalty = highRelevanceUnused * unusedPenaltyRate;
1396
+ const missingContext = results.analyzeStepResult?.missingContext || [];
1397
+ const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1398
+ const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1399
+ const scaledScore = finalScore * (options.scale || 1);
1400
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(scaledScore);
1401
+ }).generateReason({
1402
+ description: "Generate human-readable explanation of context relevance evaluation",
1403
+ createPrompt: ({ run, results, score }) => {
1404
+ const userQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1405
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1406
+ if (context.length === 0) {
1407
+ return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
1408
+ }
1409
+ const evaluations = results.analyzeStepResult?.evaluations || [];
1410
+ const missingContext = results.analyzeStepResult?.missingContext || [];
1411
+ return createReasonPrompt3({
1412
+ userQuery,
1413
+ score,
1414
+ evaluations,
1415
+ missingContext,
1416
+ scale: options.scale || 1
1417
+ });
1418
+ }
1419
+ });
1420
+ }
1421
+
1422
+ // src/scorers/llm/context-precision/prompts.ts
1423
+ var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = `You are a precise context precision evaluator. Your job is to determine if context nodes are relevant for generating the expected output based on the input query.
1424
+
1425
+ Key Principles:
1426
+ 1. Evaluate each context piece independently for relevance to the input-output pair
1427
+ 2. Consider relevance as the ability of the context to contribute to generating the expected output
1428
+ 3. Mark context as relevant only if it directly supports or informs the expected output
1429
+ 4. Consider the input query when determining relevance
1430
+ 5. Focus on practical utility for output generation, not just topical similarity
1431
+ 6. Be strict in your evaluation - context must be clearly useful for generating the output
1432
+ 7. Context that provides background but doesn't directly contribute should be marked as not relevant`;
1433
+ function createContextRelevancePrompt({
1434
+ input,
1435
+ output,
1436
+ context
1437
+ }) {
1438
+ return `Evaluate the relevance of each context piece for generating the expected output given the input query.
1439
+
1440
+ Input Query:
1441
+ ${input}
1442
+
1443
+ Expected Output:
1444
+ ${output}
1445
+
1446
+ Context pieces to evaluate:
1447
+ ${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
1448
+
1449
+ For each context piece, determine if it is relevant for generating the expected output. A context piece is relevant if:
1450
+ - It provides information that directly supports or informs the expected output
1451
+ - It contains facts, data, or details that are needed to answer the input query
1452
+ - It contributes to the accuracy or completeness of the expected output
1453
+
1454
+ Mark as "yes" only if the context piece is clearly useful for generating the output.
1455
+ Mark as "no" if the context piece does not contribute to generating the expected output.
1456
+
1457
+ Format your response as:
1458
+ {
1459
+ "verdicts": [
1460
+ {
1461
+ "context_index": 0,
1462
+ "verdict": "yes/no",
1463
+ "reason": "explanation of why this context is or isn't relevant"
1464
+ }
1465
+ ]
1466
+ }
1467
+
1468
+ The number of verdicts MUST match the number of context pieces exactly.
1469
+
1470
+ Example:
1471
+ Input: "What are the benefits of exercise?"
1472
+ Output: "Regular exercise improves cardiovascular health and mental wellbeing."
1473
+ Context:
1474
+ [0] "Exercise strengthens the heart and improves blood circulation."
1475
+ [1] "A balanced diet is important for health."
1476
+ [2] "Regular physical activity reduces stress and anxiety."
1477
+
1478
+ {
1479
+ "verdicts": [
1480
+ {
1481
+ "context_index": 0,
1482
+ "verdict": "yes",
1483
+ "reason": "This context directly supports the cardiovascular health benefit mentioned in the output"
1484
+ },
1485
+ {
1486
+ "context_index": 1,
1487
+ "verdict": "no",
1488
+ "reason": "This context is about diet, not exercise benefits, and doesn't contribute to the expected output"
1489
+ },
1490
+ {
1491
+ "context_index": 2,
1492
+ "verdict": "yes",
1493
+ "reason": "This context directly supports the mental wellbeing benefit mentioned in the output"
1494
+ }
1495
+ ]
1496
+ }`;
1497
+ }
1498
+ function createContextPrecisionReasonPrompt({
1499
+ input,
1500
+ output,
1501
+ context,
1502
+ score,
1503
+ scale,
1504
+ verdicts
1505
+ }) {
1506
+ return `Explain the context precision score for the retrieved context based on its relevance to generating the expected output.
1507
+
1508
+ Input Query:
1509
+ ${input}
1510
+
1511
+ Expected Output:
1512
+ ${output}
1513
+
1514
+ Context pieces:
1515
+ ${context.map((ctx, index) => `[${index}] ${ctx}`).join("\n")}
1516
+
1517
+ Score: ${score} out of ${scale}
1518
+ Verdicts:
1519
+ ${JSON.stringify(verdicts, null, 2)}
1520
+
1521
+ Context Precision measures how relevant and precise the retrieved context nodes are for generating the expected output. The score is calculated using Mean Average Precision (MAP) which:
1522
+ - Gives binary relevance scores (1 for relevant, 0 for irrelevant)
1523
+ - Weights earlier positions more heavily in the scoring
1524
+ - Rewards having relevant context early in the sequence
1525
+
1526
+ Rules for explanation:
1527
+ - Explain the score based on which context pieces were relevant and their positions
1528
+ - Mention how the positioning affects the MAP score
1529
+ - Keep explanation concise and focused on context quality
1530
+ - Use the given score, don't recalculate
1531
+ - Focus on how well the context supports generating the expected output
1532
+
1533
+ Format:
1534
+ "The score is ${score} because {explanation of context precision and positioning}"
1535
+
1536
+ Example responses:
1537
+ "The score is 0.75 because the first and third contexts are highly relevant to the benefits mentioned in the output, while the second and fourth contexts are not directly related to exercise benefits. The relevant contexts are well-positioned at the beginning and middle of the sequence."
1538
+ "The score is 1.0 because all context pieces are relevant for generating the expected output and are optimally ordered."
1539
+ "The score is 0.33 because only the first context piece is relevant to the query, and the remaining contexts don't contribute to generating the expected output about exercise benefits."`;
1540
+ }
1541
+
1542
+ // src/scorers/llm/context-precision/index.ts
1543
+ var contextRelevanceOutputSchema = zod.z.object({
1544
+ verdicts: zod.z.array(
1545
+ zod.z.object({
1546
+ context_index: zod.z.number(),
1547
+ verdict: zod.z.string(),
1548
+ reason: zod.z.string()
1549
+ })
1550
+ )
1551
+ });
1552
+ function createContextPrecisionScorer({
1553
+ model,
1554
+ options
1555
+ }) {
1556
+ if (!options.context && !options.contextExtractor) {
1557
+ throw new Error("Either context or contextExtractor is required for Context Precision scoring");
1558
+ }
1559
+ if (options.context && options.context.length === 0) {
1560
+ throw new Error("Context array cannot be empty if provided");
1561
+ }
1562
+ return scores.createScorer({
1563
+ name: "Context Precision Scorer",
1564
+ description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
1565
+ judge: {
1566
+ model,
1567
+ instructions: CONTEXT_PRECISION_AGENT_INSTRUCTIONS
1568
+ }
1569
+ }).analyze({
1570
+ description: "Evaluate the relevance of each context piece for generating the expected output",
1571
+ outputSchema: contextRelevanceOutputSchema,
1572
+ createPrompt: ({ run }) => {
1573
+ const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1574
+ const output = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1575
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1576
+ if (context.length === 0) {
1577
+ throw new Error("No context available for evaluation");
1578
+ }
1579
+ return createContextRelevancePrompt({
1580
+ input,
1581
+ output,
1582
+ context
1583
+ });
1584
+ }
1585
+ }).generateScore(({ results }) => {
1586
+ if (!results.analyzeStepResult || results.analyzeStepResult.verdicts.length === 0) {
1587
+ return 0;
1588
+ }
1589
+ const verdicts = results.analyzeStepResult.verdicts;
1590
+ const sortedVerdicts = verdicts.sort((a, b) => a.context_index - b.context_index);
1591
+ let sumPrecision = 0;
1592
+ let relevantCount = 0;
1593
+ for (let i = 0; i < sortedVerdicts.length; i++) {
1594
+ const targetVerdict = sortedVerdicts[i];
1595
+ const isRelevant = targetVerdict?.verdict?.toLowerCase().trim() === "yes";
1596
+ if (isRelevant) {
1597
+ relevantCount++;
1598
+ const precisionAtI = relevantCount / (i + 1);
1599
+ sumPrecision += precisionAtI;
1600
+ }
1601
+ }
1602
+ if (relevantCount === 0) {
1603
+ return 0;
1604
+ }
1605
+ const map = sumPrecision / relevantCount;
1606
+ const score = map * (options.scale || 1);
1607
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
1608
+ }).generateReason({
1609
+ description: "Reason about the context precision results",
1610
+ createPrompt: ({ run, results, score }) => {
1611
+ const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1612
+ const output = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1613
+ const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1614
+ return createContextPrecisionReasonPrompt({
1615
+ input,
1616
+ output,
1617
+ context,
1618
+ score,
1619
+ scale: options.scale || 1,
1620
+ verdicts: results.analyzeStepResult?.verdicts || []
1621
+ });
1622
+ }
1623
+ });
1624
+ }
1625
+
1626
+ // src/scorers/llm/noise-sensitivity/prompts.ts
1627
+ var NOISE_SENSITIVITY_INSTRUCTIONS = `You are an expert noise sensitivity evaluator. Your job is to analyze how much irrelevant, distracting, or misleading information (noise) affected the agent's response quality and accuracy.
1628
+
1629
+ Key Evaluation Criteria:
1630
+ 1. **Response Consistency**: How similar are the baseline and noisy responses in content and correctness?
1631
+ 2. **Information Integrity**: Did the agent maintain accuracy despite noise, or was it misled?
1632
+ 3. **Focus Preservation**: Did the agent stay on topic or get distracted by irrelevant information?
1633
+ 4. **Hallucination Resistance**: Did noise cause the agent to generate false or fabricated information?
1634
+ 5. **Completeness**: Did noise cause the agent to miss important parts of the original query?
1635
+
1636
+ Noise Impact Assessment:
1637
+ - **No Impact (1.0)**: Response is virtually identical in quality, accuracy, and completeness
1638
+ - **Minimal Impact (0.8-0.9)**: Slight changes in phrasing but maintains correctness and completeness
1639
+ - **Moderate Impact (0.5-0.7)**: Noticeable changes that affect quality but core information remains correct
1640
+ - **Significant Impact (0.2-0.4)**: Major degradation in quality, accuracy, or completeness
1641
+ - **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
1642
+
1643
+ Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
1644
+ function createAnalyzePrompt3({
1645
+ userQuery,
1646
+ baselineResponse,
1647
+ noisyQuery,
1648
+ noisyResponse,
1649
+ noiseType
1650
+ }) {
1651
+ return `Analyze how the added noise affected the agent's response quality and accuracy.
1652
+
1653
+ Original User Query:
1654
+ ${userQuery}
1655
+
1656
+ Baseline Agent Response (clean input):
1657
+ ${baselineResponse}
1658
+
1659
+ Noisy User Query (with added distractions):
1660
+ ${noisyQuery}
1661
+
1662
+ Noisy Agent Response:
1663
+ ${noisyResponse}
1664
+
1665
+ ${noiseType ? `Type of noise added: ${noiseType}` : ""}
1666
+
1667
+ Compare the baseline and noisy responses across these dimensions:
1668
+
1669
+ 1. **Content Accuracy**: Are the facts and information still correct in the noisy response?
1670
+ 2. **Completeness**: Does the noisy response address the original query as thoroughly?
1671
+ 3. **Relevance**: Did the agent stay focused on the original question or get distracted?
1672
+ 4. **Consistency**: How similar are the responses in their core message and conclusions?
1673
+ 5. **Hallucination**: Did noise cause any false or fabricated information to appear?
1674
+
1675
+ For each dimension, evaluate:
1676
+ - **Impact Level**: none, minimal, moderate, significant, severe
1677
+ - **Specific Changes**: What exactly changed between responses?
1678
+ - **Noise Influence**: How did the noise specifically affect this aspect?
1679
+
1680
+ Format your response as:
1681
+ {
1682
+ "dimensions": [
1683
+ {
1684
+ "dimension": "content_accuracy",
1685
+ "impactLevel": "none/minimal/moderate/significant/severe",
1686
+ "specificChanges": "detailed description of what changed",
1687
+ "noiseInfluence": "how the noise specifically affected this dimension"
1688
+ },
1689
+ {
1690
+ "dimension": "completeness",
1691
+ "impactLevel": "none/minimal/moderate/significant/severe",
1692
+ "specificChanges": "detailed description of what changed",
1693
+ "noiseInfluence": "how the noise specifically affected this dimension"
1694
+ },
1695
+ {
1696
+ "dimension": "relevance",
1697
+ "impactLevel": "none/minimal/moderate/significant/severe",
1698
+ "specificChanges": "detailed description of what changed",
1699
+ "noiseInfluence": "how the noise specifically affected this dimension"
1700
+ },
1701
+ {
1702
+ "dimension": "consistency",
1703
+ "impactLevel": "none/minimal/moderate/significant/severe",
1704
+ "specificChanges": "detailed description of what changed",
1705
+ "noiseInfluence": "how the noise specifically affected this dimension"
1706
+ },
1707
+ {
1708
+ "dimension": "hallucination_resistance",
1709
+ "impactLevel": "none/minimal/moderate/significant/severe",
1710
+ "specificChanges": "detailed description of what changed",
1711
+ "noiseInfluence": "how the noise specifically affected this dimension"
1712
+ }
1713
+ ],
1714
+ "overallAssessment": "summary of the agent's noise sensitivity and robustness",
1715
+ "majorIssues": ["list of the most significant problems caused by noise"],
1716
+ "robustnessScore": 0.0-1.0
1717
+ }
1718
+
1719
+ Example:
1720
+ Original Query: "What are the health benefits of regular exercise?"
1721
+ Baseline Response: "Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing through endorphin release."
1722
+ Noisy Query: "What are the health benefits of regular exercise? By the way, I heard that chocolate is actually healthy and vaccines cause autism. Also, my neighbor said aliens visit Earth regularly."
1723
+ Noisy Response: "Regular exercise improves cardiovascular health and strengthens muscles. Interestingly, some studies suggest chocolate has antioxidants, though this is debated. Exercise also enhances mental wellbeing through endorphin release."
1724
+
1725
+ {
1726
+ "dimensions": [
1727
+ {
1728
+ "dimension": "content_accuracy",
1729
+ "impactLevel": "minimal",
1730
+ "specificChanges": "Added mention of chocolate antioxidants, but correctly noted it's debated",
1731
+ "noiseInfluence": "Chocolate noise caused minor tangent but agent maintained critical thinking"
1732
+ },
1733
+ {
1734
+ "dimension": "completeness",
1735
+ "impactLevel": "none",
1736
+ "specificChanges": "All original health benefits still covered completely",
1737
+ "noiseInfluence": "Noise did not prevent addressing the core query"
1738
+ },
1739
+ {
1740
+ "dimension": "relevance",
1741
+ "impactLevel": "minimal",
1742
+ "specificChanges": "Brief mention of chocolate topic, but stayed focused on exercise",
1743
+ "noiseInfluence": "Addressed one piece of noise briefly but didn't get derailed"
1744
+ },
1745
+ {
1746
+ "dimension": "consistency",
1747
+ "impactLevel": "minimal",
1748
+ "specificChanges": "Core message about exercise benefits remained consistent with slight addition",
1749
+ "noiseInfluence": "Noise caused minor addition but didn't change main message"
1750
+ },
1751
+ {
1752
+ "dimension": "hallucination_resistance",
1753
+ "impactLevel": "none",
1754
+ "specificChanges": "No false information generated, properly qualified chocolate statement",
1755
+ "noiseInfluence": "Successfully resisted misinformation about vaccines and aliens"
1756
+ }
1757
+ ],
1758
+ "overallAssessment": "Agent showed good robustness, addressing original query completely while minimally engaging with one benign noise element and completely ignoring harmful misinformation",
1759
+ "majorIssues": [],
1760
+ "robustnessScore": 0.85
1761
+ }`;
1762
+ }
1763
+ function createReasonPrompt4({
1764
+ userQuery,
1765
+ score,
1766
+ dimensions,
1767
+ majorIssues,
1768
+ overallAssessment
1769
+ }) {
1770
+ const impactSummary = dimensions.map((d) => `${d.dimension}: ${d.impactLevel} impact`).join(", ");
1771
+ return `Explain the noise sensitivity score based on how well the agent maintained response quality despite irrelevant or distracting information.
1772
+
1773
+ Original Query:
1774
+ ${userQuery}
1775
+
1776
+ Score: ${score} out of 1.0
1777
+
1778
+ Impact Assessment:
1779
+ ${impactSummary}
1780
+
1781
+ ${majorIssues.length > 0 ? `
1782
+ Major Issues Identified:
1783
+ ${majorIssues.map((issue) => `- ${issue}`).join("\n")}` : ""}
1784
+
1785
+ Overall Assessment:
1786
+ ${overallAssessment}
1787
+
1788
+ Noise Sensitivity measures how robust an agent is when irrelevant, misleading, or distracting information is added to the input. The score considers:
1789
+ - Content accuracy preservation (maintaining factual correctness)
1790
+ - Completeness retention (addressing the full original query)
1791
+ - Focus maintenance (not getting distracted by irrelevant information)
1792
+ - Consistency preservation (keeping core message intact)
1793
+ - Hallucination resistance (not generating false information due to noise)
1794
+
1795
+ Scoring Guide:
1796
+ - 0.9-1.0: Highly robust, virtually no impact from noise
1797
+ - 0.7-0.8: Good robustness, minimal impact that doesn't affect correctness
1798
+ - 0.5-0.6: Moderate sensitivity, noticeable quality degradation
1799
+ - 0.3-0.4: High sensitivity, significant impact on accuracy or completeness
1800
+ - 0.0-0.2: Very sensitive, severe degradation or derailment
1801
+
1802
+ Rules for explanation:
1803
+ - Explain the score based on specific impacts observed across all dimensions
1804
+ - Highlight the agent's strengths and weaknesses in handling noise
1805
+ - Keep explanation actionable for improving noise robustness
1806
+ - Use the given score, don't recalculate
1807
+
1808
+ Format:
1809
+ "The score is ${score} because {explanation of robustness performance and specific noise impacts}"
1810
+
1811
+ Example responses:
1812
+ "The score is 0.85 because the agent maintained excellent accuracy and completeness while only minimally engaging with benign noise elements, successfully ignoring harmful misinformation."
1813
+ "The score is 1.0 because the agent showed perfect robustness, producing an identical high-quality response despite multiple distracting elements in the input."
1814
+ "The score is 0.40 because the agent was significantly distracted by irrelevant information, leading to incomplete coverage of the original query and inclusion of tangential topics."`;
1815
+ }
1816
+
1817
+ // src/scorers/llm/noise-sensitivity/index.ts
1818
+ var analyzeOutputSchema3 = zod.z.object({
1819
+ dimensions: zod.z.array(
1820
+ zod.z.object({
1821
+ dimension: zod.z.string(),
1822
+ impactLevel: zod.z.enum(["none", "minimal", "moderate", "significant", "severe"]),
1823
+ specificChanges: zod.z.string(),
1824
+ noiseInfluence: zod.z.string()
1825
+ })
1826
+ ),
1827
+ overallAssessment: zod.z.string(),
1828
+ majorIssues: zod.z.array(zod.z.string()).optional().default([]),
1829
+ robustnessScore: zod.z.number().min(0).max(1)
1830
+ });
1831
+ var DEFAULT_IMPACT_WEIGHTS = {
1832
+ none: 1,
1833
+ minimal: 0.85,
1834
+ moderate: 0.6,
1835
+ significant: 0.3,
1836
+ severe: 0.1
1837
+ };
1838
+ var DEFAULT_SCORING = {
1839
+ MAJOR_ISSUE_PENALTY_PER_ITEM: 0.1,
1840
+ // 10% penalty per major issue
1841
+ MAX_MAJOR_ISSUE_PENALTY: 0.3,
1842
+ // Maximum 30% penalty for major issues
1843
+ DISCREPANCY_THRESHOLD: 0.2
1844
+ // Threshold for choosing conservative score
1845
+ };
1846
+ function createNoiseSensitivityScorerLLM({
1847
+ model,
1848
+ options
1849
+ }) {
1850
+ if (!options.baselineResponse || !options.noisyQuery) {
1851
+ throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
1852
+ }
1853
+ return scores.createScorer({
1854
+ name: "Noise Sensitivity (LLM)",
1855
+ description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
1856
+ judge: {
1857
+ model,
1858
+ instructions: NOISE_SENSITIVITY_INSTRUCTIONS
1859
+ }
1860
+ }).analyze({
1861
+ description: "Analyze the impact of noise on agent response quality",
1862
+ outputSchema: analyzeOutputSchema3,
1863
+ createPrompt: ({ run }) => {
1864
+ const originalQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1865
+ const noisyResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1866
+ if (!originalQuery || !noisyResponse) {
1867
+ throw new Error("Both original query and noisy response are required for evaluation");
1868
+ }
1869
+ return createAnalyzePrompt3({
1870
+ userQuery: originalQuery,
1871
+ baselineResponse: options.baselineResponse,
1872
+ noisyQuery: options.noisyQuery,
1873
+ noisyResponse,
1874
+ noiseType: options.noiseType
1875
+ });
1876
+ }
1877
+ }).generateScore(({ results }) => {
1878
+ const analysisResult = results.analyzeStepResult;
1879
+ if (!analysisResult) {
1880
+ throw new Error("Analysis step failed to produce results");
1881
+ }
1882
+ let finalScore = analysisResult.robustnessScore;
1883
+ finalScore = Math.max(0, Math.min(1, finalScore));
1884
+ const scoring = options.scoring || {};
1885
+ const impactWeights = {
1886
+ none: scoring.impactWeights?.none ?? DEFAULT_IMPACT_WEIGHTS.none,
1887
+ minimal: scoring.impactWeights?.minimal ?? DEFAULT_IMPACT_WEIGHTS.minimal,
1888
+ moderate: scoring.impactWeights?.moderate ?? DEFAULT_IMPACT_WEIGHTS.moderate,
1889
+ significant: scoring.impactWeights?.significant ?? DEFAULT_IMPACT_WEIGHTS.significant,
1890
+ severe: scoring.impactWeights?.severe ?? DEFAULT_IMPACT_WEIGHTS.severe
1891
+ };
1892
+ const discrepancyThreshold = scoring.discrepancyThreshold ?? DEFAULT_SCORING.DISCREPANCY_THRESHOLD;
1893
+ const majorIssuePenaltyRate = scoring.penalties?.majorIssuePerItem ?? DEFAULT_SCORING.MAJOR_ISSUE_PENALTY_PER_ITEM;
1894
+ const maxMajorIssuePenalty = scoring.penalties?.maxMajorIssuePenalty ?? DEFAULT_SCORING.MAX_MAJOR_ISSUE_PENALTY;
1895
+ const dimensions = analysisResult.dimensions || [];
1896
+ if (dimensions.length > 0) {
1897
+ const averageImpact = dimensions.reduce((sum, dim) => {
1898
+ return sum + impactWeights[dim.impactLevel];
1899
+ }, 0) / dimensions.length;
1900
+ const calculatedScore = averageImpact;
1901
+ if (Math.abs(finalScore - calculatedScore) > discrepancyThreshold) {
1902
+ finalScore = Math.min(finalScore, calculatedScore);
1903
+ }
1904
+ }
1905
+ const majorIssues = analysisResult.majorIssues || [];
1906
+ const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
1907
+ finalScore = Math.max(0, finalScore - issuesPenalty);
1908
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(finalScore);
1909
+ }).generateReason({
1910
+ description: "Generate human-readable explanation of noise sensitivity evaluation",
1911
+ createPrompt: ({ run, results, score }) => {
1912
+ const originalQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1913
+ const analysisResult = results.analyzeStepResult;
1914
+ if (!analysisResult) {
1915
+ throw new Error("Analysis step failed to produce results for reason generation");
1916
+ }
1917
+ return createReasonPrompt4({
1918
+ userQuery: originalQuery,
1919
+ score,
1920
+ dimensions: analysisResult.dimensions || [],
1921
+ majorIssues: analysisResult.majorIssues || [],
1922
+ overallAssessment: analysisResult.overallAssessment
1923
+ });
1924
+ }
1925
+ });
1926
+ }
1927
+
1928
+ // src/scorers/llm/prompt-alignment/prompts.ts
1929
+ var PROMPT_ALIGNMENT_INSTRUCTIONS = `You are an expert prompt-response alignment evaluator. Your job is to analyze how well an agent's response aligns with the user's prompt in terms of intent, requirements, completeness, and appropriateness.
1930
+
1931
+ Key Evaluation Dimensions:
1932
+ 1. **Intent Alignment**: Does the response address the core purpose of the prompt?
1933
+ 2. **Requirements Fulfillment**: Are all explicit and implicit requirements met?
1934
+ 3. **Completeness**: Is the response comprehensive and thorough?
1935
+ 4. **Response Appropriateness**: Does the format, tone, and style match expectations?
1936
+
1937
+ Evaluation Guidelines:
1938
+ - Identify the primary intent and any secondary intents in the prompt
1939
+ - Extract all explicit requirements (specific tasks, constraints, formats)
1940
+ - Consider implicit requirements based on context and standard expectations
1941
+ - Assess whether the response fully addresses the prompt or leaves gaps
1942
+ - Evaluate if the response format and tone are appropriate for the request
1943
+ - Be objective and focus on alignment rather than response quality
1944
+
1945
+ Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
1946
+ function createAnalyzePrompt4({
1947
+ userPrompt,
1948
+ systemPrompt,
1949
+ agentResponse,
1950
+ evaluationMode
1951
+ }) {
1952
+ let promptContext = "";
1953
+ let evaluationTarget = "";
1954
+ if (evaluationMode === "user") {
1955
+ promptContext = `User Prompt:
1956
+ ${userPrompt}`;
1957
+ evaluationTarget = "the user's prompt";
1958
+ } else if (evaluationMode === "system") {
1959
+ promptContext = `System Prompt:
1960
+ ${systemPrompt}`;
1961
+ evaluationTarget = "the system's behavioral guidelines and constraints";
1962
+ } else {
1963
+ promptContext = `User Prompt:
1964
+ ${userPrompt}
1965
+
1966
+ System Prompt:
1967
+ ${systemPrompt}`;
1968
+ evaluationTarget = "both the user's prompt and the system's behavioral guidelines";
1969
+ }
1970
+ return `Analyze how well the agent's response aligns with ${evaluationTarget} across multiple dimensions.
1971
+
1972
+ ${promptContext}
1973
+
1974
+ Agent Response:
1975
+ ${agentResponse}
1976
+
1977
+ Evaluate the following aspects:
1978
+
1979
+ 1. **Intent Alignment**:
1980
+ ${evaluationMode === "system" ? `- Identify the primary behavioral guidelines and constraints from the system prompt
1981
+ - Assess whether the response follows these guidelines
1982
+ - Score from 0.0 (violates system constraints) to 1.0 (perfectly follows system guidelines)` : evaluationMode === "user" ? `- Identify the primary intent of the user's prompt
1983
+ - Assess whether the response addresses this intent
1984
+ - Score from 0.0 (completely misses intent) to 1.0 (perfectly addresses intent)` : `- Identify both the user's intent AND system behavioral guidelines
1985
+ - Assess whether the response addresses user intent while following system constraints
1986
+ - Score from 0.0 (misses both) to 1.0 (perfectly addresses both)`}
1987
+ - Provide reasoning for your assessment
1988
+
1989
+ 2. **Requirements Fulfillment**:
1990
+ ${evaluationMode === "system" ? `- List all system constraints and rules from the system prompt
1991
+ - Check if each constraint is respected
1992
+ - Calculate an overall score based on respected vs. total constraints` : evaluationMode === "user" ? `- List all explicit requirements from the user prompt
1993
+ - Check if each requirement is fulfilled
1994
+ - Calculate an overall score based on fulfilled vs. total requirements` : `- List requirements from BOTH user prompt and system constraints
1995
+ - Check fulfillment of each requirement
1996
+ - Calculate separate scores for user requirements and system constraints, then combine`}
1997
+ - Provide reasoning for each requirement assessment
1998
+
1999
+ 3. **Completeness**:
2000
+ ${evaluationMode === "system" ? `- Evaluate if the response fully adheres to all system guidelines
2001
+ - Identify any system rules that were not followed` : evaluationMode === "user" ? `- Evaluate if the response is comprehensive for the user's request
2002
+ - Identify any missing elements that should have been included` : `- Evaluate completeness for both user request AND system compliance
2003
+ - Identify missing elements from either perspective`}
2004
+ - Score from 0.0 (severely incomplete) to 1.0 (fully complete)
2005
+ - Provide reasoning for your assessment
2006
+
2007
+ 4. **Response Appropriateness**:
2008
+ ${evaluationMode === "system" ? `- Check if the format/tone matches system specifications
2009
+ - Evaluate consistency with defined agent behavior` : evaluationMode === "user" ? `- Check if the format matches what was requested (e.g., list, paragraph, code)
2010
+ - Evaluate if the tone is appropriate (e.g., formal, casual, technical)` : `- Check format/tone for both user expectations AND system requirements
2011
+ - Evaluate if response satisfies both perspectives`}
2012
+ - Score from 0.0 (completely inappropriate) to 1.0 (perfectly appropriate)
2013
+ - Provide reasoning for your assessment
2014
+
2015
+ Format your response as:
2016
+ {
2017
+ "intentAlignment": {
2018
+ "score": 0.0-1.0,
2019
+ "primaryIntent": "the main purpose of the prompt",
2020
+ "isAddressed": true/false,
2021
+ "reasoning": "explanation of intent alignment"
2022
+ },
2023
+ "requirementsFulfillment": {
2024
+ "requirements": [
2025
+ {
2026
+ "requirement": "specific requirement from prompt",
2027
+ "isFulfilled": true/false,
2028
+ "reasoning": "explanation of fulfillment status"
2029
+ }
2030
+ ],
2031
+ "overallScore": 0.0-1.0
2032
+ },
2033
+ "completeness": {
2034
+ "score": 0.0-1.0,
2035
+ "missingElements": ["list of missing elements if any"],
2036
+ "reasoning": "explanation of completeness assessment"
2037
+ },
2038
+ "responseAppropriateness": {
2039
+ "score": 0.0-1.0,
2040
+ "formatAlignment": true/false,
2041
+ "toneAlignment": true/false,
2042
+ "reasoning": "explanation of appropriateness"
2043
+ },
2044
+ "overallAssessment": "summary of the prompt-response alignment"
2045
+ }
2046
+
2047
+ Example:
2048
+ User Prompt: "Write a Python function to calculate factorial with error handling for negative numbers."
2049
+
2050
+ Agent Response: "def factorial(n):
2051
+ if n < 0:
2052
+ raise ValueError('Factorial not defined for negative numbers')
2053
+ if n == 0:
2054
+ return 1
2055
+ return n * factorial(n-1)"
2056
+
2057
+ {
2058
+ "intentAlignment": {
2059
+ "score": 1.0,
2060
+ "primaryIntent": "Create a Python function to calculate factorial",
2061
+ "isAddressed": true,
2062
+ "reasoning": "The response provides exactly what was requested - a Python function that calculates factorial"
2063
+ },
2064
+ "requirementsFulfillment": {
2065
+ "requirements": [
2066
+ {
2067
+ "requirement": "Write a Python function",
2068
+ "isFulfilled": true,
2069
+ "reasoning": "A proper Python function is provided with correct syntax"
2070
+ },
2071
+ {
2072
+ "requirement": "Calculate factorial",
2073
+ "isFulfilled": true,
2074
+ "reasoning": "The function correctly implements factorial calculation using recursion"
2075
+ },
2076
+ {
2077
+ "requirement": "Include error handling for negative numbers",
2078
+ "isFulfilled": true,
2079
+ "reasoning": "The function raises a ValueError for negative inputs with an appropriate message"
2080
+ }
2081
+ ],
2082
+ "overallScore": 1.0
2083
+ },
2084
+ "completeness": {
2085
+ "score": 0.9,
2086
+ "missingElements": ["No docstring or comments"],
2087
+ "reasoning": "The function is complete and functional but could benefit from documentation"
2088
+ },
2089
+ "responseAppropriateness": {
2090
+ "score": 1.0,
2091
+ "formatAlignment": true,
2092
+ "toneAlignment": true,
2093
+ "reasoning": "The response is in the exact format requested (Python code) with appropriate technical implementation"
2094
+ },
2095
+ "overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
2096
+ }`;
2097
+ }
2098
+ function createReasonPrompt5({
2099
+ userPrompt,
2100
+ systemPrompt,
2101
+ score,
2102
+ scale,
2103
+ analysis,
2104
+ evaluationMode
2105
+ }) {
2106
+ const fulfilledCount = analysis.requirementsFulfillment.requirements.filter((r) => r.isFulfilled).length;
2107
+ const totalRequirements = analysis.requirementsFulfillment.requirements.length;
2108
+ const promptContext = evaluationMode === "system" ? `System Prompt:
2109
+ ${systemPrompt}` : evaluationMode === "user" ? `User Prompt:
2110
+ ${userPrompt}` : `User Prompt:
2111
+ ${userPrompt}
2112
+
2113
+ System Prompt:
2114
+ ${systemPrompt}`;
2115
+ const alignmentDescription = evaluationMode === "system" ? "system behavioral guidelines and constraints" : evaluationMode === "user" ? "user's prompt" : "both user's prompt and system guidelines";
2116
+ return `Explain the prompt alignment score based on how well the agent's response addresses the ${alignmentDescription}.
2117
+
2118
+ ${promptContext}
2119
+
2120
+ Score: ${score} out of ${scale}
2121
+
2122
+ Evaluation Breakdown:
2123
+ - Intent Alignment (40% weight): ${analysis.intentAlignment.score}
2124
+ Primary Intent: "${analysis.intentAlignment.primaryIntent}"
2125
+ Addressed: ${analysis.intentAlignment.isAddressed ? "Yes" : "No"}
2126
+ ${analysis.intentAlignment.reasoning}
2127
+
2128
+ - Requirements Fulfillment (30% weight): ${analysis.requirementsFulfillment.overallScore}
2129
+ ${fulfilledCount} out of ${totalRequirements} requirements met
2130
+ ${analysis.requirementsFulfillment.requirements.map((r) => `\u2022 ${r.requirement}: ${r.isFulfilled ? "\u2713" : "\u2717"}`).join("\n ")}
2131
+
2132
+ - Completeness (20% weight): ${analysis.completeness.score}
2133
+ ${analysis.completeness.missingElements.length > 0 ? `Missing elements: ${analysis.completeness.missingElements.join(", ")}` : "Response is complete"}
2134
+ ${analysis.completeness.reasoning}
2135
+
2136
+ - Response Appropriateness (10% weight): ${analysis.responseAppropriateness.score}
2137
+ Format: ${analysis.responseAppropriateness.formatAlignment ? "Aligned" : "Misaligned"}
2138
+ Tone: ${analysis.responseAppropriateness.toneAlignment ? "Aligned" : "Misaligned"}
2139
+ ${analysis.responseAppropriateness.reasoning}
2140
+
2141
+ Overall Assessment: ${analysis.overallAssessment}
2142
+
2143
+ Prompt Alignment measures how well the response addresses the user's request across intent, requirements, completeness, and appropriateness. The weighted scoring ensures primary focus on understanding and addressing the core intent while meeting specific requirements.
2144
+
2145
+ Rules for explanation:
2146
+ - Summarize the key strengths and weaknesses of alignment
2147
+ - Highlight any major misalignments that significantly impacted the score
2148
+ - Be concise but comprehensive in the explanation
2149
+ - Use the given score, don't recalculate
2150
+
2151
+ Format:
2152
+ "The score is ${score} because {explanation of alignment strengths and weaknesses based on the weighted dimensions}"
2153
+
2154
+ Example responses:
2155
+ "The score is 0.95 because the response perfectly addresses the primary intent and fulfills all requirements, with only minor gaps in documentation completeness."
2156
+ "The score is 0.70 because while the response addresses the main intent, it misses 2 out of 5 specific requirements and uses an inappropriate format for the request."
2157
+ "The score is 0.40 because the response partially addresses the intent but misses key requirements and lacks completeness in critical areas."`;
2158
+ }
2159
+
2160
+ // src/scorers/llm/prompt-alignment/index.ts
2161
+ var analyzeOutputSchema4 = zod.z.object({
2162
+ intentAlignment: zod.z.object({
2163
+ score: zod.z.number().min(0).max(1),
2164
+ primaryIntent: zod.z.string(),
2165
+ isAddressed: zod.z.boolean(),
2166
+ reasoning: zod.z.string()
2167
+ }),
2168
+ requirementsFulfillment: zod.z.object({
2169
+ requirements: zod.z.array(
2170
+ zod.z.object({
2171
+ requirement: zod.z.string(),
2172
+ isFulfilled: zod.z.boolean(),
2173
+ reasoning: zod.z.string()
2174
+ })
2175
+ ),
2176
+ overallScore: zod.z.number().min(0).max(1)
2177
+ }),
2178
+ completeness: zod.z.object({
2179
+ score: zod.z.number().min(0).max(1),
2180
+ missingElements: zod.z.array(zod.z.string()),
2181
+ reasoning: zod.z.string()
2182
+ }),
2183
+ responseAppropriateness: zod.z.object({
2184
+ score: zod.z.number().min(0).max(1),
2185
+ formatAlignment: zod.z.boolean(),
2186
+ toneAlignment: zod.z.boolean(),
2187
+ reasoning: zod.z.string()
2188
+ }),
2189
+ overallAssessment: zod.z.string()
2190
+ });
2191
+ var SCORING_WEIGHTS = {
2192
+ USER: {
2193
+ INTENT_ALIGNMENT: 0.4,
2194
+ // 40% - Core intent is most important
2195
+ REQUIREMENTS_FULFILLMENT: 0.3,
2196
+ // 30% - Meeting specific requirements
2197
+ COMPLETENESS: 0.2,
2198
+ // 20% - Comprehensive response
2199
+ RESPONSE_APPROPRIATENESS: 0.1
2200
+ // 10% - Format and tone matching
2201
+ },
2202
+ SYSTEM: {
2203
+ INTENT_ALIGNMENT: 0.35,
2204
+ // 35% - Following system behavioral guidelines
2205
+ REQUIREMENTS_FULFILLMENT: 0.35,
2206
+ // 35% - Meeting system constraints
2207
+ COMPLETENESS: 0.15,
2208
+ // 15% - Adherence to all system rules
2209
+ RESPONSE_APPROPRIATENESS: 0.15
2210
+ // 15% - Consistency with system tone/format
2211
+ },
2212
+ BOTH: {
2213
+ // When evaluating both, we weight user alignment at 70% and system at 30%
2214
+ USER_WEIGHT: 0.7,
2215
+ SYSTEM_WEIGHT: 0.3
2216
+ }
2217
+ };
2218
+ function createPromptAlignmentScorerLLM({
2219
+ model,
2220
+ options
2221
+ }) {
2222
+ const scale = options?.scale || 1;
2223
+ const evaluationMode = options?.evaluationMode || "both";
2224
+ return scores.createScorer({
2225
+ name: "Prompt Alignment (LLM)",
2226
+ description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
2227
+ judge: {
2228
+ model,
2229
+ instructions: PROMPT_ALIGNMENT_INSTRUCTIONS
2230
+ }
2231
+ }).analyze({
2232
+ description: "Analyze prompt-response alignment across multiple dimensions",
2233
+ outputSchema: analyzeOutputSchema4,
2234
+ createPrompt: ({ run }) => {
2235
+ const userPrompt = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
2236
+ const systemPrompt = chunkEKSPLMYP_cjs.getCombinedSystemPrompt(run.input) ?? "";
2237
+ const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2238
+ if (evaluationMode === "user" && !userPrompt) {
2239
+ throw new Error("User prompt is required for user prompt alignment scoring");
2240
+ }
2241
+ if (evaluationMode === "system" && !systemPrompt) {
2242
+ throw new Error("System prompt is required for system prompt alignment scoring");
2243
+ }
2244
+ if (evaluationMode === "both" && (!userPrompt || !systemPrompt)) {
2245
+ throw new Error("Both user and system prompts are required for combined alignment scoring");
2246
+ }
2247
+ if (!agentResponse) {
2248
+ throw new Error("Agent response is required for prompt alignment scoring");
2249
+ }
2250
+ return createAnalyzePrompt4({
2251
+ userPrompt,
2252
+ systemPrompt,
2253
+ agentResponse,
2254
+ evaluationMode
2255
+ });
2256
+ }
2257
+ }).generateScore(({ results }) => {
2258
+ const analysis = results.analyzeStepResult;
2259
+ if (!analysis) {
2260
+ return 0;
2261
+ }
2262
+ let weightedScore = 0;
2263
+ if (evaluationMode === "user") {
2264
+ weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
2265
+ } else if (evaluationMode === "system") {
2266
+ weightedScore = analysis.intentAlignment.score * SCORING_WEIGHTS.SYSTEM.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.SYSTEM.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.SYSTEM.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.SYSTEM.RESPONSE_APPROPRIATENESS;
2267
+ } else {
2268
+ const userScore = analysis.intentAlignment.score * SCORING_WEIGHTS.USER.INTENT_ALIGNMENT + analysis.requirementsFulfillment.overallScore * SCORING_WEIGHTS.USER.REQUIREMENTS_FULFILLMENT + analysis.completeness.score * SCORING_WEIGHTS.USER.COMPLETENESS + analysis.responseAppropriateness.score * SCORING_WEIGHTS.USER.RESPONSE_APPROPRIATENESS;
2269
+ const systemScore = userScore;
2270
+ weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2271
+ }
2272
+ const finalScore = weightedScore * scale;
2273
+ return chunkEKSPLMYP_cjs.roundToTwoDecimals(finalScore);
2274
+ }).generateReason({
2275
+ description: "Generate human-readable explanation of prompt alignment evaluation",
2276
+ createPrompt: ({ run, results, score }) => {
2277
+ const userPrompt = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
2278
+ const systemPrompt = chunkEKSPLMYP_cjs.getCombinedSystemPrompt(run.input) ?? "";
2279
+ const analysis = results.analyzeStepResult;
2280
+ if (!analysis) {
2281
+ return `Unable to analyze prompt alignment. Score: ${score}`;
2282
+ }
2283
+ return createReasonPrompt5({
2284
+ userPrompt,
2285
+ systemPrompt,
2286
+ score,
2287
+ scale,
2288
+ analysis,
2289
+ evaluationMode
2290
+ });
2291
+ }
2292
+ });
2293
+ }
2294
+
1161
2295
  exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
1162
2296
  exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
1163
2297
  exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
1164
2298
  exports.createBiasScorer = createBiasScorer;
2299
+ exports.createContextPrecisionScorer = createContextPrecisionScorer;
2300
+ exports.createContextRelevanceScorerLLM = createContextRelevanceScorerLLM;
1165
2301
  exports.createFaithfulnessScorer = createFaithfulnessScorer;
1166
2302
  exports.createHallucinationScorer = createHallucinationScorer;
2303
+ exports.createNoiseSensitivityScorerLLM = createNoiseSensitivityScorerLLM;
2304
+ exports.createPromptAlignmentScorerLLM = createPromptAlignmentScorerLLM;
1167
2305
  exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
1168
2306
  exports.createToxicityScorer = createToxicityScorer;
1169
2307
  //# sourceMappingURL=index.cjs.map