@mastra/evals 0.13.5-alpha.0 → 0.13.6-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
4
- var chunkEKSPLMYP_cjs = require('../../chunk-EKSPLMYP.cjs');
4
+ var chunkQKR2PMLZ_cjs = require('../../chunk-QKR2PMLZ.cjs');
5
5
  var scores = require('@mastra/core/scores');
6
6
  var zod = require('zod');
7
7
 
@@ -227,14 +227,14 @@ function createAnswerRelevancyScorer({
227
227
  description: "Extract relevant statements from the LLM output",
228
228
  outputSchema: extractOutputSchema,
229
229
  createPrompt: ({ run }) => {
230
- const assistantMessage = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
230
+ const assistantMessage = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
231
231
  return createExtractPrompt(assistantMessage);
232
232
  }
233
233
  }).analyze({
234
234
  description: "Score the relevance of the statements to the input",
235
235
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
236
236
  createPrompt: ({ run, results }) => {
237
- const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
237
+ const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
238
238
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
239
239
  }
240
240
  }).generateScore(({ results }) => {
@@ -256,8 +256,8 @@ function createAnswerRelevancyScorer({
256
256
  description: "Reason about the results",
257
257
  createPrompt: ({ run, results, score }) => {
258
258
  return createReasonPrompt({
259
- input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
260
- output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
259
+ input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
260
+ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
261
261
  score,
262
262
  results: results.analyzeStepResult.results,
263
263
  scale: options.scale
@@ -266,6 +266,270 @@ function createAnswerRelevancyScorer({
266
266
  });
267
267
  }
268
268
 
269
+ // src/scorers/llm/answer-similarity/prompts.ts
270
+ var createExtractPrompt2 = ({ output, groundTruth }) => `
271
+ Extract and normalize the semantic units (facts, claims, concepts) from both the agent output and the ground truth answer.
272
+
273
+ Break down each text into its core semantic components while preserving meaning and relationships.
274
+ Focus on extracting:
275
+ - Key facts and claims
276
+ - Important concepts and entities
277
+ - Relationships between concepts
278
+ - Quantitative information
279
+ - Qualitative descriptions
280
+
281
+ Guidelines:
282
+ - Preserve the semantic meaning, not just keywords
283
+ - Group related information together
284
+ - Normalize different phrasings of the same concept
285
+ - Keep numerical values and units together
286
+ - Don't over-split compound concepts that belong together
287
+
288
+ Return ONLY valid JSON with two arrays of semantic units. Do not include any text before or after the JSON.
289
+
290
+ Agent Output:
291
+ ${output}
292
+
293
+ Ground Truth:
294
+ ${groundTruth}
295
+
296
+ Required JSON format (return valid JSON only):
297
+ {
298
+ "outputUnits": [],
299
+ "groundTruthUnits": []
300
+ }
301
+
302
+ Important: Return valid JSON only, no additional text or explanations.
303
+ `;
304
+ var createAnalyzePrompt = ({
305
+ outputUnits,
306
+ groundTruthUnits
307
+ }) => `
308
+ Compare the semantic units from the agent output against the ground truth to evaluate answer similarity.
309
+
310
+ Analyze each ground truth unit and determine:
311
+ 1. Whether it has a matching unit in the output (exact or semantic match)
312
+ 2. The quality of the match (exact, semantic, partial, missing)
313
+ 3. Whether there are contradictions
314
+
315
+ Also identify:
316
+ - Extra information in the output not present in ground truth
317
+ - Any contradictory statements between output and ground truth
318
+
319
+ Matching Guidelines:
320
+ - "exact": The same information expressed identically or with minor wording differences
321
+ - "semantic": The same concept or fact expressed differently but with equivalent meaning
322
+ - "partial": Some overlap but missing important details or context
323
+ - "missing": No corresponding information found in the output
324
+ - "contradiction": Information that directly conflicts with the ground truth (wrong facts, incorrect names, false claims)
325
+
326
+ CRITICAL: If the output contains factually incorrect information (wrong names, wrong facts, opposite claims), you MUST identify contradictions and mark relevant matches as "missing" while adding entries to the contradictions array.
327
+
328
+ Return ONLY valid JSON with detailed analysis. Do not include any text before or after the JSON.
329
+
330
+ Output Units:
331
+ ${JSON.stringify(outputUnits, null, 2)}
332
+
333
+ Ground Truth Units:
334
+ ${JSON.stringify(groundTruthUnits, null, 2)}
335
+
336
+ Required JSON format (copy this structure exactly):
337
+ {
338
+ "matches": [
339
+ {
340
+ "groundTruthUnit": "unit from ground truth",
341
+ "outputUnit": "corresponding unit from output or null if missing",
342
+ "matchType": "exact",
343
+ "explanation": "brief explanation of the match quality"
344
+ }
345
+ ],
346
+ "extraInOutput": [],
347
+ "contradictions": []
348
+ }
349
+
350
+ Important:
351
+ - matchType must be exactly one of: "exact", "semantic", "partial", "missing"
352
+ - outputUnit must be a string or null (not undefined)
353
+ - All arrays must be present even if empty
354
+ - Return valid JSON only, no additional text
355
+ `;
356
+ var createReasonPrompt2 = ({
357
+ output,
358
+ groundTruth,
359
+ score,
360
+ analysis,
361
+ scale
362
+ }) => `
363
+ Generate a clear, actionable explanation of the answer similarity score.
364
+
365
+ Context:
366
+ - Agent Output: ${output}
367
+ - Ground Truth: ${groundTruth}
368
+ - Score: ${score}/${scale}
369
+ - Analysis: ${JSON.stringify(analysis, null, 2)}
370
+
371
+ Provide a concise explanation that:
372
+ 1. States the overall similarity level (high/moderate/low)
373
+ 2. Highlights what the agent got right
374
+ 3. Identifies key missing or incorrect information
375
+ 4. Suggests specific improvements if score is not perfect
376
+
377
+ Keep the explanation under 3 sentences and focus on actionable insights.
378
+
379
+ Format: "The score is {score}/{scale} because {explanation}. {what matched well}. {what needs improvement or is perfect}."
380
+
381
+ Example good responses:
382
+ - "The score is 0.9/1 because the answer captures all key concepts with minor phrasing differences. The agent correctly identified the main facts and relationships. Only missing a minor detail about the specific date mentioned in the ground truth."
383
+ - "The score is 0.5/1 because the answer is partially correct but missing crucial information. The agent correctly explained the basic concept. However, it missed the quantitative data and specific examples that were essential to the complete answer."
384
+ - "The score is 1.0/1 because the answer perfectly matches the ground truth semantically. All key facts, relationships, and details are accurately represented. No improvements needed."
385
+ `;
386
+
387
+ // src/scorers/llm/answer-similarity/index.ts
388
+ var ANSWER_SIMILARITY_DEFAULT_OPTIONS = {
389
+ requireGroundTruth: true,
390
+ semanticThreshold: 0.8,
391
+ exactMatchBonus: 0.2,
392
+ missingPenalty: 0.15,
393
+ contradictionPenalty: 1,
394
+ extraInfoPenalty: 0.05,
395
+ scale: 1
396
+ };
397
+ var ANSWER_SIMILARITY_INSTRUCTIONS = `
398
+ You are a precise answer similarity evaluator for CI/CD testing. Your role is to compare agent outputs against ground truth answers to ensure consistency and accuracy in automated testing.
399
+
400
+ Key Principles:
401
+ 1. Focus on semantic equivalence, not just string matching
402
+ 2. Recognize that different phrasings can convey the same information
403
+ 3. Identify missing critical information from the ground truth
404
+ 4. Detect contradictions between output and ground truth
405
+ 5. Provide actionable feedback for improving answer accuracy
406
+ 6. Be strict but fair - partial credit for partial matches
407
+ `;
408
+ var extractOutputSchema2 = zod.z.object({
409
+ outputUnits: zod.z.array(zod.z.string()),
410
+ groundTruthUnits: zod.z.array(zod.z.string())
411
+ });
412
+ var analyzeOutputSchema = zod.z.object({
413
+ matches: zod.z.array(
414
+ zod.z.object({
415
+ groundTruthUnit: zod.z.string(),
416
+ outputUnit: zod.z.string().nullable(),
417
+ matchType: zod.z.enum(["exact", "semantic", "partial", "missing"]),
418
+ explanation: zod.z.string()
419
+ })
420
+ ),
421
+ extraInOutput: zod.z.array(zod.z.string()),
422
+ contradictions: zod.z.array(
423
+ zod.z.object({
424
+ outputUnit: zod.z.string(),
425
+ groundTruthUnit: zod.z.string(),
426
+ explanation: zod.z.string()
427
+ })
428
+ )
429
+ });
430
+ function createAnswerSimilarityScorer({
431
+ model,
432
+ options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
433
+ }) {
434
+ const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
435
+ return scores.createScorer({
436
+ name: "Answer Similarity Scorer",
437
+ description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
438
+ judge: {
439
+ model,
440
+ instructions: ANSWER_SIMILARITY_INSTRUCTIONS
441
+ }
442
+ }).preprocess({
443
+ description: "Extract semantic units from output and ground truth",
444
+ outputSchema: extractOutputSchema2,
445
+ createPrompt: ({ run }) => {
446
+ if (!run.groundTruth) {
447
+ if (mergedOptions.requireGroundTruth) {
448
+ throw new Error("Answer Similarity Scorer requires ground truth to be provided");
449
+ }
450
+ return createExtractPrompt2({
451
+ output: "",
452
+ groundTruth: ""
453
+ });
454
+ }
455
+ const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
456
+ const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
457
+ return createExtractPrompt2({
458
+ output,
459
+ groundTruth
460
+ });
461
+ }
462
+ }).analyze({
463
+ description: "Compare semantic units between output and ground truth",
464
+ outputSchema: analyzeOutputSchema,
465
+ createPrompt: ({ results }) => {
466
+ const outputUnits = results.preprocessStepResult?.outputUnits || [];
467
+ const groundTruthUnits = results.preprocessStepResult?.groundTruthUnits || [];
468
+ return createAnalyzePrompt({
469
+ outputUnits,
470
+ groundTruthUnits
471
+ });
472
+ }
473
+ }).generateScore(({ run, results }) => {
474
+ if (!run.groundTruth) {
475
+ return 0;
476
+ }
477
+ const analysis = results.analyzeStepResult;
478
+ if (!analysis) {
479
+ return 0;
480
+ }
481
+ let score = 0;
482
+ const totalUnits = analysis.matches.length;
483
+ if (totalUnits === 0) {
484
+ return 0;
485
+ }
486
+ for (const match of analysis.matches) {
487
+ switch (match.matchType) {
488
+ case "exact":
489
+ score += 1 + mergedOptions.exactMatchBonus;
490
+ break;
491
+ case "semantic":
492
+ score += mergedOptions.semanticThreshold;
493
+ break;
494
+ case "partial":
495
+ score += mergedOptions.semanticThreshold * 0.5;
496
+ break;
497
+ case "missing":
498
+ score -= mergedOptions.missingPenalty;
499
+ break;
500
+ }
501
+ }
502
+ const maxPossibleScore = totalUnits * (1 + mergedOptions.exactMatchBonus);
503
+ score = score / maxPossibleScore;
504
+ const contradictionPenalty = analysis.contradictions.length * mergedOptions.contradictionPenalty;
505
+ score -= contradictionPenalty;
506
+ const extraInfoPenalty = Math.min(
507
+ analysis.extraInOutput.length * mergedOptions.extraInfoPenalty,
508
+ 0.2
509
+ // Cap extra info penalty at 0.2
510
+ );
511
+ score -= extraInfoPenalty;
512
+ score = Math.max(0, Math.min(1, score));
513
+ return chunk7QAUEU4L_cjs.roundToTwoDecimals(score * mergedOptions.scale);
514
+ }).generateReason({
515
+ description: "Generate explanation of similarity score",
516
+ createPrompt: ({ run, results, score }) => {
517
+ if (!run.groundTruth) {
518
+ return "No ground truth was provided for comparison. Score is 0 by default.";
519
+ }
520
+ const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
521
+ const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
522
+ return createReasonPrompt2({
523
+ output,
524
+ groundTruth,
525
+ score,
526
+ analysis: results.analyzeStepResult,
527
+ scale: mergedOptions.scale
528
+ });
529
+ }
530
+ });
531
+ }
532
+
269
533
  // src/scorers/llm/faithfulness/prompts.ts
270
534
  var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
271
535
 
@@ -435,7 +699,7 @@ function createFaithfulnessScorer({
435
699
  description: "Extract relevant statements from the LLM output",
436
700
  outputSchema: zod.z.array(zod.z.string()),
437
701
  createPrompt: ({ run }) => {
438
- const prompt = createFaithfulnessExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
702
+ const prompt = createFaithfulnessExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
439
703
  return prompt;
440
704
  }
441
705
  }).analyze({
@@ -456,13 +720,13 @@ function createFaithfulnessScorer({
456
720
  return 0;
457
721
  }
458
722
  const score = supportedClaims / totalClaims * (options?.scale || 1);
459
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
723
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
460
724
  }).generateReason({
461
725
  description: "Reason about the results",
462
726
  createPrompt: ({ run, results, score }) => {
463
727
  const prompt = createFaithfulnessReasonPrompt({
464
- input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
465
- output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
728
+ input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
729
+ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
466
730
  context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
467
731
  score,
468
732
  scale: options?.scale || 1,
@@ -593,13 +857,13 @@ function createBiasScorer({ model, options }) {
593
857
  outputSchema: zod.z.object({
594
858
  opinions: zod.z.array(zod.z.string())
595
859
  }),
596
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
860
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
597
861
  }).analyze({
598
862
  description: "Score the relevance of the statements to the input",
599
863
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
600
864
  createPrompt: ({ run, results }) => {
601
865
  const prompt = createBiasAnalyzePrompt({
602
- output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
866
+ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
603
867
  opinions: results.preprocessStepResult?.opinions || []
604
868
  });
605
869
  return prompt;
@@ -610,7 +874,7 @@ function createBiasScorer({ model, options }) {
610
874
  }
611
875
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
612
876
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
613
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(score * (options?.scale || 1));
877
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
614
878
  }).generateReason({
615
879
  description: "Reason about the results",
616
880
  createPrompt: ({ score, results }) => {
@@ -827,7 +1091,7 @@ function createHallucinationScorer({
827
1091
  claims: zod.z.array(zod.z.string())
828
1092
  }),
829
1093
  createPrompt: ({ run }) => {
830
- const prompt = createHallucinationExtractPrompt({ output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1094
+ const prompt = createHallucinationExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
831
1095
  return prompt;
832
1096
  }
833
1097
  }).analyze({
@@ -849,13 +1113,13 @@ function createHallucinationScorer({
849
1113
  return 0;
850
1114
  }
851
1115
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
852
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
1116
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
853
1117
  }).generateReason({
854
1118
  description: "Reason about the results",
855
1119
  createPrompt: ({ run, results, score }) => {
856
1120
  const prompt = createHallucinationReasonPrompt({
857
- input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
858
- output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1121
+ input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
1122
+ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
859
1123
  context: options?.context || [],
860
1124
  score,
861
1125
  scale: options?.scale || 1,
@@ -964,8 +1228,8 @@ function createToxicityScorer({ model, options }) {
964
1228
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
965
1229
  createPrompt: ({ run }) => {
966
1230
  const prompt = createToxicityAnalyzePrompt({
967
- input: chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "",
968
- output: chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1231
+ input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
1232
+ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
969
1233
  });
970
1234
  return prompt;
971
1235
  }
@@ -981,7 +1245,7 @@ function createToxicityScorer({ model, options }) {
981
1245
  }
982
1246
  }
983
1247
  const score = toxicityCount / numberOfVerdicts;
984
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1248
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
985
1249
  }).generateReason({
986
1250
  description: "Reason about the results",
987
1251
  createPrompt: ({ results, score }) => {
@@ -1018,7 +1282,7 @@ OUTPUT REQUIREMENTS:
1018
1282
 
1019
1283
  You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
1020
1284
  `;
1021
- var createAnalyzePrompt = ({
1285
+ var createAnalyzePrompt2 = ({
1022
1286
  userInput,
1023
1287
  agentResponse,
1024
1288
  toolsCalled,
@@ -1069,7 +1333,7 @@ STRICT EVALUATION CRITERIA:
1069
1333
  Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
1070
1334
  `;
1071
1335
  };
1072
- var createReasonPrompt2 = ({
1336
+ var createReasonPrompt3 = ({
1073
1337
  userInput,
1074
1338
  score,
1075
1339
  evaluations,
@@ -1088,7 +1352,7 @@ Provide a single, concise sentence explaining why this score was given.
1088
1352
  };
1089
1353
 
1090
1354
  // src/scorers/llm/tool-call-accuracy/index.ts
1091
- var analyzeOutputSchema = zod.z.object({
1355
+ var analyzeOutputSchema2 = zod.z.object({
1092
1356
  evaluations: zod.z.array(
1093
1357
  zod.z.object({
1094
1358
  toolCalled: zod.z.string(),
@@ -1113,7 +1377,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1113
1377
  if (isInputInvalid || isOutputInvalid) {
1114
1378
  throw new Error("Input and output messages cannot be null or empty");
1115
1379
  }
1116
- const { tools: actualTools, toolCallInfos } = chunkEKSPLMYP_cjs.extractToolCalls(run.output);
1380
+ const { tools: actualTools, toolCallInfos } = chunkQKR2PMLZ_cjs.extractToolCalls(run.output);
1117
1381
  return {
1118
1382
  actualTools,
1119
1383
  hasToolCalls: actualTools.length > 0,
@@ -1121,12 +1385,12 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1121
1385
  };
1122
1386
  }).analyze({
1123
1387
  description: "Analyze the appropriateness of tool selections",
1124
- outputSchema: analyzeOutputSchema,
1388
+ outputSchema: analyzeOutputSchema2,
1125
1389
  createPrompt: ({ run, results }) => {
1126
- const userInput = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1127
- const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1390
+ const userInput = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1391
+ const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1128
1392
  const toolsCalled = results.preprocessStepResult?.actualTools || [];
1129
- return createAnalyzePrompt({
1393
+ return createAnalyzePrompt2({
1130
1394
  userInput,
1131
1395
  agentResponse,
1132
1396
  toolsCalled,
@@ -1141,14 +1405,14 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1141
1405
  }
1142
1406
  const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1143
1407
  const totalToolCalls = evaluations.length;
1144
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1408
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1145
1409
  }).generateReason({
1146
1410
  description: "Generate human-readable explanation of tool selection evaluation",
1147
1411
  createPrompt: ({ run, results, score }) => {
1148
- const userInput = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1412
+ const userInput = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1149
1413
  const evaluations = results.analyzeStepResult?.evaluations || [];
1150
1414
  const missingTools = results.analyzeStepResult?.missingTools || [];
1151
- return createReasonPrompt2({
1415
+ return createReasonPrompt3({
1152
1416
  userInput,
1153
1417
  score,
1154
1418
  evaluations,
@@ -1175,7 +1439,7 @@ Evaluation Guidelines:
1175
1439
  - Consider whether missing context might have led to a better response
1176
1440
 
1177
1441
  Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
1178
- function createAnalyzePrompt2({
1442
+ function createAnalyzePrompt3({
1179
1443
  userQuery,
1180
1444
  agentResponse,
1181
1445
  providedContext
@@ -1260,7 +1524,7 @@ Context:
1260
1524
  "overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
1261
1525
  }`;
1262
1526
  }
1263
- function createReasonPrompt3({
1527
+ function createReasonPrompt4({
1264
1528
  userQuery,
1265
1529
  score,
1266
1530
  evaluations,
@@ -1306,7 +1570,7 @@ Example responses:
1306
1570
  }
1307
1571
 
1308
1572
  // src/scorers/llm/context-relevance/index.ts
1309
- var analyzeOutputSchema2 = zod.z.object({
1573
+ var analyzeOutputSchema3 = zod.z.object({
1310
1574
  evaluations: zod.z.array(
1311
1575
  zod.z.object({
1312
1576
  context_index: zod.z.number(),
@@ -1346,19 +1610,19 @@ function createContextRelevanceScorerLLM({
1346
1610
  }
1347
1611
  }).analyze({
1348
1612
  description: "Analyze the relevance and utility of provided context",
1349
- outputSchema: analyzeOutputSchema2,
1613
+ outputSchema: analyzeOutputSchema3,
1350
1614
  createPrompt: ({ run }) => {
1351
- const userQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1352
- const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1615
+ const userQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1616
+ const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1353
1617
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1354
1618
  if (context.length === 0) {
1355
- return createAnalyzePrompt2({
1619
+ return createAnalyzePrompt3({
1356
1620
  userQuery,
1357
1621
  agentResponse,
1358
1622
  providedContext: ["[No context was provided for evaluation]"]
1359
1623
  });
1360
1624
  }
1361
- return createAnalyzePrompt2({
1625
+ return createAnalyzePrompt3({
1362
1626
  userQuery,
1363
1627
  agentResponse,
1364
1628
  providedContext: context
@@ -1397,18 +1661,18 @@ function createContextRelevanceScorerLLM({
1397
1661
  const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1398
1662
  const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1399
1663
  const scaledScore = finalScore * (options.scale || 1);
1400
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(scaledScore);
1664
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(scaledScore);
1401
1665
  }).generateReason({
1402
1666
  description: "Generate human-readable explanation of context relevance evaluation",
1403
1667
  createPrompt: ({ run, results, score }) => {
1404
- const userQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1668
+ const userQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1405
1669
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1406
1670
  if (context.length === 0) {
1407
1671
  return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
1408
1672
  }
1409
1673
  const evaluations = results.analyzeStepResult?.evaluations || [];
1410
1674
  const missingContext = results.analyzeStepResult?.missingContext || [];
1411
- return createReasonPrompt3({
1675
+ return createReasonPrompt4({
1412
1676
  userQuery,
1413
1677
  score,
1414
1678
  evaluations,
@@ -1570,8 +1834,8 @@ function createContextPrecisionScorer({
1570
1834
  description: "Evaluate the relevance of each context piece for generating the expected output",
1571
1835
  outputSchema: contextRelevanceOutputSchema,
1572
1836
  createPrompt: ({ run }) => {
1573
- const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1574
- const output = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1837
+ const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1838
+ const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1575
1839
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1576
1840
  if (context.length === 0) {
1577
1841
  throw new Error("No context available for evaluation");
@@ -1604,12 +1868,12 @@ function createContextPrecisionScorer({
1604
1868
  }
1605
1869
  const map = sumPrecision / relevantCount;
1606
1870
  const score = map * (options.scale || 1);
1607
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(score);
1871
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
1608
1872
  }).generateReason({
1609
1873
  description: "Reason about the context precision results",
1610
1874
  createPrompt: ({ run, results, score }) => {
1611
- const input = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1612
- const output = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1875
+ const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1876
+ const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1613
1877
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1614
1878
  return createContextPrecisionReasonPrompt({
1615
1879
  input,
@@ -1641,7 +1905,7 @@ Noise Impact Assessment:
1641
1905
  - **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
1642
1906
 
1643
1907
  Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
1644
- function createAnalyzePrompt3({
1908
+ function createAnalyzePrompt4({
1645
1909
  userQuery,
1646
1910
  baselineResponse,
1647
1911
  noisyQuery,
@@ -1760,7 +2024,7 @@ Noisy Response: "Regular exercise improves cardiovascular health and strengthens
1760
2024
  "robustnessScore": 0.85
1761
2025
  }`;
1762
2026
  }
1763
- function createReasonPrompt4({
2027
+ function createReasonPrompt5({
1764
2028
  userQuery,
1765
2029
  score,
1766
2030
  dimensions,
@@ -1815,7 +2079,7 @@ Example responses:
1815
2079
  }
1816
2080
 
1817
2081
  // src/scorers/llm/noise-sensitivity/index.ts
1818
- var analyzeOutputSchema3 = zod.z.object({
2082
+ var analyzeOutputSchema4 = zod.z.object({
1819
2083
  dimensions: zod.z.array(
1820
2084
  zod.z.object({
1821
2085
  dimension: zod.z.string(),
@@ -1859,14 +2123,14 @@ function createNoiseSensitivityScorerLLM({
1859
2123
  }
1860
2124
  }).analyze({
1861
2125
  description: "Analyze the impact of noise on agent response quality",
1862
- outputSchema: analyzeOutputSchema3,
2126
+ outputSchema: analyzeOutputSchema4,
1863
2127
  createPrompt: ({ run }) => {
1864
- const originalQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
1865
- const noisyResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2128
+ const originalQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2129
+ const noisyResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1866
2130
  if (!originalQuery || !noisyResponse) {
1867
2131
  throw new Error("Both original query and noisy response are required for evaluation");
1868
2132
  }
1869
- return createAnalyzePrompt3({
2133
+ return createAnalyzePrompt4({
1870
2134
  userQuery: originalQuery,
1871
2135
  baselineResponse: options.baselineResponse,
1872
2136
  noisyQuery: options.noisyQuery,
@@ -1905,16 +2169,16 @@ function createNoiseSensitivityScorerLLM({
1905
2169
  const majorIssues = analysisResult.majorIssues || [];
1906
2170
  const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
1907
2171
  finalScore = Math.max(0, finalScore - issuesPenalty);
1908
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(finalScore);
2172
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(finalScore);
1909
2173
  }).generateReason({
1910
2174
  description: "Generate human-readable explanation of noise sensitivity evaluation",
1911
2175
  createPrompt: ({ run, results, score }) => {
1912
- const originalQuery = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
2176
+ const originalQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1913
2177
  const analysisResult = results.analyzeStepResult;
1914
2178
  if (!analysisResult) {
1915
2179
  throw new Error("Analysis step failed to produce results for reason generation");
1916
2180
  }
1917
- return createReasonPrompt4({
2181
+ return createReasonPrompt5({
1918
2182
  userQuery: originalQuery,
1919
2183
  score,
1920
2184
  dimensions: analysisResult.dimensions || [],
@@ -1943,7 +2207,7 @@ Evaluation Guidelines:
1943
2207
  - Be objective and focus on alignment rather than response quality
1944
2208
 
1945
2209
  Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
1946
- function createAnalyzePrompt4({
2210
+ function createAnalyzePrompt5({
1947
2211
  userPrompt,
1948
2212
  systemPrompt,
1949
2213
  agentResponse,
@@ -2095,7 +2359,7 @@ Agent Response: "def factorial(n):
2095
2359
  "overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
2096
2360
  }`;
2097
2361
  }
2098
- function createReasonPrompt5({
2362
+ function createReasonPrompt6({
2099
2363
  userPrompt,
2100
2364
  systemPrompt,
2101
2365
  score,
@@ -2158,7 +2422,7 @@ Example responses:
2158
2422
  }
2159
2423
 
2160
2424
  // src/scorers/llm/prompt-alignment/index.ts
2161
- var analyzeOutputSchema4 = zod.z.object({
2425
+ var analyzeOutputSchema5 = zod.z.object({
2162
2426
  intentAlignment: zod.z.object({
2163
2427
  score: zod.z.number().min(0).max(1),
2164
2428
  primaryIntent: zod.z.string(),
@@ -2230,11 +2494,11 @@ function createPromptAlignmentScorerLLM({
2230
2494
  }
2231
2495
  }).analyze({
2232
2496
  description: "Analyze prompt-response alignment across multiple dimensions",
2233
- outputSchema: analyzeOutputSchema4,
2497
+ outputSchema: analyzeOutputSchema5,
2234
2498
  createPrompt: ({ run }) => {
2235
- const userPrompt = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
2236
- const systemPrompt = chunkEKSPLMYP_cjs.getCombinedSystemPrompt(run.input) ?? "";
2237
- const agentResponse = chunkEKSPLMYP_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2499
+ const userPrompt = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2500
+ const systemPrompt = chunkQKR2PMLZ_cjs.getCombinedSystemPrompt(run.input) ?? "";
2501
+ const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2238
2502
  if (evaluationMode === "user" && !userPrompt) {
2239
2503
  throw new Error("User prompt is required for user prompt alignment scoring");
2240
2504
  }
@@ -2247,7 +2511,7 @@ function createPromptAlignmentScorerLLM({
2247
2511
  if (!agentResponse) {
2248
2512
  throw new Error("Agent response is required for prompt alignment scoring");
2249
2513
  }
2250
- return createAnalyzePrompt4({
2514
+ return createAnalyzePrompt5({
2251
2515
  userPrompt,
2252
2516
  systemPrompt,
2253
2517
  agentResponse,
@@ -2270,17 +2534,17 @@ function createPromptAlignmentScorerLLM({
2270
2534
  weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2271
2535
  }
2272
2536
  const finalScore = weightedScore * scale;
2273
- return chunkEKSPLMYP_cjs.roundToTwoDecimals(finalScore);
2537
+ return chunkQKR2PMLZ_cjs.roundToTwoDecimals(finalScore);
2274
2538
  }).generateReason({
2275
2539
  description: "Generate human-readable explanation of prompt alignment evaluation",
2276
2540
  createPrompt: ({ run, results, score }) => {
2277
- const userPrompt = chunkEKSPLMYP_cjs.getUserMessageFromRunInput(run.input) ?? "";
2278
- const systemPrompt = chunkEKSPLMYP_cjs.getCombinedSystemPrompt(run.input) ?? "";
2541
+ const userPrompt = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2542
+ const systemPrompt = chunkQKR2PMLZ_cjs.getCombinedSystemPrompt(run.input) ?? "";
2279
2543
  const analysis = results.analyzeStepResult;
2280
2544
  if (!analysis) {
2281
2545
  return `Unable to analyze prompt alignment. Score: ${score}`;
2282
2546
  }
2283
- return createReasonPrompt5({
2547
+ return createReasonPrompt6({
2284
2548
  userPrompt,
2285
2549
  systemPrompt,
2286
2550
  score,
@@ -2293,8 +2557,11 @@ function createPromptAlignmentScorerLLM({
2293
2557
  }
2294
2558
 
2295
2559
  exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
2560
+ exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
2561
+ exports.ANSWER_SIMILARITY_INSTRUCTIONS = ANSWER_SIMILARITY_INSTRUCTIONS;
2296
2562
  exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
2297
2563
  exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
2564
+ exports.createAnswerSimilarityScorer = createAnswerSimilarityScorer;
2298
2565
  exports.createBiasScorer = createBiasScorer;
2299
2566
  exports.createContextPrecisionScorer = createContextPrecisionScorer;
2300
2567
  exports.createContextRelevanceScorerLLM = createContextRelevanceScorerLLM;