@mastra/evals 0.13.5 → 0.13.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/dist/{chunk-4LRZVFXR.js → chunk-KHEXN75Q.js} +72 -3
  3. package/dist/chunk-KHEXN75Q.js.map +1 -0
  4. package/dist/{chunk-EKSPLMYP.cjs → chunk-QKR2PMLZ.cjs} +79 -2
  5. package/dist/chunk-QKR2PMLZ.cjs.map +1 -0
  6. package/dist/{dist-QNM75ISG.cjs → dist-ALHZKHK6.cjs} +9 -9
  7. package/dist/{dist-QNM75ISG.cjs.map → dist-ALHZKHK6.cjs.map} +1 -1
  8. package/dist/{dist-KXHZV6E4.js → dist-HPW4UI62.js} +9 -9
  9. package/dist/{dist-KXHZV6E4.js.map → dist-HPW4UI62.js.map} +1 -1
  10. package/dist/index.cjs +1 -1
  11. package/dist/index.js +1 -1
  12. package/dist/scorers/code/index.cjs +2 -2
  13. package/dist/scorers/code/index.js +1 -1
  14. package/dist/scorers/llm/answer-similarity/index.d.ts +34 -0
  15. package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -0
  16. package/dist/scorers/llm/answer-similarity/prompts.d.ts +29 -0
  17. package/dist/scorers/llm/answer-similarity/prompts.d.ts.map +1 -0
  18. package/dist/scorers/llm/index.cjs +335 -68
  19. package/dist/scorers/llm/index.cjs.map +1 -1
  20. package/dist/scorers/llm/index.d.ts +1 -0
  21. package/dist/scorers/llm/index.d.ts.map +1 -1
  22. package/dist/scorers/llm/index.js +291 -27
  23. package/dist/scorers/llm/index.js.map +1 -1
  24. package/dist/scorers/utils.cjs +60 -0
  25. package/dist/scorers/utils.cjs.map +1 -0
  26. package/dist/scorers/utils.d.ts +1 -1
  27. package/dist/scorers/utils.d.ts.map +1 -1
  28. package/dist/scorers/utils.js +3 -0
  29. package/dist/scorers/utils.js.map +1 -0
  30. package/package.json +14 -4
  31. package/dist/chunk-4LRZVFXR.js.map +0 -1
  32. package/dist/chunk-EKSPLMYP.cjs.map +0 -1
@@ -1,4 +1,5 @@
1
1
  export * from './answer-relevancy/index.js';
2
+ export * from './answer-similarity/index.js';
2
3
  export * from './faithfulness/index.js';
3
4
  export * from './bias/index.js';
4
5
  export * from './hallucination/index.js';
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
@@ -1,5 +1,5 @@
1
1
  import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
2
- import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-4LRZVFXR.js';
2
+ import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-KHEXN75Q.js';
3
3
  import { createScorer } from '@mastra/core/scores';
4
4
  import { z } from 'zod';
5
5
 
@@ -264,6 +264,270 @@ function createAnswerRelevancyScorer({
264
264
  });
265
265
  }
266
266
 
267
+ // src/scorers/llm/answer-similarity/prompts.ts
268
+ var createExtractPrompt2 = ({ output, groundTruth }) => `
269
+ Extract and normalize the semantic units (facts, claims, concepts) from both the agent output and the ground truth answer.
270
+
271
+ Break down each text into its core semantic components while preserving meaning and relationships.
272
+ Focus on extracting:
273
+ - Key facts and claims
274
+ - Important concepts and entities
275
+ - Relationships between concepts
276
+ - Quantitative information
277
+ - Qualitative descriptions
278
+
279
+ Guidelines:
280
+ - Preserve the semantic meaning, not just keywords
281
+ - Group related information together
282
+ - Normalize different phrasings of the same concept
283
+ - Keep numerical values and units together
284
+ - Don't over-split compound concepts that belong together
285
+
286
+ Return ONLY valid JSON with two arrays of semantic units. Do not include any text before or after the JSON.
287
+
288
+ Agent Output:
289
+ ${output}
290
+
291
+ Ground Truth:
292
+ ${groundTruth}
293
+
294
+ Required JSON format (return valid JSON only):
295
+ {
296
+ "outputUnits": [],
297
+ "groundTruthUnits": []
298
+ }
299
+
300
+ Important: Return valid JSON only, no additional text or explanations.
301
+ `;
302
+ var createAnalyzePrompt = ({
303
+ outputUnits,
304
+ groundTruthUnits
305
+ }) => `
306
+ Compare the semantic units from the agent output against the ground truth to evaluate answer similarity.
307
+
308
+ Analyze each ground truth unit and determine:
309
+ 1. Whether it has a matching unit in the output (exact or semantic match)
310
+ 2. The quality of the match (exact, semantic, partial, missing)
311
+ 3. Whether there are contradictions
312
+
313
+ Also identify:
314
+ - Extra information in the output not present in ground truth
315
+ - Any contradictory statements between output and ground truth
316
+
317
+ Matching Guidelines:
318
+ - "exact": The same information expressed identically or with minor wording differences
319
+ - "semantic": The same concept or fact expressed differently but with equivalent meaning
320
+ - "partial": Some overlap but missing important details or context
321
+ - "missing": No corresponding information found in the output
322
+ - "contradiction": Information that directly conflicts with the ground truth (wrong facts, incorrect names, false claims)
323
+
324
+ CRITICAL: If the output contains factually incorrect information (wrong names, wrong facts, opposite claims), you MUST identify contradictions and mark relevant matches as "missing" while adding entries to the contradictions array.
325
+
326
+ Return ONLY valid JSON with detailed analysis. Do not include any text before or after the JSON.
327
+
328
+ Output Units:
329
+ ${JSON.stringify(outputUnits, null, 2)}
330
+
331
+ Ground Truth Units:
332
+ ${JSON.stringify(groundTruthUnits, null, 2)}
333
+
334
+ Required JSON format (copy this structure exactly):
335
+ {
336
+ "matches": [
337
+ {
338
+ "groundTruthUnit": "unit from ground truth",
339
+ "outputUnit": "corresponding unit from output or null if missing",
340
+ "matchType": "exact",
341
+ "explanation": "brief explanation of the match quality"
342
+ }
343
+ ],
344
+ "extraInOutput": [],
345
+ "contradictions": []
346
+ }
347
+
348
+ Important:
349
+ - matchType must be exactly one of: "exact", "semantic", "partial", "missing"
350
+ - outputUnit must be a string or null (not undefined)
351
+ - All arrays must be present even if empty
352
+ - Return valid JSON only, no additional text
353
+ `;
354
+ var createReasonPrompt2 = ({
355
+ output,
356
+ groundTruth,
357
+ score,
358
+ analysis,
359
+ scale
360
+ }) => `
361
+ Generate a clear, actionable explanation of the answer similarity score.
362
+
363
+ Context:
364
+ - Agent Output: ${output}
365
+ - Ground Truth: ${groundTruth}
366
+ - Score: ${score}/${scale}
367
+ - Analysis: ${JSON.stringify(analysis, null, 2)}
368
+
369
+ Provide a concise explanation that:
370
+ 1. States the overall similarity level (high/moderate/low)
371
+ 2. Highlights what the agent got right
372
+ 3. Identifies key missing or incorrect information
373
+ 4. Suggests specific improvements if score is not perfect
374
+
375
+ Keep the explanation under 3 sentences and focus on actionable insights.
376
+
377
+ Format: "The score is {score}/{scale} because {explanation}. {what matched well}. {what needs improvement or is perfect}."
378
+
379
+ Example good responses:
380
+ - "The score is 0.9/1 because the answer captures all key concepts with minor phrasing differences. The agent correctly identified the main facts and relationships. Only missing a minor detail about the specific date mentioned in the ground truth."
381
+ - "The score is 0.5/1 because the answer is partially correct but missing crucial information. The agent correctly explained the basic concept. However, it missed the quantitative data and specific examples that were essential to the complete answer."
382
+ - "The score is 1.0/1 because the answer perfectly matches the ground truth semantically. All key facts, relationships, and details are accurately represented. No improvements needed."
383
+ `;
384
+
385
+ // src/scorers/llm/answer-similarity/index.ts
386
+ var ANSWER_SIMILARITY_DEFAULT_OPTIONS = {
387
+ requireGroundTruth: true,
388
+ semanticThreshold: 0.8,
389
+ exactMatchBonus: 0.2,
390
+ missingPenalty: 0.15,
391
+ contradictionPenalty: 1,
392
+ extraInfoPenalty: 0.05,
393
+ scale: 1
394
+ };
395
+ var ANSWER_SIMILARITY_INSTRUCTIONS = `
396
+ You are a precise answer similarity evaluator for CI/CD testing. Your role is to compare agent outputs against ground truth answers to ensure consistency and accuracy in automated testing.
397
+
398
+ Key Principles:
399
+ 1. Focus on semantic equivalence, not just string matching
400
+ 2. Recognize that different phrasings can convey the same information
401
+ 3. Identify missing critical information from the ground truth
402
+ 4. Detect contradictions between output and ground truth
403
+ 5. Provide actionable feedback for improving answer accuracy
404
+ 6. Be strict but fair - partial credit for partial matches
405
+ `;
406
+ var extractOutputSchema2 = z.object({
407
+ outputUnits: z.array(z.string()),
408
+ groundTruthUnits: z.array(z.string())
409
+ });
410
+ var analyzeOutputSchema = z.object({
411
+ matches: z.array(
412
+ z.object({
413
+ groundTruthUnit: z.string(),
414
+ outputUnit: z.string().nullable(),
415
+ matchType: z.enum(["exact", "semantic", "partial", "missing"]),
416
+ explanation: z.string()
417
+ })
418
+ ),
419
+ extraInOutput: z.array(z.string()),
420
+ contradictions: z.array(
421
+ z.object({
422
+ outputUnit: z.string(),
423
+ groundTruthUnit: z.string(),
424
+ explanation: z.string()
425
+ })
426
+ )
427
+ });
428
+ function createAnswerSimilarityScorer({
429
+ model,
430
+ options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
431
+ }) {
432
+ const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
433
+ return createScorer({
434
+ name: "Answer Similarity Scorer",
435
+ description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
436
+ judge: {
437
+ model,
438
+ instructions: ANSWER_SIMILARITY_INSTRUCTIONS
439
+ }
440
+ }).preprocess({
441
+ description: "Extract semantic units from output and ground truth",
442
+ outputSchema: extractOutputSchema2,
443
+ createPrompt: ({ run }) => {
444
+ if (!run.groundTruth) {
445
+ if (mergedOptions.requireGroundTruth) {
446
+ throw new Error("Answer Similarity Scorer requires ground truth to be provided");
447
+ }
448
+ return createExtractPrompt2({
449
+ output: "",
450
+ groundTruth: ""
451
+ });
452
+ }
453
+ const output = getAssistantMessageFromRunOutput(run.output) ?? "";
454
+ const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
455
+ return createExtractPrompt2({
456
+ output,
457
+ groundTruth
458
+ });
459
+ }
460
+ }).analyze({
461
+ description: "Compare semantic units between output and ground truth",
462
+ outputSchema: analyzeOutputSchema,
463
+ createPrompt: ({ results }) => {
464
+ const outputUnits = results.preprocessStepResult?.outputUnits || [];
465
+ const groundTruthUnits = results.preprocessStepResult?.groundTruthUnits || [];
466
+ return createAnalyzePrompt({
467
+ outputUnits,
468
+ groundTruthUnits
469
+ });
470
+ }
471
+ }).generateScore(({ run, results }) => {
472
+ if (!run.groundTruth) {
473
+ return 0;
474
+ }
475
+ const analysis = results.analyzeStepResult;
476
+ if (!analysis) {
477
+ return 0;
478
+ }
479
+ let score = 0;
480
+ const totalUnits = analysis.matches.length;
481
+ if (totalUnits === 0) {
482
+ return 0;
483
+ }
484
+ for (const match of analysis.matches) {
485
+ switch (match.matchType) {
486
+ case "exact":
487
+ score += 1 + mergedOptions.exactMatchBonus;
488
+ break;
489
+ case "semantic":
490
+ score += mergedOptions.semanticThreshold;
491
+ break;
492
+ case "partial":
493
+ score += mergedOptions.semanticThreshold * 0.5;
494
+ break;
495
+ case "missing":
496
+ score -= mergedOptions.missingPenalty;
497
+ break;
498
+ }
499
+ }
500
+ const maxPossibleScore = totalUnits * (1 + mergedOptions.exactMatchBonus);
501
+ score = score / maxPossibleScore;
502
+ const contradictionPenalty = analysis.contradictions.length * mergedOptions.contradictionPenalty;
503
+ score -= contradictionPenalty;
504
+ const extraInfoPenalty = Math.min(
505
+ analysis.extraInOutput.length * mergedOptions.extraInfoPenalty,
506
+ 0.2
507
+ // Cap extra info penalty at 0.2
508
+ );
509
+ score -= extraInfoPenalty;
510
+ score = Math.max(0, Math.min(1, score));
511
+ return roundToTwoDecimals(score * mergedOptions.scale);
512
+ }).generateReason({
513
+ description: "Generate explanation of similarity score",
514
+ createPrompt: ({ run, results, score }) => {
515
+ if (!run.groundTruth) {
516
+ return "No ground truth was provided for comparison. Score is 0 by default.";
517
+ }
518
+ const output = getAssistantMessageFromRunOutput(run.output) ?? "";
519
+ const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
520
+ return createReasonPrompt2({
521
+ output,
522
+ groundTruth,
523
+ score,
524
+ analysis: results.analyzeStepResult,
525
+ scale: mergedOptions.scale
526
+ });
527
+ }
528
+ });
529
+ }
530
+
267
531
  // src/scorers/llm/faithfulness/prompts.ts
268
532
  var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
269
533
 
@@ -1016,7 +1280,7 @@ OUTPUT REQUIREMENTS:
1016
1280
 
1017
1281
  You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
1018
1282
  `;
1019
- var createAnalyzePrompt = ({
1283
+ var createAnalyzePrompt2 = ({
1020
1284
  userInput,
1021
1285
  agentResponse,
1022
1286
  toolsCalled,
@@ -1067,7 +1331,7 @@ STRICT EVALUATION CRITERIA:
1067
1331
  Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
1068
1332
  `;
1069
1333
  };
1070
- var createReasonPrompt2 = ({
1334
+ var createReasonPrompt3 = ({
1071
1335
  userInput,
1072
1336
  score,
1073
1337
  evaluations,
@@ -1086,7 +1350,7 @@ Provide a single, concise sentence explaining why this score was given.
1086
1350
  };
1087
1351
 
1088
1352
  // src/scorers/llm/tool-call-accuracy/index.ts
1089
- var analyzeOutputSchema = z.object({
1353
+ var analyzeOutputSchema2 = z.object({
1090
1354
  evaluations: z.array(
1091
1355
  z.object({
1092
1356
  toolCalled: z.string(),
@@ -1119,12 +1383,12 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1119
1383
  };
1120
1384
  }).analyze({
1121
1385
  description: "Analyze the appropriateness of tool selections",
1122
- outputSchema: analyzeOutputSchema,
1386
+ outputSchema: analyzeOutputSchema2,
1123
1387
  createPrompt: ({ run, results }) => {
1124
1388
  const userInput = getUserMessageFromRunInput(run.input) ?? "";
1125
1389
  const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
1126
1390
  const toolsCalled = results.preprocessStepResult?.actualTools || [];
1127
- return createAnalyzePrompt({
1391
+ return createAnalyzePrompt2({
1128
1392
  userInput,
1129
1393
  agentResponse,
1130
1394
  toolsCalled,
@@ -1146,7 +1410,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1146
1410
  const userInput = getUserMessageFromRunInput(run.input) ?? "";
1147
1411
  const evaluations = results.analyzeStepResult?.evaluations || [];
1148
1412
  const missingTools = results.analyzeStepResult?.missingTools || [];
1149
- return createReasonPrompt2({
1413
+ return createReasonPrompt3({
1150
1414
  userInput,
1151
1415
  score,
1152
1416
  evaluations,
@@ -1173,7 +1437,7 @@ Evaluation Guidelines:
1173
1437
  - Consider whether missing context might have led to a better response
1174
1438
 
1175
1439
  Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
1176
- function createAnalyzePrompt2({
1440
+ function createAnalyzePrompt3({
1177
1441
  userQuery,
1178
1442
  agentResponse,
1179
1443
  providedContext
@@ -1258,7 +1522,7 @@ Context:
1258
1522
  "overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
1259
1523
  }`;
1260
1524
  }
1261
- function createReasonPrompt3({
1525
+ function createReasonPrompt4({
1262
1526
  userQuery,
1263
1527
  score,
1264
1528
  evaluations,
@@ -1304,7 +1568,7 @@ Example responses:
1304
1568
  }
1305
1569
 
1306
1570
  // src/scorers/llm/context-relevance/index.ts
1307
- var analyzeOutputSchema2 = z.object({
1571
+ var analyzeOutputSchema3 = z.object({
1308
1572
  evaluations: z.array(
1309
1573
  z.object({
1310
1574
  context_index: z.number(),
@@ -1344,19 +1608,19 @@ function createContextRelevanceScorerLLM({
1344
1608
  }
1345
1609
  }).analyze({
1346
1610
  description: "Analyze the relevance and utility of provided context",
1347
- outputSchema: analyzeOutputSchema2,
1611
+ outputSchema: analyzeOutputSchema3,
1348
1612
  createPrompt: ({ run }) => {
1349
1613
  const userQuery = getUserMessageFromRunInput(run.input) ?? "";
1350
1614
  const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
1351
1615
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1352
1616
  if (context.length === 0) {
1353
- return createAnalyzePrompt2({
1617
+ return createAnalyzePrompt3({
1354
1618
  userQuery,
1355
1619
  agentResponse,
1356
1620
  providedContext: ["[No context was provided for evaluation]"]
1357
1621
  });
1358
1622
  }
1359
- return createAnalyzePrompt2({
1623
+ return createAnalyzePrompt3({
1360
1624
  userQuery,
1361
1625
  agentResponse,
1362
1626
  providedContext: context
@@ -1406,7 +1670,7 @@ function createContextRelevanceScorerLLM({
1406
1670
  }
1407
1671
  const evaluations = results.analyzeStepResult?.evaluations || [];
1408
1672
  const missingContext = results.analyzeStepResult?.missingContext || [];
1409
- return createReasonPrompt3({
1673
+ return createReasonPrompt4({
1410
1674
  userQuery,
1411
1675
  score,
1412
1676
  evaluations,
@@ -1639,7 +1903,7 @@ Noise Impact Assessment:
1639
1903
  - **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
1640
1904
 
1641
1905
  Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
1642
- function createAnalyzePrompt3({
1906
+ function createAnalyzePrompt4({
1643
1907
  userQuery,
1644
1908
  baselineResponse,
1645
1909
  noisyQuery,
@@ -1758,7 +2022,7 @@ Noisy Response: "Regular exercise improves cardiovascular health and strengthens
1758
2022
  "robustnessScore": 0.85
1759
2023
  }`;
1760
2024
  }
1761
- function createReasonPrompt4({
2025
+ function createReasonPrompt5({
1762
2026
  userQuery,
1763
2027
  score,
1764
2028
  dimensions,
@@ -1813,7 +2077,7 @@ Example responses:
1813
2077
  }
1814
2078
 
1815
2079
  // src/scorers/llm/noise-sensitivity/index.ts
1816
- var analyzeOutputSchema3 = z.object({
2080
+ var analyzeOutputSchema4 = z.object({
1817
2081
  dimensions: z.array(
1818
2082
  z.object({
1819
2083
  dimension: z.string(),
@@ -1857,14 +2121,14 @@ function createNoiseSensitivityScorerLLM({
1857
2121
  }
1858
2122
  }).analyze({
1859
2123
  description: "Analyze the impact of noise on agent response quality",
1860
- outputSchema: analyzeOutputSchema3,
2124
+ outputSchema: analyzeOutputSchema4,
1861
2125
  createPrompt: ({ run }) => {
1862
2126
  const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
1863
2127
  const noisyResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
1864
2128
  if (!originalQuery || !noisyResponse) {
1865
2129
  throw new Error("Both original query and noisy response are required for evaluation");
1866
2130
  }
1867
- return createAnalyzePrompt3({
2131
+ return createAnalyzePrompt4({
1868
2132
  userQuery: originalQuery,
1869
2133
  baselineResponse: options.baselineResponse,
1870
2134
  noisyQuery: options.noisyQuery,
@@ -1912,7 +2176,7 @@ function createNoiseSensitivityScorerLLM({
1912
2176
  if (!analysisResult) {
1913
2177
  throw new Error("Analysis step failed to produce results for reason generation");
1914
2178
  }
1915
- return createReasonPrompt4({
2179
+ return createReasonPrompt5({
1916
2180
  userQuery: originalQuery,
1917
2181
  score,
1918
2182
  dimensions: analysisResult.dimensions || [],
@@ -1941,7 +2205,7 @@ Evaluation Guidelines:
1941
2205
  - Be objective and focus on alignment rather than response quality
1942
2206
 
1943
2207
  Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
1944
- function createAnalyzePrompt4({
2208
+ function createAnalyzePrompt5({
1945
2209
  userPrompt,
1946
2210
  systemPrompt,
1947
2211
  agentResponse,
@@ -2093,7 +2357,7 @@ Agent Response: "def factorial(n):
2093
2357
  "overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
2094
2358
  }`;
2095
2359
  }
2096
- function createReasonPrompt5({
2360
+ function createReasonPrompt6({
2097
2361
  userPrompt,
2098
2362
  systemPrompt,
2099
2363
  score,
@@ -2156,7 +2420,7 @@ Example responses:
2156
2420
  }
2157
2421
 
2158
2422
  // src/scorers/llm/prompt-alignment/index.ts
2159
- var analyzeOutputSchema4 = z.object({
2423
+ var analyzeOutputSchema5 = z.object({
2160
2424
  intentAlignment: z.object({
2161
2425
  score: z.number().min(0).max(1),
2162
2426
  primaryIntent: z.string(),
@@ -2228,7 +2492,7 @@ function createPromptAlignmentScorerLLM({
2228
2492
  }
2229
2493
  }).analyze({
2230
2494
  description: "Analyze prompt-response alignment across multiple dimensions",
2231
- outputSchema: analyzeOutputSchema4,
2495
+ outputSchema: analyzeOutputSchema5,
2232
2496
  createPrompt: ({ run }) => {
2233
2497
  const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
2234
2498
  const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
@@ -2245,7 +2509,7 @@ function createPromptAlignmentScorerLLM({
2245
2509
  if (!agentResponse) {
2246
2510
  throw new Error("Agent response is required for prompt alignment scoring");
2247
2511
  }
2248
- return createAnalyzePrompt4({
2512
+ return createAnalyzePrompt5({
2249
2513
  userPrompt,
2250
2514
  systemPrompt,
2251
2515
  agentResponse,
@@ -2278,7 +2542,7 @@ function createPromptAlignmentScorerLLM({
2278
2542
  if (!analysis) {
2279
2543
  return `Unable to analyze prompt alignment. Score: ${score}`;
2280
2544
  }
2281
- return createReasonPrompt5({
2545
+ return createReasonPrompt6({
2282
2546
  userPrompt,
2283
2547
  systemPrompt,
2284
2548
  score,
@@ -2290,6 +2554,6 @@ function createPromptAlignmentScorerLLM({
2290
2554
  });
2291
2555
  }
2292
2556
 
2293
- export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
2557
+ export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
2294
2558
  //# sourceMappingURL=index.js.map
2295
2559
  //# sourceMappingURL=index.js.map