@mastra/evals 0.0.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +10 -0
  2. package/LICENSE +44 -0
  3. package/dist/evals.cjs.development.js +1510 -0
  4. package/dist/evals.cjs.development.js.map +1 -0
  5. package/dist/evals.cjs.production.min.js +2 -0
  6. package/dist/evals.cjs.production.min.js.map +1 -0
  7. package/dist/evals.esm.js +1497 -0
  8. package/dist/evals.esm.js.map +1 -0
  9. package/dist/evaluation.d.ts +3 -0
  10. package/dist/evaluation.d.ts.map +1 -0
  11. package/dist/index.d.ts +3 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +8 -0
  14. package/dist/judge/index.d.ts +6 -0
  15. package/dist/judge/index.d.ts.map +1 -0
  16. package/dist/metrics/answer-relevancy/index.d.ts +17 -0
  17. package/dist/metrics/answer-relevancy/index.d.ts.map +1 -0
  18. package/dist/metrics/answer-relevancy/metricJudge.d.ts +11 -0
  19. package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -0
  20. package/dist/metrics/answer-relevancy/prompts.d.ts +15 -0
  21. package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -0
  22. package/dist/metrics/completeness/index.d.ts +12 -0
  23. package/dist/metrics/completeness/index.d.ts.map +1 -0
  24. package/dist/metrics/content-similarity/index.d.ts +11 -0
  25. package/dist/metrics/content-similarity/index.d.ts.map +1 -0
  26. package/dist/metrics/context-position/index.d.ts +15 -0
  27. package/dist/metrics/context-position/index.d.ts.map +1 -0
  28. package/dist/metrics/context-position/metricJudge.d.ts +14 -0
  29. package/dist/metrics/context-position/metricJudge.d.ts.map +1 -0
  30. package/dist/metrics/context-position/prompts.d.ts +16 -0
  31. package/dist/metrics/context-position/prompts.d.ts.map +1 -0
  32. package/dist/metrics/context-precision/index.d.ts +15 -0
  33. package/dist/metrics/context-precision/index.d.ts.map +1 -0
  34. package/dist/metrics/context-precision/metricJudge.d.ts +15 -0
  35. package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -0
  36. package/dist/metrics/context-precision/prompts.d.ts +16 -0
  37. package/dist/metrics/context-precision/prompts.d.ts.map +1 -0
  38. package/dist/metrics/difference/index.d.ts +9 -0
  39. package/dist/metrics/difference/index.d.ts.map +1 -0
  40. package/dist/metrics/index.d.ts +10 -0
  41. package/dist/metrics/index.d.ts.map +1 -0
  42. package/dist/metrics/keyword-coverage/index.d.ts +9 -0
  43. package/dist/metrics/keyword-coverage/index.d.ts.map +1 -0
  44. package/dist/metrics/prompt-alignment/index.d.ts +17 -0
  45. package/dist/metrics/prompt-alignment/index.d.ts.map +1 -0
  46. package/dist/metrics/prompt-alignment/metricJudge.d.ts +11 -0
  47. package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -0
  48. package/dist/metrics/prompt-alignment/prompts.d.ts +13 -0
  49. package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -0
  50. package/dist/metrics/tone/index.d.ts +10 -0
  51. package/dist/metrics/tone/index.d.ts.map +1 -0
  52. package/dist/metrics/types.d.ts +12 -0
  53. package/dist/metrics/types.d.ts.map +1 -0
  54. package/jest.config.ts +19 -0
  55. package/package.json +51 -0
  56. package/src/evaluation.test.ts +32 -0
  57. package/src/evaluation.ts +20 -0
  58. package/src/index.ts +2 -0
  59. package/src/judge/index.ts +13 -0
  60. package/src/metrics/answer-relevancy/index.test.ts +193 -0
  61. package/src/metrics/answer-relevancy/index.ts +80 -0
  62. package/src/metrics/answer-relevancy/metricJudge.ts +49 -0
  63. package/src/metrics/answer-relevancy/prompts.ts +179 -0
  64. package/src/metrics/completeness/index.test.ts +96 -0
  65. package/src/metrics/completeness/index.ts +112 -0
  66. package/src/metrics/content-similarity/index.test.ts +107 -0
  67. package/src/metrics/content-similarity/index.ts +41 -0
  68. package/src/metrics/context-position/index.test.ts +292 -0
  69. package/src/metrics/context-position/index.ts +63 -0
  70. package/src/metrics/context-position/metricJudge.ts +54 -0
  71. package/src/metrics/context-position/prompts.ts +123 -0
  72. package/src/metrics/context-precision/index.test.ts +249 -0
  73. package/src/metrics/context-precision/index.ts +62 -0
  74. package/src/metrics/context-precision/metricJudge.ts +55 -0
  75. package/src/metrics/context-precision/prompts.ts +111 -0
  76. package/src/metrics/difference/index.test.ts +116 -0
  77. package/src/metrics/difference/index.ts +31 -0
  78. package/src/metrics/index.ts +9 -0
  79. package/src/metrics/keyword-coverage/index.test.ts +114 -0
  80. package/src/metrics/keyword-coverage/index.ts +47 -0
  81. package/src/metrics/prompt-alignment/index.test.ts +46 -0
  82. package/src/metrics/prompt-alignment/index.ts +66 -0
  83. package/src/metrics/prompt-alignment/metricJudge.ts +41 -0
  84. package/src/metrics/prompt-alignment/prompts.ts +102 -0
  85. package/src/metrics/tone/index.test.ts +123 -0
  86. package/src/metrics/tone/index.ts +47 -0
  87. package/src/metrics/types.ts +13 -0
  88. package/tsconfig.json +10 -0
@@ -0,0 +1,111 @@
1
+ export const CONTEXT_PRECISION_AGENT_INSTRUCTIONS = `You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.
2
+
3
+ Key Principles:
4
+ 1. Evaluate whether each context node was useful in generating the expected output
5
+ 2. Consider both direct and indirect relevance
6
+ 3. Prioritize usefulness over completeness
7
+ 4. Recognize that some nodes may be partially relevant
8
+ 5. Empty or error nodes should be marked as not relevant`;
9
+
10
+ export function generateEvaluatePrompt({
11
+ input,
12
+ output,
13
+ context,
14
+ }: {
15
+ input: string;
16
+ output: string;
17
+ context: string[];
18
+ }) {
19
+ return `Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.
20
+
21
+ **
22
+ IMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: \`verdict\` with either 'yes' or 'no', and \`reason\` explaining the verdict. Your reason should include relevant quotes from the context.
23
+
24
+ Example Context: ["The Sun is a star", "Stars produce their own light", "The Moon reflects sunlight"]
25
+ Example Query: "What is the Sun?"
26
+ Example Expected Response: "The Sun is a star that produces light."
27
+
28
+ Example:
29
+ {
30
+ "verdicts": [
31
+ {
32
+ "verdict": "yes",
33
+ "reason": "The context 'The Sun is a star' directly defines what the Sun is."
34
+ },
35
+ {
36
+ "verdict": "yes",
37
+ "reason": "The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun."
38
+ },
39
+ {
40
+ "verdict": "no",
41
+ "reason": "The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is."
42
+ }
43
+ ]
44
+ }
45
+
46
+ Consider context relevant if it:
47
+ - Directly addresses the query
48
+ - Provides examples or instances that help explain the concept
49
+ - Offers related information that helps build understanding
50
+ - Contains partial information that contributes to the response
51
+
52
+ The number of verdicts MUST MATCH the number of context pieces exactly.
53
+ **
54
+
55
+ Input:
56
+ ${input}
57
+
58
+ Output:
59
+ ${output}
60
+
61
+ Context:
62
+ ${context}
63
+
64
+ JSON:
65
+ `;
66
+ }
67
+
68
+ export function generateReasonPrompt({
69
+ input,
70
+ output,
71
+ verdicts,
72
+ score,
73
+ }: {
74
+ input: string;
75
+ output: string;
76
+ verdicts: Array<{ verdict: string; reason: string }>;
77
+ score: number;
78
+ }) {
79
+ return `Given the input, output, verdicts, and precision score, provide a BRIEF explanation for the score. Explain both its strengths and limitations.
80
+ The retrieved contexts is a list containing \`verdict\` ('yes' or 'no' for relevance), \`reason\` (explaining the verdict) and \`node\` (the context text). Contexts are listed in their ranking order.
81
+
82
+ **
83
+ IMPORTANT: Return only JSON format with a single 'reason' key explaining the score.
84
+ Example JSON:
85
+ {
86
+ "reason": "The score is <score> because <explanation>."
87
+ }
88
+
89
+ Guidelines:
90
+ - Don't mention 'verdict' - refer to relevant/irrelevant nodes instead
91
+ - Use information from the \`reason\` field, not the field itself
92
+ - Reference node positions (first, second, etc.) when explaining relevance
93
+ - For perfect scores (10.0), emphasize both relevance and optimal ordering
94
+ - Always reference the ranking order when discussing relevance
95
+ **
96
+
97
+ Precision Score:
98
+ ${score}
99
+
100
+ Input:
101
+ ${input}
102
+
103
+ Output:
104
+ ${output}
105
+
106
+ Context:
107
+ ${verdicts}
108
+
109
+ JSON:
110
+ `;
111
+ }
@@ -0,0 +1,116 @@
1
+ import { it, expect } from '@jest/globals';
2
+
3
+ import { DifferenceMetric } from './index';
4
+
5
+ describe('DifferenceMetric', () => {
6
+ const metric = new DifferenceMetric();
7
+
8
+ it('should return perfect match for identical strings', async () => {
9
+ const result = await metric.measure({
10
+ input: 'The quick brown fox',
11
+ output: 'The quick brown fox',
12
+ });
13
+ expect(result.score).toBe(1);
14
+ expect(result.confidence).toBe(1);
15
+ expect(result.metrics!.changes).toBe(0);
16
+ });
17
+
18
+ it('should handle small differences', async () => {
19
+ const result = await metric.measure({
20
+ input: 'The quick brown fox',
21
+ output: 'The quick brown cat',
22
+ });
23
+ expect(result.score).toBeGreaterThan(0.8);
24
+ expect(result.metrics!.changes).toBe(1);
25
+ });
26
+
27
+ it('should handle word additions', async () => {
28
+ const result = await metric.measure({
29
+ input: 'The quick brown fox',
30
+ output: 'The very quick brown fox',
31
+ });
32
+ expect(result.score).toBeGreaterThan(0.7);
33
+ expect(result.metrics!.changes).toBe(1);
34
+ });
35
+
36
+ it('should handle word deletions', async () => {
37
+ const result = await metric.measure({
38
+ input: 'The quick brown fox jumps',
39
+ output: 'The quick fox jumps',
40
+ });
41
+ expect(result.score).toBeGreaterThan(0.7);
42
+ expect(result.metrics!.changes).toBe(1);
43
+ });
44
+
45
+ it('should handle multiple changes', async () => {
46
+ const result = await metric.measure({
47
+ input: 'The quick brown fox jumps over the lazy dog',
48
+ output: 'The slow black fox runs under the active cat',
49
+ });
50
+ expect(result.score).toBeGreaterThan(0.4);
51
+ expect(result.score).toBeLessThan(0.7);
52
+ expect(result.metrics!.changes).toBeGreaterThan(3);
53
+ });
54
+
55
+ it('should handle completely different strings', async () => {
56
+ const result = await metric.measure({
57
+ input: 'The quick brown fox',
58
+ output: 'Lorem ipsum dolor sit amet',
59
+ });
60
+ expect(result.score).toBeLessThan(0.3);
61
+ expect(result.metrics!.changes).toBeGreaterThan(3);
62
+ });
63
+
64
+ it('should handle empty strings', async () => {
65
+ const result = await metric.measure({
66
+ input: '',
67
+ output: '',
68
+ });
69
+ expect(result.score).toBe(1);
70
+ expect(result.confidence).toBe(1);
71
+ expect(result.metrics!.changes).toBe(0);
72
+ expect(result.metrics!.lengthDiff).toBe(0);
73
+ });
74
+
75
+ it('should handle one empty string', async () => {
76
+ const result = await metric.measure({
77
+ input: 'The quick brown fox',
78
+ output: '',
79
+ });
80
+ expect(result.score).toBe(0);
81
+ expect(result.confidence).toBe(0);
82
+ expect(result.metrics!.changes).toBeGreaterThan(0);
83
+ expect(result.metrics!.lengthDiff).toBe(1);
84
+ });
85
+
86
+ it('should handle case sensitivity', async () => {
87
+ const result = await metric.measure({
88
+ input: 'The Quick Brown Fox',
89
+ output: 'the quick brown fox',
90
+ });
91
+ expect(result.score).toBeLessThan(1);
92
+ expect(result.metrics!.changes).toBeGreaterThan(0);
93
+ });
94
+
95
+ it('should handle whitespace sensitivity', async () => {
96
+ const result = await metric.measure({
97
+ input: 'The quick\nbrown fox',
98
+ output: 'The quick brown fox',
99
+ });
100
+ expect(result.score).toBeLessThan(1);
101
+ expect(result.metrics!.changes).toBeGreaterThan(0);
102
+ });
103
+
104
+ it('should include difference details in result', async () => {
105
+ const result = await metric.measure({
106
+ input: 'The quick brown fox',
107
+ output: 'The quick brown fox',
108
+ });
109
+ expect(result.details).toBe('Difference score: 100.0% with 0 changes');
110
+ expect(result.metrics!).toEqual({
111
+ ratio: 1,
112
+ changes: 0,
113
+ lengthDiff: 0,
114
+ });
115
+ });
116
+ });
@@ -0,0 +1,31 @@
1
+ import { Metric } from '@mastra/core';
2
+ import { SequenceMatcher } from 'difflib';
3
+
4
+ import { MetricScoringResult } from '../types';
5
+
6
+ export class DifferenceMetric extends Metric {
7
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
8
+ const matcher = new SequenceMatcher(null, input, output);
9
+ const ratio = matcher.ratio();
10
+
11
+ // Get detailed operations
12
+ const ops = matcher.getOpcodes();
13
+ const changes = ops.filter(([op]) => op !== 'equal').length;
14
+
15
+ // Calculate confidence based on text length difference
16
+ const maxLength = Math.max(input.length, output.length);
17
+ const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
18
+ const confidence = 1 - lengthDiff;
19
+
20
+ return {
21
+ score: ratio,
22
+ details: `Difference score: ${(ratio * 100).toFixed(1)}% with ${changes} changes`,
23
+ confidence,
24
+ metrics: {
25
+ ratio,
26
+ changes,
27
+ lengthDiff,
28
+ },
29
+ };
30
+ }
31
+ }
@@ -0,0 +1,9 @@
1
+ export { AnswerRelevancyMetric } from './answer-relevancy';
2
+ export { CompletenessMetric } from './completeness';
3
+ export { ContentSimilarityMetric } from './content-similarity';
4
+ export { ContextPositionMetric } from './context-position';
5
+ export { ContextPrecisionMetric } from './context-precision';
6
+ export { DifferenceMetric } from './difference';
7
+ export { KeywordCoverageMetric } from './keyword-coverage';
8
+ export { PromptAlignmentMetric } from './prompt-alignment';
9
+ export { ToneConsistencyMetric } from './tone';
@@ -0,0 +1,114 @@
1
+ import { it, expect } from '@jest/globals';
2
+
3
+ import { KeywordCoverageMetric } from './index';
4
+
5
+ describe('KeywordCoverageMetric', () => {
6
+ const metric = new KeywordCoverageMetric();
7
+
8
+ it('should return perfect coverage for identical text', async () => {
9
+ const result = await metric.measure({
10
+ input: 'The quick brown fox jumps over the lazy dog',
11
+ output: 'The quick brown fox jumps over the lazy dog',
12
+ });
13
+ expect(result.score).toBe(1);
14
+ expect(result.confidence).toBe(0.85);
15
+ const matched = result.metrics?.matchedKeywords as number;
16
+ const total = result.metrics?.totalKeywords as number;
17
+ expect(matched).toBeGreaterThan(0);
18
+ expect(total).toBeGreaterThan(0);
19
+ expect(matched).toBe(total);
20
+ });
21
+
22
+ it('should handle partial keyword coverage', async () => {
23
+ const result = await metric.measure({
24
+ input: 'The quick brown fox jumps over the lazy dog',
25
+ output: 'A quick brown fox runs past a sleeping cat',
26
+ });
27
+ expect(result.score).toBeGreaterThan(0.3);
28
+ expect(result.score).toBeLessThan(0.7);
29
+ const matched = result.metrics?.matchedKeywords as number;
30
+ const total = result.metrics?.totalKeywords as number;
31
+ expect(matched).toBeLessThan(total);
32
+ });
33
+
34
+ it('should ignore common words and stop words', async () => {
35
+ const result = await metric.measure({
36
+ input: 'The quick brown fox',
37
+ output: 'A quick brown fox',
38
+ });
39
+ expect(result.score).toBe(1); // "the" and "a" should be ignored
40
+ const matched = result.metrics?.matchedKeywords as number;
41
+ const total = result.metrics?.totalKeywords as number;
42
+ expect(matched).toBe(total);
43
+ });
44
+
45
+ it('should handle case differences', async () => {
46
+ const result = await metric.measure({
47
+ input: 'The Quick Brown Fox',
48
+ output: 'the quick brown fox',
49
+ });
50
+ expect(result.score).toBe(1);
51
+ const matched = result.metrics?.matchedKeywords as number;
52
+ const total = result.metrics?.totalKeywords as number;
53
+ expect(matched).toBe(total);
54
+ });
55
+
56
+ it('should handle empty strings', async () => {
57
+ const result = await metric.measure({
58
+ input: '',
59
+ output: '',
60
+ });
61
+ expect(result.score).toBe(1);
62
+ expect(result.metrics?.totalKeywords).toBe(0);
63
+ expect(result.metrics?.matchedKeywords).toBe(0);
64
+ });
65
+
66
+ it('should handle one empty string', async () => {
67
+ const result = await metric.measure({
68
+ input: 'The quick brown fox',
69
+ output: '',
70
+ });
71
+ expect(result.score).toBe(0);
72
+ expect(result.metrics?.matchedKeywords).toBe(0);
73
+ expect(result.metrics?.totalKeywords).toBeGreaterThan(0);
74
+ });
75
+
76
+ it('should ignore numbers by default', async () => {
77
+ const result = await metric.measure({
78
+ input: 'The 123 quick 456 brown fox',
79
+ output: 'The quick brown fox',
80
+ });
81
+ expect(result.score).toBe(1);
82
+ });
83
+
84
+ it('should handle special characters', async () => {
85
+ const result = await metric.measure({
86
+ input: 'The quick-brown fox!',
87
+ output: 'The quick brown fox',
88
+ });
89
+ // Hyphenated words are treated as separate keywords
90
+ expect(result.score).toBeGreaterThanOrEqual(0.5);
91
+ expect(result.score).toBeLessThan(1);
92
+ });
93
+
94
+ it('should handle completely different content', async () => {
95
+ const result = await metric.measure({
96
+ input: 'The quick brown fox jumps over the lazy dog',
97
+ output: 'Lorem ipsum dolor sit amet',
98
+ });
99
+ expect(result.score).toBe(0);
100
+ expect(result.metrics?.matchedKeywords).toBe(0);
101
+ });
102
+
103
+ it('should include coverage details in result', async () => {
104
+ const result = await metric.measure({
105
+ input: 'quick brown fox',
106
+ output: 'quick brown fox',
107
+ });
108
+ expect(result.details).toMatch(/Keyword coverage: 100.0% \(3\/3 keywords\)/);
109
+ expect(result.metrics).toEqual({
110
+ totalKeywords: 3,
111
+ matchedKeywords: 3,
112
+ });
113
+ });
114
+ });
@@ -0,0 +1,47 @@
1
+ import { Metric } from '@mastra/core';
2
+ import keyword_extractor from 'keyword-extractor';
3
+
4
+ import { MetricScoringResult } from '../types';
5
+
6
+ export class KeywordCoverageMetric extends Metric {
7
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
8
+ // Handle empty strings case
9
+ if (!input && !output) {
10
+ return {
11
+ score: 1,
12
+ details: 'Keyword coverage: 100.0% (0/0 keywords)',
13
+ confidence: 0.85,
14
+ metrics: {
15
+ totalKeywords: 0,
16
+ matchedKeywords: 0,
17
+ },
18
+ };
19
+ }
20
+
21
+ const extractKeywords = (text: string) => {
22
+ return keyword_extractor.extract(text, {
23
+ language: 'english',
24
+ remove_digits: true,
25
+ return_changed_case: true,
26
+ remove_duplicates: true,
27
+ });
28
+ };
29
+
30
+ const referenceKeywords = new Set(extractKeywords(input));
31
+ const responseKeywords = new Set(extractKeywords(output));
32
+
33
+ const matchedKeywords = [...referenceKeywords].filter(k => responseKeywords.has(k));
34
+ const totalKeywords = referenceKeywords.size;
35
+ const coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
36
+
37
+ return {
38
+ score: coverage,
39
+ details: `Keyword coverage: ${(coverage * 100).toFixed(1)}% (${matchedKeywords.length}/${referenceKeywords.size} keywords)`,
40
+ confidence: 0.85,
41
+ metrics: {
42
+ totalKeywords: referenceKeywords.size,
43
+ matchedKeywords: matchedKeywords.length,
44
+ },
45
+ };
46
+ }
47
+ }
@@ -0,0 +1,46 @@
1
+ import { it, expect, jest } from '@jest/globals';
2
+ import { type ModelConfig } from '@mastra/core';
3
+
4
+ import { PromptAlignmentMetric } from './index';
5
+
6
+ const SECONDS = 1000;
7
+ jest.setTimeout(15 * SECONDS);
8
+
9
+ const modelConfig: ModelConfig = {
10
+ provider: 'OPEN_AI',
11
+ name: 'gpt-4o',
12
+ toolChoice: 'auto',
13
+ apiKey: process.env.OPENAI_API_KEY,
14
+ };
15
+
16
+ it('should be able to measure prompt alignment', async () => {
17
+ const metric = new PromptAlignmentMetric(modelConfig, {
18
+ instructions: ['Reply in all uppercase'],
19
+ });
20
+
21
+ const result = await metric.measure({
22
+ input: `What if these shoes don't fit?`,
23
+ output: 'We offer a 30-day full refund at no extra cost.',
24
+ });
25
+
26
+ const resultUppercase = await metric.measure({
27
+ input: `What if these shoes don't fit?`,
28
+ output: 'We offer a 30-day full refund at no extra cost.'.toUpperCase(),
29
+ });
30
+
31
+ expect(resultUppercase.score).toBe(10);
32
+ expect(result.score).toBe(0);
33
+ });
34
+
35
+ it('should be able to measure prompt alignment with an array of instructions', async () => {
36
+ const metric = new PromptAlignmentMetric(modelConfig, {
37
+ instructions: ['Reply in all uppercase', 'Include baguettes in the response'],
38
+ });
39
+
40
+ const result = await metric.measure({
41
+ input: `What is the capital of France?`,
42
+ output: 'THE CAPITAL OF FRANCE IS BAGUETTE.',
43
+ });
44
+
45
+ expect(result.score).toBe(10);
46
+ });
@@ -0,0 +1,66 @@
1
+ import { Metric, MetricResult, ModelConfig } from '@mastra/core';
2
+
3
+ import { PromptAlignmentJudge } from './metricJudge';
4
+
5
+ export class PromptAlignmentMetric extends Metric {
6
+ private instructions: string[];
7
+ private judge: PromptAlignmentJudge;
8
+ private scale: number;
9
+
10
+ constructor(model: ModelConfig, { instructions, scale = 10 }: { instructions: string[]; scale?: number }) {
11
+ super();
12
+
13
+ this.instructions = instructions;
14
+ this.judge = new PromptAlignmentJudge(model);
15
+ this.scale = scale;
16
+ }
17
+
18
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricResult> {
19
+ const verdicts = await this.judge.evaluate(input, output, this.instructions);
20
+ const score = this.calculateScore(verdicts);
21
+
22
+ const reason = await this.generateReason(input, output, score, verdicts);
23
+
24
+ return {
25
+ score,
26
+ reason,
27
+ };
28
+ }
29
+
30
+ private async generateReason(
31
+ input: string,
32
+ output: string,
33
+ score: number,
34
+ verdicts: {
35
+ verdict: string;
36
+ reason: string;
37
+ }[],
38
+ ): Promise<string> {
39
+ const reasonsForVerdicts: string[] = [];
40
+ for (const { verdict, reason } of verdicts || []) {
41
+ if (verdict.trim().toLowerCase() === 'no') {
42
+ reasonsForVerdicts.push(reason);
43
+ }
44
+ }
45
+
46
+ const reason = await this.judge.getReason(input, output, score, reasonsForVerdicts);
47
+ return reason;
48
+ }
49
+
50
+ private calculateScore(evaluation: { verdict: string; reason: string }[]): number {
51
+ const numberOfVerdicts = evaluation?.length || 0;
52
+ if (numberOfVerdicts === 0) {
53
+ return 1;
54
+ }
55
+
56
+ let alignmentCount = 0;
57
+ for (const { verdict } of evaluation!) {
58
+ if (verdict.trim().toLowerCase() !== 'no') {
59
+ alignmentCount++;
60
+ }
61
+ }
62
+
63
+ const score = alignmentCount / numberOfVerdicts;
64
+ return score * this.scale;
65
+ }
66
+ }
@@ -0,0 +1,41 @@
1
+ import { ModelConfig } from '@mastra/core';
2
+ import { z } from 'zod';
3
+
4
+ import { MastraAgentJudge } from '../../judge';
5
+
6
+ import { generateEvaluatePrompt, generateReasonPrompt, PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS } from './prompts';
7
+
8
+ export class PromptAlignmentJudge extends MastraAgentJudge {
9
+ constructor(model: ModelConfig) {
10
+ super('Prompt Alignment', PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, model);
11
+ }
12
+
13
+ async evaluate(
14
+ input: string,
15
+ actualOutput: string,
16
+ instructions: string[],
17
+ ): Promise<{ verdict: string; reason: string }[]> {
18
+ const prompt = generateEvaluatePrompt({ input, output: actualOutput, instructions });
19
+ const result = await this.agent.generate(prompt, {
20
+ output: z.object({
21
+ verdicts: z.array(
22
+ z.object({
23
+ verdict: z.string(),
24
+ reason: z.string(),
25
+ }),
26
+ ),
27
+ }),
28
+ });
29
+ return result.object.verdicts;
30
+ }
31
+
32
+ async getReason(input: string, actualOutput: string, score: number, reasons: string[]): Promise<string> {
33
+ const prompt = generateReasonPrompt({ input, output: actualOutput, reasons, score });
34
+ const result = await this.agent.generate(prompt, {
35
+ output: z.object({
36
+ reason: z.string(),
37
+ }),
38
+ });
39
+ return result.object.reason;
40
+ }
41
+ }
@@ -0,0 +1,102 @@
1
+ export const PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = `You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.
2
+
3
+ Key Principles:
4
+ 1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.
5
+ 2. Only give a "yes" verdict if an instruction is COMPLETELY followed
6
+ 3. Any partial compliance should be marked as "no"
7
+ 4. Provide clear, specific reasons for any "no" verdicts
8
+ 5. Focus solely on instruction compliance, not output quality
9
+
10
+ Remember:
11
+ - Each instruction must be evaluated independently
12
+ - Verdicts must be either "yes" or "no" - no in-between
13
+ - Reasons are required only for "no" verdicts
14
+ - The number of verdicts must match the number of instructions exactly`;
15
+
16
+ export function generateEvaluatePrompt({
17
+ instructions,
18
+ input,
19
+ output,
20
+ }: {
21
+ instructions: string[];
22
+ input: string;
23
+ output: string;
24
+ }) {
25
+ return `For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.
26
+ Make sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.
27
+ Generate a list of verdicts in JSON format, where each verdict must have:
28
+ - "verdict": Strictly "yes" or "no"
29
+ - "reason": Give a reason for the verdict
30
+
31
+ Be EXTRA STRICT in your evaluation. Only give "yes" if the instruction is followed COMPLETELY.
32
+ Evaluate the output EXACTLY as written - consider every character, space, and case
33
+
34
+ Example:
35
+ Input: "describe the sky"
36
+ Output: "the sky is Blue today"
37
+ Instructions: ["Start sentences with capital letters", "Use proper English"]
38
+
39
+ {
40
+ "verdicts": [
41
+ {
42
+ "verdict": "no",
43
+ "reason": "The sentence 'the sky is Blue' starts with lowercase 't'"
44
+ },
45
+ {
46
+ "verdict": "no",
47
+ "reason": "Improper capitalization: 'Blue' is capitalized mid-sentence"
48
+ }
49
+ ]
50
+ }
51
+
52
+ Prompt Instructions:
53
+ ${instructions.join('\n')}
54
+
55
+ Input:
56
+ ${input}
57
+
58
+ LLM Actual Output:
59
+ ${output}
60
+
61
+ JSON:`;
62
+ }
63
+
64
+ export function generateReasonPrompt({
65
+ input,
66
+ output,
67
+ score,
68
+ reasons,
69
+ }: {
70
+ input: string;
71
+ output: string;
72
+ score: number;
73
+ reasons: string[];
74
+ }) {
75
+ return `Explain the instruction following score (0-10) for the LLM's response using this context:
76
+ Context:
77
+ Input: ${input}
78
+ Output: ${output}
79
+ Score: ${score}
80
+ Failure Reasons: ${reasons.join('\n')}
81
+
82
+ Rules (follow these rules exactly. do not deviate):
83
+ - Keep your response concise and to the point.
84
+ - Do not change score from what is given.
85
+ - Do not make judgements on inputs or outputs (factual correctness, quality, etc).
86
+ - If there are failure reasons given, explain why the score is not higher.
87
+
88
+
89
+ Output format:
90
+ {
91
+ "reason": "The score is {score} because {explanation of instruction following}"
92
+ }
93
+
94
+ Example Responses:
95
+ {
96
+ "reason": "The score is 10 because the output follows the instructions exactly"
97
+ }
98
+ {
99
+ "reason": "The score is 0 because the output does not follow the instructions"
100
+ }
101
+ `;
102
+ }