@mastra/evals 0.0.1-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/LICENSE +44 -0
- package/dist/evals.cjs.development.js +1510 -0
- package/dist/evals.cjs.development.js.map +1 -0
- package/dist/evals.cjs.production.min.js +2 -0
- package/dist/evals.cjs.production.min.js.map +1 -0
- package/dist/evals.esm.js +1497 -0
- package/dist/evals.esm.js.map +1 -0
- package/dist/evaluation.d.ts +3 -0
- package/dist/evaluation.d.ts.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/judge/index.d.ts +6 -0
- package/dist/judge/index.d.ts.map +1 -0
- package/dist/metrics/answer-relevancy/index.d.ts +17 -0
- package/dist/metrics/answer-relevancy/index.d.ts.map +1 -0
- package/dist/metrics/answer-relevancy/metricJudge.d.ts +11 -0
- package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -0
- package/dist/metrics/answer-relevancy/prompts.d.ts +15 -0
- package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -0
- package/dist/metrics/completeness/index.d.ts +12 -0
- package/dist/metrics/completeness/index.d.ts.map +1 -0
- package/dist/metrics/content-similarity/index.d.ts +11 -0
- package/dist/metrics/content-similarity/index.d.ts.map +1 -0
- package/dist/metrics/context-position/index.d.ts +15 -0
- package/dist/metrics/context-position/index.d.ts.map +1 -0
- package/dist/metrics/context-position/metricJudge.d.ts +14 -0
- package/dist/metrics/context-position/metricJudge.d.ts.map +1 -0
- package/dist/metrics/context-position/prompts.d.ts +16 -0
- package/dist/metrics/context-position/prompts.d.ts.map +1 -0
- package/dist/metrics/context-precision/index.d.ts +15 -0
- package/dist/metrics/context-precision/index.d.ts.map +1 -0
- package/dist/metrics/context-precision/metricJudge.d.ts +15 -0
- package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -0
- package/dist/metrics/context-precision/prompts.d.ts +16 -0
- package/dist/metrics/context-precision/prompts.d.ts.map +1 -0
- package/dist/metrics/difference/index.d.ts +9 -0
- package/dist/metrics/difference/index.d.ts.map +1 -0
- package/dist/metrics/index.d.ts +10 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/keyword-coverage/index.d.ts +9 -0
- package/dist/metrics/keyword-coverage/index.d.ts.map +1 -0
- package/dist/metrics/prompt-alignment/index.d.ts +17 -0
- package/dist/metrics/prompt-alignment/index.d.ts.map +1 -0
- package/dist/metrics/prompt-alignment/metricJudge.d.ts +11 -0
- package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -0
- package/dist/metrics/prompt-alignment/prompts.d.ts +13 -0
- package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -0
- package/dist/metrics/tone/index.d.ts +10 -0
- package/dist/metrics/tone/index.d.ts.map +1 -0
- package/dist/metrics/types.d.ts +12 -0
- package/dist/metrics/types.d.ts.map +1 -0
- package/jest.config.ts +19 -0
- package/package.json +51 -0
- package/src/evaluation.test.ts +32 -0
- package/src/evaluation.ts +20 -0
- package/src/index.ts +2 -0
- package/src/judge/index.ts +13 -0
- package/src/metrics/answer-relevancy/index.test.ts +193 -0
- package/src/metrics/answer-relevancy/index.ts +80 -0
- package/src/metrics/answer-relevancy/metricJudge.ts +49 -0
- package/src/metrics/answer-relevancy/prompts.ts +179 -0
- package/src/metrics/completeness/index.test.ts +96 -0
- package/src/metrics/completeness/index.ts +112 -0
- package/src/metrics/content-similarity/index.test.ts +107 -0
- package/src/metrics/content-similarity/index.ts +41 -0
- package/src/metrics/context-position/index.test.ts +292 -0
- package/src/metrics/context-position/index.ts +63 -0
- package/src/metrics/context-position/metricJudge.ts +54 -0
- package/src/metrics/context-position/prompts.ts +123 -0
- package/src/metrics/context-precision/index.test.ts +249 -0
- package/src/metrics/context-precision/index.ts +62 -0
- package/src/metrics/context-precision/metricJudge.ts +55 -0
- package/src/metrics/context-precision/prompts.ts +111 -0
- package/src/metrics/difference/index.test.ts +116 -0
- package/src/metrics/difference/index.ts +31 -0
- package/src/metrics/index.ts +9 -0
- package/src/metrics/keyword-coverage/index.test.ts +114 -0
- package/src/metrics/keyword-coverage/index.ts +47 -0
- package/src/metrics/prompt-alignment/index.test.ts +46 -0
- package/src/metrics/prompt-alignment/index.ts +66 -0
- package/src/metrics/prompt-alignment/metricJudge.ts +41 -0
- package/src/metrics/prompt-alignment/prompts.ts +102 -0
- package/src/metrics/tone/index.test.ts +123 -0
- package/src/metrics/tone/index.ts +47 -0
- package/src/metrics/types.ts +13 -0
- package/tsconfig.json +10 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
export const CONTEXT_PRECISION_AGENT_INSTRUCTIONS = `You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.
|
|
2
|
+
|
|
3
|
+
Key Principles:
|
|
4
|
+
1. Evaluate whether each context node was useful in generating the expected output
|
|
5
|
+
2. Consider both direct and indirect relevance
|
|
6
|
+
3. Prioritize usefulness over completeness
|
|
7
|
+
4. Recognize that some nodes may be partially relevant
|
|
8
|
+
5. Empty or error nodes should be marked as not relevant`;
|
|
9
|
+
|
|
10
|
+
export function generateEvaluatePrompt({
|
|
11
|
+
input,
|
|
12
|
+
output,
|
|
13
|
+
context,
|
|
14
|
+
}: {
|
|
15
|
+
input: string;
|
|
16
|
+
output: string;
|
|
17
|
+
context: string[];
|
|
18
|
+
}) {
|
|
19
|
+
return `Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.
|
|
20
|
+
|
|
21
|
+
**
|
|
22
|
+
IMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: \`verdict\` with either 'yes' or 'no', and \`reason\` explaining the verdict. Your reason should include relevant quotes from the context.
|
|
23
|
+
|
|
24
|
+
Example Context: ["The Sun is a star", "Stars produce their own light", "The Moon reflects sunlight"]
|
|
25
|
+
Example Query: "What is the Sun?"
|
|
26
|
+
Example Expected Response: "The Sun is a star that produces light."
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
{
|
|
30
|
+
"verdicts": [
|
|
31
|
+
{
|
|
32
|
+
"verdict": "yes",
|
|
33
|
+
"reason": "The context 'The Sun is a star' directly defines what the Sun is."
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"verdict": "yes",
|
|
37
|
+
"reason": "The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun."
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"verdict": "no",
|
|
41
|
+
"reason": "The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is."
|
|
42
|
+
}
|
|
43
|
+
]
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
Consider context relevant if it:
|
|
47
|
+
- Directly addresses the query
|
|
48
|
+
- Provides examples or instances that help explain the concept
|
|
49
|
+
- Offers related information that helps build understanding
|
|
50
|
+
- Contains partial information that contributes to the response
|
|
51
|
+
|
|
52
|
+
The number of verdicts MUST MATCH the number of context pieces exactly.
|
|
53
|
+
**
|
|
54
|
+
|
|
55
|
+
Input:
|
|
56
|
+
${input}
|
|
57
|
+
|
|
58
|
+
Output:
|
|
59
|
+
${output}
|
|
60
|
+
|
|
61
|
+
Context:
|
|
62
|
+
${context}
|
|
63
|
+
|
|
64
|
+
JSON:
|
|
65
|
+
`;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function generateReasonPrompt({
|
|
69
|
+
input,
|
|
70
|
+
output,
|
|
71
|
+
verdicts,
|
|
72
|
+
score,
|
|
73
|
+
}: {
|
|
74
|
+
input: string;
|
|
75
|
+
output: string;
|
|
76
|
+
verdicts: Array<{ verdict: string; reason: string }>;
|
|
77
|
+
score: number;
|
|
78
|
+
}) {
|
|
79
|
+
return `Given the input, output, verdicts, and precision score, provide a BRIEF explanation for the score. Explain both its strengths and limitations.
|
|
80
|
+
The retrieved contexts is a list containing \`verdict\` ('yes' or 'no' for relevance), \`reason\` (explaining the verdict) and \`node\` (the context text). Contexts are listed in their ranking order.
|
|
81
|
+
|
|
82
|
+
**
|
|
83
|
+
IMPORTANT: Return only JSON format with a single 'reason' key explaining the score.
|
|
84
|
+
Example JSON:
|
|
85
|
+
{
|
|
86
|
+
"reason": "The score is <score> because <explanation>."
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
Guidelines:
|
|
90
|
+
- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead
|
|
91
|
+
- Use information from the \`reason\` field, not the field itself
|
|
92
|
+
- Reference node positions (first, second, etc.) when explaining relevance
|
|
93
|
+
- For perfect scores (10.0), emphasize both relevance and optimal ordering
|
|
94
|
+
- Always reference the ranking order when discussing relevance
|
|
95
|
+
**
|
|
96
|
+
|
|
97
|
+
Precision Score:
|
|
98
|
+
${score}
|
|
99
|
+
|
|
100
|
+
Input:
|
|
101
|
+
${input}
|
|
102
|
+
|
|
103
|
+
Output:
|
|
104
|
+
${output}
|
|
105
|
+
|
|
106
|
+
Context:
|
|
107
|
+
${verdicts}
|
|
108
|
+
|
|
109
|
+
JSON:
|
|
110
|
+
`;
|
|
111
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { it, expect } from '@jest/globals';
|
|
2
|
+
|
|
3
|
+
import { DifferenceMetric } from './index';
|
|
4
|
+
|
|
5
|
+
describe('DifferenceMetric', () => {
|
|
6
|
+
const metric = new DifferenceMetric();
|
|
7
|
+
|
|
8
|
+
it('should return perfect match for identical strings', async () => {
|
|
9
|
+
const result = await metric.measure({
|
|
10
|
+
input: 'The quick brown fox',
|
|
11
|
+
output: 'The quick brown fox',
|
|
12
|
+
});
|
|
13
|
+
expect(result.score).toBe(1);
|
|
14
|
+
expect(result.confidence).toBe(1);
|
|
15
|
+
expect(result.metrics!.changes).toBe(0);
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
it('should handle small differences', async () => {
|
|
19
|
+
const result = await metric.measure({
|
|
20
|
+
input: 'The quick brown fox',
|
|
21
|
+
output: 'The quick brown cat',
|
|
22
|
+
});
|
|
23
|
+
expect(result.score).toBeGreaterThan(0.8);
|
|
24
|
+
expect(result.metrics!.changes).toBe(1);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('should handle word additions', async () => {
|
|
28
|
+
const result = await metric.measure({
|
|
29
|
+
input: 'The quick brown fox',
|
|
30
|
+
output: 'The very quick brown fox',
|
|
31
|
+
});
|
|
32
|
+
expect(result.score).toBeGreaterThan(0.7);
|
|
33
|
+
expect(result.metrics!.changes).toBe(1);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('should handle word deletions', async () => {
|
|
37
|
+
const result = await metric.measure({
|
|
38
|
+
input: 'The quick brown fox jumps',
|
|
39
|
+
output: 'The quick fox jumps',
|
|
40
|
+
});
|
|
41
|
+
expect(result.score).toBeGreaterThan(0.7);
|
|
42
|
+
expect(result.metrics!.changes).toBe(1);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('should handle multiple changes', async () => {
|
|
46
|
+
const result = await metric.measure({
|
|
47
|
+
input: 'The quick brown fox jumps over the lazy dog',
|
|
48
|
+
output: 'The slow black fox runs under the active cat',
|
|
49
|
+
});
|
|
50
|
+
expect(result.score).toBeGreaterThan(0.4);
|
|
51
|
+
expect(result.score).toBeLessThan(0.7);
|
|
52
|
+
expect(result.metrics!.changes).toBeGreaterThan(3);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it('should handle completely different strings', async () => {
|
|
56
|
+
const result = await metric.measure({
|
|
57
|
+
input: 'The quick brown fox',
|
|
58
|
+
output: 'Lorem ipsum dolor sit amet',
|
|
59
|
+
});
|
|
60
|
+
expect(result.score).toBeLessThan(0.3);
|
|
61
|
+
expect(result.metrics!.changes).toBeGreaterThan(3);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('should handle empty strings', async () => {
|
|
65
|
+
const result = await metric.measure({
|
|
66
|
+
input: '',
|
|
67
|
+
output: '',
|
|
68
|
+
});
|
|
69
|
+
expect(result.score).toBe(1);
|
|
70
|
+
expect(result.confidence).toBe(1);
|
|
71
|
+
expect(result.metrics!.changes).toBe(0);
|
|
72
|
+
expect(result.metrics!.lengthDiff).toBe(0);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it('should handle one empty string', async () => {
|
|
76
|
+
const result = await metric.measure({
|
|
77
|
+
input: 'The quick brown fox',
|
|
78
|
+
output: '',
|
|
79
|
+
});
|
|
80
|
+
expect(result.score).toBe(0);
|
|
81
|
+
expect(result.confidence).toBe(0);
|
|
82
|
+
expect(result.metrics!.changes).toBeGreaterThan(0);
|
|
83
|
+
expect(result.metrics!.lengthDiff).toBe(1);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
it('should handle case sensitivity', async () => {
|
|
87
|
+
const result = await metric.measure({
|
|
88
|
+
input: 'The Quick Brown Fox',
|
|
89
|
+
output: 'the quick brown fox',
|
|
90
|
+
});
|
|
91
|
+
expect(result.score).toBeLessThan(1);
|
|
92
|
+
expect(result.metrics!.changes).toBeGreaterThan(0);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
it('should handle whitespace sensitivity', async () => {
|
|
96
|
+
const result = await metric.measure({
|
|
97
|
+
input: 'The quick\nbrown fox',
|
|
98
|
+
output: 'The quick brown fox',
|
|
99
|
+
});
|
|
100
|
+
expect(result.score).toBeLessThan(1);
|
|
101
|
+
expect(result.metrics!.changes).toBeGreaterThan(0);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('should include difference details in result', async () => {
|
|
105
|
+
const result = await metric.measure({
|
|
106
|
+
input: 'The quick brown fox',
|
|
107
|
+
output: 'The quick brown fox',
|
|
108
|
+
});
|
|
109
|
+
expect(result.details).toBe('Difference score: 100.0% with 0 changes');
|
|
110
|
+
expect(result.metrics!).toEqual({
|
|
111
|
+
ratio: 1,
|
|
112
|
+
changes: 0,
|
|
113
|
+
lengthDiff: 0,
|
|
114
|
+
});
|
|
115
|
+
});
|
|
116
|
+
});
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { Metric } from '@mastra/core';
|
|
2
|
+
import { SequenceMatcher } from 'difflib';
|
|
3
|
+
|
|
4
|
+
import { MetricScoringResult } from '../types';
|
|
5
|
+
|
|
6
|
+
export class DifferenceMetric extends Metric {
|
|
7
|
+
async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
|
|
8
|
+
const matcher = new SequenceMatcher(null, input, output);
|
|
9
|
+
const ratio = matcher.ratio();
|
|
10
|
+
|
|
11
|
+
// Get detailed operations
|
|
12
|
+
const ops = matcher.getOpcodes();
|
|
13
|
+
const changes = ops.filter(([op]) => op !== 'equal').length;
|
|
14
|
+
|
|
15
|
+
// Calculate confidence based on text length difference
|
|
16
|
+
const maxLength = Math.max(input.length, output.length);
|
|
17
|
+
const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
|
|
18
|
+
const confidence = 1 - lengthDiff;
|
|
19
|
+
|
|
20
|
+
return {
|
|
21
|
+
score: ratio,
|
|
22
|
+
details: `Difference score: ${(ratio * 100).toFixed(1)}% with ${changes} changes`,
|
|
23
|
+
confidence,
|
|
24
|
+
metrics: {
|
|
25
|
+
ratio,
|
|
26
|
+
changes,
|
|
27
|
+
lengthDiff,
|
|
28
|
+
},
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export { AnswerRelevancyMetric } from './answer-relevancy';
|
|
2
|
+
export { CompletenessMetric } from './completeness';
|
|
3
|
+
export { ContentSimilarityMetric } from './content-similarity';
|
|
4
|
+
export { ContextPositionMetric } from './context-position';
|
|
5
|
+
export { ContextPrecisionMetric } from './context-precision';
|
|
6
|
+
export { DifferenceMetric } from './difference';
|
|
7
|
+
export { KeywordCoverageMetric } from './keyword-coverage';
|
|
8
|
+
export { PromptAlignmentMetric } from './prompt-alignment';
|
|
9
|
+
export { ToneConsistencyMetric } from './tone';
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { it, expect } from '@jest/globals';
|
|
2
|
+
|
|
3
|
+
import { KeywordCoverageMetric } from './index';
|
|
4
|
+
|
|
5
|
+
describe('KeywordCoverageMetric', () => {
|
|
6
|
+
const metric = new KeywordCoverageMetric();
|
|
7
|
+
|
|
8
|
+
it('should return perfect coverage for identical text', async () => {
|
|
9
|
+
const result = await metric.measure({
|
|
10
|
+
input: 'The quick brown fox jumps over the lazy dog',
|
|
11
|
+
output: 'The quick brown fox jumps over the lazy dog',
|
|
12
|
+
});
|
|
13
|
+
expect(result.score).toBe(1);
|
|
14
|
+
expect(result.confidence).toBe(0.85);
|
|
15
|
+
const matched = result.metrics?.matchedKeywords as number;
|
|
16
|
+
const total = result.metrics?.totalKeywords as number;
|
|
17
|
+
expect(matched).toBeGreaterThan(0);
|
|
18
|
+
expect(total).toBeGreaterThan(0);
|
|
19
|
+
expect(matched).toBe(total);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it('should handle partial keyword coverage', async () => {
|
|
23
|
+
const result = await metric.measure({
|
|
24
|
+
input: 'The quick brown fox jumps over the lazy dog',
|
|
25
|
+
output: 'A quick brown fox runs past a sleeping cat',
|
|
26
|
+
});
|
|
27
|
+
expect(result.score).toBeGreaterThan(0.3);
|
|
28
|
+
expect(result.score).toBeLessThan(0.7);
|
|
29
|
+
const matched = result.metrics?.matchedKeywords as number;
|
|
30
|
+
const total = result.metrics?.totalKeywords as number;
|
|
31
|
+
expect(matched).toBeLessThan(total);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('should ignore common words and stop words', async () => {
|
|
35
|
+
const result = await metric.measure({
|
|
36
|
+
input: 'The quick brown fox',
|
|
37
|
+
output: 'A quick brown fox',
|
|
38
|
+
});
|
|
39
|
+
expect(result.score).toBe(1); // "the" and "a" should be ignored
|
|
40
|
+
const matched = result.metrics?.matchedKeywords as number;
|
|
41
|
+
const total = result.metrics?.totalKeywords as number;
|
|
42
|
+
expect(matched).toBe(total);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('should handle case differences', async () => {
|
|
46
|
+
const result = await metric.measure({
|
|
47
|
+
input: 'The Quick Brown Fox',
|
|
48
|
+
output: 'the quick brown fox',
|
|
49
|
+
});
|
|
50
|
+
expect(result.score).toBe(1);
|
|
51
|
+
const matched = result.metrics?.matchedKeywords as number;
|
|
52
|
+
const total = result.metrics?.totalKeywords as number;
|
|
53
|
+
expect(matched).toBe(total);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it('should handle empty strings', async () => {
|
|
57
|
+
const result = await metric.measure({
|
|
58
|
+
input: '',
|
|
59
|
+
output: '',
|
|
60
|
+
});
|
|
61
|
+
expect(result.score).toBe(1);
|
|
62
|
+
expect(result.metrics?.totalKeywords).toBe(0);
|
|
63
|
+
expect(result.metrics?.matchedKeywords).toBe(0);
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
it('should handle one empty string', async () => {
|
|
67
|
+
const result = await metric.measure({
|
|
68
|
+
input: 'The quick brown fox',
|
|
69
|
+
output: '',
|
|
70
|
+
});
|
|
71
|
+
expect(result.score).toBe(0);
|
|
72
|
+
expect(result.metrics?.matchedKeywords).toBe(0);
|
|
73
|
+
expect(result.metrics?.totalKeywords).toBeGreaterThan(0);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('should ignore numbers by default', async () => {
|
|
77
|
+
const result = await metric.measure({
|
|
78
|
+
input: 'The 123 quick 456 brown fox',
|
|
79
|
+
output: 'The quick brown fox',
|
|
80
|
+
});
|
|
81
|
+
expect(result.score).toBe(1);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('should handle special characters', async () => {
|
|
85
|
+
const result = await metric.measure({
|
|
86
|
+
input: 'The quick-brown fox!',
|
|
87
|
+
output: 'The quick brown fox',
|
|
88
|
+
});
|
|
89
|
+
// Hyphenated words are treated as separate keywords
|
|
90
|
+
expect(result.score).toBeGreaterThanOrEqual(0.5);
|
|
91
|
+
expect(result.score).toBeLessThan(1);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('should handle completely different content', async () => {
|
|
95
|
+
const result = await metric.measure({
|
|
96
|
+
input: 'The quick brown fox jumps over the lazy dog',
|
|
97
|
+
output: 'Lorem ipsum dolor sit amet',
|
|
98
|
+
});
|
|
99
|
+
expect(result.score).toBe(0);
|
|
100
|
+
expect(result.metrics?.matchedKeywords).toBe(0);
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it('should include coverage details in result', async () => {
|
|
104
|
+
const result = await metric.measure({
|
|
105
|
+
input: 'quick brown fox',
|
|
106
|
+
output: 'quick brown fox',
|
|
107
|
+
});
|
|
108
|
+
expect(result.details).toMatch(/Keyword coverage: 100.0% \(3\/3 keywords\)/);
|
|
109
|
+
expect(result.metrics).toEqual({
|
|
110
|
+
totalKeywords: 3,
|
|
111
|
+
matchedKeywords: 3,
|
|
112
|
+
});
|
|
113
|
+
});
|
|
114
|
+
});
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { Metric } from '@mastra/core';
|
|
2
|
+
import keyword_extractor from 'keyword-extractor';
|
|
3
|
+
|
|
4
|
+
import { MetricScoringResult } from '../types';
|
|
5
|
+
|
|
6
|
+
export class KeywordCoverageMetric extends Metric {
|
|
7
|
+
async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
|
|
8
|
+
// Handle empty strings case
|
|
9
|
+
if (!input && !output) {
|
|
10
|
+
return {
|
|
11
|
+
score: 1,
|
|
12
|
+
details: 'Keyword coverage: 100.0% (0/0 keywords)',
|
|
13
|
+
confidence: 0.85,
|
|
14
|
+
metrics: {
|
|
15
|
+
totalKeywords: 0,
|
|
16
|
+
matchedKeywords: 0,
|
|
17
|
+
},
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const extractKeywords = (text: string) => {
|
|
22
|
+
return keyword_extractor.extract(text, {
|
|
23
|
+
language: 'english',
|
|
24
|
+
remove_digits: true,
|
|
25
|
+
return_changed_case: true,
|
|
26
|
+
remove_duplicates: true,
|
|
27
|
+
});
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
const referenceKeywords = new Set(extractKeywords(input));
|
|
31
|
+
const responseKeywords = new Set(extractKeywords(output));
|
|
32
|
+
|
|
33
|
+
const matchedKeywords = [...referenceKeywords].filter(k => responseKeywords.has(k));
|
|
34
|
+
const totalKeywords = referenceKeywords.size;
|
|
35
|
+
const coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
score: coverage,
|
|
39
|
+
details: `Keyword coverage: ${(coverage * 100).toFixed(1)}% (${matchedKeywords.length}/${referenceKeywords.size} keywords)`,
|
|
40
|
+
confidence: 0.85,
|
|
41
|
+
metrics: {
|
|
42
|
+
totalKeywords: referenceKeywords.size,
|
|
43
|
+
matchedKeywords: matchedKeywords.length,
|
|
44
|
+
},
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { it, expect, jest } from '@jest/globals';
|
|
2
|
+
import { type ModelConfig } from '@mastra/core';
|
|
3
|
+
|
|
4
|
+
import { PromptAlignmentMetric } from './index';
|
|
5
|
+
|
|
6
|
+
const SECONDS = 1000;
|
|
7
|
+
jest.setTimeout(15 * SECONDS);
|
|
8
|
+
|
|
9
|
+
const modelConfig: ModelConfig = {
|
|
10
|
+
provider: 'OPEN_AI',
|
|
11
|
+
name: 'gpt-4o',
|
|
12
|
+
toolChoice: 'auto',
|
|
13
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
it('should be able to measure prompt alignment', async () => {
|
|
17
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
18
|
+
instructions: ['Reply in all uppercase'],
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
const result = await metric.measure({
|
|
22
|
+
input: `What if these shoes don't fit?`,
|
|
23
|
+
output: 'We offer a 30-day full refund at no extra cost.',
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
const resultUppercase = await metric.measure({
|
|
27
|
+
input: `What if these shoes don't fit?`,
|
|
28
|
+
output: 'We offer a 30-day full refund at no extra cost.'.toUpperCase(),
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
expect(resultUppercase.score).toBe(10);
|
|
32
|
+
expect(result.score).toBe(0);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('should be able to measure prompt alignment with an array of instructions', async () => {
|
|
36
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
37
|
+
instructions: ['Reply in all uppercase', 'Include baguettes in the response'],
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
const result = await metric.measure({
|
|
41
|
+
input: `What is the capital of France?`,
|
|
42
|
+
output: 'THE CAPITAL OF FRANCE IS BAGUETTE.',
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
expect(result.score).toBe(10);
|
|
46
|
+
});
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { Metric, MetricResult, ModelConfig } from '@mastra/core';
|
|
2
|
+
|
|
3
|
+
import { PromptAlignmentJudge } from './metricJudge';
|
|
4
|
+
|
|
5
|
+
export class PromptAlignmentMetric extends Metric {
|
|
6
|
+
private instructions: string[];
|
|
7
|
+
private judge: PromptAlignmentJudge;
|
|
8
|
+
private scale: number;
|
|
9
|
+
|
|
10
|
+
constructor(model: ModelConfig, { instructions, scale = 10 }: { instructions: string[]; scale?: number }) {
|
|
11
|
+
super();
|
|
12
|
+
|
|
13
|
+
this.instructions = instructions;
|
|
14
|
+
this.judge = new PromptAlignmentJudge(model);
|
|
15
|
+
this.scale = scale;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
async measure({ input, output }: { input: string; output: string }): Promise<MetricResult> {
|
|
19
|
+
const verdicts = await this.judge.evaluate(input, output, this.instructions);
|
|
20
|
+
const score = this.calculateScore(verdicts);
|
|
21
|
+
|
|
22
|
+
const reason = await this.generateReason(input, output, score, verdicts);
|
|
23
|
+
|
|
24
|
+
return {
|
|
25
|
+
score,
|
|
26
|
+
reason,
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
private async generateReason(
|
|
31
|
+
input: string,
|
|
32
|
+
output: string,
|
|
33
|
+
score: number,
|
|
34
|
+
verdicts: {
|
|
35
|
+
verdict: string;
|
|
36
|
+
reason: string;
|
|
37
|
+
}[],
|
|
38
|
+
): Promise<string> {
|
|
39
|
+
const reasonsForVerdicts: string[] = [];
|
|
40
|
+
for (const { verdict, reason } of verdicts || []) {
|
|
41
|
+
if (verdict.trim().toLowerCase() === 'no') {
|
|
42
|
+
reasonsForVerdicts.push(reason);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const reason = await this.judge.getReason(input, output, score, reasonsForVerdicts);
|
|
47
|
+
return reason;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
private calculateScore(evaluation: { verdict: string; reason: string }[]): number {
|
|
51
|
+
const numberOfVerdicts = evaluation?.length || 0;
|
|
52
|
+
if (numberOfVerdicts === 0) {
|
|
53
|
+
return 1;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
let alignmentCount = 0;
|
|
57
|
+
for (const { verdict } of evaluation!) {
|
|
58
|
+
if (verdict.trim().toLowerCase() !== 'no') {
|
|
59
|
+
alignmentCount++;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const score = alignmentCount / numberOfVerdicts;
|
|
64
|
+
return score * this.scale;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { ModelConfig } from '@mastra/core';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
|
|
4
|
+
import { MastraAgentJudge } from '../../judge';
|
|
5
|
+
|
|
6
|
+
import { generateEvaluatePrompt, generateReasonPrompt, PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS } from './prompts';
|
|
7
|
+
|
|
8
|
+
export class PromptAlignmentJudge extends MastraAgentJudge {
|
|
9
|
+
constructor(model: ModelConfig) {
|
|
10
|
+
super('Prompt Alignment', PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, model);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
async evaluate(
|
|
14
|
+
input: string,
|
|
15
|
+
actualOutput: string,
|
|
16
|
+
instructions: string[],
|
|
17
|
+
): Promise<{ verdict: string; reason: string }[]> {
|
|
18
|
+
const prompt = generateEvaluatePrompt({ input, output: actualOutput, instructions });
|
|
19
|
+
const result = await this.agent.generate(prompt, {
|
|
20
|
+
output: z.object({
|
|
21
|
+
verdicts: z.array(
|
|
22
|
+
z.object({
|
|
23
|
+
verdict: z.string(),
|
|
24
|
+
reason: z.string(),
|
|
25
|
+
}),
|
|
26
|
+
),
|
|
27
|
+
}),
|
|
28
|
+
});
|
|
29
|
+
return result.object.verdicts;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
async getReason(input: string, actualOutput: string, score: number, reasons: string[]): Promise<string> {
|
|
33
|
+
const prompt = generateReasonPrompt({ input, output: actualOutput, reasons, score });
|
|
34
|
+
const result = await this.agent.generate(prompt, {
|
|
35
|
+
output: z.object({
|
|
36
|
+
reason: z.string(),
|
|
37
|
+
}),
|
|
38
|
+
});
|
|
39
|
+
return result.object.reason;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
export const PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = `You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.
|
|
2
|
+
|
|
3
|
+
Key Principles:
|
|
4
|
+
1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.
|
|
5
|
+
2. Only give a "yes" verdict if an instruction is COMPLETELY followed
|
|
6
|
+
3. Any partial compliance should be marked as "no"
|
|
7
|
+
4. Provide clear, specific reasons for any "no" verdicts
|
|
8
|
+
5. Focus solely on instruction compliance, not output quality
|
|
9
|
+
|
|
10
|
+
Remember:
|
|
11
|
+
- Each instruction must be evaluated independently
|
|
12
|
+
- Verdicts must be either "yes" or "no" - no in-between
|
|
13
|
+
- Reasons are required only for "no" verdicts
|
|
14
|
+
- The number of verdicts must match the number of instructions exactly`;
|
|
15
|
+
|
|
16
|
+
export function generateEvaluatePrompt({
|
|
17
|
+
instructions,
|
|
18
|
+
input,
|
|
19
|
+
output,
|
|
20
|
+
}: {
|
|
21
|
+
instructions: string[];
|
|
22
|
+
input: string;
|
|
23
|
+
output: string;
|
|
24
|
+
}) {
|
|
25
|
+
return `For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.
|
|
26
|
+
Make sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.
|
|
27
|
+
Generate a list of verdicts in JSON format, where each verdict must have:
|
|
28
|
+
- "verdict": Strictly "yes" or "no"
|
|
29
|
+
- "reason": Give a reason for the verdict
|
|
30
|
+
|
|
31
|
+
Be EXTRA STRICT in your evaluation. Only give "yes" if the instruction is followed COMPLETELY.
|
|
32
|
+
Evaluate the output EXACTLY as written - consider every character, space, and case
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
Input: "describe the sky"
|
|
36
|
+
Output: "the sky is Blue today"
|
|
37
|
+
Instructions: ["Start sentences with capital letters", "Use proper English"]
|
|
38
|
+
|
|
39
|
+
{
|
|
40
|
+
"verdicts": [
|
|
41
|
+
{
|
|
42
|
+
"verdict": "no",
|
|
43
|
+
"reason": "The sentence 'the sky is Blue' starts with lowercase 't'"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"verdict": "no",
|
|
47
|
+
"reason": "Improper capitalization: 'Blue' is capitalized mid-sentence"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
Prompt Instructions:
|
|
53
|
+
${instructions.join('\n')}
|
|
54
|
+
|
|
55
|
+
Input:
|
|
56
|
+
${input}
|
|
57
|
+
|
|
58
|
+
LLM Actual Output:
|
|
59
|
+
${output}
|
|
60
|
+
|
|
61
|
+
JSON:`;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function generateReasonPrompt({
|
|
65
|
+
input,
|
|
66
|
+
output,
|
|
67
|
+
score,
|
|
68
|
+
reasons,
|
|
69
|
+
}: {
|
|
70
|
+
input: string;
|
|
71
|
+
output: string;
|
|
72
|
+
score: number;
|
|
73
|
+
reasons: string[];
|
|
74
|
+
}) {
|
|
75
|
+
return `Explain the instruction following score (0-10) for the LLM's response using this context:
|
|
76
|
+
Context:
|
|
77
|
+
Input: ${input}
|
|
78
|
+
Output: ${output}
|
|
79
|
+
Score: ${score}
|
|
80
|
+
Failure Reasons: ${reasons.join('\n')}
|
|
81
|
+
|
|
82
|
+
Rules (follow these rules exactly. do not deviate):
|
|
83
|
+
- Keep your response concise and to the point.
|
|
84
|
+
- Do not change score from what is given.
|
|
85
|
+
- Do not make judgements on inputs or outputs (factual correctness, quality, etc).
|
|
86
|
+
- If there are failure reasons given, explain why the score is not higher.
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
Output format:
|
|
90
|
+
{
|
|
91
|
+
"reason": "The score is {score} because {explanation of instruction following}"
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
Example Responses:
|
|
95
|
+
{
|
|
96
|
+
"reason": "The score is 10 because the output follows the instructions exactly"
|
|
97
|
+
}
|
|
98
|
+
{
|
|
99
|
+
"reason": "The score is 0 because the output does not follow the instructions"
|
|
100
|
+
}
|
|
101
|
+
`;
|
|
102
|
+
}
|