@mastra/evals 0.0.1-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/LICENSE +44 -0
- package/dist/evals.cjs.development.js +1510 -0
- package/dist/evals.cjs.development.js.map +1 -0
- package/dist/evals.cjs.production.min.js +2 -0
- package/dist/evals.cjs.production.min.js.map +1 -0
- package/dist/evals.esm.js +1497 -0
- package/dist/evals.esm.js.map +1 -0
- package/dist/evaluation.d.ts +3 -0
- package/dist/evaluation.d.ts.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/judge/index.d.ts +6 -0
- package/dist/judge/index.d.ts.map +1 -0
- package/dist/metrics/answer-relevancy/index.d.ts +17 -0
- package/dist/metrics/answer-relevancy/index.d.ts.map +1 -0
- package/dist/metrics/answer-relevancy/metricJudge.d.ts +11 -0
- package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -0
- package/dist/metrics/answer-relevancy/prompts.d.ts +15 -0
- package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -0
- package/dist/metrics/completeness/index.d.ts +12 -0
- package/dist/metrics/completeness/index.d.ts.map +1 -0
- package/dist/metrics/content-similarity/index.d.ts +11 -0
- package/dist/metrics/content-similarity/index.d.ts.map +1 -0
- package/dist/metrics/context-position/index.d.ts +15 -0
- package/dist/metrics/context-position/index.d.ts.map +1 -0
- package/dist/metrics/context-position/metricJudge.d.ts +14 -0
- package/dist/metrics/context-position/metricJudge.d.ts.map +1 -0
- package/dist/metrics/context-position/prompts.d.ts +16 -0
- package/dist/metrics/context-position/prompts.d.ts.map +1 -0
- package/dist/metrics/context-precision/index.d.ts +15 -0
- package/dist/metrics/context-precision/index.d.ts.map +1 -0
- package/dist/metrics/context-precision/metricJudge.d.ts +15 -0
- package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -0
- package/dist/metrics/context-precision/prompts.d.ts +16 -0
- package/dist/metrics/context-precision/prompts.d.ts.map +1 -0
- package/dist/metrics/difference/index.d.ts +9 -0
- package/dist/metrics/difference/index.d.ts.map +1 -0
- package/dist/metrics/index.d.ts +10 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/keyword-coverage/index.d.ts +9 -0
- package/dist/metrics/keyword-coverage/index.d.ts.map +1 -0
- package/dist/metrics/prompt-alignment/index.d.ts +17 -0
- package/dist/metrics/prompt-alignment/index.d.ts.map +1 -0
- package/dist/metrics/prompt-alignment/metricJudge.d.ts +11 -0
- package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -0
- package/dist/metrics/prompt-alignment/prompts.d.ts +13 -0
- package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -0
- package/dist/metrics/tone/index.d.ts +10 -0
- package/dist/metrics/tone/index.d.ts.map +1 -0
- package/dist/metrics/types.d.ts +12 -0
- package/dist/metrics/types.d.ts.map +1 -0
- package/jest.config.ts +19 -0
- package/package.json +51 -0
- package/src/evaluation.test.ts +32 -0
- package/src/evaluation.ts +20 -0
- package/src/index.ts +2 -0
- package/src/judge/index.ts +13 -0
- package/src/metrics/answer-relevancy/index.test.ts +193 -0
- package/src/metrics/answer-relevancy/index.ts +80 -0
- package/src/metrics/answer-relevancy/metricJudge.ts +49 -0
- package/src/metrics/answer-relevancy/prompts.ts +179 -0
- package/src/metrics/completeness/index.test.ts +96 -0
- package/src/metrics/completeness/index.ts +112 -0
- package/src/metrics/content-similarity/index.test.ts +107 -0
- package/src/metrics/content-similarity/index.ts +41 -0
- package/src/metrics/context-position/index.test.ts +292 -0
- package/src/metrics/context-position/index.ts +63 -0
- package/src/metrics/context-position/metricJudge.ts +54 -0
- package/src/metrics/context-position/prompts.ts +123 -0
- package/src/metrics/context-precision/index.test.ts +249 -0
- package/src/metrics/context-precision/index.ts +62 -0
- package/src/metrics/context-precision/metricJudge.ts +55 -0
- package/src/metrics/context-precision/prompts.ts +111 -0
- package/src/metrics/difference/index.test.ts +116 -0
- package/src/metrics/difference/index.ts +31 -0
- package/src/metrics/index.ts +9 -0
- package/src/metrics/keyword-coverage/index.test.ts +114 -0
- package/src/metrics/keyword-coverage/index.ts +47 -0
- package/src/metrics/prompt-alignment/index.test.ts +46 -0
- package/src/metrics/prompt-alignment/index.ts +66 -0
- package/src/metrics/prompt-alignment/metricJudge.ts +41 -0
- package/src/metrics/prompt-alignment/prompts.ts +102 -0
- package/src/metrics/tone/index.test.ts +123 -0
- package/src/metrics/tone/index.ts +47 -0
- package/src/metrics/types.ts +13 -0
- package/tsconfig.json +10 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { it, expect } from '@jest/globals';
|
|
2
|
+
|
|
3
|
+
import { ToneConsistencyMetric } from './index';
|
|
4
|
+
|
|
5
|
+
describe('ToneConsistencyMetric', () => {
|
|
6
|
+
const metric = new ToneConsistencyMetric();
|
|
7
|
+
|
|
8
|
+
describe('tone consistency (with reference)', () => {
|
|
9
|
+
it('should return perfect score for identical sentiment', async () => {
|
|
10
|
+
const result = await metric.measure({
|
|
11
|
+
input: 'I love this amazing product!',
|
|
12
|
+
output: 'This product is wonderful and fantastic!',
|
|
13
|
+
});
|
|
14
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
15
|
+
expect(result.score).toBeGreaterThan(0.9);
|
|
16
|
+
expect(result.confidence).toBe(0.75);
|
|
17
|
+
expect(metrics.responseSentiment!).toBeGreaterThan(0);
|
|
18
|
+
expect(metrics.referenceSentiment!).toBeGreaterThan(0);
|
|
19
|
+
expect(metrics.difference!).toBeLessThan(0.1);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it('should handle opposite sentiments', async () => {
|
|
23
|
+
const result = await metric.measure({
|
|
24
|
+
input: 'This is terrible and disappointing.',
|
|
25
|
+
output: 'This is excellent and amazing!',
|
|
26
|
+
});
|
|
27
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
28
|
+
expect(result.score).toBeLessThan(0.5);
|
|
29
|
+
expect(metrics.responseSentiment!).toBeLessThan(0);
|
|
30
|
+
expect(metrics.referenceSentiment!).toBeGreaterThan(0);
|
|
31
|
+
expect(metrics.difference!).toBeGreaterThan(0.5);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('should handle neutral text', async () => {
|
|
35
|
+
const result = await metric.measure({
|
|
36
|
+
input: 'The sky is blue. The grass is green.',
|
|
37
|
+
output: 'Trees are tall. Water is wet.',
|
|
38
|
+
});
|
|
39
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
40
|
+
expect(result.score).toBeGreaterThan(0.9);
|
|
41
|
+
expect(Math.abs(metrics.responseSentiment!)).toBeLessThan(0.2);
|
|
42
|
+
expect(Math.abs(metrics.referenceSentiment!)).toBeLessThan(0.2);
|
|
43
|
+
expect(metrics.difference!).toBeLessThan(0.1);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it('should handle mixed sentiment comparison', async () => {
|
|
47
|
+
const result = await metric.measure({
|
|
48
|
+
input: 'The product has great features but some annoying bugs.',
|
|
49
|
+
output: 'While the interface is beautiful, performance is poor.',
|
|
50
|
+
});
|
|
51
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
52
|
+
expect(result.score).toBeGreaterThan(0.7);
|
|
53
|
+
expect(Math.abs(metrics.difference!)).toBeLessThan(0.3);
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
describe('tone stability (single input)', () => {
|
|
58
|
+
it('should handle consistent positive tone', async () => {
|
|
59
|
+
const result = await metric.measure({
|
|
60
|
+
input: 'I love this product! It works amazingly well. The features are fantastic.',
|
|
61
|
+
output: '',
|
|
62
|
+
});
|
|
63
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
64
|
+
expect(result.score).toBeGreaterThan(0.8);
|
|
65
|
+
expect(result.confidence).toBe(0.7);
|
|
66
|
+
expect(metrics.avgSentiment!).toBeGreaterThan(0);
|
|
67
|
+
expect(metrics.sentimentVariance!).toBeLessThan(0.2);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
it('should handle consistent negative tone', async () => {
|
|
71
|
+
const result = await metric.measure({
|
|
72
|
+
input: 'This is terrible. It never works properly. The support is awful.',
|
|
73
|
+
output: '',
|
|
74
|
+
});
|
|
75
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
76
|
+
expect(result.score).toBeGreaterThan(0.8);
|
|
77
|
+
expect(metrics.avgSentiment!).toBeLessThan(0);
|
|
78
|
+
expect(metrics.sentimentVariance!).toBeLessThan(0.2);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('should detect inconsistent tone', async () => {
|
|
82
|
+
const result = await metric.measure({
|
|
83
|
+
input: 'This is amazing! But it has terrible flaws. Yet somehow I love it. Though it frustrates me.',
|
|
84
|
+
output: '',
|
|
85
|
+
});
|
|
86
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
87
|
+
expect(result.score).toBeLessThan(0.7);
|
|
88
|
+
expect(metrics.sentimentVariance!).toBeGreaterThan(0.2);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('should handle single sentence', async () => {
|
|
92
|
+
const result = await metric.measure({
|
|
93
|
+
input: 'This is a great product.',
|
|
94
|
+
output: '',
|
|
95
|
+
});
|
|
96
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
97
|
+
expect(result.score).toBe(1);
|
|
98
|
+
expect(metrics.sentimentVariance!).toBe(0);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it('should handle empty input', async () => {
|
|
102
|
+
const result = await metric.measure({
|
|
103
|
+
input: '',
|
|
104
|
+
output: '',
|
|
105
|
+
});
|
|
106
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
107
|
+
expect(result.score).toBe(1);
|
|
108
|
+
expect(metrics.avgSentiment!).toBe(0);
|
|
109
|
+
expect(metrics.sentimentVariance!).toBe(0);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('should handle neutral consistent tone', async () => {
|
|
113
|
+
const result = await metric.measure({
|
|
114
|
+
input: 'The sky is blue. The grass is green. The tree is tall.',
|
|
115
|
+
output: '',
|
|
116
|
+
});
|
|
117
|
+
const metrics = result.metrics as { [key: string]: number };
|
|
118
|
+
expect(result.score).toBeGreaterThan(0.9);
|
|
119
|
+
expect(Math.abs(metrics.avgSentiment!)).toBeLessThan(0.2);
|
|
120
|
+
expect(metrics.sentimentVariance!).toBeLessThan(0.1);
|
|
121
|
+
});
|
|
122
|
+
});
|
|
123
|
+
});
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { Metric } from '@mastra/core';
|
|
2
|
+
import Sentiment from 'sentiment';
|
|
3
|
+
|
|
4
|
+
import { MetricScoringResult } from '../types';
|
|
5
|
+
|
|
6
|
+
export class ToneConsistencyMetric extends Metric {
|
|
7
|
+
private sentiment = new Sentiment();
|
|
8
|
+
|
|
9
|
+
async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
|
|
10
|
+
const responseSentiment = this.sentiment.analyze(input);
|
|
11
|
+
|
|
12
|
+
if (output) {
|
|
13
|
+
// Compare sentiment with reference
|
|
14
|
+
const referenceSentiment = this.sentiment.analyze(output);
|
|
15
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
16
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
17
|
+
|
|
18
|
+
return {
|
|
19
|
+
score: normalizedScore,
|
|
20
|
+
details: `Tone consistency: ${(normalizedScore * 100).toFixed(1)}%`,
|
|
21
|
+
confidence: 0.75,
|
|
22
|
+
metrics: {
|
|
23
|
+
responseSentiment: responseSentiment.comparative,
|
|
24
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
25
|
+
difference: sentimentDiff,
|
|
26
|
+
},
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Evaluate sentiment stability across response
|
|
31
|
+
const sentences = input.match(/[^.!?]+[.!?]+/g) || [input];
|
|
32
|
+
const sentiments = sentences.map(s => this.sentiment.analyze(s).comparative);
|
|
33
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
34
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
35
|
+
const stability = Math.max(0, 1 - variance);
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
score: stability,
|
|
39
|
+
details: `Tone stability: ${(stability * 100).toFixed(1)}%`,
|
|
40
|
+
confidence: 0.7,
|
|
41
|
+
metrics: {
|
|
42
|
+
avgSentiment,
|
|
43
|
+
sentimentVariance: variance,
|
|
44
|
+
},
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export interface MetricScoringResult {
|
|
2
|
+
score: number; // 0-1 normalized score
|
|
3
|
+
weight?: number;
|
|
4
|
+
details: string; // Human-readable explanation
|
|
5
|
+
confidence: number; // 0-1 confidence level
|
|
6
|
+
metrics?: Record<string, number | boolean | Record<string, any>>; // Additional numerical metrics
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface MetricOptions {
|
|
10
|
+
ignoreCase?: boolean;
|
|
11
|
+
ignoreWhitespace?: boolean;
|
|
12
|
+
// Add more options as needed
|
|
13
|
+
}
|