@mastra/evals 0.0.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +10 -0
  2. package/LICENSE +44 -0
  3. package/dist/evals.cjs.development.js +1510 -0
  4. package/dist/evals.cjs.development.js.map +1 -0
  5. package/dist/evals.cjs.production.min.js +2 -0
  6. package/dist/evals.cjs.production.min.js.map +1 -0
  7. package/dist/evals.esm.js +1497 -0
  8. package/dist/evals.esm.js.map +1 -0
  9. package/dist/evaluation.d.ts +3 -0
  10. package/dist/evaluation.d.ts.map +1 -0
  11. package/dist/index.d.ts +3 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +8 -0
  14. package/dist/judge/index.d.ts +6 -0
  15. package/dist/judge/index.d.ts.map +1 -0
  16. package/dist/metrics/answer-relevancy/index.d.ts +17 -0
  17. package/dist/metrics/answer-relevancy/index.d.ts.map +1 -0
  18. package/dist/metrics/answer-relevancy/metricJudge.d.ts +11 -0
  19. package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -0
  20. package/dist/metrics/answer-relevancy/prompts.d.ts +15 -0
  21. package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -0
  22. package/dist/metrics/completeness/index.d.ts +12 -0
  23. package/dist/metrics/completeness/index.d.ts.map +1 -0
  24. package/dist/metrics/content-similarity/index.d.ts +11 -0
  25. package/dist/metrics/content-similarity/index.d.ts.map +1 -0
  26. package/dist/metrics/context-position/index.d.ts +15 -0
  27. package/dist/metrics/context-position/index.d.ts.map +1 -0
  28. package/dist/metrics/context-position/metricJudge.d.ts +14 -0
  29. package/dist/metrics/context-position/metricJudge.d.ts.map +1 -0
  30. package/dist/metrics/context-position/prompts.d.ts +16 -0
  31. package/dist/metrics/context-position/prompts.d.ts.map +1 -0
  32. package/dist/metrics/context-precision/index.d.ts +15 -0
  33. package/dist/metrics/context-precision/index.d.ts.map +1 -0
  34. package/dist/metrics/context-precision/metricJudge.d.ts +15 -0
  35. package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -0
  36. package/dist/metrics/context-precision/prompts.d.ts +16 -0
  37. package/dist/metrics/context-precision/prompts.d.ts.map +1 -0
  38. package/dist/metrics/difference/index.d.ts +9 -0
  39. package/dist/metrics/difference/index.d.ts.map +1 -0
  40. package/dist/metrics/index.d.ts +10 -0
  41. package/dist/metrics/index.d.ts.map +1 -0
  42. package/dist/metrics/keyword-coverage/index.d.ts +9 -0
  43. package/dist/metrics/keyword-coverage/index.d.ts.map +1 -0
  44. package/dist/metrics/prompt-alignment/index.d.ts +17 -0
  45. package/dist/metrics/prompt-alignment/index.d.ts.map +1 -0
  46. package/dist/metrics/prompt-alignment/metricJudge.d.ts +11 -0
  47. package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -0
  48. package/dist/metrics/prompt-alignment/prompts.d.ts +13 -0
  49. package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -0
  50. package/dist/metrics/tone/index.d.ts +10 -0
  51. package/dist/metrics/tone/index.d.ts.map +1 -0
  52. package/dist/metrics/types.d.ts +12 -0
  53. package/dist/metrics/types.d.ts.map +1 -0
  54. package/jest.config.ts +19 -0
  55. package/package.json +51 -0
  56. package/src/evaluation.test.ts +32 -0
  57. package/src/evaluation.ts +20 -0
  58. package/src/index.ts +2 -0
  59. package/src/judge/index.ts +13 -0
  60. package/src/metrics/answer-relevancy/index.test.ts +193 -0
  61. package/src/metrics/answer-relevancy/index.ts +80 -0
  62. package/src/metrics/answer-relevancy/metricJudge.ts +49 -0
  63. package/src/metrics/answer-relevancy/prompts.ts +179 -0
  64. package/src/metrics/completeness/index.test.ts +96 -0
  65. package/src/metrics/completeness/index.ts +112 -0
  66. package/src/metrics/content-similarity/index.test.ts +107 -0
  67. package/src/metrics/content-similarity/index.ts +41 -0
  68. package/src/metrics/context-position/index.test.ts +292 -0
  69. package/src/metrics/context-position/index.ts +63 -0
  70. package/src/metrics/context-position/metricJudge.ts +54 -0
  71. package/src/metrics/context-position/prompts.ts +123 -0
  72. package/src/metrics/context-precision/index.test.ts +249 -0
  73. package/src/metrics/context-precision/index.ts +62 -0
  74. package/src/metrics/context-precision/metricJudge.ts +55 -0
  75. package/src/metrics/context-precision/prompts.ts +111 -0
  76. package/src/metrics/difference/index.test.ts +116 -0
  77. package/src/metrics/difference/index.ts +31 -0
  78. package/src/metrics/index.ts +9 -0
  79. package/src/metrics/keyword-coverage/index.test.ts +114 -0
  80. package/src/metrics/keyword-coverage/index.ts +47 -0
  81. package/src/metrics/prompt-alignment/index.test.ts +46 -0
  82. package/src/metrics/prompt-alignment/index.ts +66 -0
  83. package/src/metrics/prompt-alignment/metricJudge.ts +41 -0
  84. package/src/metrics/prompt-alignment/prompts.ts +102 -0
  85. package/src/metrics/tone/index.test.ts +123 -0
  86. package/src/metrics/tone/index.ts +47 -0
  87. package/src/metrics/types.ts +13 -0
  88. package/tsconfig.json +10 -0
@@ -0,0 +1,96 @@
1
+ import { CompletenessMetric } from './index';
2
+
3
+ describe('CompletenessMetric', () => {
4
+ let metric: CompletenessMetric;
5
+
6
+ beforeEach(() => {
7
+ metric = new CompletenessMetric();
8
+ });
9
+
10
+ describe('basic functionality', () => {
11
+ it('should return high score for identical text', async () => {
12
+ const text = 'The quick brown fox jumps over the lazy dog';
13
+ const result = await metric.measure({ input: text, output: text });
14
+
15
+ expect(result.score).toBeCloseTo(1.0);
16
+ expect(result.confidence).toBe(0.8);
17
+ });
18
+
19
+ it('should return lower score for simplified text missing elements', async () => {
20
+ const original = 'The quick brown fox jumps over the lazy dog';
21
+ const simplified = 'The fox jumps over the dog';
22
+ const result = await metric.measure({ input: original, output: simplified });
23
+
24
+ expect(result.score).toBeLessThan(1.0);
25
+ expect(result.score).toBeGreaterThan(0.5);
26
+ expect(result.metrics?.missingElements).toContain('brown');
27
+ expect(result.metrics?.missingElements).toContain('lazy');
28
+ });
29
+
30
+ it('should handle completely different texts', async () => {
31
+ const original = 'The weather is sunny today';
32
+ const simplified = 'I like to eat pizza';
33
+ const result = await metric.measure({ input: original, output: simplified });
34
+
35
+ expect(result.score).toBeLessThan(0.3);
36
+ const { input, output } = result.metrics?.elementCounts as { input: number; output: number };
37
+ expect(input).toBeGreaterThan(0);
38
+ expect(output).toBeGreaterThan(0);
39
+ });
40
+ });
41
+
42
+ describe('edge cases', () => {
43
+ it('should handle both empty strings', async () => {
44
+ const result = await metric.measure({ input: '', output: '' });
45
+ expect(result.score).toBe(1);
46
+ const { input, output } = result.metrics?.elementCounts as { input: number; output: number };
47
+ expect(input).toBe(0);
48
+ expect(output).toBe(0);
49
+ });
50
+
51
+ it('should handle empty original string', async () => {
52
+ const result = await metric.measure({ input: '', output: 'some text' });
53
+ expect(result.score).toBe(0);
54
+ });
55
+
56
+ it('should handle whitespace-only strings', async () => {
57
+ const result = await metric.measure({ input: ' \n ', output: ' \n ' });
58
+ expect(result.score).toBe(1);
59
+ const { input, output } = result.metrics?.elementCounts as { input: number; output: number };
60
+ expect(input).toBe(0);
61
+ expect(output).toBe(0);
62
+ });
63
+
64
+ it('should handle null and undefined inputs', async () => {
65
+ // @ts-expect-error Testing invalid input
66
+ await expect(metric.measure({ input: null, output: '' })).rejects.toThrow();
67
+ // @ts-expect-error Testing invalid input
68
+ await expect(metric.measure({ input: '', output: undefined })).rejects.toThrow();
69
+ });
70
+ });
71
+
72
+ describe('special cases', () => {
73
+ it('should handle lists and enumerations', async () => {
74
+ const result = await metric.measure({ input: 'apples, oranges, and bananas', output: 'apples and bananas' });
75
+ expect(result.score).toBeLessThan(0.8);
76
+ expect(result.metrics?.missingElements).toContain('oranges');
77
+ });
78
+
79
+ it('should handle repeated elements', async () => {
80
+ const result = await metric.measure({ input: 'cat cat cat cats', output: 'cat cats' });
81
+ expect(result.score).toBeGreaterThan(0.7);
82
+ });
83
+
84
+ it('should handle long and multi-paragraph text', async () => {
85
+ const original = `First paragraph about AI.
86
+ Second paragraph about ML.
87
+ Third paragraph about DL.`;
88
+ const simplified = `First para about AI.
89
+ Second para about ML.`;
90
+ const result = await metric.measure({ input: original, output: simplified });
91
+
92
+ expect(result.score).toBeGreaterThan(0.5);
93
+ expect(result.metrics?.missingElements).toBeDefined();
94
+ });
95
+ });
96
+ });
@@ -0,0 +1,112 @@
1
+ import { Metric } from '@mastra/core';
2
+ import nlp from 'compromise';
3
+
4
+ import { MetricScoringResult } from '../types';
5
+
6
+ export class CompletenessMetric extends Metric {
7
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
8
+ // Handle null/undefined inputs
9
+ if (input === null || input === undefined || output === null || output === undefined) {
10
+ throw new Error('Inputs cannot be null or undefined');
11
+ }
12
+
13
+ // Trim both inputs
14
+ input = input.trim();
15
+ output = output.trim();
16
+
17
+ const inputDoc = nlp(input);
18
+ const outputDoc = nlp(output);
19
+
20
+ // Extract and log elements
21
+ const inputElements = this.extractElements(inputDoc);
22
+ const outputElements = this.extractElements(outputDoc);
23
+ // Maybe we need a more sophisticated matching approach
24
+ const coverage = this.calculateCoverage(inputElements, outputElements);
25
+
26
+ return {
27
+ score: coverage,
28
+ details: `Completeness score: ${(coverage * 100).toFixed(1)}%`,
29
+ confidence: 0.8,
30
+ metrics: {
31
+ inputElements,
32
+ outputElements,
33
+ missingElements: inputElements.filter(e => !outputElements.includes(e)),
34
+ elementCounts: {
35
+ input: inputElements.length,
36
+ output: outputElements.length,
37
+ },
38
+ },
39
+ };
40
+ }
41
+
42
+ private extractElements(doc: any): string[] {
43
+ // Get more specific elements and ensure they're arrays
44
+ const nouns = doc.nouns().out('array') || [];
45
+ const verbs = doc.verbs().toInfinitive().out('array') || [];
46
+ const topics = doc.topics().out('array') || [];
47
+ const terms = doc.terms().out('array') || [];
48
+
49
+ // Helper function to clean and split terms
50
+ const cleanAndSplitTerm = (term: string): string[] => {
51
+ // First normalize the string
52
+ const normalized = this.normalizeString(term);
53
+
54
+ // Split on word boundaries and filter out empty strings
55
+ return normalized
56
+ .replace(/([a-z])([A-Z])/g, '$1 $2') // Split camelCase
57
+ .replace(/[^a-z0-9]+/g, ' ') // Replace non-alphanumeric with spaces
58
+ .trim()
59
+ .split(/\s+/)
60
+ .filter(word => word.length > 0);
61
+ };
62
+
63
+ // Process all elements
64
+ const processedTerms = [
65
+ ...nouns.flatMap(cleanAndSplitTerm),
66
+ ...verbs.flatMap(cleanAndSplitTerm),
67
+ ...topics.flatMap(cleanAndSplitTerm),
68
+ ...terms.flatMap(cleanAndSplitTerm),
69
+ ];
70
+
71
+ // Remove duplicates
72
+ return [...new Set(processedTerms)];
73
+ }
74
+
75
+ private normalizeString(str: string): string {
76
+ // Remove diacritics and convert to lowercase
77
+ return str
78
+ .normalize('NFD')
79
+ .replace(/[\u0300-\u036f]/g, '')
80
+ .toLowerCase();
81
+ }
82
+
83
+ private calculateCoverage(original: string[], simplified: string[]): number {
84
+ if (original.length === 0) {
85
+ return simplified.length === 0 ? 1 : 0;
86
+ }
87
+
88
+ // Exact matching for short words (3 chars or less), substring matching for longer words
89
+ const covered = original.filter(element =>
90
+ simplified.some(s => {
91
+ const elem = this.normalizeString(element);
92
+ const simp = this.normalizeString(s);
93
+
94
+ // For short words (3 chars or less), require exact match
95
+ if (elem.length <= 3) {
96
+ return elem === simp;
97
+ }
98
+
99
+ // For longer words, require substantial overlap (more than 60% of the longer word)
100
+ const longer = elem.length > simp.length ? elem : simp;
101
+ const shorter = elem.length > simp.length ? simp : elem;
102
+
103
+ if (longer.includes(shorter)) {
104
+ return shorter.length / longer.length > 0.6;
105
+ }
106
+
107
+ return false;
108
+ }),
109
+ );
110
+ return covered.length / original.length;
111
+ }
112
+ }
@@ -0,0 +1,107 @@
1
+ import { it, expect } from '@jest/globals';
2
+
3
+ import { ContentSimilarityMetric } from './index';
4
+
5
+ describe('ContentSimilarityMetric', () => {
6
+ const metric = new ContentSimilarityMetric();
7
+
8
+ it('should return perfect similarity for identical strings', async () => {
9
+ const result = await metric.measure({
10
+ input: 'The quick brown fox',
11
+ output: 'The quick brown fox',
12
+ });
13
+ expect(result.score).toBe(1);
14
+ expect(result.confidence).toBe(0.9);
15
+ });
16
+
17
+ it('should handle case differences with default options', async () => {
18
+ const result = await metric.measure({
19
+ input: 'The Quick Brown Fox',
20
+ output: 'the quick brown fox',
21
+ });
22
+ expect(result.score).toBe(1);
23
+ });
24
+
25
+ it('should handle whitespace differences with default options', async () => {
26
+ const result = await metric.measure({
27
+ input: 'The quick\nbrown fox',
28
+ output: 'The quick brown fox',
29
+ });
30
+ expect(result.score).toBe(1);
31
+ });
32
+
33
+ it('should be case sensitive when ignoreCase is false', async () => {
34
+ const caseSensitiveMetric = new ContentSimilarityMetric({ ignoreCase: false });
35
+ const result = await caseSensitiveMetric.measure({
36
+ input: 'The Quick Brown FOX',
37
+ output: 'the quick brown fox',
38
+ });
39
+ expect(result.score).toBeLessThan(0.8);
40
+ });
41
+
42
+ it('should preserve whitespace differences when ignoreWhitespace is true', async () => {
43
+ const whitespaceMetric = new ContentSimilarityMetric({
44
+ ignoreCase: true,
45
+ ignoreWhitespace: true,
46
+ });
47
+ const result = await whitespaceMetric.measure({
48
+ input: 'The\tquick brown\n\nfox',
49
+ output: 'The quick brown fox',
50
+ });
51
+ expect(result.score).toBe(1);
52
+ });
53
+
54
+ it('should handle both case and whitespace sensitivity', async () => {
55
+ const sensitiveMetric = new ContentSimilarityMetric({
56
+ ignoreCase: false,
57
+ ignoreWhitespace: true,
58
+ });
59
+ const result = await sensitiveMetric.measure({
60
+ input: 'The\tQuick Brown\n\nFOX',
61
+ output: 'the quick brown fox',
62
+ });
63
+ expect(result.score).toBeLessThan(0.8);
64
+ });
65
+
66
+ it('should handle partial similarity', async () => {
67
+ const result = await metric.measure({
68
+ input: 'The quick brown fox jumps over the lazy dog',
69
+ output: 'The quick brown fox runs past the lazy dog',
70
+ });
71
+ expect(result.score).toBeGreaterThan(0.7);
72
+ expect(result.score).toBeLessThan(0.8);
73
+ });
74
+
75
+ it('should handle completely different strings', async () => {
76
+ const result = await metric.measure({
77
+ input: 'The quick brown fox',
78
+ output: 'Lorem ipsum dolor sit amet',
79
+ });
80
+ expect(result.score).toBeLessThan(0.3);
81
+ });
82
+
83
+ it('should handle empty strings', async () => {
84
+ const result = await metric.measure({
85
+ input: '',
86
+ output: '',
87
+ });
88
+ expect(result.score).toBe(1);
89
+ });
90
+
91
+ it('should handle one empty string', async () => {
92
+ const result = await metric.measure({
93
+ input: 'The quick brown fox',
94
+ output: '',
95
+ });
96
+ expect(result.score).toBe(0);
97
+ });
98
+
99
+ it('should include similarity details in result', async () => {
100
+ const result = await metric.measure({
101
+ input: 'The quick brown fox',
102
+ output: 'The quick brown fox',
103
+ });
104
+ expect(result.details).toBe('Content similarity: 100.0%');
105
+ expect(result.metrics).toEqual({ similarity: 1 });
106
+ });
107
+ });
@@ -0,0 +1,41 @@
1
+ import { Metric } from '@mastra/core';
2
+ import stringSimilarity from 'string-similarity';
3
+
4
+ import { MetricOptions, MetricScoringResult } from '../types';
5
+
6
+ export class ContentSimilarityMetric extends Metric {
7
+ private options: MetricOptions;
8
+
9
+ constructor(options: MetricOptions = {}) {
10
+ super();
11
+ this.options = {
12
+ ignoreCase: true,
13
+ ignoreWhitespace: true,
14
+ ...options,
15
+ };
16
+ }
17
+
18
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricScoringResult> {
19
+ let processedInput = input;
20
+ let processedOutput = output;
21
+
22
+ if (this.options.ignoreCase) {
23
+ processedInput = processedInput.toLowerCase();
24
+ processedOutput = processedOutput.toLowerCase();
25
+ }
26
+
27
+ if (this.options.ignoreWhitespace) {
28
+ processedInput = processedInput.replace(/\s+/g, ' ').trim();
29
+ processedOutput = processedOutput.replace(/\s+/g, ' ').trim();
30
+ }
31
+
32
+ const similarity = stringSimilarity.compareTwoStrings(processedInput, processedOutput);
33
+
34
+ return {
35
+ score: similarity,
36
+ details: `Content similarity: ${(similarity * 100).toFixed(1)}%`,
37
+ confidence: 0.9,
38
+ metrics: { similarity },
39
+ };
40
+ }
41
+ }
@@ -0,0 +1,292 @@
1
+ import { it, expect, jest } from '@jest/globals';
2
+ import { type ModelConfig } from '@mastra/core';
3
+
4
+ import { ContextPositionMetric } from './index';
5
+
6
+ type TestCase = {
7
+ input: string;
8
+ output: string;
9
+ context: string[];
10
+ expectedResult: {
11
+ score: number;
12
+ reason: string;
13
+ };
14
+ };
15
+
16
+ const testCases: TestCase[] = [
17
+ {
18
+ // Perfect ordering with all relevant pieces [1,1,1]
19
+ input: 'What is the Sun?',
20
+ output: 'The Sun is a star that produces its own light.',
21
+ context: [
22
+ 'The Sun is a star at the center of our solar system.',
23
+ 'Stars like the Sun produce their own light through nuclear fusion.',
24
+ 'The Sun provides light and heat to Earth.',
25
+ ],
26
+ expectedResult: {
27
+ score: 10,
28
+ reason: 'All context pieces are relevant and optimally ordered, with the most important definition first.',
29
+ },
30
+ },
31
+ {
32
+ // Mixed relevance case [1,0,1]
33
+ input: 'What is photosynthesis?',
34
+ output: 'Photosynthesis is how plants make food using sunlight.',
35
+ context: [
36
+ 'Plants use sunlight to create food through photosynthesis.',
37
+ 'Chlorophyll gives plants their green color.',
38
+ 'Plants need water and CO2 for photosynthesis.',
39
+ ],
40
+ expectedResult: {
41
+ score: 7.27,
42
+ reason:
43
+ 'First and third pieces are relevant, with an irrelevant piece in between, demonstrating proper handling of mixed relevance.',
44
+ },
45
+ },
46
+ {
47
+ // Domain knowledge relevance [1,1,1]
48
+ input: 'How does a car engine work?',
49
+ output: 'A car engine converts gasoline into mechanical energy through combustion.',
50
+ context: [
51
+ 'Car engines burn gasoline for power.',
52
+ 'Pistons move up and down in the engine.',
53
+ 'Spark plugs create electrical sparks.',
54
+ ],
55
+ expectedResult: {
56
+ score: 10,
57
+ reason: 'All pieces contribute domain knowledge: direct explanation, mechanical process, and enabling mechanism.',
58
+ },
59
+ },
60
+ {
61
+ // Mixed relevance with good ordering [1,1,0]
62
+ input: 'What is JavaScript?',
63
+ output: 'JavaScript is a programming language used for web development.',
64
+ context: [
65
+ 'JavaScript is a popular programming language.',
66
+ 'Web browsers can run JavaScript code.',
67
+ 'Computers use binary code.',
68
+ ],
69
+ expectedResult: {
70
+ score: 8.18,
71
+ reason: 'Two relevant pieces at the start, followed by an irrelevant piece.',
72
+ },
73
+ },
74
+ {
75
+ // Single relevant at start [1,0,0]
76
+ input: 'What is precipitation?',
77
+ output: 'Precipitation is water falling from clouds as rain or snow.',
78
+ context: ['Precipitation is water falling from clouds.', 'The Earth is round.', 'Plants are green.'],
79
+ expectedResult: {
80
+ score: 5.45,
81
+ reason: 'Single relevant piece at the start, followed by irrelevant pieces.',
82
+ },
83
+ },
84
+ {
85
+ // Single relevant in middle [0,1,0]
86
+ input: 'What are clouds?',
87
+ output: 'Clouds are visible masses of water droplets in the sky.',
88
+ context: ['The sky is blue.', 'Clouds are made of water droplets.', 'Birds have feathers.'],
89
+ expectedResult: {
90
+ score: 2.73,
91
+ reason: 'Single relevant piece in middle position, with irrelevant pieces before and after.',
92
+ },
93
+ },
94
+ {
95
+ // Single relevant at end [0,0,1]
96
+ input: 'What is the moon?',
97
+ output: "The Moon is Earth's natural satellite.",
98
+ context: ['Stars twinkle at night.', 'The sky appears blue.', 'The Moon orbits around Earth.'],
99
+ expectedResult: {
100
+ score: 1.82,
101
+ reason: 'Single relevant piece at the end, with irrelevant pieces before it.',
102
+ },
103
+ },
104
+ {
105
+ // Empty context (edge case)
106
+ input: 'What is gravity?',
107
+ output: 'Gravity is a force that attracts objects together.',
108
+ context: [],
109
+ expectedResult: {
110
+ score: 0,
111
+ reason: 'No context provided to evaluate.',
112
+ },
113
+ },
114
+ {
115
+ // All irrelevant [0,0,0]
116
+ input: 'What is electricity?',
117
+ output: 'Electricity is the flow of electrical charge.',
118
+ context: ['The weather is sunny today.', 'Birds can fly.', 'Trees grow tall.'],
119
+ expectedResult: {
120
+ score: 0,
121
+ reason: 'No relevant context pieces found.',
122
+ },
123
+ },
124
+ {
125
+ // Complex interdependent context [1,1,1]
126
+ input: 'How do plants grow?',
127
+ output: 'Plants grow through photosynthesis, using water and nutrients from soil.',
128
+ context: [
129
+ 'Plants need nutrients from soil to grow.',
130
+ 'Photosynthesis converts sunlight to energy.',
131
+ 'Roots absorb water from the soil.',
132
+ ],
133
+ expectedResult: {
134
+ score: 10,
135
+ reason: 'All pieces are relevant and work together to explain the concept, with logical ordering.',
136
+ },
137
+ },
138
+ {
139
+ // Single relevant piece [1]
140
+ input: 'What is DNA?',
141
+ output: 'DNA contains genetic information.',
142
+ context: ['DNA stores genetic information in cells.'],
143
+ expectedResult: {
144
+ score: 10,
145
+ reason: 'Single relevant piece in optimal first position.',
146
+ },
147
+ },
148
+ {
149
+ // Two relevant at end [0,1,1]
150
+ input: 'What is a volcano?',
151
+ output: 'A volcano is a mountain that erupts hot lava.',
152
+ context: ['Mountains can be found worldwide.', 'Volcanoes erupt molten rock.', 'Lava flows from volcanic vents.'],
153
+ expectedResult: {
154
+ score: 4.55,
155
+ reason: 'Two relevant pieces appear at the end, after an irrelevant general statement.',
156
+ },
157
+ },
158
+ ];
159
+
160
+ const SECONDS = 10000;
161
+ jest.setTimeout(15 * SECONDS);
162
+
163
+ const modelConfig: ModelConfig = {
164
+ provider: 'OPEN_AI',
165
+ name: 'gpt-4o',
166
+ toolChoice: 'auto',
167
+ apiKey: process.env.OPENAI_API_KEY,
168
+ };
169
+
170
+ describe('ContextPositionMetric', () => {
171
+ const metric = new ContextPositionMetric(modelConfig);
172
+
173
+ it('should handle perfect ordering with all relevant pieces', async () => {
174
+ const testCase = testCases[0]!;
175
+ const result = await metric.measure({
176
+ input: testCase.input,
177
+ output: testCase.output,
178
+ context: testCase.context,
179
+ });
180
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
181
+ });
182
+
183
+ it('should handle mixed relevance case', async () => {
184
+ const testCase = testCases[1]!;
185
+ const result = await metric.measure({
186
+ input: testCase.input,
187
+ output: testCase.output,
188
+ context: testCase.context,
189
+ });
190
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
191
+ });
192
+
193
+ it('should handle domain knowledge relevance', async () => {
194
+ const testCase = testCases[2]!;
195
+ const result = await metric.measure({
196
+ input: testCase.input,
197
+ output: testCase.output,
198
+ context: testCase.context,
199
+ });
200
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
201
+ });
202
+
203
+ it('should handle mixed relevance with good ordering', async () => {
204
+ const testCase = testCases[3]!;
205
+ const result = await metric.measure({
206
+ input: testCase.input,
207
+ output: testCase.output,
208
+ context: testCase.context,
209
+ });
210
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
211
+ });
212
+
213
+ it('should handle single relevant piece at start', async () => {
214
+ const testCase = testCases[4]!;
215
+ const result = await metric.measure({
216
+ input: testCase.input,
217
+ output: testCase.output,
218
+ context: testCase.context,
219
+ });
220
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
221
+ });
222
+
223
+ it('should handle single relevant piece in middle', async () => {
224
+ const testCase = testCases[5]!;
225
+ const result = await metric.measure({
226
+ input: testCase.input,
227
+ output: testCase.output,
228
+ context: testCase.context,
229
+ });
230
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
231
+ });
232
+
233
+ it('should handle single relevant piece at end', async () => {
234
+ const testCase = testCases[6]!;
235
+ const result = await metric.measure({
236
+ input: testCase.input,
237
+ output: testCase.output,
238
+ context: testCase.context,
239
+ });
240
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
241
+ });
242
+
243
+ it('should handle empty context', async () => {
244
+ const testCase = testCases[7]!;
245
+ const result = await metric.measure({
246
+ input: testCase.input,
247
+ output: testCase.output,
248
+ context: testCase.context,
249
+ });
250
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
251
+ });
252
+
253
+ it('should handle all irrelevant context', async () => {
254
+ const testCase = testCases[8]!;
255
+ const result = await metric.measure({
256
+ input: testCase.input,
257
+ output: testCase.output,
258
+ context: testCase.context,
259
+ });
260
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
261
+ });
262
+
263
+ it('should handle complex interdependent context', async () => {
264
+ const testCase = testCases[9]!;
265
+ const result = await metric.measure({
266
+ input: testCase.input,
267
+ output: testCase.output,
268
+ context: testCase.context,
269
+ });
270
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
271
+ });
272
+
273
+ it('should handle single piece context', async () => {
274
+ const testCase = testCases[10]!;
275
+ const result = await metric.measure({
276
+ input: testCase.input,
277
+ output: testCase.output,
278
+ context: testCase.context,
279
+ });
280
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
281
+ });
282
+
283
+ it('should handle two relevant pieces at end', async () => {
284
+ const testCase = testCases[11]!;
285
+ const result = await metric.measure({
286
+ input: testCase.input,
287
+ output: testCase.output,
288
+ context: testCase.context,
289
+ });
290
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
291
+ });
292
+ });