npm - @mastra/evals - Versions diffs - 0.1.0-alpha.17 → 0.1.0-alpha.21 - Mend

@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/CHANGELOG.md +29 -0
package/README.md +186 -0
package/dist/evals.cjs.development.js +1 -0
package/dist/evals.cjs.development.js.map +1 -1
package/dist/evals.cjs.production.min.js.map +1 -1
package/dist/evals.esm.js +1 -0
package/dist/evals.esm.js.map +1 -1
package/dist/evaluation.d.ts +2 -2
package/dist/evaluation.d.ts.map +1 -1
package/package.json +4 -7
package/src/evaluation.test.ts +1 -1
package/src/evaluation.ts +2 -0
package/src/metrics/llm/answer-relevancy/index.test.ts +49 -44
package/src/metrics/llm/bias/index.test.ts +13 -12
package/src/metrics/llm/context-position/index.test.ts +92 -87
package/src/metrics/llm/context-precision/index.test.ts +69 -64
package/src/metrics/llm/context-relevancy/index.test.ts +27 -22
package/src/metrics/llm/contextual-recall/index.test.ts +28 -23
package/src/metrics/llm/faithfulness/index.test.ts +81 -76
package/src/metrics/llm/hallucination/index.test.ts +85 -80
package/src/metrics/llm/prompt-alignment/index.test.ts +53 -48
package/src/metrics/llm/summarization/index.test.ts +85 -80
package/src/metrics/llm/toxicity/index.test.ts +22 -17
package/src/metrics/nlp/completeness/index.test.ts +1 -1
package/src/metrics/nlp/content-similarity/index.test.ts +1 -1
package/src/metrics/nlp/keyword-coverage/index.test.ts +1 -1
package/src/metrics/nlp/textual-difference/index.test.ts +1 -1
package/src/metrics/nlp/tone/index.test.ts +1 -1
package/vitest.config.ts +9 -0
package/jest.config.ts +0 -21

package/src/metrics/llm/faithfulness/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCaseWithContext } from '../utils';
@@ -146,7 +146,6 @@ const testCases: TestCaseWithContext[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -155,100 +154,106 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('FaithfulnessMetric', () => {
-  it('should handle perfect faithfulness', async () => {
-    const testCase = testCases[0]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+describe(
+  'FaithfulnessMetric',
+  () => {
+    it('should handle perfect faithfulness', async () => {
+      const testCase = testCases[0]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle mixed faithfulness with contradictions', async () => {
-    const testCase = testCases[1]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle mixed faithfulness with contradictions', async () => {
+      const testCase = testCases[1]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle claims with speculative language', async () => {
-    const testCase = testCases[2]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle claims with speculative language', async () => {
+      const testCase = testCases[2]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle empty output', async () => {
-    const testCase = testCases[3]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle empty output', async () => {
+      const testCase = testCases[3]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
-  it('should handle empty context', async () => {
-    const testCase = testCases[4]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle empty context', async () => {
+      const testCase = testCases[4]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
-  it('should handle subjective claims', async () => {
-    const testCase = testCases[5]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle subjective claims', async () => {
+      const testCase = testCases[5]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
-  it('should handle claims with speculative language appropriately', async () => {
-    const testCase = testCases[6]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle claims with speculative language appropriately', async () => {
+      const testCase = testCases[6]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle compound statements correctly', async () => {
-    const testCase = testCases[7]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle compound statements correctly', async () => {
+      const testCase = testCases[7]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle precise numerical claims', async () => {
-    const testCase = testCases[8]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle precise numerical claims', async () => {
+      const testCase = testCases[8]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
-  it('should handle partially supported claims', async () => {
-    const testCase = testCases[9]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle partially supported claims', async () => {
+      const testCase = testCases[9]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle mixed factual and speculative claims', async () => {
-    const testCase = testCases[10]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle mixed factual and speculative claims', async () => {
+      const testCase = testCases[10]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle implicit information appropriately', async () => {
-    const testCase = testCases[11]!;
-    const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle implicit information appropriately', async () => {
+      const testCase = testCases[11]!;
+      const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-});
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/llm/hallucination/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCaseWithContext } from '../utils';
@@ -128,7 +128,6 @@ const testCases: TestCaseWithContext[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -137,81 +136,87 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('HallucinationMetric', () => {
-  it('should handle perfect alignment', async () => {
-    const testCase = testCases[0]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle complete hallucination', async () => {
-    const testCase = testCases[1]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle partial hallucination', async () => {
-    const testCase = testCases[2]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle empty output', async () => {
-    const testCase = testCases[3]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
-  it('should handle speculative language', async () => {
-    const testCase = testCases[4]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle empty context', async () => {
-    const testCase = testCases[5]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
-  it('should handle implicit contradictions', async () => {
-    const testCase = testCases[6]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle numerical approximations', async () => {
-    const testCase = testCases[7]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle out of scope additions', async () => {
-    const testCase = testCases[8]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle temporal contradictions', async () => {
-    const testCase = testCases[9]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-  it('should handle numerical contradiction despite approximation', async () => {
-    const testCase = testCases[10]!;
-    const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
-});
+describe(
+  'HallucinationMetric',
+  () => {
+    it('should handle perfect alignment', async () => {
+      const testCase = testCases[0]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle complete hallucination', async () => {
+      const testCase = testCases[1]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle partial hallucination', async () => {
+      const testCase = testCases[2]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle empty output', async () => {
+      const testCase = testCases[3]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
+    it('should handle speculative language', async () => {
+      const testCase = testCases[4]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle empty context', async () => {
+      const testCase = testCases[5]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
+    it('should handle implicit contradictions', async () => {
+      const testCase = testCases[6]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle numerical approximations', async () => {
+      const testCase = testCases[7]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle out of scope additions', async () => {
+      const testCase = testCases[8]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle temporal contradictions', async () => {
+      const testCase = testCases[9]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+    it('should handle numerical contradiction despite approximation', async () => {
+      const testCase = testCases[10]!;
+      const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/llm/prompt-alignment/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCaseWithInstructions } from '../utils';
@@ -69,7 +69,6 @@ const testCases: TestCaseWithInstructions[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -78,69 +77,75 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('PromptAlignmentMetric', () => {
-  it('should measure perfect alignment with single instruction', async () => {
-    const testCase = testCases[0]!;
-    const metric = new PromptAlignmentMetric(modelConfig, {
-      instructions: testCase.instructions,
+describe(
+  'PromptAlignmentMetric',
+  () => {
+    it('should measure perfect alignment with single instruction', async () => {
+      const testCase = testCases[0]!;
+      const metric = new PromptAlignmentMetric(modelConfig, {
+        instructions: testCase.instructions,
+      });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBe(testCase.expectedResult.score);
     });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+    it('should measure zero alignment with single instruction', async () => {
+      const testCase = testCases[1]!;
+      const metric = new PromptAlignmentMetric(modelConfig, {
+        instructions: testCase.instructions,
+      });
-  it('should measure zero alignment with single instruction', async () => {
-    const testCase = testCases[1]!;
-    const metric = new PromptAlignmentMetric(modelConfig, {
-      instructions: testCase.instructions,
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBe(testCase.expectedResult.score);
     });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should measure perfect alignment with multiple instructions', async () => {
+      const testCase = testCases[2]!;
+      const metric = new PromptAlignmentMetric(modelConfig, {
+        instructions: testCase.instructions,
+      });
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      const result = await metric.measure(testCase.input, testCase.output);
-  it('should measure perfect alignment with multiple instructions', async () => {
-    const testCase = testCases[2]!;
-    const metric = new PromptAlignmentMetric(modelConfig, {
-      instructions: testCase.instructions,
+      expect(result.score).toBe(testCase.expectedResult.score);
     });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should measure partial alignment with multiple instructions', async () => {
+      const testCase = testCases[3]!;
+      const metric = new PromptAlignmentMetric(modelConfig, {
+        instructions: testCase.instructions,
+      });
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      const result = await metric.measure(testCase.input, testCase.output);
-  it('should measure partial alignment with multiple instructions', async () => {
-    const testCase = testCases[3]!;
-    const metric = new PromptAlignmentMetric(modelConfig, {
-      instructions: testCase.instructions,
+      expect(result.score).toBe(testCase.expectedResult.score);
     });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should measure alignment with complex formatting instructions', async () => {
+      const testCase = testCases[4]!;
+      const metric = new PromptAlignmentMetric(modelConfig, {
+        instructions: testCase.instructions,
+      });
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      const result = await metric.measure(testCase.input, testCase.output);
-  it('should measure alignment with complex formatting instructions', async () => {
-    const testCase = testCases[4]!;
-    const metric = new PromptAlignmentMetric(modelConfig, {
-      instructions: testCase.instructions,
+      expect(result.score).toBe(testCase.expectedResult.score);
     });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should handle empty output', async () => {
+      const testCase = testCases[5]!;
+      const metric = new PromptAlignmentMetric(modelConfig, {
+        instructions: testCase.instructions,
+      });
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+      const result = await metric.measure(testCase.input, testCase.output);
-  it('should handle empty output', async () => {
-    const testCase = testCases[5]!;
-    const metric = new PromptAlignmentMetric(modelConfig, {
-      instructions: testCase.instructions,
+      expect(result.score).toBe(testCase.expectedResult.score);
     });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
-});
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);