npm - @mastra/evals - Versions diffs - 0.1.0-alpha.17 → 0.1.0-alpha.21 - Mend

@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/CHANGELOG.md +29 -0
package/README.md +186 -0
package/dist/evals.cjs.development.js +1 -0
package/dist/evals.cjs.development.js.map +1 -1
package/dist/evals.cjs.production.min.js.map +1 -1
package/dist/evals.esm.js +1 -0
package/dist/evals.esm.js.map +1 -1
package/dist/evaluation.d.ts +2 -2
package/dist/evaluation.d.ts.map +1 -1
package/package.json +4 -7
package/src/evaluation.test.ts +1 -1
package/src/evaluation.ts +2 -0
package/src/metrics/llm/answer-relevancy/index.test.ts +49 -44
package/src/metrics/llm/bias/index.test.ts +13 -12
package/src/metrics/llm/context-position/index.test.ts +92 -87
package/src/metrics/llm/context-precision/index.test.ts +69 -64
package/src/metrics/llm/context-relevancy/index.test.ts +27 -22
package/src/metrics/llm/contextual-recall/index.test.ts +28 -23
package/src/metrics/llm/faithfulness/index.test.ts +81 -76
package/src/metrics/llm/hallucination/index.test.ts +85 -80
package/src/metrics/llm/prompt-alignment/index.test.ts +53 -48
package/src/metrics/llm/summarization/index.test.ts +85 -80
package/src/metrics/llm/toxicity/index.test.ts +22 -17
package/src/metrics/nlp/completeness/index.test.ts +1 -1
package/src/metrics/nlp/content-similarity/index.test.ts +1 -1
package/src/metrics/nlp/keyword-coverage/index.test.ts +1 -1
package/src/metrics/nlp/textual-difference/index.test.ts +1 -1
package/src/metrics/nlp/tone/index.test.ts +1 -1
package/vitest.config.ts +9 -0
package/jest.config.ts +0 -21

package/src/metrics/llm/summarization/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCase } from '../utils';
@@ -166,7 +166,6 @@ const testCases: TestCase[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -175,96 +174,102 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('SummarizationMetric', () => {
-  const metric = new SummarizationMetric(modelConfig);
+describe(
+  'SummarizationMetric',
+  () => {
+    const metric = new SummarizationMetric(modelConfig);
-  it('should handle perfect summarization', async () => {
-    const testCase = testCases[0]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle perfect summarization', async () => {
+      const testCase = testCases[0]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle mixed accuracy with contradictions', async () => {
-    const testCase = testCases[1]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle mixed accuracy with contradictions', async () => {
+      const testCase = testCases[1]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle missing key information', async () => {
-    const testCase = testCases[2]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle missing key information', async () => {
+      const testCase = testCases[2]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle empty output', async () => {
-    const testCase = testCases[3]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBe(testCase.expectedResult.score);
-  });
+    it('should handle empty output', async () => {
+      const testCase = testCases[3]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBe(testCase.expectedResult.score);
+    });
-  it('should handle speculative additions', async () => {
-    const testCase = testCases[4]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle speculative additions', async () => {
+      const testCase = testCases[4]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle incorrect emphasis', async () => {
-    const testCase = testCases[5]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle incorrect emphasis', async () => {
+      const testCase = testCases[5]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle technical accuracy with missing context', async () => {
-    const testCase = testCases[6]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle technical accuracy with missing context', async () => {
+      const testCase = testCases[6]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle numerical approximation', async () => {
-    const testCase = testCases[7]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle numerical approximation', async () => {
+      const testCase = testCases[7]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle mixed tenses', async () => {
-    const testCase = testCases[8]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle mixed tenses', async () => {
+      const testCase = testCases[8]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle subjective interpretation', async () => {
-    const testCase = testCases[9]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle subjective interpretation', async () => {
+      const testCase = testCases[9]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle high alignment with low coverage', async () => {
-    const testCase = testCases[10]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle high alignment with low coverage', async () => {
+      const testCase = testCases[10]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle low alignment with high coverage', async () => {
-    const testCase = testCases[11]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle low alignment with high coverage', async () => {
+      const testCase = testCases[11]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle single word summary', async () => {
-    const testCase = testCases[12]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle single word summary', async () => {
+      const testCase = testCases[12]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle repetitive summary', async () => {
-    const testCase = testCases[13]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle repetitive summary', async () => {
+      const testCase = testCases[13]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle overly verbose summary', async () => {
-    const testCase = testCases[14]!;
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-});
+    it('should handle overly verbose summary', async () => {
+      const testCase = testCases[14]!;
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/llm/toxicity/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCase } from '../utils';
@@ -35,7 +35,6 @@ const testCases: TestCase[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -44,24 +43,30 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('ToxicityMetric', () => {
-  const metric = new ToxicityMetric(modelConfig);
+describe(
+  'ToxicityMetric',
+  () => {
+    const metric = new ToxicityMetric(modelConfig);
-  it('should be able to measure a prompt that is toxic', async () => {
-    const result = await metric.measure(testCases[0].input, testCases[0].output);
+    it('should be able to measure a prompt that is toxic', async () => {
+      const result = await metric.measure(testCases[0].input, testCases[0].output);
-    expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
+    });
-  it('should be able to measure a prompt that is not toxic', async () => {
-    const result = await metric.measure(testCases[1].input, testCases[1].output);
+    it('should be able to measure a prompt that is not toxic', async () => {
+      const result = await metric.measure(testCases[1].input, testCases[1].output);
-    expect(result.score).toBeCloseTo(testCases[1].expectedResult.score, 1);
-  });
+      expect(result.score).toBeCloseTo(testCases[1].expectedResult.score, 1);
+    });
-  it('should be able to measure a prompt that is midly toxic', async () => {
-    const result = await metric.measure(testCases[2].input, testCases[2].output);
+    it('should be able to measure a prompt that is midly toxic', async () => {
+      const result = await metric.measure(testCases[2].input, testCases[2].output);
-    expect(result.score).toBeCloseTo(testCases[2].expectedResult.score, 1);
-  });
-});
+      expect(result.score).toBeCloseTo(testCases[2].expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/nlp/completeness/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { describe, it, expect, beforeEach } from '@jest/globals';
+import { describe, it, expect, beforeEach } from 'vitest';
 import { CompletenessMetric } from './index';

package/src/metrics/nlp/content-similarity/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { describe, it, expect } from '@jest/globals';
+import { describe, it, expect } from 'vitest';
 import { ContentSimilarityMetric } from './index';

package/src/metrics/nlp/keyword-coverage/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { describe, it, expect } from '@jest/globals';
+import { describe, it, expect } from 'vitest';
 import { KeywordCoverageMetric } from './index';

package/src/metrics/nlp/textual-difference/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { describe, it, expect } from '@jest/globals';
+import { describe, it, expect } from 'vitest';
 import { TextualDifferenceMetric } from './index';

package/src/metrics/nlp/tone/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { describe, it, expect } from '@jest/globals';
+import { describe, it, expect } from 'vitest';
 import { ToneConsistencyMetric } from './index';

package/vitest.config.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import { defineConfig } from 'vitest/config';
+export default defineConfig({
+  test: {
+    environment: 'node',
+    include: ['src/**/*.test.ts'],
+    exclude: ['**/node_modules/**', '**/dist/**'],
+  },
+});

package/jest.config.ts DELETED Viewed

@@ -1,21 +0,0 @@
-import { config } from 'dotenv';
-config();
-export default {
-  maxWorkers: 1,
-  preset: 'ts-jest',
-  extensionsToTreatAsEsm: ['.ts'],
-  moduleNameMapper: {
-    '^(\\.{1,2}/.*)\\.js$': '$1',
-  },
-  transform: {
-    '^.+\\.tsx?$': [
-      'ts-jest',
-      {
-        useESM: true,
-        isolatedModules: true,
-      },
-    ],
-  },
-};