npm - @mastra/evals - Versions diffs - 0.1.0-alpha.17 → 0.1.0-alpha.19 - Mend

@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/CHANGELOG.md +15 -0
package/README.md +186 -0
package/dist/evals.cjs.development.js +1 -0
package/dist/evals.cjs.development.js.map +1 -1
package/dist/evals.cjs.production.min.js.map +1 -1
package/dist/evals.esm.js +1 -0
package/dist/evals.esm.js.map +1 -1
package/dist/evaluation.d.ts +2 -2
package/dist/evaluation.d.ts.map +1 -1
package/package.json +4 -7
package/src/evaluation.test.ts +1 -1
package/src/evaluation.ts +2 -0
package/src/metrics/llm/answer-relevancy/index.test.ts +49 -44
package/src/metrics/llm/bias/index.test.ts +13 -12
package/src/metrics/llm/context-position/index.test.ts +92 -87
package/src/metrics/llm/context-precision/index.test.ts +69 -64
package/src/metrics/llm/context-relevancy/index.test.ts +27 -22
package/src/metrics/llm/contextual-recall/index.test.ts +28 -23
package/src/metrics/llm/faithfulness/index.test.ts +81 -76
package/src/metrics/llm/hallucination/index.test.ts +85 -80
package/src/metrics/llm/prompt-alignment/index.test.ts +53 -48
package/src/metrics/llm/summarization/index.test.ts +85 -80
package/src/metrics/llm/toxicity/index.test.ts +22 -17
package/src/metrics/nlp/completeness/index.test.ts +1 -1
package/src/metrics/nlp/content-similarity/index.test.ts +1 -1
package/src/metrics/nlp/keyword-coverage/index.test.ts +1 -1
package/src/metrics/nlp/textual-difference/index.test.ts +1 -1
package/src/metrics/nlp/tone/index.test.ts +1 -1
package/vitest.config.ts +9 -0
package/jest.config.ts +0 -21

package/src/metrics/llm/context-position/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCaseWithContext } from '../utils';
@@ -150,7 +150,6 @@ const testCases: TestCaseWithContext[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -159,88 +158,94 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('ContextPositionMetric', () => {
-  it('should handle perfect ordering with all relevant pieces', async () => {
-    const testCase = testCases[0]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle mixed relevance case', async () => {
-    const testCase = testCases[1]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle domain knowledge relevance', async () => {
-    const testCase = testCases[2]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle mixed relevance with good ordering', async () => {
-    const testCase = testCases[3]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle single relevant piece at start', async () => {
-    const testCase = testCases[4]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle single relevant piece in middle', async () => {
-    const testCase = testCases[5]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle single relevant piece at end', async () => {
-    const testCase = testCases[6]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle empty context', async () => {
-    const testCase = testCases[7]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle all irrelevant context', async () => {
-    const testCase = testCases[8]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle complex interdependent context', async () => {
-    const testCase = testCases[9]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle single piece context', async () => {
-    const testCase = testCases[10]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-  it('should handle two relevant pieces at end', async () => {
-    const testCase = testCases[11]!;
-    const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-});
+describe(
+  'ContextPositionMetric',
+  () => {
+    it('should handle perfect ordering with all relevant pieces', async () => {
+      const testCase = testCases[0]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle mixed relevance case', async () => {
+      const testCase = testCases[1]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle domain knowledge relevance', async () => {
+      const testCase = testCases[2]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle mixed relevance with good ordering', async () => {
+      const testCase = testCases[3]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle single relevant piece at start', async () => {
+      const testCase = testCases[4]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle single relevant piece in middle', async () => {
+      const testCase = testCases[5]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle single relevant piece at end', async () => {
+      const testCase = testCases[6]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle empty context', async () => {
+      const testCase = testCases[7]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle all irrelevant context', async () => {
+      const testCase = testCases[8]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle complex interdependent context', async () => {
+      const testCase = testCases[9]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle single piece context', async () => {
+      const testCase = testCases[10]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+    it('should handle two relevant pieces at end', async () => {
+      const testCase = testCases[11]!;
+      const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/llm/context-precision/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { TestCaseWithContext } from '../utils';
@@ -127,7 +127,6 @@ const testCases: TestCaseWithContext[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -136,74 +135,80 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('ContextPrecisionMetric', () => {
-  it('should measure perfect context precision with all relevant items', async () => {
-    const testCase = testCases[0]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+describe(
+  'ContextPrecisionMetric',
+  () => {
+    it('should measure perfect context precision with all relevant items', async () => {
+      const testCase = testCases[0]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should measure high precision with irrelevant item at end', async () => {
-    const testCase = testCases[1]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should measure high precision with irrelevant item at end', async () => {
+      const testCase = testCases[1]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should measure precision with two relevant items after irrelevant start', async () => {
-    const testCase = testCases[2]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should measure precision with two relevant items after irrelevant start', async () => {
+      const testCase = testCases[2]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should measure precision with alternating relevant items', async () => {
-    const testCase = testCases[3]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should measure precision with alternating relevant items', async () => {
+      const testCase = testCases[3]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should measure precision with single relevant item at start', async () => {
-    const testCase = testCases[4]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should measure precision with single relevant item at start', async () => {
+      const testCase = testCases[4]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle completely irrelevant context', async () => {
-    const testCase = testCases[5]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle completely irrelevant context', async () => {
+      const testCase = testCases[5]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle single relevant context perfectly', async () => {
-    const testCase = testCases[6]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle single relevant context perfectly', async () => {
+      const testCase = testCases[6]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should measure precision with single relevant item at end', async () => {
-    const testCase = testCases[7]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should measure precision with single relevant item at end', async () => {
+      const testCase = testCases[7]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle empty context', async () => {
-    const testCase = testCases[8]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+    it('should handle empty context', async () => {
+      const testCase = testCases[8]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should handle single irrelevant context', async () => {
-    const testCase = testCases[9]!;
-    const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-});
+    it('should handle single irrelevant context', async () => {
+      const testCase = testCases[9]!;
+      const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/llm/context-relevancy/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { isCloserTo } from '../utils';
 import { TestCaseWithContext } from '../utils';
@@ -55,7 +55,6 @@ const testCases: TestCaseWithContext[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -64,25 +63,31 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('ContextPrecisionMetric', () => {
-  it('should measure perfect context relevancy with all relevant items', async () => {
-    const testCase = testCases[0]!;
-    const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
+describe(
+  'ContextPrecisionMetric',
+  () => {
+    it('should measure perfect context relevancy with all relevant items', async () => {
+      const testCase = testCases[0]!;
+      const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
-  it('should measure mixed relevancy where only some contexts are relevant', async () => {
-    const testCase = testCases[1]!;
-    const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(isCloserTo(result.score, testCase.expectedResult.score, 0)).toBe(true);
-  });
+    it('should measure mixed relevancy where only some contexts are relevant', async () => {
+      const testCase = testCases[1]!;
+      const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(isCloserTo(result.score, testCase.expectedResult.score, 0)).toBe(true);
+    });
-  it('should measure no relevancy where contexts are completely unrelated', async () => {
-    const testCase = testCases[2]!;
-    const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-});
+    it('should measure no relevancy where contexts are completely unrelated', async () => {
+      const testCase = testCases[2]!;
+      const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);

package/src/metrics/llm/contextual-recall/index.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { describe, it, expect, jest } from '@jest/globals';
 import { type ModelConfig } from '@mastra/core';
+import { describe, it, expect } from 'vitest';
 import { isCloserTo } from '../utils';
 import { TestCaseWithContext } from '../utils';
@@ -51,7 +51,6 @@ const testCases: TestCaseWithContext[] = [
 ];
 const SECONDS = 10000;
-jest.setTimeout(15 * SECONDS);
 const modelConfig: ModelConfig = {
   provider: 'OPEN_AI',
@@ -60,27 +59,33 @@ const modelConfig: ModelConfig = {
   apiKey: process.env.OPENAI_API_KEY,
 };
-describe('ContextualRecallMetric', () => {
-  it('should succeed when context is relevant', async () => {
-    const testCase = testCases[0]!;
-    const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-  });
+describe(
+  'ContextualRecallMetric',
+  () => {
+    it('should succeed when context is relevant', async () => {
+      const testCase = testCases[0]!;
+      const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+    });
-  it('should be mixed', async () => {
-    const testCase = testCases[1]!;
-    const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
+    it('should be mixed', async () => {
+      const testCase = testCases[1]!;
+      const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
-    expect(isCloserTo(result.score, testCase.expectedResult.score, 1)).toBe(true);
-    expect(result.score - testCase.expectedResult.score).toBeGreaterThan(0);
-  });
+      expect(isCloserTo(result.score, testCase.expectedResult.score, 1)).toBe(true);
+      expect(result.score - testCase.expectedResult.score).toBeGreaterThan(0);
+    });
-  it('should be none', async () => {
-    const testCase = testCases[2]!;
-    const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
-    const result = await metric.measure(testCase.input, testCase.output);
-    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
-  });
-});
+    it('should be none', async () => {
+      const testCase = testCases[2]!;
+      const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
+      const result = await metric.measure(testCase.input, testCase.output);
+      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
+    });
+  },
+  {
+    timeout: 15 * SECONDS,
+  },
+);