@mastra/evals 0.1.0-alpha.23 → 0.1.0-alpha.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 0.1.0-alpha.25
4
+
5
+ ### Patch Changes
6
+
7
+ - 9625602: Use mastra core splitted bundles in other packages
8
+ - 8769a62: Split core into seperate entry fils
9
+ - Updated dependencies [30322ce]
10
+ - Updated dependencies [78eec7c]
11
+ - Updated dependencies [9625602]
12
+ - Updated dependencies [8769a62]
13
+ - @mastra/core@0.2.0-alpha.83
14
+
15
+ ## 0.1.0-alpha.24
16
+
17
+ ### Patch Changes
18
+
19
+ - Updated dependencies [73d112c]
20
+ - @mastra/core@0.1.27-alpha.82
21
+
3
22
  ## 0.1.0-alpha.23
4
23
 
5
24
  ### Patch Changes
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  import './chunk-4VNS5WPM.js';
2
- import { evaluate as evaluate$1, registerHook, AvailableHooks } from '@mastra/core';
2
+ import { evaluate as evaluate$1 } from '@mastra/core';
3
+ import { registerHook, AvailableHooks } from '@mastra/core/hooks';
3
4
  import { mkdirSync, appendFile } from 'fs';
4
5
  import { join } from 'path';
5
6
 
@@ -1,4 +1,4 @@
1
- import { Metric, MetricResult } from '@mastra/core';
1
+ import { Metric, MetricResult } from '@mastra/core/eval';
2
2
 
3
3
  interface CompletenessMetricResult extends MetricResult {
4
4
  info: {
@@ -1,5 +1,5 @@
1
1
  import '../../chunk-4VNS5WPM.js';
2
- import { Metric } from '@mastra/core';
2
+ import { Metric } from '@mastra/core/eval';
3
3
  import nlp from 'compromise';
4
4
  import stringSimilarity from 'string-similarity';
5
5
  import { SequenceMatcher } from 'difflib';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/evals",
3
- "version": "0.1.0-alpha.23",
3
+ "version": "0.1.0-alpha.25",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -38,11 +38,9 @@
38
38
  "sentiment": "^5.0.2",
39
39
  "string-similarity": "^4.0.4",
40
40
  "zod": "^3.24.1",
41
- "@mastra/core": "0.1.27-alpha.81"
41
+ "@mastra/core": "0.2.0-alpha.83"
42
42
  },
43
43
  "devDependencies": {
44
- "@babel/preset-env": "^7.26.0",
45
- "@babel/preset-typescript": "^7.26.0",
46
44
  "@tsconfig/recommended": "^1.0.7",
47
45
  "@types/difflib": "^0.2.7",
48
46
  "@types/fs-extra": "^11.0.4",
@@ -1,4 +1,4 @@
1
- import { AvailableHooks, registerHook } from '@mastra/core';
1
+ import { AvailableHooks, registerHook } from '@mastra/core/hooks';
2
2
  import { mkdirSync, appendFile } from 'fs';
3
3
  import { join } from 'path';
4
4
 
@@ -1,6 +1,6 @@
1
- import { Agent } from '@mastra/core';
2
- import { ModelConfig } from '@mastra/core';
3
- import { Metric } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
+ import { Agent } from '@mastra/core/agent';
3
+ import { Metric } from '@mastra/core/eval';
4
4
  import { describe, expect, it } from 'vitest';
5
5
 
6
6
  import { evaluate } from './evaluation';
@@ -1,4 +1,4 @@
1
- import { Agent, ModelConfig } from '@mastra/core';
1
+ import { Agent, type ModelConfig } from '@mastra/core';
2
2
 
3
3
  export abstract class MastraAgentJudge {
4
4
  protected readonly agent: Agent;
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,7 +1,6 @@
1
1
  import { type ModelConfig } from '@mastra/core';
2
- import { describe, it, expect } from 'vitest';
2
+ import { describe, it, expect, vi } from 'vitest';
3
3
 
4
- import { isCloserTo } from '../utils';
5
4
  import { TestCase } from '../utils';
6
5
 
7
6
  import { BiasMetric } from './index';
@@ -41,7 +40,11 @@ const testCases: TestCase[] = [
41
40
  },
42
41
  ];
43
42
 
44
- const SECONDS = 10000;
43
+ const SECONDS = 1000;
44
+
45
+ vi.setConfig({
46
+ testTimeout: 20 * SECONDS,
47
+ });
45
48
 
46
49
  const modelConfig: ModelConfig = {
47
50
  provider: 'OPEN_AI',
@@ -56,20 +59,20 @@ describe('BiasMetric', () => {
56
59
  it('should be able to measure a prompt that is biased', async () => {
57
60
  const result = await metric.measure(testCases[0].input, testCases[0].output);
58
61
  expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
59
- }, 10000);
62
+ });
60
63
 
61
64
  it('should be able to measure a prompt that is almost not biased', async () => {
62
65
  const result = await metric.measure(testCases[1].input, testCases[1].output);
63
66
  expect(result.score).toBeLessThan(0.5);
64
- }, 10000);
67
+ });
65
68
 
66
69
  it('should be able to measure a prompt that is mildly biased but actually not', async () => {
67
70
  const result = await metric.measure(testCases[2].input, testCases[2].output);
68
71
  expect(result.score).toBe(0);
69
- }, 10000);
72
+ });
70
73
 
71
74
  it('should be able to measure a prompt that is mildly biased', async () => {
72
75
  const result = await metric.measure(testCases[3].input, testCases[3].output);
73
76
  expect(result.score).toBeLessThan(0.8);
74
- }, 10000);
77
+ });
75
78
  });
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,5 +1,5 @@
1
1
  import { type ModelConfig } from '@mastra/core';
2
- import { describe, it, expect } from 'vitest';
2
+ import { describe, it, expect, vi } from 'vitest';
3
3
 
4
4
  import { TestCase } from '../utils';
5
5
 
@@ -165,7 +165,7 @@ const testCases: TestCase[] = [
165
165
  },
166
166
  ];
167
167
 
168
- const SECONDS = 10000;
168
+ const SECONDS = 1000;
169
169
 
170
170
  const modelConfig: ModelConfig = {
171
171
  provider: 'OPEN_AI',
@@ -174,102 +174,100 @@ const modelConfig: ModelConfig = {
174
174
  apiKey: process.env.OPENAI_API_KEY,
175
175
  };
176
176
 
177
- describe(
178
- 'SummarizationMetric',
179
- () => {
180
- const metric = new SummarizationMetric(modelConfig);
177
+ vi.setConfig({
178
+ testTimeout: 20 * SECONDS,
179
+ });
181
180
 
182
- it('should handle perfect summarization', async () => {
183
- const testCase = testCases[0]!;
184
- const result = await metric.measure(testCase.input, testCase.output);
185
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
186
- });
181
+ describe('SummarizationMetric', () => {
182
+ const metric = new SummarizationMetric(modelConfig);
187
183
 
188
- it('should handle mixed accuracy with contradictions', async () => {
189
- const testCase = testCases[1]!;
190
- const result = await metric.measure(testCase.input, testCase.output);
191
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
192
- });
184
+ it('should handle perfect summarization', async () => {
185
+ const testCase = testCases[0]!;
186
+ const result = await metric.measure(testCase.input, testCase.output);
187
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
188
+ });
193
189
 
194
- it('should handle missing key information', async () => {
195
- const testCase = testCases[2]!;
196
- const result = await metric.measure(testCase.input, testCase.output);
197
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
198
- });
190
+ it('should handle mixed accuracy with contradictions', async () => {
191
+ const testCase = testCases[1]!;
192
+ const result = await metric.measure(testCase.input, testCase.output);
193
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
194
+ });
199
195
 
200
- it('should handle empty output', async () => {
201
- const testCase = testCases[3]!;
202
- const result = await metric.measure(testCase.input, testCase.output);
203
- expect(result.score).toBe(testCase.expectedResult.score);
204
- });
196
+ it('should handle missing key information', async () => {
197
+ const testCase = testCases[2]!;
198
+ const result = await metric.measure(testCase.input, testCase.output);
199
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
200
+ });
205
201
 
206
- it('should handle speculative additions', async () => {
207
- const testCase = testCases[4]!;
208
- const result = await metric.measure(testCase.input, testCase.output);
209
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
210
- });
202
+ it('should handle empty output', async () => {
203
+ const testCase = testCases[3]!;
204
+ const result = await metric.measure(testCase.input, testCase.output);
205
+ expect(result.score).toBe(testCase.expectedResult.score);
206
+ });
211
207
 
212
- it('should handle incorrect emphasis', async () => {
213
- const testCase = testCases[5]!;
214
- const result = await metric.measure(testCase.input, testCase.output);
215
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
216
- });
208
+ it('should handle speculative additions', async () => {
209
+ const testCase = testCases[4]!;
210
+ const result = await metric.measure(testCase.input, testCase.output);
211
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
212
+ });
217
213
 
218
- it('should handle technical accuracy with missing context', async () => {
219
- const testCase = testCases[6]!;
220
- const result = await metric.measure(testCase.input, testCase.output);
221
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
222
- });
214
+ it('should handle incorrect emphasis', async () => {
215
+ const testCase = testCases[5]!;
216
+ const result = await metric.measure(testCase.input, testCase.output);
217
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
218
+ });
223
219
 
224
- it('should handle numerical approximation', async () => {
225
- const testCase = testCases[7]!;
226
- const result = await metric.measure(testCase.input, testCase.output);
227
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
228
- });
220
+ it('should handle technical accuracy with missing context', async () => {
221
+ const testCase = testCases[6]!;
222
+ const result = await metric.measure(testCase.input, testCase.output);
223
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
224
+ });
229
225
 
230
- it('should handle mixed tenses', async () => {
231
- const testCase = testCases[8]!;
232
- const result = await metric.measure(testCase.input, testCase.output);
233
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
234
- });
226
+ it('should handle numerical approximation', async () => {
227
+ const testCase = testCases[7]!;
228
+ const result = await metric.measure(testCase.input, testCase.output);
229
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
230
+ });
235
231
 
236
- it('should handle subjective interpretation', async () => {
237
- const testCase = testCases[9]!;
238
- const result = await metric.measure(testCase.input, testCase.output);
239
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
240
- });
232
+ it('should handle mixed tenses', async () => {
233
+ const testCase = testCases[8]!;
234
+ const result = await metric.measure(testCase.input, testCase.output);
235
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
236
+ });
241
237
 
242
- it('should handle high alignment with low coverage', async () => {
243
- const testCase = testCases[10]!;
244
- const result = await metric.measure(testCase.input, testCase.output);
245
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
246
- });
238
+ it('should handle subjective interpretation', async () => {
239
+ const testCase = testCases[9]!;
240
+ const result = await metric.measure(testCase.input, testCase.output);
241
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
242
+ });
247
243
 
248
- it('should handle low alignment with high coverage', async () => {
249
- const testCase = testCases[11]!;
250
- const result = await metric.measure(testCase.input, testCase.output);
251
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
252
- });
244
+ it('should handle high alignment with low coverage', async () => {
245
+ const testCase = testCases[10]!;
246
+ const result = await metric.measure(testCase.input, testCase.output);
247
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
248
+ });
253
249
 
254
- it('should handle single word summary', async () => {
255
- const testCase = testCases[12]!;
256
- const result = await metric.measure(testCase.input, testCase.output);
257
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
258
- });
250
+ it('should handle low alignment with high coverage', async () => {
251
+ const testCase = testCases[11]!;
252
+ const result = await metric.measure(testCase.input, testCase.output);
253
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
254
+ });
259
255
 
260
- it('should handle repetitive summary', async () => {
261
- const testCase = testCases[13]!;
262
- const result = await metric.measure(testCase.input, testCase.output);
263
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
264
- });
256
+ it('should handle single word summary', async () => {
257
+ const testCase = testCases[12]!;
258
+ const result = await metric.measure(testCase.input, testCase.output);
259
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
260
+ });
265
261
 
266
- it('should handle overly verbose summary', async () => {
267
- const testCase = testCases[14]!;
268
- const result = await metric.measure(testCase.input, testCase.output);
269
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
270
- });
271
- },
272
- {
273
- timeout: 15 * SECONDS,
274
- },
275
- );
262
+ it('should handle repetitive summary', async () => {
263
+ const testCase = testCases[13]!;
264
+ const result = await metric.measure(testCase.input, testCase.output);
265
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
266
+ });
267
+
268
+ it('should handle overly verbose summary', async () => {
269
+ const testCase = testCases[14]!;
270
+ const result = await metric.measure(testCase.input, testCase.output);
271
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
272
+ });
273
+ });
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { ModelConfig } from '@mastra/core';
1
+ import { type ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -1,4 +1,4 @@
1
- import { Metric, type MetricResult } from '@mastra/core';
1
+ import { Metric, type MetricResult } from '@mastra/core/eval';
2
2
  import nlp from 'compromise';
3
3
 
4
4
  interface CompletenessMetricResult extends MetricResult {
@@ -1,4 +1,4 @@
1
- import { Metric, type MetricResult } from '@mastra/core';
1
+ import { Metric, type MetricResult } from '@mastra/core/eval';
2
2
  import stringSimilarity from 'string-similarity';
3
3
 
4
4
  interface ContentSimilarityResult extends MetricResult {
@@ -1,4 +1,4 @@
1
- import { Metric, type MetricResult } from '@mastra/core';
1
+ import { Metric, type MetricResult } from '@mastra/core/eval';
2
2
  import keyword_extractor from 'keyword-extractor';
3
3
 
4
4
  interface KeywordCoverageResult extends MetricResult {
@@ -1,4 +1,4 @@
1
- import { Metric, type MetricResult } from '@mastra/core';
1
+ import { Metric, type MetricResult } from '@mastra/core/eval';
2
2
  import { SequenceMatcher } from 'difflib';
3
3
 
4
4
  interface TextualDifferenceResult extends MetricResult {
@@ -1,4 +1,4 @@
1
- import { Metric, type MetricResult } from '@mastra/core';
1
+ import { Metric, type MetricResult } from '@mastra/core/eval';
2
2
  import Sentiment from 'sentiment';
3
3
 
4
4
  interface ToneConsitencyResult extends MetricResult {
package/vitest.config.ts CHANGED
@@ -5,5 +5,7 @@ export default defineConfig({
5
5
  environment: 'node',
6
6
  include: ['src/**/*.test.ts'],
7
7
  exclude: ['**/node_modules/**', '**/dist/**'],
8
+ maxConcurrency: 1,
9
+ fileParallelism: false,
8
10
  },
9
11
  });