@mastra/evals 0.1.0-alpha.5 → 0.1.0-alpha.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +372 -0
  2. package/README.md +185 -0
  3. package/dist/_tsup-dts-rollup.d.ts +723 -0
  4. package/dist/chunk-4VNS5WPM.js +37 -0
  5. package/dist/dist-56AYDN4X.js +17575 -0
  6. package/dist/index.d.ts +3 -0
  7. package/dist/index.js +87 -0
  8. package/dist/magic-string.es-5UDOWOAZ.js +1296 -0
  9. package/dist/metrics/llm/index.d.ts +10 -0
  10. package/dist/metrics/llm/index.js +2121 -0
  11. package/dist/metrics/nlp/index.d.ts +5 -0
  12. package/dist/metrics/nlp/index.js +189 -0
  13. package/package.json +16 -28
  14. package/src/attachListeners.ts +40 -0
  15. package/src/constants.ts +1 -0
  16. package/src/evaluation.test.ts +15 -18
  17. package/src/evaluation.ts +48 -11
  18. package/src/index.ts +1 -0
  19. package/src/metrics/judge/index.ts +4 -3
  20. package/src/metrics/llm/answer-relevancy/index.test.ts +44 -74
  21. package/src/metrics/llm/answer-relevancy/index.ts +8 -5
  22. package/src/metrics/llm/answer-relevancy/metricJudge.ts +4 -3
  23. package/src/metrics/llm/answer-relevancy/prompts.ts +26 -28
  24. package/src/metrics/llm/bias/index.test.ts +19 -34
  25. package/src/metrics/llm/bias/index.ts +13 -4
  26. package/src/metrics/llm/bias/metricJudge.ts +20 -4
  27. package/src/metrics/llm/bias/prompts.ts +27 -0
  28. package/src/metrics/llm/context-position/index.test.ts +98 -108
  29. package/src/metrics/llm/context-position/index.ts +13 -13
  30. package/src/metrics/llm/context-position/metricJudge.ts +2 -2
  31. package/src/metrics/llm/context-position/prompts.ts +31 -36
  32. package/src/metrics/llm/context-precision/index.test.ts +72 -100
  33. package/src/metrics/llm/context-precision/index.ts +13 -13
  34. package/src/metrics/llm/context-precision/metricJudge.ts +2 -2
  35. package/src/metrics/llm/context-relevancy/index.test.ts +28 -36
  36. package/src/metrics/llm/context-relevancy/index.ts +22 -12
  37. package/src/metrics/llm/context-relevancy/metricJudge.ts +20 -6
  38. package/src/metrics/llm/context-relevancy/prompts.ts +37 -0
  39. package/src/metrics/llm/contextual-recall/index.test.ts +30 -37
  40. package/src/metrics/llm/contextual-recall/index.ts +19 -12
  41. package/src/metrics/llm/contextual-recall/metricJudge.ts +19 -4
  42. package/src/metrics/llm/contextual-recall/prompts.ts +42 -1
  43. package/src/metrics/llm/faithfulness/index.test.ts +71 -109
  44. package/src/metrics/llm/faithfulness/index.ts +21 -14
  45. package/src/metrics/llm/faithfulness/metricJudge.ts +12 -12
  46. package/src/metrics/llm/hallucination/index.test.ts +66 -104
  47. package/src/metrics/llm/hallucination/index.ts +21 -14
  48. package/src/metrics/llm/hallucination/metricJudge.ts +13 -15
  49. package/src/metrics/llm/hallucination/prompts.ts +28 -35
  50. package/src/metrics/llm/index.ts +1 -0
  51. package/src/metrics/llm/prompt-alignment/index.test.ts +59 -74
  52. package/src/metrics/llm/prompt-alignment/index.ts +15 -6
  53. package/src/metrics/llm/prompt-alignment/metricJudge.ts +12 -16
  54. package/src/metrics/llm/summarization/index.test.ts +33 -75
  55. package/src/metrics/llm/summarization/index.ts +18 -9
  56. package/src/metrics/llm/summarization/metricJudge.ts +14 -27
  57. package/src/metrics/llm/summarization/prompts.ts +52 -14
  58. package/src/metrics/llm/toxicity/index.test.ts +22 -31
  59. package/src/metrics/llm/toxicity/index.ts +10 -7
  60. package/src/metrics/llm/toxicity/metricJudge.ts +7 -6
  61. package/src/metrics/llm/toxicity/prompts.ts +5 -12
  62. package/src/metrics/llm/types.ts +7 -0
  63. package/src/metrics/nlp/completeness/index.test.ts +20 -20
  64. package/src/metrics/nlp/completeness/index.ts +14 -6
  65. package/src/metrics/nlp/content-similarity/index.test.ts +17 -48
  66. package/src/metrics/nlp/content-similarity/index.ts +15 -8
  67. package/src/metrics/nlp/keyword-coverage/index.test.ts +31 -60
  68. package/src/metrics/nlp/keyword-coverage/index.ts +10 -9
  69. package/src/metrics/nlp/textual-difference/index.test.ts +34 -62
  70. package/src/metrics/nlp/textual-difference/index.ts +12 -6
  71. package/src/metrics/nlp/tone/index.test.ts +49 -72
  72. package/src/metrics/nlp/tone/index.ts +16 -9
  73. package/tsconfig.json +1 -10
  74. package/vitest.config.ts +11 -0
  75. package/jest.config.ts +0 -21
  76. package/src/metrics/nlp/types.ts +0 -13
package/CHANGELOG.md CHANGED
@@ -1,5 +1,377 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 0.1.0-alpha.50
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [5ee67d3]
8
+ - Updated dependencies [95a4697]
9
+ - @mastra/core@0.2.0-alpha.108
10
+
11
+ ## 0.1.0-alpha.49
12
+
13
+ ### Patch Changes
14
+
15
+ - Updated dependencies [66a5392]
16
+ - @mastra/core@0.2.0-alpha.107
17
+
18
+ ## 0.1.0-alpha.48
19
+
20
+ ### Patch Changes
21
+
22
+ - a8a459a: Updated Evals table UI
23
+ - Updated dependencies [6f2c0f5]
24
+ - Updated dependencies [a8a459a]
25
+ - @mastra/core@0.2.0-alpha.106
26
+
27
+ ## 0.1.0-alpha.47
28
+
29
+ ### Patch Changes
30
+
31
+ - Updated dependencies [1420ae2]
32
+ - Updated dependencies [99f1847]
33
+ - @mastra/core@0.2.0-alpha.105
34
+
35
+ ## 0.1.0-alpha.46
36
+
37
+ ### Patch Changes
38
+
39
+ - 5fdc87c: Update evals storage in attachListeners
40
+ - b97ca96: Tracing into default storage
41
+ - 72d1990: Updated evals table schema
42
+ - Updated dependencies [5fdc87c]
43
+ - Updated dependencies [b97ca96]
44
+ - Updated dependencies [72d1990]
45
+ - Updated dependencies [cf6d825]
46
+ - Updated dependencies [10870bc]
47
+ - @mastra/core@0.2.0-alpha.104
48
+
49
+ ## 0.1.0-alpha.45
50
+
51
+ ### Patch Changes
52
+
53
+ - Updated dependencies [4534e77]
54
+ - @mastra/core@0.2.0-alpha.103
55
+
56
+ ## 0.1.0-alpha.44
57
+
58
+ ### Patch Changes
59
+
60
+ - Updated dependencies [a9345f9]
61
+ - @mastra/core@0.2.0-alpha.102
62
+
63
+ ## 0.1.0-alpha.43
64
+
65
+ ### Patch Changes
66
+
67
+ - 4f1d1a1: Enforce types ann cleanup package.json
68
+ - Updated dependencies [66a03ec]
69
+ - Updated dependencies [4f1d1a1]
70
+ - @mastra/core@0.2.0-alpha.101
71
+
72
+ ## 0.1.0-alpha.42
73
+
74
+ ### Patch Changes
75
+
76
+ - Updated dependencies [9d1796d]
77
+ - @mastra/core@0.2.0-alpha.100
78
+
79
+ ## 0.1.0-alpha.41
80
+
81
+ ### Patch Changes
82
+
83
+ - Updated dependencies [7d83b92]
84
+ - @mastra/core@0.2.0-alpha.99
85
+
86
+ ## 0.1.0-alpha.40
87
+
88
+ ### Patch Changes
89
+
90
+ - 70dabd9: Fix broken publish
91
+ - 202d404: Added instructions when generating evals
92
+ - Updated dependencies [70dabd9]
93
+ - Updated dependencies [202d404]
94
+ - @mastra/core@0.2.0-alpha.98
95
+
96
+ ## 0.1.0-alpha.39
97
+
98
+ ### Patch Changes
99
+
100
+ - 7892533: Updated test evals to use Mastra Storage
101
+ - d641d91: Fix exports for @mastra/evals
102
+ - Updated dependencies [07c069d]
103
+ - Updated dependencies [7892533]
104
+ - Updated dependencies [e6d8055]
105
+ - Updated dependencies [5950de5]
106
+ - Updated dependencies [df843d3]
107
+ - Updated dependencies [a870123]
108
+ - @mastra/core@0.2.0-alpha.97
109
+
110
+ ## 0.1.0-alpha.38
111
+
112
+ ### Patch Changes
113
+
114
+ - Updated dependencies [74b3078]
115
+ - @mastra/core@0.2.0-alpha.96
116
+
117
+ ## 0.1.0-alpha.37
118
+
119
+ ### Patch Changes
120
+
121
+ - Updated dependencies [9fb59d6]
122
+ - @mastra/core@0.2.0-alpha.95
123
+
124
+ ## 0.1.0-alpha.36
125
+
126
+ ### Minor Changes
127
+
128
+ - 8b416d9: Breaking changes
129
+
130
+ ### Patch Changes
131
+
132
+ - 9c10484: update all packages
133
+ - Updated dependencies [9c10484]
134
+ - Updated dependencies [8b416d9]
135
+ - @mastra/core@0.2.0-alpha.94
136
+
137
+ ## 0.1.0-alpha.35
138
+
139
+ ### Patch Changes
140
+
141
+ - Updated dependencies [5285356]
142
+ - @mastra/core@0.2.0-alpha.93
143
+
144
+ ## 0.1.0-alpha.34
145
+
146
+ ### Patch Changes
147
+
148
+ - Updated dependencies [4d4f6b6]
149
+ - @mastra/core@0.2.0-alpha.92
150
+
151
+ ## 0.1.0-alpha.33
152
+
153
+ ### Patch Changes
154
+
155
+ - Updated dependencies [d7d465a]
156
+ - Updated dependencies [d7d465a]
157
+ - Updated dependencies [2017553]
158
+ - Updated dependencies [a10b7a3]
159
+ - Updated dependencies [16e5b04]
160
+ - @mastra/core@0.2.0-alpha.91
161
+
162
+ ## 0.1.0-alpha.32
163
+
164
+ ### Patch Changes
165
+
166
+ - Updated dependencies [8151f44]
167
+ - Updated dependencies [e897f1c]
168
+ - Updated dependencies [3700be1]
169
+ - @mastra/core@0.2.0-alpha.90
170
+
171
+ ## 0.1.0-alpha.31
172
+
173
+ ### Patch Changes
174
+
175
+ - Updated dependencies [27275c9]
176
+ - @mastra/core@0.2.0-alpha.89
177
+
178
+ ## 0.1.0-alpha.30
179
+
180
+ ### Patch Changes
181
+
182
+ - Updated dependencies [ccbc581]
183
+ - @mastra/core@0.2.0-alpha.88
184
+
185
+ ## 0.1.0-alpha.29
186
+
187
+ ### Patch Changes
188
+
189
+ - Updated dependencies [7365b6c]
190
+ - @mastra/core@0.2.0-alpha.87
191
+
192
+ ## 0.1.0-alpha.28
193
+
194
+ ### Minor Changes
195
+
196
+ - 5916f9d: Update deps from fixed to ^
197
+
198
+ ### Patch Changes
199
+
200
+ - Updated dependencies [6fa4bd2]
201
+ - Updated dependencies [e2e76de]
202
+ - Updated dependencies [7f24c29]
203
+ - Updated dependencies [67637ba]
204
+ - Updated dependencies [04f3171]
205
+ - @mastra/core@0.2.0-alpha.86
206
+
207
+ ## 0.1.0-alpha.27
208
+
209
+ ### Patch Changes
210
+
211
+ - Updated dependencies [e9d1b47]
212
+ - @mastra/core@0.2.0-alpha.85
213
+
214
+ ## 0.1.0-alpha.26
215
+
216
+ ### Patch Changes
217
+
218
+ - Updated dependencies [2f17a5f]
219
+ - Updated dependencies [cb290ee]
220
+ - Updated dependencies [b4d7416]
221
+ - Updated dependencies [38b7f66]
222
+ - @mastra/core@0.2.0-alpha.84
223
+
224
+ ## 0.1.0-alpha.25
225
+
226
+ ### Patch Changes
227
+
228
+ - 9625602: Use mastra core splitted bundles in other packages
229
+ - 8769a62: Split core into seperate entry fils
230
+ - Updated dependencies [30322ce]
231
+ - Updated dependencies [78eec7c]
232
+ - Updated dependencies [9625602]
233
+ - Updated dependencies [8769a62]
234
+ - @mastra/core@0.2.0-alpha.83
235
+
236
+ ## 0.1.0-alpha.24
237
+
238
+ ### Patch Changes
239
+
240
+ - Updated dependencies [73d112c]
241
+ - @mastra/core@0.1.27-alpha.82
242
+
243
+ ## 0.1.0-alpha.23
244
+
245
+ ### Patch Changes
246
+
247
+ - Updated dependencies [9fb3039]
248
+ - @mastra/core@0.1.27-alpha.81
249
+
250
+ ## 0.1.0-alpha.22
251
+
252
+ ### Patch Changes
253
+
254
+ - cb2e997: Bundle evals package with tsup
255
+
256
+ ## 0.1.0-alpha.21
257
+
258
+ ### Patch Changes
259
+
260
+ - Updated dependencies [327ece7]
261
+ - @mastra/core@0.1.27-alpha.80
262
+
263
+ ## 0.1.0-alpha.20
264
+
265
+ ### Patch Changes
266
+
267
+ - Updated dependencies [21fe536]
268
+ - @mastra/core@0.1.27-alpha.79
269
+
270
+ ## 0.1.0-alpha.19
271
+
272
+ ### Patch Changes
273
+
274
+ - Updated dependencies [685108a]
275
+ - Updated dependencies [685108a]
276
+ - @mastra/core@0.1.27-alpha.78
277
+
278
+ ## 0.1.0-alpha.18
279
+
280
+ ### Patch Changes
281
+
282
+ - Updated dependencies [8105fae]
283
+ - @mastra/core@0.1.27-alpha.77
284
+
285
+ ## 0.1.0-alpha.17
286
+
287
+ ### Patch Changes
288
+
289
+ - Updated dependencies [ae7bf94]
290
+ - Updated dependencies [ae7bf94]
291
+ - @mastra/core@0.1.27-alpha.76
292
+
293
+ ## 0.1.0-alpha.16
294
+
295
+ ### Patch Changes
296
+
297
+ - Updated dependencies [23dcb23]
298
+ - @mastra/core@0.1.27-alpha.75
299
+
300
+ ## 0.1.0-alpha.15
301
+
302
+ ### Patch Changes
303
+
304
+ - Updated dependencies [7b87567]
305
+ - @mastra/core@0.1.27-alpha.74
306
+
307
+ ## 0.1.0-alpha.14
308
+
309
+ ### Patch Changes
310
+
311
+ - Updated dependencies [3427b95]
312
+ - @mastra/core@0.1.27-alpha.73
313
+
314
+ ## 0.1.0-alpha.13
315
+
316
+ ### Patch Changes
317
+
318
+ - 06b2c0a: Update summarization prompt and fix eval input
319
+ - Updated dependencies [e4d4ede]
320
+ - Updated dependencies [06b2c0a]
321
+ - @mastra/core@0.1.27-alpha.72
322
+
323
+ ## 0.1.0-alpha.12
324
+
325
+ ### Patch Changes
326
+
327
+ - Updated dependencies [d9c8dd0]
328
+ - @mastra/core@0.1.27-alpha.71
329
+
330
+ ## 0.1.0-alpha.11
331
+
332
+ ### Patch Changes
333
+
334
+ - bdaf834: publish packages
335
+
336
+ ## 0.1.0-alpha.10
337
+
338
+ ### Patch Changes
339
+
340
+ - Updated dependencies [dd6d87f]
341
+ - Updated dependencies [04434b6]
342
+ - @mastra/core@0.1.27-alpha.70
343
+
344
+ ## 0.1.0-alpha.9
345
+
346
+ ### Patch Changes
347
+
348
+ - 1944807: Unified logger and major step in better logs
349
+ - 9ade36e: Changed measure for evals, added endpoints, attached metrics to agent, added ui for evals in playground, and updated docs
350
+ - Updated dependencies [1944807]
351
+ - Updated dependencies [9ade36e]
352
+ - @mastra/core@0.1.27-alpha.69
353
+
354
+ ## 0.1.0-alpha.8
355
+
356
+ ### Patch Changes
357
+
358
+ - Updated dependencies [0be7181]
359
+ - Updated dependencies [0be7181]
360
+ - @mastra/core@0.1.27-alpha.68
361
+
362
+ ## 0.1.0-alpha.7
363
+
364
+ ### Patch Changes
365
+
366
+ - Updated dependencies [c8ff2f5]
367
+ - @mastra/core@0.1.27-alpha.67
368
+
369
+ ## 0.1.0-alpha.6
370
+
371
+ ### Patch Changes
372
+
373
+ - aea3c13: Fix evals export for llm and nlp
374
+
3
375
  ## 0.1.0-alpha.5
4
376
 
5
377
  ### Minor Changes
package/README.md ADDED
@@ -0,0 +1,185 @@
1
+ # @mastra/evals
2
+
3
+ A comprehensive evaluation framework for assessing AI model outputs across multiple dimensions.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @mastra/evals
9
+ ```
10
+
11
+ ## Overview
12
+
13
+ `@mastra/evals` provides a suite of evaluation metrics for assessing AI model outputs. The package includes both LLM-based and NLP-based metrics, enabling both automated and model-assisted evaluation of AI responses.
14
+
15
+ ## Features
16
+
17
+ ### LLM-Based Metrics
18
+
19
+ 1. **Answer Relevancy**
20
+
21
+ - Evaluates how well an answer addresses the input question
22
+ - Considers uncertainty weighting for more nuanced scoring
23
+ - Returns detailed reasoning for scores
24
+
25
+ 2. **Bias Detection**
26
+
27
+ - Identifies potential biases in model outputs
28
+ - Analyzes opinions and statements for bias indicators
29
+ - Provides explanations for detected biases
30
+ - Configurable scoring scale
31
+
32
+ 3. **Context Precision & Relevancy**
33
+
34
+ - Assesses how well responses use provided context
35
+ - Evaluates accuracy of context usage
36
+ - Measures relevance of context to the response
37
+ - Analyzes context positioning in responses
38
+
39
+ 4. **Faithfulness**
40
+
41
+ - Verifies that responses are faithful to provided context
42
+ - Detects hallucinations or fabricated information
43
+ - Evaluates claims against provided context
44
+ - Provides detailed analysis of faithfulness breaches
45
+
46
+ 5. **Prompt Alignment**
47
+
48
+ - Measures how well responses follow given instructions
49
+ - Evaluates adherence to multiple instruction criteria
50
+ - Provides per-instruction scoring
51
+ - Supports custom instruction sets
52
+
53
+ 6. **Toxicity**
54
+ - Detects toxic or harmful content in responses
55
+ - Provides detailed reasoning for toxicity verdicts
56
+ - Configurable scoring thresholds
57
+ - Considers both input and output context
58
+
59
+ ### NLP-Based Metrics
60
+
61
+ 1. **Completeness**
62
+
63
+ - Analyzes structural completeness of responses
64
+ - Identifies missing elements from input requirements
65
+ - Provides detailed element coverage analysis
66
+ - Tracks input-output element ratios
67
+
68
+ 2. **Content Similarity**
69
+
70
+ - Measures text similarity between inputs and outputs
71
+ - Configurable for case and whitespace sensitivity
72
+ - Returns normalized similarity scores
73
+ - Uses string comparison algorithms for accuracy
74
+
75
+ 3. **Keyword Coverage**
76
+ - Tracks presence of key terms from input in output
77
+ - Provides detailed keyword matching statistics
78
+ - Calculates coverage ratios
79
+ - Useful for ensuring comprehensive responses
80
+
81
+ ## Usage
82
+
83
+ ### Basic Example
84
+
85
+ ```typescript
86
+ import { ContentSimilarityMetric, ToxicityMetric } from '@mastra/evals';
87
+
88
+ // Initialize metrics
89
+ const similarityMetric = new ContentSimilarityMetric({
90
+ ignoreCase: true,
91
+ ignoreWhitespace: true,
92
+ });
93
+
94
+ const toxicityMetric = new ToxicityMetric({
95
+ model: openai('gpt-4'),
96
+ scale: 1, // Optional: adjust scoring scale
97
+ });
98
+
99
+ // Evaluate outputs
100
+ const input = 'What is the capital of France?';
101
+ const output = 'Paris is the capital of France.';
102
+
103
+ const similarityResult = await similarityMetric.measure(input, output);
104
+ const toxicityResult = await toxicityMetric.measure(input, output);
105
+
106
+ console.log('Similarity Score:', similarityResult.score);
107
+ console.log('Toxicity Score:', toxicityResult.score);
108
+ ```
109
+
110
+ ### Context-Aware Evaluation
111
+
112
+ ```typescript
113
+ import { FaithfulnessMetric } from '@mastra/evals';
114
+
115
+ // Initialize with context
116
+ const faithfulnessMetric = new FaithfulnessMetric({
117
+ model: openai('gpt-4'),
118
+ context: ['Paris is the capital of France', 'Paris has a population of 2.2 million'],
119
+ scale: 1,
120
+ });
121
+
122
+ // Evaluate response against context
123
+ const result = await faithfulnessMetric.measure(
124
+ 'Tell me about Paris',
125
+ 'Paris is the capital of France with 2.2 million residents',
126
+ );
127
+
128
+ console.log('Faithfulness Score:', result.score);
129
+ console.log('Reasoning:', result.reason);
130
+ ```
131
+
132
+ ## Metric Results
133
+
134
+ Each metric returns a standardized result object containing:
135
+
136
+ - `score`: Normalized score (typically 0-1)
137
+ - `info`: Detailed information about the evaluation
138
+ - Additional metric-specific data (e.g., matched keywords, missing elements)
139
+
140
+ Some metrics also provide:
141
+
142
+ - `reason`: Detailed explanation of the score
143
+ - `verdicts`: Individual judgments that contributed to the final score
144
+
145
+ ## Telemetry and Logging
146
+
147
+ The package includes built-in telemetry and logging capabilities:
148
+
149
+ - Automatic evaluation tracking through Mastra Storage
150
+ - Integration with OpenTelemetry for performance monitoring
151
+ - Detailed evaluation traces for debugging
152
+
153
+ ```typescript
154
+ import { attachListeners } from '@mastra/evals';
155
+
156
+ // Enable basic evaluation tracking
157
+ await attachListeners();
158
+
159
+ // Store evals in Mastra Storage (if storage is enabled)
160
+ await attachListeners(mastra);
161
+ // Note: When using in-memory storage, evaluations are isolated to the test process.
162
+ // When using file storage, evaluations are persisted and can be queried later.
163
+ ```
164
+
165
+ ## Environment Variables
166
+
167
+ Required for LLM-based metrics:
168
+
169
+ - `OPENAI_API_KEY`: For OpenAI model access
170
+ - Additional provider keys as needed (Cohere, Anthropic, etc.)
171
+
172
+ ## Package Exports
173
+
174
+ ```typescript
175
+ // Main package exports
176
+ import { evaluate } from '@mastra/evals';
177
+ // NLP-specific metrics
178
+ import { ContentSimilarityMetric } from '@mastra/evals/nlp';
179
+ ```
180
+
181
+ ## Related Packages
182
+
183
+ - `@mastra/core`: Core framework functionality
184
+ - `@mastra/engine`: LLM execution engine
185
+ - `@mastra/mcp`: Model Context Protocol integration