@mastra/evals 1.2.4-alpha.0 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 1.2.4
4
+
5
+ ### Patch Changes
6
+
7
+ - Fixed the hallucination and tool-usage scorers returning incorrect scores when observable memory is enabled. These scorers now detect tool calls in every message format, so responses are no longer wrongly scored as fully hallucinated or as using zero tools. ([#17321](https://github.com/mastra-ai/mastra/pull/17321))
8
+
9
+ - Updated dependencies [[`fa63872`](https://github.com/mastra-ai/mastra/commit/fa6387280954e6b667bec5714b55ba082bc627ff), [`d779de3`](https://github.com/mastra-ai/mastra/commit/d779de3cd9d2e7ed8110547190e2f15e786a0e41), [`1750c97`](https://github.com/mastra-ai/mastra/commit/1750c975d6179fbf6db2813b15229d4f8f23fc55), [`9283971`](https://github.com/mastra-ai/mastra/commit/928397157009b4aef4d5fdf3a0a273cb371beb55), [`f07b646`](https://github.com/mastra-ai/mastra/commit/f07b64604ab7d25391179790b7fd4823df9e2dff), [`d8838ae`](https://github.com/mastra-ai/mastra/commit/d8838ae80b69780361693d27098f7f6684af12fe), [`40f9297`](https://github.com/mastra-ai/mastra/commit/40f9297003b921c62373d3e8d3a4bda76c9f6de3), [`19a8658`](https://github.com/mastra-ai/mastra/commit/19a86589c788ef48bb6c1b0612cc82a201857379), [`850af77`](https://github.com/mastra-ai/mastra/commit/850af7779cb87c350804488734544a5b1843de25), [`0f0d1ba`](https://github.com/mastra-ai/mastra/commit/0f0d1ba67bfcb2204e571401662f1eceefc03357), [`a18775a`](https://github.com/mastra-ai/mastra/commit/a18775a693172546ee2378d39b67d4e32895b251), [`1baf2d1`](https://github.com/mastra-ai/mastra/commit/1baf2d152c6881338ff8f114633d5316fe13dd15), [`8c31bcd`](https://github.com/mastra-ai/mastra/commit/8c31bcdb00e597880d5939b1b7d7566fbe5dacae), [`0e32507`](https://github.com/mastra-ai/mastra/commit/0e32507962cdfa5569b7bda5bc6fb3dd34e40b03), [`95b14cd`](https://github.com/mastra-ai/mastra/commit/95b14cdd820e86d97ac05fe568424c513a252e31), [`07c3de7`](https://github.com/mastra-ai/mastra/commit/07c3de7f7bc418beccaea3b5e6b7f7cdda79d492), [`0bf2d93`](https://github.com/mastra-ai/mastra/commit/0bf2d932d20e2936f2d9abb8c0a86e24fbc97ec6), [`7b0d34c`](https://github.com/mastra-ai/mastra/commit/7b0d34cfe4a2fce22ac86ae17404685ff67a2ddb), [`a659a77`](https://github.com/mastra-ai/mastra/commit/a659a779bdebe3a52a518c56d2260592d0240fe0), [`aa36be2`](https://github.com/mastra-ai/mastra/commit/aa36be23aa513b7dc53cb8ca16b7fab8f20e43ad), [`3332be9`](https://github.com/mastra-ai/mastra/commit/3332be9701ecd77aba840959d9a1d1ce7aef02d3), [`212c635`](https://github.com/mastra-ai/mastra/commit/212c635203e61d036ab41db8ff86c3893dc795b3), [`d8838ae`](https://github.com/mastra-ai/mastra/commit/d8838ae80b69780361693d27098f7f6684af12fe), [`9aa5a73`](https://github.com/mastra-ai/mastra/commit/9aa5a73e7e110f6e9365eec69364a33d5f03bb56), [`f73c789`](https://github.com/mastra-ai/mastra/commit/f73c789e8ef21561580395d2c410119cab5848c8), [`8bd16da`](https://github.com/mastra-ai/mastra/commit/8bd16da73a4cb874d739373643dbd6a6e7f88684), [`c8630f8`](https://github.com/mastra-ai/mastra/commit/c8630f80d4f40cb5d22e60ab162b618b1907167a), [`94dfef6`](https://github.com/mastra-ai/mastra/commit/94dfef6e2bf19a88467ea3940afcbce88a433f0f), [`47f71dc`](https://github.com/mastra-ai/mastra/commit/47f71dc6fbcbd12d71e21a979e676e20a02bd77d), [`50ceae2`](https://github.com/mastra-ai/mastra/commit/50ceae270878e2f8fb2b2c6c2faab09df0007c8a), [`a122f79`](https://github.com/mastra-ai/mastra/commit/a122f79427ae225ec79c7b2ed46278da48d04b17), [`8cdde58`](https://github.com/mastra-ai/mastra/commit/8cdde5875bbba6702d9df226f2b20232b8d75d6c), [`3a081c1`](https://github.com/mastra-ai/mastra/commit/3a081c1255c5ae8c99f6dad91cc612934ef6f2bd), [`49f8abc`](https://github.com/mastra-ai/mastra/commit/49f8abce8258e4f2f87bd326acfbdb641264a47c), [`847ff1e`](https://github.com/mastra-ai/mastra/commit/847ff1e0d94368d94b2e173e4e0908e115568ef3), [`0c1ed1d`](https://github.com/mastra-ai/mastra/commit/0c1ed1d00c7d87b5ac99ca95896211a2fa9189fa), [`259d409`](https://github.com/mastra-ai/mastra/commit/259d409a514174299dbde1ff5e1121209b3ba850), [`9e16c68`](https://github.com/mastra-ai/mastra/commit/9e16c6818b6485ccb43df28aba6f3a2219d28662), [`cefca33`](https://github.com/mastra-ai/mastra/commit/cefca33ae666e69810c935fedf95a929c173d1d7), [`d00e8c5`](https://github.com/mastra-ai/mastra/commit/d00e8c50daebe5bce5bf2f48bde39c86fc3d2fe4), [`36fa7e2`](https://github.com/mastra-ai/mastra/commit/36fa7e24d14e58a1eb46147097b32f583e5b8775), [`87e9774`](https://github.com/mastra-ai/mastra/commit/87e97741c1e493cd6d62f478eb810b49bda4d57c), [`65a72e7`](https://github.com/mastra-ai/mastra/commit/65a72e70c25eedea8ff985a6624b96be2850236b), [`fe9eacd`](https://github.com/mastra-ai/mastra/commit/fe9eacd9545a0a9d64aad31c9fa90294a425289e), [`4c02027`](https://github.com/mastra-ai/mastra/commit/4c020277235eaa6b1dc957c90ad0639eef213992), [`0f77241`](https://github.com/mastra-ai/mastra/commit/0f7724108806703799a8ba80ad0f09414afd5066), [`849efb9`](https://github.com/mastra-ai/mastra/commit/849efb9fca6dc976589c1f90a303fea618769109), [`92ff509`](https://github.com/mastra-ai/mastra/commit/92ff5098ef8a990438ca038077021a5f7541ec1d), [`3fce5e7`](https://github.com/mastra-ai/mastra/commit/3fce5e70d011d289043e75003ef3336ed4aa43c3), [`a763592`](https://github.com/mastra-ai/mastra/commit/a763592c3db46963ef1011cfe16fe372816e775e), [`db79c86`](https://github.com/mastra-ai/mastra/commit/db79c86c60723d57e02f9636ca2611bd4515f194), [`6855012`](https://github.com/mastra-ai/mastra/commit/685501247cc4717506f3e89beed03509d63a5370), [`80c7737`](https://github.com/mastra-ai/mastra/commit/80c7737e32d7917b5f356957d67c169d01744fd3), [`7fef31c`](https://github.com/mastra-ai/mastra/commit/7fef31c0d2a6d362a43a647a8a4f6ab893758a23), [`7fef31c`](https://github.com/mastra-ai/mastra/commit/7fef31c0d2a6d362a43a647a8a4f6ab893758a23), [`3f1cf47`](https://github.com/mastra-ai/mastra/commit/3f1cf476f74c1e4cc2df908837e05853a5347e31)]:
10
+ - @mastra/core@1.38.0
11
+
3
12
  ## 1.2.4-alpha.0
4
13
 
5
14
  ### Patch Changes
@@ -3,7 +3,7 @@ name: mastra-evals
3
3
  description: Documentation for @mastra/evals. Use when working with @mastra/evals APIs, configuration, or implementation.
4
4
  metadata:
5
5
  package: "@mastra/evals"
6
- version: "1.2.4-alpha.0"
6
+ version: "1.2.4"
7
7
  ---
8
8
 
9
9
  ## When to use
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "1.2.4-alpha.0",
2
+ "version": "1.2.4",
3
3
  "package": "@mastra/evals",
4
4
  "exports": {},
5
5
  "modules": {}
@@ -57,11 +57,11 @@ import { createAnswerRelevancyScorer, createToxicityScorer } from '@mastra/evals
57
57
  export const evaluatedAgent = new Agent({
58
58
  scorers: {
59
59
  relevancy: {
60
- scorer: createAnswerRelevancyScorer({ model: '__OPENAI_MODEL_MINI__' }),
60
+ scorer: createAnswerRelevancyScorer({ model: 'openai/gpt-5-mini' }),
61
61
  sampling: { type: 'ratio', rate: 0.5 },
62
62
  },
63
63
  safety: {
64
- scorer: createToxicityScorer({ model: '__OPENAI_MODEL_MINI__' }),
64
+ scorer: createToxicityScorer({ model: 'openai/gpt-5-mini' }),
65
65
  sampling: { type: 'ratio', rate: 1 },
66
66
  },
67
67
  },
@@ -69,7 +69,7 @@ import { runEvals } from '@mastra/core/evals'
69
69
  import { createAnswerRelevancyScorer } from '@mastra/evals/scorers/prebuilt'
70
70
  import { myAgent } from './agent'
71
71
 
72
- const scorer = createAnswerRelevancyScorer({ model: 'openai/gpt-5.4' })
72
+ const scorer = createAnswerRelevancyScorer({ model: 'openai/gpt-5.5' })
73
73
 
74
74
  const result = await runEvals({
75
75
  data: [
@@ -62,7 +62,7 @@ import { runEvals } from '@mastra/core/evals'
62
62
  import { createAnswerSimilarityScorer } from '@mastra/evals/scorers/prebuilt'
63
63
  import { myAgent } from './agent'
64
64
 
65
- const scorer = createAnswerSimilarityScorer({ model: 'openai/gpt-5.4' })
65
+ const scorer = createAnswerSimilarityScorer({ model: 'openai/gpt-5.5' })
66
66
 
67
67
  const result = await runEvals({
68
68
  data: [
@@ -82,7 +82,7 @@ import { runEvals } from '@mastra/core/evals'
82
82
  import { createBiasScorer } from '@mastra/evals/scorers/prebuilt'
83
83
  import { myAgent } from './agent'
84
84
 
85
- const scorer = createBiasScorer({ model: 'openai/gpt-5.4' })
85
+ const scorer = createBiasScorer({ model: 'openai/gpt-5.5' })
86
86
 
87
87
  const result = await runEvals({
88
88
  data: [
@@ -98,7 +98,7 @@ MAP = (1.0 + 0.67) / 2 = 0.835 ≈ **0.83**
98
98
 
99
99
  ```typescript
100
100
  const scorer = createContextPrecisionScorer({
101
- model: 'openai/gpt-5.4',
101
+ model: 'openai/gpt-5.5',
102
102
  options: {
103
103
  contextExtractor: (input, output) => {
104
104
  // Extract context dynamically based on the query
@@ -117,7 +117,7 @@ const scorer = createContextPrecisionScorer({
117
117
 
118
118
  ```typescript
119
119
  const scorer = createContextPrecisionScorer({
120
- model: 'openai/gpt-5.4',
120
+ model: 'openai/gpt-5.5',
121
121
  options: {
122
122
  context: [
123
123
  // Simulate retrieved documents from vector database
@@ -142,7 +142,7 @@ import { createContextPrecisionScorer } from '@mastra/evals/scorers/prebuilt'
142
142
  import { myAgent } from './agent'
143
143
 
144
144
  const scorer = createContextPrecisionScorer({
145
- model: 'openai/gpt-5.4',
145
+ model: 'openai/gpt-5.5',
146
146
  options: {
147
147
  contextExtractor: (input, output) => {
148
148
  // Extract context from agent's retrieved documents
@@ -119,7 +119,7 @@ import { createContextRelevanceScorerLLM } from '@mastra/evals'
119
119
 
120
120
  // Stricter penalty configuration
121
121
  const strictScorer = createContextRelevanceScorerLLM({
122
- model: 'openai/gpt-5.4',
122
+ model: 'openai/gpt-5.5',
123
123
  options: {
124
124
  context: [
125
125
  'Einstein won the Nobel Prize for photoelectric effect',
@@ -137,7 +137,7 @@ const strictScorer = createContextRelevanceScorerLLM({
137
137
 
138
138
  // Lenient penalty configuration
139
139
  const lenientScorer = createContextRelevanceScorerLLM({
140
- model: 'openai/gpt-5.4',
140
+ model: 'openai/gpt-5.5',
141
141
  options: {
142
142
  context: [
143
143
  'Einstein won the Nobel Prize for photoelectric effect',
@@ -183,7 +183,7 @@ console.log('Lenient penalties:', lenientResult.score) // Higher score, less pen
183
183
 
184
184
  ```typescript
185
185
  const scorer = createContextRelevanceScorerLLM({
186
- model: 'openai/gpt-5.4',
186
+ model: 'openai/gpt-5.5',
187
187
  options: {
188
188
  contextExtractor: (input, output) => {
189
189
  // Extract context based on the query
@@ -207,7 +207,7 @@ const scorer = createContextRelevanceScorerLLM({
207
207
 
208
208
  ```typescript
209
209
  const scorer = createContextRelevanceScorerLLM({
210
- model: 'openai/gpt-5.4',
210
+ model: 'openai/gpt-5.5',
211
211
  options: {
212
212
  context: ['Relevant information...', 'Supporting details...'],
213
213
  scale: 100, // Scale scores from 0-100 instead of 0-1
@@ -221,7 +221,7 @@ const scorer = createContextRelevanceScorerLLM({
221
221
 
222
222
  ```typescript
223
223
  const scorer = createContextRelevanceScorerLLM({
224
- model: 'openai/gpt-5.4',
224
+ model: 'openai/gpt-5.5',
225
225
  options: {
226
226
  contextExtractor: (input, output) => {
227
227
  const query = input?.inputMessages?.[0]?.content || ''
@@ -248,7 +248,7 @@ This example shows excellent context relevance where all context directly suppor
248
248
  import { createContextRelevanceScorerLLM } from '@mastra/evals'
249
249
 
250
250
  const scorer = createContextRelevanceScorerLLM({
251
- model: 'openai/gpt-5.4',
251
+ model: 'openai/gpt-5.5',
252
252
  options: {
253
253
  context: [
254
254
  'Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1921.',
@@ -295,7 +295,7 @@ This example shows moderate relevance with some context being irrelevant or unus
295
295
  import { createContextRelevanceScorerLLM } from '@mastra/evals'
296
296
 
297
297
  const scorer = createContextRelevanceScorerLLM({
298
- model: 'openai/gpt-5.4',
298
+ model: 'openai/gpt-5.5',
299
299
  options: {
300
300
  context: [
301
301
  'Solar eclipses occur when the Moon blocks the Sun.',
@@ -337,7 +337,7 @@ console.log(result)
337
337
 
338
338
  // With custom penalty configuration
339
339
  const customScorer = createContextRelevanceScorerLLM({
340
- model: 'openai/gpt-5.4',
340
+ model: 'openai/gpt-5.5',
341
341
  options: {
342
342
  context: [
343
343
  'Solar eclipses occur when the Moon blocks the Sun.',
@@ -384,7 +384,7 @@ This example shows poor context relevance with mostly irrelevant information:
384
384
  import { createContextRelevanceScorerLLM } from '@mastra/evals'
385
385
 
386
386
  const scorer = createContextRelevanceScorerLLM({
387
- model: 'openai/gpt-5.4',
387
+ model: 'openai/gpt-5.5',
388
388
  options: {
389
389
  context: [
390
390
  'The Great Barrier Reef is located in Australia.',
@@ -432,7 +432,7 @@ Extract context dynamically based on the run input:
432
432
  import { createContextRelevanceScorerLLM } from '@mastra/evals'
433
433
 
434
434
  const scorer = createContextRelevanceScorerLLM({
435
- model: 'openai/gpt-5.4',
435
+ model: 'openai/gpt-5.5',
436
436
  options: {
437
437
  contextExtractor: (input, output) => {
438
438
  // Extract query from input
@@ -475,7 +475,7 @@ Integrate with RAG pipelines to evaluate retrieved context:
475
475
  import { createContextRelevanceScorerLLM } from '@mastra/evals'
476
476
 
477
477
  const scorer = createContextRelevanceScorerLLM({
478
- model: 'openai/gpt-5.4',
478
+ model: 'openai/gpt-5.5',
479
479
  options: {
480
480
  contextExtractor: (input, output) => {
481
481
  // Extract from RAG retrieval results
@@ -79,7 +79,7 @@ import { myAgent } from './agent'
79
79
 
80
80
  // Context is typically populated from agent tool calls or RAG retrieval
81
81
  const scorer = createFaithfulnessScorer({
82
- model: 'openai/gpt-5.4',
82
+ model: 'openai/gpt-5.5',
83
83
  })
84
84
 
85
85
  const result = await runEvals({
@@ -104,7 +104,7 @@ Use static context when you have known ground truth to compare against:
104
104
  import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
105
105
 
106
106
  const scorer = createHallucinationScorer({
107
- model: 'openai/gpt-5.4',
107
+ model: 'openai/gpt-5.5',
108
108
  options: {
109
109
  context: [
110
110
  'The first iPhone was announced on January 9, 2007.',
@@ -124,7 +124,7 @@ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
124
124
  import { extractToolResults } from '@mastra/evals/scorers'
125
125
 
126
126
  const scorer = createHallucinationScorer({
127
- model: 'openai/gpt-5.4',
127
+ model: 'openai/gpt-5.5',
128
128
  options: {
129
129
  getContext: ({ run, step }) => {
130
130
  // Extract tool results as context
@@ -145,7 +145,7 @@ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
145
145
  import { extractToolResults } from '@mastra/evals/scorers'
146
146
 
147
147
  const hallucinationScorer = createHallucinationScorer({
148
- model: 'openai/gpt-5.4',
148
+ model: 'openai/gpt-5.5',
149
149
  options: {
150
150
  getContext: ({ run }) => {
151
151
  const toolResults = extractToolResults(run.output)
@@ -156,7 +156,7 @@ const hallucinationScorer = createHallucinationScorer({
156
156
 
157
157
  const agent = new Agent({
158
158
  name: 'my-agent',
159
- model: 'openai/gpt-5.4',
159
+ model: 'openai/gpt-5.5',
160
160
  instructions: 'You are a helpful assistant.',
161
161
  evals: {
162
162
  scorers: [hallucinationScorer],
@@ -172,7 +172,7 @@ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
172
172
  import { myAgent } from './agent'
173
173
 
174
174
  const scorer = createHallucinationScorer({
175
- model: 'openai/gpt-5.4',
175
+ model: 'openai/gpt-5.5',
176
176
  options: {
177
177
  context: ['Known fact 1', 'Known fact 2'],
178
178
  },
@@ -61,7 +61,7 @@ describe('Agent Noise Resistance Tests', () => {
61
61
 
62
62
  // Step 4: Evaluate using noise sensitivity scorer
63
63
  const scorer = createNoiseSensitivityScorerLLM({
64
- model: 'openai/gpt-5.4',
64
+ model: 'openai/gpt-5.5',
65
65
  options: {
66
66
  baselineResponse,
67
67
  noisyQuery,
@@ -256,7 +256,7 @@ describe('Agent Noise Resistance CI Tests', () => {
256
256
 
257
257
  // Evaluate using noise sensitivity scorer
258
258
  const scorer = createNoiseSensitivityScorerLLM({
259
- model: 'openai/gpt-5.4',
259
+ model: 'openai/gpt-5.5',
260
260
  options: {
261
261
  baselineResponse: testCase.baselineResponse,
262
262
  noisyQuery: testCase.noisyQuery,
@@ -291,7 +291,7 @@ This example shows an agent that completely resists misinformation in a test sce
291
291
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
292
292
 
293
293
  const scorer = createNoiseSensitivityScorerLLM({
294
- model: 'openai/gpt-5.4',
294
+ model: 'openai/gpt-5.5',
295
295
  options: {
296
296
  baselineResponse:
297
297
  'Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing.',
@@ -337,7 +337,7 @@ This example shows an agent partially distracted by irrelevant requests:
337
337
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals/scorers/prebuilt'
338
338
 
339
339
  const scorer = createNoiseSensitivityScorerLLM({
340
- model: 'openai/gpt-5.4',
340
+ model: 'openai/gpt-5.5',
341
341
  options: {
342
342
  baselineResponse:
343
343
  'To bake a cake: Mix flour, sugar, eggs, and butter. Bake at 350°F for 30 minutes.',
@@ -382,7 +382,7 @@ This example shows an agent that incorporates misinformation:
382
382
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
383
383
 
384
384
  const scorer = createNoiseSensitivityScorerLLM({
385
- model: 'openai/gpt-5.4',
385
+ model: 'openai/gpt-5.5',
386
386
  options: {
387
387
  baselineResponse: 'Climate change is caused by greenhouse gas emissions from human activities.',
388
388
  noisyQuery:
@@ -428,7 +428,7 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
428
428
 
429
429
  // Lenient scoring - more forgiving of minor issues
430
430
  const lenientScorer = createNoiseSensitivityScorerLLM({
431
- model: 'openai/gpt-5.4',
431
+ model: 'openai/gpt-5.5',
432
432
  options: {
433
433
  baselineResponse: 'Python is a high-level programming language.',
434
434
  noisyQuery: 'What is Python? Also, snakes are dangerous!',
@@ -448,7 +448,7 @@ const lenientScorer = createNoiseSensitivityScorerLLM({
448
448
 
449
449
  // Strict scoring - harsh on any deviation
450
450
  const strictScorer = createNoiseSensitivityScorerLLM({
451
- model: 'openai/gpt-5.4',
451
+ model: 'openai/gpt-5.5',
452
452
  options: {
453
453
  baselineResponse: 'Python is a high-level programming language.',
454
454
  noisyQuery: 'What is Python? Also, snakes are dangerous!',
@@ -499,7 +499,7 @@ async function evaluateNoiseResistance(testCases) {
499
499
 
500
500
  for (const testCase of testCases) {
501
501
  const scorer = createNoiseSensitivityScorerLLM({
502
- model: 'openai/gpt-5.4',
502
+ model: 'openai/gpt-5.5',
503
503
  options: {
504
504
  baselineResponse: testCase.baseline,
505
505
  noisyQuery: testCase.noisyQuery,
@@ -546,9 +546,9 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
546
546
 
547
547
  async function compareModelRobustness() {
548
548
  const models = [
549
- { name: 'GPT-5.4', model: 'openai/gpt-5.4' },
549
+ { name: 'GPT-5.4', model: 'openai/gpt-5.5' },
550
550
  { name: 'GPT-5.4-mini', model: 'openai/gpt-5-mini' },
551
- { name: 'Claude', model: 'anthropic/claude-opus-4-6' },
551
+ { name: 'Claude', model: 'anthropic/claude-opus-4-7' },
552
552
  ]
553
553
 
554
554
  const testScenario = {
@@ -598,7 +598,7 @@ Include noise sensitivity tests in your security test suite to validate prompt i
598
598
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
599
599
 
600
600
  const scorer = createNoiseSensitivityScorerLLM({
601
- model: 'openai/gpt-5.4',
601
+ model: 'openai/gpt-5.5',
602
602
  options: {
603
603
  baselineResponse: 'I can help you with programming questions.',
604
604
  noisyQuery:
@@ -60,7 +60,7 @@ You can customize the Prompt Alignment Scorer by adjusting the scale parameter a
60
60
 
61
61
  ```typescript
62
62
  const scorer = createPromptAlignmentScorerLLM({
63
- model: 'openai/gpt-5.4',
63
+ model: 'openai/gpt-5.5',
64
64
  options: {
65
65
  scale: 10, // Score from 0-10 instead of 0-1
66
66
  evaluationMode: 'both', // 'user', 'system', or 'both' (default)
@@ -221,24 +221,24 @@ Measure how well your AI agents follow user instructions:
221
221
  const agent = new Agent({
222
222
  name: 'CodingAssistant',
223
223
  instructions: 'You are a helpful coding assistant. Always provide working code examples.',
224
- model: 'openai/gpt-5.4',
224
+ model: 'openai/gpt-5.5',
225
225
  })
226
226
 
227
227
  // Evaluate comprehensive alignment (default)
228
228
  const scorer = createPromptAlignmentScorerLLM({
229
- model: 'openai/gpt-5.4',
229
+ model: 'openai/gpt-5.5',
230
230
  options: { evaluationMode: 'both' }, // Evaluates both user intent and system guidelines
231
231
  })
232
232
 
233
233
  // Evaluate just user satisfaction
234
234
  const userScorer = createPromptAlignmentScorerLLM({
235
- model: 'openai/gpt-5.4',
235
+ model: 'openai/gpt-5.5',
236
236
  options: { evaluationMode: 'user' }, // Focus only on user request fulfillment
237
237
  })
238
238
 
239
239
  // Evaluate system compliance
240
240
  const systemScorer = createPromptAlignmentScorerLLM({
241
- model: 'openai/gpt-5.4',
241
+ model: 'openai/gpt-5.5',
242
242
  options: { evaluationMode: 'system' }, // Check adherence to system instructions
243
243
  })
244
244
 
@@ -290,7 +290,7 @@ for (const agent of agents) {
290
290
  import { createPromptAlignmentScorerLLM } from '@mastra/evals'
291
291
 
292
292
  const scorer = createPromptAlignmentScorerLLM({
293
- model: 'openai/gpt-5.4',
293
+ model: 'openai/gpt-5.5',
294
294
  })
295
295
 
296
296
  // Evaluate a code generation task
@@ -319,7 +319,7 @@ const result = await scorer.run({
319
319
  ```typescript
320
320
  // Configure scale and evaluation mode
321
321
  const scorer = createPromptAlignmentScorerLLM({
322
- model: 'openai/gpt-5.4',
322
+ model: 'openai/gpt-5.5',
323
323
  options: {
324
324
  scale: 10, // Score from 0-10 instead of 0-1
325
325
  evaluationMode: 'both', // 'user', 'system', or 'both' (default)
@@ -328,13 +328,13 @@ const scorer = createPromptAlignmentScorerLLM({
328
328
 
329
329
  // User-only evaluation - focus on user satisfaction
330
330
  const userScorer = createPromptAlignmentScorerLLM({
331
- model: 'openai/gpt-5.4',
331
+ model: 'openai/gpt-5.5',
332
332
  options: { evaluationMode: 'user' },
333
333
  })
334
334
 
335
335
  // System-only evaluation - focus on compliance
336
336
  const systemScorer = createPromptAlignmentScorerLLM({
337
- model: 'openai/gpt-5.4',
337
+ model: 'openai/gpt-5.5',
338
338
  options: { evaluationMode: 'system' },
339
339
  })
340
340
 
@@ -369,7 +369,7 @@ In this example, the response fully addresses the user's prompt with all require
369
369
  import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
370
370
 
371
371
  const scorer = createPromptAlignmentScorerLLM({
372
- model: 'openai/gpt-5.4',
372
+ model: 'openai/gpt-5.5',
373
373
  })
374
374
 
375
375
  const inputMessages = [
@@ -417,7 +417,7 @@ In this example, the response addresses the core intent but misses some requirem
417
417
  import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
418
418
 
419
419
  const scorer = createPromptAlignmentScorerLLM({
420
- model: 'openai/gpt-5.4',
420
+ model: 'openai/gpt-5.5',
421
421
  })
422
422
 
423
423
  const inputMessages = [
@@ -458,7 +458,7 @@ In this example, the response fails to address the user's specific requirements.
458
458
  import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
459
459
 
460
460
  const scorer = createPromptAlignmentScorerLLM({
461
- model: 'openai/gpt-5.4',
461
+ model: 'openai/gpt-5.5',
462
462
  })
463
463
 
464
464
  const inputMessages = [
@@ -502,7 +502,7 @@ Evaluates how well the response addresses the user's request, ignoring system in
502
502
 
503
503
  ```typescript
504
504
  const scorer = createPromptAlignmentScorerLLM({
505
- model: 'openai/gpt-5.4',
505
+ model: 'openai/gpt-5.5',
506
506
  options: { evaluationMode: 'user' },
507
507
  })
508
508
 
@@ -534,7 +534,7 @@ Evaluates compliance with system behavioral guidelines and constraints:
534
534
 
535
535
  ```typescript
536
536
  const scorer = createPromptAlignmentScorerLLM({
537
- model: 'openai/gpt-5.4',
537
+ model: 'openai/gpt-5.5',
538
538
  options: { evaluationMode: 'system' },
539
539
  })
540
540
 
@@ -566,7 +566,7 @@ Evaluates both user intent fulfillment and system compliance with weighted scori
566
566
 
567
567
  ```typescript
568
568
  const scorer = createPromptAlignmentScorerLLM({
569
- model: 'openai/gpt-5.4',
569
+ model: 'openai/gpt-5.5',
570
570
  options: { evaluationMode: 'both' }, // This is the default
571
571
  })
572
572
 
@@ -309,7 +309,7 @@ The LLM-based scorer provides:
309
309
  ```typescript
310
310
  // Basic configuration
311
311
  const basicLLMScorer = createLLMScorer({
312
- model: 'openai/gpt-5.4',
312
+ model: 'openai/gpt-5.5',
313
313
  availableTools: [
314
314
  { name: 'tool1', description: 'Description 1' },
315
315
  { name: 'tool2', description: 'Description 2' }
@@ -349,7 +349,7 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
349
349
 
350
350
  ```typescript
351
351
  const llmScorer = createToolCallAccuracyScorerLLM({
352
- model: 'openai/gpt-5.4',
352
+ model: 'openai/gpt-5.5',
353
353
  availableTools: [
354
354
  {
355
355
  name: 'weather-tool',
@@ -482,7 +482,7 @@ const codeScorer = createCodeScorer({
482
482
  })
483
483
 
484
484
  const llmScorer = createLLMScorer({
485
- model: 'openai/gpt-5.4',
485
+ model: 'openai/gpt-5.5',
486
486
  availableTools: [
487
487
  { name: 'weather-tool', description: 'Get weather information' },
488
488
  { name: 'search-tool', description: 'Search the web' },
@@ -86,7 +86,7 @@ import { runEvals } from '@mastra/core/evals'
86
86
  import { createToxicityScorer } from '@mastra/evals/scorers/prebuilt'
87
87
  import { myAgent } from './agent'
88
88
 
89
- const scorer = createToxicityScorer({ model: 'openai/gpt-5.4' })
89
+ const scorer = createToxicityScorer({ model: 'openai/gpt-5.5' })
90
90
 
91
91
  const result = await runEvals({
92
92
  data: [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/evals",
3
- "version": "1.2.4-alpha.0",
3
+ "version": "1.2.4",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "files": [
@@ -75,12 +75,12 @@
75
75
  "typescript": "^6.0.3",
76
76
  "vitest": "4.1.5",
77
77
  "zod": "^4.4.3",
78
- "@internal/ai-sdk-v5": "0.0.46",
79
- "@internal/lint": "0.0.99",
80
- "@internal/llm-recorder": "0.0.35",
81
- "@internal/types-builder": "0.0.74",
82
- "@internal/test-utils": "0.0.35",
83
- "@mastra/core": "1.38.0-alpha.3"
78
+ "@internal/ai-sdk-v5": "0.0.47",
79
+ "@internal/llm-recorder": "0.0.36",
80
+ "@internal/test-utils": "0.0.36",
81
+ "@internal/lint": "0.0.100",
82
+ "@internal/types-builder": "0.0.75",
83
+ "@mastra/core": "1.38.0"
84
84
  },
85
85
  "engines": {
86
86
  "node": ">=22.13.0"