@mastra/evals 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/{chunk-33T2SZZ2.cjs → chunk-BULMCHKJ.cjs} +20 -16
- package/dist/chunk-BULMCHKJ.cjs.map +1 -0
- package/dist/{chunk-ZRHCSFKL.js → chunk-XOXUFZEG.js} +20 -16
- package/dist/chunk-XOXUFZEG.js.map +1 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-overview.md +2 -2
- package/dist/docs/references/reference-evals-answer-relevancy.md +1 -1
- package/dist/docs/references/reference-evals-answer-similarity.md +1 -1
- package/dist/docs/references/reference-evals-bias.md +1 -1
- package/dist/docs/references/reference-evals-context-precision.md +3 -3
- package/dist/docs/references/reference-evals-context-relevance.md +11 -11
- package/dist/docs/references/reference-evals-faithfulness.md +1 -1
- package/dist/docs/references/reference-evals-hallucination.md +5 -5
- package/dist/docs/references/reference-evals-noise-sensitivity.md +11 -11
- package/dist/docs/references/reference-evals-prompt-alignment.md +15 -15
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +3 -3
- package/dist/docs/references/reference-evals-toxicity.md +1 -1
- package/dist/scorers/prebuilt/index.cjs +74 -74
- package/dist/scorers/prebuilt/index.js +1 -1
- package/dist/scorers/utils.cjs +25 -25
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +7 -7
- package/dist/chunk-33T2SZZ2.cjs.map +0 -1
- package/dist/chunk-ZRHCSFKL.js.map +0 -1
|
@@ -98,7 +98,7 @@ MAP = (1.0 + 0.67) / 2 = 0.835 ≈ **0.83**
|
|
|
98
98
|
|
|
99
99
|
```typescript
|
|
100
100
|
const scorer = createContextPrecisionScorer({
|
|
101
|
-
model: 'openai/gpt-5.
|
|
101
|
+
model: 'openai/gpt-5.5',
|
|
102
102
|
options: {
|
|
103
103
|
contextExtractor: (input, output) => {
|
|
104
104
|
// Extract context dynamically based on the query
|
|
@@ -117,7 +117,7 @@ const scorer = createContextPrecisionScorer({
|
|
|
117
117
|
|
|
118
118
|
```typescript
|
|
119
119
|
const scorer = createContextPrecisionScorer({
|
|
120
|
-
model: 'openai/gpt-5.
|
|
120
|
+
model: 'openai/gpt-5.5',
|
|
121
121
|
options: {
|
|
122
122
|
context: [
|
|
123
123
|
// Simulate retrieved documents from vector database
|
|
@@ -142,7 +142,7 @@ import { createContextPrecisionScorer } from '@mastra/evals/scorers/prebuilt'
|
|
|
142
142
|
import { myAgent } from './agent'
|
|
143
143
|
|
|
144
144
|
const scorer = createContextPrecisionScorer({
|
|
145
|
-
model: 'openai/gpt-5.
|
|
145
|
+
model: 'openai/gpt-5.5',
|
|
146
146
|
options: {
|
|
147
147
|
contextExtractor: (input, output) => {
|
|
148
148
|
// Extract context from agent's retrieved documents
|
|
@@ -119,7 +119,7 @@ import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
|
119
119
|
|
|
120
120
|
// Stricter penalty configuration
|
|
121
121
|
const strictScorer = createContextRelevanceScorerLLM({
|
|
122
|
-
model: 'openai/gpt-5.
|
|
122
|
+
model: 'openai/gpt-5.5',
|
|
123
123
|
options: {
|
|
124
124
|
context: [
|
|
125
125
|
'Einstein won the Nobel Prize for photoelectric effect',
|
|
@@ -137,7 +137,7 @@ const strictScorer = createContextRelevanceScorerLLM({
|
|
|
137
137
|
|
|
138
138
|
// Lenient penalty configuration
|
|
139
139
|
const lenientScorer = createContextRelevanceScorerLLM({
|
|
140
|
-
model: 'openai/gpt-5.
|
|
140
|
+
model: 'openai/gpt-5.5',
|
|
141
141
|
options: {
|
|
142
142
|
context: [
|
|
143
143
|
'Einstein won the Nobel Prize for photoelectric effect',
|
|
@@ -183,7 +183,7 @@ console.log('Lenient penalties:', lenientResult.score) // Higher score, less pen
|
|
|
183
183
|
|
|
184
184
|
```typescript
|
|
185
185
|
const scorer = createContextRelevanceScorerLLM({
|
|
186
|
-
model: 'openai/gpt-5.
|
|
186
|
+
model: 'openai/gpt-5.5',
|
|
187
187
|
options: {
|
|
188
188
|
contextExtractor: (input, output) => {
|
|
189
189
|
// Extract context based on the query
|
|
@@ -207,7 +207,7 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
207
207
|
|
|
208
208
|
```typescript
|
|
209
209
|
const scorer = createContextRelevanceScorerLLM({
|
|
210
|
-
model: 'openai/gpt-5.
|
|
210
|
+
model: 'openai/gpt-5.5',
|
|
211
211
|
options: {
|
|
212
212
|
context: ['Relevant information...', 'Supporting details...'],
|
|
213
213
|
scale: 100, // Scale scores from 0-100 instead of 0-1
|
|
@@ -221,7 +221,7 @@ const scorer = createContextRelevanceScorerLLM({
|
|
|
221
221
|
|
|
222
222
|
```typescript
|
|
223
223
|
const scorer = createContextRelevanceScorerLLM({
|
|
224
|
-
model: 'openai/gpt-5.
|
|
224
|
+
model: 'openai/gpt-5.5',
|
|
225
225
|
options: {
|
|
226
226
|
contextExtractor: (input, output) => {
|
|
227
227
|
const query = input?.inputMessages?.[0]?.content || ''
|
|
@@ -248,7 +248,7 @@ This example shows excellent context relevance where all context directly suppor
|
|
|
248
248
|
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
249
249
|
|
|
250
250
|
const scorer = createContextRelevanceScorerLLM({
|
|
251
|
-
model: 'openai/gpt-5.
|
|
251
|
+
model: 'openai/gpt-5.5',
|
|
252
252
|
options: {
|
|
253
253
|
context: [
|
|
254
254
|
'Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1921.',
|
|
@@ -295,7 +295,7 @@ This example shows moderate relevance with some context being irrelevant or unus
|
|
|
295
295
|
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
296
296
|
|
|
297
297
|
const scorer = createContextRelevanceScorerLLM({
|
|
298
|
-
model: 'openai/gpt-5.
|
|
298
|
+
model: 'openai/gpt-5.5',
|
|
299
299
|
options: {
|
|
300
300
|
context: [
|
|
301
301
|
'Solar eclipses occur when the Moon blocks the Sun.',
|
|
@@ -337,7 +337,7 @@ console.log(result)
|
|
|
337
337
|
|
|
338
338
|
// With custom penalty configuration
|
|
339
339
|
const customScorer = createContextRelevanceScorerLLM({
|
|
340
|
-
model: 'openai/gpt-5.
|
|
340
|
+
model: 'openai/gpt-5.5',
|
|
341
341
|
options: {
|
|
342
342
|
context: [
|
|
343
343
|
'Solar eclipses occur when the Moon blocks the Sun.',
|
|
@@ -384,7 +384,7 @@ This example shows poor context relevance with mostly irrelevant information:
|
|
|
384
384
|
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
385
385
|
|
|
386
386
|
const scorer = createContextRelevanceScorerLLM({
|
|
387
|
-
model: 'openai/gpt-5.
|
|
387
|
+
model: 'openai/gpt-5.5',
|
|
388
388
|
options: {
|
|
389
389
|
context: [
|
|
390
390
|
'The Great Barrier Reef is located in Australia.',
|
|
@@ -432,7 +432,7 @@ Extract context dynamically based on the run input:
|
|
|
432
432
|
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
433
433
|
|
|
434
434
|
const scorer = createContextRelevanceScorerLLM({
|
|
435
|
-
model: 'openai/gpt-5.
|
|
435
|
+
model: 'openai/gpt-5.5',
|
|
436
436
|
options: {
|
|
437
437
|
contextExtractor: (input, output) => {
|
|
438
438
|
// Extract query from input
|
|
@@ -475,7 +475,7 @@ Integrate with RAG pipelines to evaluate retrieved context:
|
|
|
475
475
|
import { createContextRelevanceScorerLLM } from '@mastra/evals'
|
|
476
476
|
|
|
477
477
|
const scorer = createContextRelevanceScorerLLM({
|
|
478
|
-
model: 'openai/gpt-5.
|
|
478
|
+
model: 'openai/gpt-5.5',
|
|
479
479
|
options: {
|
|
480
480
|
contextExtractor: (input, output) => {
|
|
481
481
|
// Extract from RAG retrieval results
|
|
@@ -104,7 +104,7 @@ Use static context when you have known ground truth to compare against:
|
|
|
104
104
|
import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
105
105
|
|
|
106
106
|
const scorer = createHallucinationScorer({
|
|
107
|
-
model: 'openai/gpt-5.
|
|
107
|
+
model: 'openai/gpt-5.5',
|
|
108
108
|
options: {
|
|
109
109
|
context: [
|
|
110
110
|
'The first iPhone was announced on January 9, 2007.',
|
|
@@ -124,7 +124,7 @@ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
|
124
124
|
import { extractToolResults } from '@mastra/evals/scorers'
|
|
125
125
|
|
|
126
126
|
const scorer = createHallucinationScorer({
|
|
127
|
-
model: 'openai/gpt-5.
|
|
127
|
+
model: 'openai/gpt-5.5',
|
|
128
128
|
options: {
|
|
129
129
|
getContext: ({ run, step }) => {
|
|
130
130
|
// Extract tool results as context
|
|
@@ -145,7 +145,7 @@ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
|
145
145
|
import { extractToolResults } from '@mastra/evals/scorers'
|
|
146
146
|
|
|
147
147
|
const hallucinationScorer = createHallucinationScorer({
|
|
148
|
-
model: 'openai/gpt-5.
|
|
148
|
+
model: 'openai/gpt-5.5',
|
|
149
149
|
options: {
|
|
150
150
|
getContext: ({ run }) => {
|
|
151
151
|
const toolResults = extractToolResults(run.output)
|
|
@@ -156,7 +156,7 @@ const hallucinationScorer = createHallucinationScorer({
|
|
|
156
156
|
|
|
157
157
|
const agent = new Agent({
|
|
158
158
|
name: 'my-agent',
|
|
159
|
-
model: 'openai/gpt-5.
|
|
159
|
+
model: 'openai/gpt-5.5',
|
|
160
160
|
instructions: 'You are a helpful assistant.',
|
|
161
161
|
evals: {
|
|
162
162
|
scorers: [hallucinationScorer],
|
|
@@ -172,7 +172,7 @@ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
|
|
|
172
172
|
import { myAgent } from './agent'
|
|
173
173
|
|
|
174
174
|
const scorer = createHallucinationScorer({
|
|
175
|
-
model: 'openai/gpt-5.
|
|
175
|
+
model: 'openai/gpt-5.5',
|
|
176
176
|
options: {
|
|
177
177
|
context: ['Known fact 1', 'Known fact 2'],
|
|
178
178
|
},
|
|
@@ -61,7 +61,7 @@ describe('Agent Noise Resistance Tests', () => {
|
|
|
61
61
|
|
|
62
62
|
// Step 4: Evaluate using noise sensitivity scorer
|
|
63
63
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
64
|
-
model: 'openai/gpt-5.
|
|
64
|
+
model: 'openai/gpt-5.5',
|
|
65
65
|
options: {
|
|
66
66
|
baselineResponse,
|
|
67
67
|
noisyQuery,
|
|
@@ -256,7 +256,7 @@ describe('Agent Noise Resistance CI Tests', () => {
|
|
|
256
256
|
|
|
257
257
|
// Evaluate using noise sensitivity scorer
|
|
258
258
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
259
|
-
model: 'openai/gpt-5.
|
|
259
|
+
model: 'openai/gpt-5.5',
|
|
260
260
|
options: {
|
|
261
261
|
baselineResponse: testCase.baselineResponse,
|
|
262
262
|
noisyQuery: testCase.noisyQuery,
|
|
@@ -291,7 +291,7 @@ This example shows an agent that completely resists misinformation in a test sce
|
|
|
291
291
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
292
292
|
|
|
293
293
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
294
|
-
model: 'openai/gpt-5.
|
|
294
|
+
model: 'openai/gpt-5.5',
|
|
295
295
|
options: {
|
|
296
296
|
baselineResponse:
|
|
297
297
|
'Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing.',
|
|
@@ -337,7 +337,7 @@ This example shows an agent partially distracted by irrelevant requests:
|
|
|
337
337
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
338
338
|
|
|
339
339
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
340
|
-
model: 'openai/gpt-5.
|
|
340
|
+
model: 'openai/gpt-5.5',
|
|
341
341
|
options: {
|
|
342
342
|
baselineResponse:
|
|
343
343
|
'To bake a cake: Mix flour, sugar, eggs, and butter. Bake at 350°F for 30 minutes.',
|
|
@@ -382,7 +382,7 @@ This example shows an agent that incorporates misinformation:
|
|
|
382
382
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
383
383
|
|
|
384
384
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
385
|
-
model: 'openai/gpt-5.
|
|
385
|
+
model: 'openai/gpt-5.5',
|
|
386
386
|
options: {
|
|
387
387
|
baselineResponse: 'Climate change is caused by greenhouse gas emissions from human activities.',
|
|
388
388
|
noisyQuery:
|
|
@@ -428,7 +428,7 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
|
428
428
|
|
|
429
429
|
// Lenient scoring - more forgiving of minor issues
|
|
430
430
|
const lenientScorer = createNoiseSensitivityScorerLLM({
|
|
431
|
-
model: 'openai/gpt-5.
|
|
431
|
+
model: 'openai/gpt-5.5',
|
|
432
432
|
options: {
|
|
433
433
|
baselineResponse: 'Python is a high-level programming language.',
|
|
434
434
|
noisyQuery: 'What is Python? Also, snakes are dangerous!',
|
|
@@ -448,7 +448,7 @@ const lenientScorer = createNoiseSensitivityScorerLLM({
|
|
|
448
448
|
|
|
449
449
|
// Strict scoring - harsh on any deviation
|
|
450
450
|
const strictScorer = createNoiseSensitivityScorerLLM({
|
|
451
|
-
model: 'openai/gpt-5.
|
|
451
|
+
model: 'openai/gpt-5.5',
|
|
452
452
|
options: {
|
|
453
453
|
baselineResponse: 'Python is a high-level programming language.',
|
|
454
454
|
noisyQuery: 'What is Python? Also, snakes are dangerous!',
|
|
@@ -499,7 +499,7 @@ async function evaluateNoiseResistance(testCases) {
|
|
|
499
499
|
|
|
500
500
|
for (const testCase of testCases) {
|
|
501
501
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
502
|
-
model: 'openai/gpt-5.
|
|
502
|
+
model: 'openai/gpt-5.5',
|
|
503
503
|
options: {
|
|
504
504
|
baselineResponse: testCase.baseline,
|
|
505
505
|
noisyQuery: testCase.noisyQuery,
|
|
@@ -546,9 +546,9 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
|
546
546
|
|
|
547
547
|
async function compareModelRobustness() {
|
|
548
548
|
const models = [
|
|
549
|
-
{ name: 'GPT-5.4', model: 'openai/gpt-5.
|
|
549
|
+
{ name: 'GPT-5.4', model: 'openai/gpt-5.5' },
|
|
550
550
|
{ name: 'GPT-5.4-mini', model: 'openai/gpt-5-mini' },
|
|
551
|
-
{ name: 'Claude', model: 'anthropic/claude-opus-4-
|
|
551
|
+
{ name: 'Claude', model: 'anthropic/claude-opus-4-7' },
|
|
552
552
|
]
|
|
553
553
|
|
|
554
554
|
const testScenario = {
|
|
@@ -598,7 +598,7 @@ Include noise sensitivity tests in your security test suite to validate prompt i
|
|
|
598
598
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
599
599
|
|
|
600
600
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
601
|
-
model: 'openai/gpt-5.
|
|
601
|
+
model: 'openai/gpt-5.5',
|
|
602
602
|
options: {
|
|
603
603
|
baselineResponse: 'I can help you with programming questions.',
|
|
604
604
|
noisyQuery:
|
|
@@ -60,7 +60,7 @@ You can customize the Prompt Alignment Scorer by adjusting the scale parameter a
|
|
|
60
60
|
|
|
61
61
|
```typescript
|
|
62
62
|
const scorer = createPromptAlignmentScorerLLM({
|
|
63
|
-
model: 'openai/gpt-5.
|
|
63
|
+
model: 'openai/gpt-5.5',
|
|
64
64
|
options: {
|
|
65
65
|
scale: 10, // Score from 0-10 instead of 0-1
|
|
66
66
|
evaluationMode: 'both', // 'user', 'system', or 'both' (default)
|
|
@@ -221,24 +221,24 @@ Measure how well your AI agents follow user instructions:
|
|
|
221
221
|
const agent = new Agent({
|
|
222
222
|
name: 'CodingAssistant',
|
|
223
223
|
instructions: 'You are a helpful coding assistant. Always provide working code examples.',
|
|
224
|
-
model: 'openai/gpt-5.
|
|
224
|
+
model: 'openai/gpt-5.5',
|
|
225
225
|
})
|
|
226
226
|
|
|
227
227
|
// Evaluate comprehensive alignment (default)
|
|
228
228
|
const scorer = createPromptAlignmentScorerLLM({
|
|
229
|
-
model: 'openai/gpt-5.
|
|
229
|
+
model: 'openai/gpt-5.5',
|
|
230
230
|
options: { evaluationMode: 'both' }, // Evaluates both user intent and system guidelines
|
|
231
231
|
})
|
|
232
232
|
|
|
233
233
|
// Evaluate just user satisfaction
|
|
234
234
|
const userScorer = createPromptAlignmentScorerLLM({
|
|
235
|
-
model: 'openai/gpt-5.
|
|
235
|
+
model: 'openai/gpt-5.5',
|
|
236
236
|
options: { evaluationMode: 'user' }, // Focus only on user request fulfillment
|
|
237
237
|
})
|
|
238
238
|
|
|
239
239
|
// Evaluate system compliance
|
|
240
240
|
const systemScorer = createPromptAlignmentScorerLLM({
|
|
241
|
-
model: 'openai/gpt-5.
|
|
241
|
+
model: 'openai/gpt-5.5',
|
|
242
242
|
options: { evaluationMode: 'system' }, // Check adherence to system instructions
|
|
243
243
|
})
|
|
244
244
|
|
|
@@ -290,7 +290,7 @@ for (const agent of agents) {
|
|
|
290
290
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals'
|
|
291
291
|
|
|
292
292
|
const scorer = createPromptAlignmentScorerLLM({
|
|
293
|
-
model: 'openai/gpt-5.
|
|
293
|
+
model: 'openai/gpt-5.5',
|
|
294
294
|
})
|
|
295
295
|
|
|
296
296
|
// Evaluate a code generation task
|
|
@@ -319,7 +319,7 @@ const result = await scorer.run({
|
|
|
319
319
|
```typescript
|
|
320
320
|
// Configure scale and evaluation mode
|
|
321
321
|
const scorer = createPromptAlignmentScorerLLM({
|
|
322
|
-
model: 'openai/gpt-5.
|
|
322
|
+
model: 'openai/gpt-5.5',
|
|
323
323
|
options: {
|
|
324
324
|
scale: 10, // Score from 0-10 instead of 0-1
|
|
325
325
|
evaluationMode: 'both', // 'user', 'system', or 'both' (default)
|
|
@@ -328,13 +328,13 @@ const scorer = createPromptAlignmentScorerLLM({
|
|
|
328
328
|
|
|
329
329
|
// User-only evaluation - focus on user satisfaction
|
|
330
330
|
const userScorer = createPromptAlignmentScorerLLM({
|
|
331
|
-
model: 'openai/gpt-5.
|
|
331
|
+
model: 'openai/gpt-5.5',
|
|
332
332
|
options: { evaluationMode: 'user' },
|
|
333
333
|
})
|
|
334
334
|
|
|
335
335
|
// System-only evaluation - focus on compliance
|
|
336
336
|
const systemScorer = createPromptAlignmentScorerLLM({
|
|
337
|
-
model: 'openai/gpt-5.
|
|
337
|
+
model: 'openai/gpt-5.5',
|
|
338
338
|
options: { evaluationMode: 'system' },
|
|
339
339
|
})
|
|
340
340
|
|
|
@@ -369,7 +369,7 @@ In this example, the response fully addresses the user's prompt with all require
|
|
|
369
369
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
370
370
|
|
|
371
371
|
const scorer = createPromptAlignmentScorerLLM({
|
|
372
|
-
model: 'openai/gpt-5.
|
|
372
|
+
model: 'openai/gpt-5.5',
|
|
373
373
|
})
|
|
374
374
|
|
|
375
375
|
const inputMessages = [
|
|
@@ -417,7 +417,7 @@ In this example, the response addresses the core intent but misses some requirem
|
|
|
417
417
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
418
418
|
|
|
419
419
|
const scorer = createPromptAlignmentScorerLLM({
|
|
420
|
-
model: 'openai/gpt-5.
|
|
420
|
+
model: 'openai/gpt-5.5',
|
|
421
421
|
})
|
|
422
422
|
|
|
423
423
|
const inputMessages = [
|
|
@@ -458,7 +458,7 @@ In this example, the response fails to address the user's specific requirements.
|
|
|
458
458
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
459
459
|
|
|
460
460
|
const scorer = createPromptAlignmentScorerLLM({
|
|
461
|
-
model: 'openai/gpt-5.
|
|
461
|
+
model: 'openai/gpt-5.5',
|
|
462
462
|
})
|
|
463
463
|
|
|
464
464
|
const inputMessages = [
|
|
@@ -502,7 +502,7 @@ Evaluates how well the response addresses the user's request, ignoring system in
|
|
|
502
502
|
|
|
503
503
|
```typescript
|
|
504
504
|
const scorer = createPromptAlignmentScorerLLM({
|
|
505
|
-
model: 'openai/gpt-5.
|
|
505
|
+
model: 'openai/gpt-5.5',
|
|
506
506
|
options: { evaluationMode: 'user' },
|
|
507
507
|
})
|
|
508
508
|
|
|
@@ -534,7 +534,7 @@ Evaluates compliance with system behavioral guidelines and constraints:
|
|
|
534
534
|
|
|
535
535
|
```typescript
|
|
536
536
|
const scorer = createPromptAlignmentScorerLLM({
|
|
537
|
-
model: 'openai/gpt-5.
|
|
537
|
+
model: 'openai/gpt-5.5',
|
|
538
538
|
options: { evaluationMode: 'system' },
|
|
539
539
|
})
|
|
540
540
|
|
|
@@ -566,7 +566,7 @@ Evaluates both user intent fulfillment and system compliance with weighted scori
|
|
|
566
566
|
|
|
567
567
|
```typescript
|
|
568
568
|
const scorer = createPromptAlignmentScorerLLM({
|
|
569
|
-
model: 'openai/gpt-5.
|
|
569
|
+
model: 'openai/gpt-5.5',
|
|
570
570
|
options: { evaluationMode: 'both' }, // This is the default
|
|
571
571
|
})
|
|
572
572
|
|
|
@@ -309,7 +309,7 @@ The LLM-based scorer provides:
|
|
|
309
309
|
```typescript
|
|
310
310
|
// Basic configuration
|
|
311
311
|
const basicLLMScorer = createLLMScorer({
|
|
312
|
-
model: 'openai/gpt-5.
|
|
312
|
+
model: 'openai/gpt-5.5',
|
|
313
313
|
availableTools: [
|
|
314
314
|
{ name: 'tool1', description: 'Description 1' },
|
|
315
315
|
{ name: 'tool2', description: 'Description 2' }
|
|
@@ -349,7 +349,7 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
|
|
|
349
349
|
|
|
350
350
|
```typescript
|
|
351
351
|
const llmScorer = createToolCallAccuracyScorerLLM({
|
|
352
|
-
model: 'openai/gpt-5.
|
|
352
|
+
model: 'openai/gpt-5.5',
|
|
353
353
|
availableTools: [
|
|
354
354
|
{
|
|
355
355
|
name: 'weather-tool',
|
|
@@ -482,7 +482,7 @@ const codeScorer = createCodeScorer({
|
|
|
482
482
|
})
|
|
483
483
|
|
|
484
484
|
const llmScorer = createLLMScorer({
|
|
485
|
-
model: 'openai/gpt-5.
|
|
485
|
+
model: 'openai/gpt-5.5',
|
|
486
486
|
availableTools: [
|
|
487
487
|
{ name: 'weather-tool', description: 'Get weather information' },
|
|
488
488
|
{ name: 'search-tool', description: 'Search the web' },
|
|
@@ -86,7 +86,7 @@ import { runEvals } from '@mastra/core/evals'
|
|
|
86
86
|
import { createToxicityScorer } from '@mastra/evals/scorers/prebuilt'
|
|
87
87
|
import { myAgent } from './agent'
|
|
88
88
|
|
|
89
|
-
const scorer = createToxicityScorer({ model: 'openai/gpt-5.
|
|
89
|
+
const scorer = createToxicityScorer({ model: 'openai/gpt-5.5' })
|
|
90
90
|
|
|
91
91
|
const result = await runEvals({
|
|
92
92
|
data: [
|