@artemiskit/core 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/dist/adapters/types.d.ts +5 -0
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/manifest.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +20 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/pricing.d.ts +2 -1
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/index.js +468 -205
- package/dist/scenario/schema.d.ts +8 -0
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +66 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +5 -0
- package/src/artifacts/manifest.ts +24 -2
- package/src/artifacts/types.ts +21 -0
- package/src/cost/pricing.ts +242 -65
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/similarity.test.ts +4 -3
- package/src/scenario/schema.ts +4 -0
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +186 -4
- package/src/storage/types.ts +77 -0
- package/dist/events/emitter.d.ts +0 -111
- package/dist/events/emitter.d.ts.map +0 -1
- package/dist/events/index.d.ts +0 -6
- package/dist/events/index.d.ts.map +0 -1
- package/dist/events/types.d.ts +0 -177
- package/dist/events/types.d.ts.map +0 -1
package/src/cost/pricing.ts
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Pricing is per 1,000 tokens (1K tokens) in USD
|
|
5
5
|
* Data is updated periodically - always verify with provider's official pricing
|
|
6
|
+
* Last comprehensive update: January 2026
|
|
6
7
|
*/
|
|
7
8
|
|
|
8
9
|
export interface ModelPricing {
|
|
@@ -34,132 +35,230 @@ export interface CostEstimate {
|
|
|
34
35
|
* Prices are in USD per 1,000 tokens
|
|
35
36
|
*/
|
|
36
37
|
export const MODEL_PRICING: Record<string, ModelPricing> = {
|
|
37
|
-
//
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
38
|
+
// ============================================
|
|
39
|
+
// OpenAI GPT-5 family (Latest - 2025)
|
|
40
|
+
// ============================================
|
|
41
|
+
'gpt-5': {
|
|
42
|
+
promptPer1K: 0.00125,
|
|
43
|
+
completionPer1K: 0.01,
|
|
44
|
+
lastUpdated: '2026-01',
|
|
45
|
+
notes: '400K context window',
|
|
42
46
|
},
|
|
43
|
-
'gpt-
|
|
44
|
-
promptPer1K: 0.
|
|
45
|
-
completionPer1K: 0.
|
|
46
|
-
lastUpdated: '
|
|
47
|
+
'gpt-5.1': {
|
|
48
|
+
promptPer1K: 0.00125,
|
|
49
|
+
completionPer1K: 0.01,
|
|
50
|
+
lastUpdated: '2026-01',
|
|
47
51
|
},
|
|
48
|
-
'gpt-
|
|
49
|
-
promptPer1K: 0.
|
|
50
|
-
completionPer1K: 0.
|
|
51
|
-
lastUpdated: '
|
|
52
|
+
'gpt-5.2': {
|
|
53
|
+
promptPer1K: 0.00175,
|
|
54
|
+
completionPer1K: 0.014,
|
|
55
|
+
lastUpdated: '2026-01',
|
|
52
56
|
},
|
|
53
|
-
'gpt-
|
|
54
|
-
promptPer1K: 0.
|
|
55
|
-
completionPer1K: 0.
|
|
56
|
-
lastUpdated: '
|
|
57
|
+
'gpt-5-mini': {
|
|
58
|
+
promptPer1K: 0.00025,
|
|
59
|
+
completionPer1K: 0.002,
|
|
60
|
+
lastUpdated: '2026-01',
|
|
61
|
+
},
|
|
62
|
+
'gpt-5-nano': {
|
|
63
|
+
promptPer1K: 0.00005,
|
|
64
|
+
completionPer1K: 0.0004,
|
|
65
|
+
lastUpdated: '2026-01',
|
|
66
|
+
},
|
|
67
|
+
|
|
68
|
+
// ============================================
|
|
69
|
+
// OpenAI GPT-4.1 family (2025)
|
|
70
|
+
// ============================================
|
|
71
|
+
'gpt-4.1': {
|
|
72
|
+
promptPer1K: 0.002,
|
|
73
|
+
completionPer1K: 0.008,
|
|
74
|
+
lastUpdated: '2026-01',
|
|
75
|
+
notes: '1M context window',
|
|
76
|
+
},
|
|
77
|
+
'gpt-4.1-mini': {
|
|
78
|
+
promptPer1K: 0.0004,
|
|
79
|
+
completionPer1K: 0.0016,
|
|
80
|
+
lastUpdated: '2026-01',
|
|
81
|
+
},
|
|
82
|
+
'gpt-4.1-nano': {
|
|
83
|
+
promptPer1K: 0.0001,
|
|
84
|
+
completionPer1K: 0.0004,
|
|
85
|
+
lastUpdated: '2026-01',
|
|
57
86
|
},
|
|
87
|
+
|
|
88
|
+
// ============================================
|
|
89
|
+
// OpenAI GPT-4o family (2024-2025)
|
|
90
|
+
// ============================================
|
|
58
91
|
'gpt-4o': {
|
|
59
|
-
promptPer1K: 0.
|
|
60
|
-
completionPer1K: 0.
|
|
61
|
-
lastUpdated: '
|
|
92
|
+
promptPer1K: 0.0025,
|
|
93
|
+
completionPer1K: 0.01,
|
|
94
|
+
lastUpdated: '2026-01',
|
|
95
|
+
notes: '128K context window',
|
|
62
96
|
},
|
|
63
97
|
'gpt-4o-mini': {
|
|
64
98
|
promptPer1K: 0.00015,
|
|
65
99
|
completionPer1K: 0.0006,
|
|
66
|
-
lastUpdated: '
|
|
100
|
+
lastUpdated: '2026-01',
|
|
101
|
+
notes: '128K context window',
|
|
67
102
|
},
|
|
68
103
|
|
|
69
|
-
//
|
|
104
|
+
// ============================================
|
|
105
|
+
// OpenAI O-series (Reasoning models)
|
|
106
|
+
// ============================================
|
|
107
|
+
o1: {
|
|
108
|
+
promptPer1K: 0.015,
|
|
109
|
+
completionPer1K: 0.06,
|
|
110
|
+
lastUpdated: '2026-01',
|
|
111
|
+
notes: 'Reasoning model - internal thinking tokens billed as output',
|
|
112
|
+
},
|
|
113
|
+
o3: {
|
|
114
|
+
promptPer1K: 0.002,
|
|
115
|
+
completionPer1K: 0.008,
|
|
116
|
+
lastUpdated: '2026-01',
|
|
117
|
+
},
|
|
118
|
+
'o3-mini': {
|
|
119
|
+
promptPer1K: 0.0011,
|
|
120
|
+
completionPer1K: 0.0044,
|
|
121
|
+
lastUpdated: '2026-01',
|
|
122
|
+
},
|
|
123
|
+
'o4-mini': {
|
|
124
|
+
promptPer1K: 0.0011,
|
|
125
|
+
completionPer1K: 0.0044,
|
|
126
|
+
lastUpdated: '2026-01',
|
|
127
|
+
},
|
|
128
|
+
|
|
129
|
+
// ============================================
|
|
130
|
+
// OpenAI Legacy GPT-4 family
|
|
131
|
+
// ============================================
|
|
132
|
+
'gpt-4-turbo': {
|
|
133
|
+
promptPer1K: 0.01,
|
|
134
|
+
completionPer1K: 0.03,
|
|
135
|
+
lastUpdated: '2026-01',
|
|
136
|
+
},
|
|
137
|
+
'gpt-4': {
|
|
138
|
+
promptPer1K: 0.03,
|
|
139
|
+
completionPer1K: 0.06,
|
|
140
|
+
lastUpdated: '2026-01',
|
|
141
|
+
},
|
|
70
142
|
'gpt-3.5-turbo': {
|
|
71
143
|
promptPer1K: 0.0005,
|
|
72
144
|
completionPer1K: 0.0015,
|
|
73
|
-
lastUpdated: '
|
|
145
|
+
lastUpdated: '2026-01',
|
|
74
146
|
},
|
|
75
|
-
|
|
147
|
+
|
|
148
|
+
// ============================================
|
|
149
|
+
// Anthropic Claude 4.5 family (Latest - 2025)
|
|
150
|
+
// ============================================
|
|
151
|
+
'claude-opus-4.5': {
|
|
152
|
+
promptPer1K: 0.005,
|
|
153
|
+
completionPer1K: 0.025,
|
|
154
|
+
lastUpdated: '2026-01',
|
|
155
|
+
notes: 'Most capable Claude model',
|
|
156
|
+
},
|
|
157
|
+
'claude-sonnet-4.5': {
|
|
76
158
|
promptPer1K: 0.003,
|
|
77
|
-
completionPer1K: 0.
|
|
78
|
-
lastUpdated: '
|
|
159
|
+
completionPer1K: 0.015,
|
|
160
|
+
lastUpdated: '2026-01',
|
|
161
|
+
notes: 'Balanced performance and cost',
|
|
162
|
+
},
|
|
163
|
+
'claude-haiku-4.5': {
|
|
164
|
+
promptPer1K: 0.001,
|
|
165
|
+
completionPer1K: 0.005,
|
|
166
|
+
lastUpdated: '2026-01',
|
|
167
|
+
notes: 'Fastest Claude model',
|
|
79
168
|
},
|
|
80
169
|
|
|
81
|
-
//
|
|
82
|
-
|
|
170
|
+
// ============================================
|
|
171
|
+
// Anthropic Claude 4 family (2025)
|
|
172
|
+
// ============================================
|
|
173
|
+
'claude-opus-4': {
|
|
83
174
|
promptPer1K: 0.015,
|
|
84
175
|
completionPer1K: 0.075,
|
|
85
|
-
lastUpdated: '
|
|
176
|
+
lastUpdated: '2026-01',
|
|
86
177
|
},
|
|
87
|
-
'claude-
|
|
178
|
+
'claude-opus-4.1': {
|
|
179
|
+
promptPer1K: 0.015,
|
|
180
|
+
completionPer1K: 0.075,
|
|
181
|
+
lastUpdated: '2026-01',
|
|
182
|
+
},
|
|
183
|
+
'claude-sonnet-4': {
|
|
88
184
|
promptPer1K: 0.003,
|
|
89
185
|
completionPer1K: 0.015,
|
|
90
|
-
lastUpdated: '
|
|
186
|
+
lastUpdated: '2026-01',
|
|
91
187
|
},
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
188
|
+
|
|
189
|
+
// ============================================
|
|
190
|
+
// Anthropic Claude 3.7 family
|
|
191
|
+
// ============================================
|
|
192
|
+
'claude-sonnet-3.7': {
|
|
193
|
+
promptPer1K: 0.003,
|
|
194
|
+
completionPer1K: 0.015,
|
|
195
|
+
lastUpdated: '2026-01',
|
|
96
196
|
},
|
|
97
|
-
'claude-3-
|
|
197
|
+
'claude-3-7-sonnet': {
|
|
98
198
|
promptPer1K: 0.003,
|
|
99
199
|
completionPer1K: 0.015,
|
|
100
|
-
lastUpdated: '
|
|
200
|
+
lastUpdated: '2026-01',
|
|
101
201
|
},
|
|
202
|
+
|
|
203
|
+
// ============================================
|
|
204
|
+
// Anthropic Claude 3.5 family (Legacy)
|
|
205
|
+
// ============================================
|
|
102
206
|
'claude-3-5-sonnet-20241022': {
|
|
103
207
|
promptPer1K: 0.003,
|
|
104
208
|
completionPer1K: 0.015,
|
|
105
|
-
lastUpdated: '
|
|
209
|
+
lastUpdated: '2026-01',
|
|
106
210
|
},
|
|
107
211
|
'claude-3-5-haiku-20241022': {
|
|
108
212
|
promptPer1K: 0.0008,
|
|
109
213
|
completionPer1K: 0.004,
|
|
110
|
-
lastUpdated: '
|
|
214
|
+
lastUpdated: '2026-01',
|
|
111
215
|
},
|
|
112
|
-
|
|
216
|
+
'claude-haiku-3.5': {
|
|
217
|
+
promptPer1K: 0.0008,
|
|
218
|
+
completionPer1K: 0.004,
|
|
219
|
+
lastUpdated: '2026-01',
|
|
220
|
+
},
|
|
221
|
+
|
|
222
|
+
// ============================================
|
|
223
|
+
// Anthropic Claude 3 family (Legacy)
|
|
224
|
+
// ============================================
|
|
113
225
|
'claude-3-opus': {
|
|
114
226
|
promptPer1K: 0.015,
|
|
115
227
|
completionPer1K: 0.075,
|
|
116
|
-
lastUpdated: '
|
|
228
|
+
lastUpdated: '2026-01',
|
|
117
229
|
},
|
|
118
230
|
'claude-3-sonnet': {
|
|
119
231
|
promptPer1K: 0.003,
|
|
120
232
|
completionPer1K: 0.015,
|
|
121
|
-
lastUpdated: '
|
|
233
|
+
lastUpdated: '2026-01',
|
|
122
234
|
},
|
|
123
235
|
'claude-3-haiku': {
|
|
124
236
|
promptPer1K: 0.00025,
|
|
125
237
|
completionPer1K: 0.00125,
|
|
126
|
-
lastUpdated: '
|
|
238
|
+
lastUpdated: '2026-01',
|
|
127
239
|
},
|
|
240
|
+
|
|
241
|
+
// Aliases for common naming patterns
|
|
128
242
|
'claude-3.5-sonnet': {
|
|
129
243
|
promptPer1K: 0.003,
|
|
130
244
|
completionPer1K: 0.015,
|
|
131
|
-
lastUpdated: '
|
|
245
|
+
lastUpdated: '2026-01',
|
|
132
246
|
},
|
|
133
247
|
'claude-3.5-haiku': {
|
|
134
248
|
promptPer1K: 0.0008,
|
|
135
249
|
completionPer1K: 0.004,
|
|
136
|
-
lastUpdated: '
|
|
137
|
-
},
|
|
138
|
-
|
|
139
|
-
// Legacy Claude
|
|
140
|
-
'claude-2': {
|
|
141
|
-
promptPer1K: 0.008,
|
|
142
|
-
completionPer1K: 0.024,
|
|
143
|
-
lastUpdated: '2024-01',
|
|
250
|
+
lastUpdated: '2026-01',
|
|
144
251
|
},
|
|
145
|
-
'claude-instant-1': {
|
|
146
|
-
promptPer1K: 0.0008,
|
|
147
|
-
completionPer1K: 0.0024,
|
|
148
|
-
lastUpdated: '2024-01',
|
|
149
|
-
},
|
|
150
|
-
|
|
151
|
-
// Azure OpenAI (same pricing as OpenAI typically)
|
|
152
|
-
// Add 'azure-' prefix versions if needed
|
|
153
252
|
};
|
|
154
253
|
|
|
155
254
|
/**
|
|
156
255
|
* Default pricing for unknown models
|
|
157
|
-
* Uses conservative estimates
|
|
256
|
+
* Uses conservative estimates based on mid-tier model pricing
|
|
158
257
|
*/
|
|
159
258
|
export const DEFAULT_PRICING: ModelPricing = {
|
|
160
|
-
promptPer1K: 0.
|
|
161
|
-
completionPer1K: 0.
|
|
162
|
-
lastUpdated: '
|
|
259
|
+
promptPer1K: 0.003,
|
|
260
|
+
completionPer1K: 0.015,
|
|
261
|
+
lastUpdated: '2026-01',
|
|
163
262
|
notes: 'Default pricing - verify with provider',
|
|
164
263
|
};
|
|
165
264
|
|
|
@@ -183,12 +282,57 @@ export function getModelPricing(model: string): ModelPricing {
|
|
|
183
282
|
}
|
|
184
283
|
|
|
185
284
|
// Try partial match for common patterns
|
|
285
|
+
// GPT-5 family
|
|
286
|
+
if (lowerModel.includes('gpt-5.2')) {
|
|
287
|
+
return MODEL_PRICING['gpt-5.2'];
|
|
288
|
+
}
|
|
289
|
+
if (lowerModel.includes('gpt-5.1')) {
|
|
290
|
+
return MODEL_PRICING['gpt-5.1'];
|
|
291
|
+
}
|
|
292
|
+
if (lowerModel.includes('gpt-5-mini')) {
|
|
293
|
+
return MODEL_PRICING['gpt-5-mini'];
|
|
294
|
+
}
|
|
295
|
+
if (lowerModel.includes('gpt-5-nano')) {
|
|
296
|
+
return MODEL_PRICING['gpt-5-nano'];
|
|
297
|
+
}
|
|
298
|
+
if (lowerModel.includes('gpt-5')) {
|
|
299
|
+
return MODEL_PRICING['gpt-5'];
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// GPT-4.1 family
|
|
303
|
+
if (lowerModel.includes('gpt-4.1-mini')) {
|
|
304
|
+
return MODEL_PRICING['gpt-4.1-mini'];
|
|
305
|
+
}
|
|
306
|
+
if (lowerModel.includes('gpt-4.1-nano')) {
|
|
307
|
+
return MODEL_PRICING['gpt-4.1-nano'];
|
|
308
|
+
}
|
|
309
|
+
if (lowerModel.includes('gpt-4.1')) {
|
|
310
|
+
return MODEL_PRICING['gpt-4.1'];
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// GPT-4o family
|
|
186
314
|
if (lowerModel.includes('gpt-4o-mini')) {
|
|
187
315
|
return MODEL_PRICING['gpt-4o-mini'];
|
|
188
316
|
}
|
|
189
317
|
if (lowerModel.includes('gpt-4o')) {
|
|
190
318
|
return MODEL_PRICING['gpt-4o'];
|
|
191
319
|
}
|
|
320
|
+
|
|
321
|
+
// O-series
|
|
322
|
+
if (lowerModel.includes('o4-mini')) {
|
|
323
|
+
return MODEL_PRICING['o4-mini'];
|
|
324
|
+
}
|
|
325
|
+
if (lowerModel.includes('o3-mini')) {
|
|
326
|
+
return MODEL_PRICING['o3-mini'];
|
|
327
|
+
}
|
|
328
|
+
if (lowerModel.includes('o3')) {
|
|
329
|
+
return MODEL_PRICING.o3;
|
|
330
|
+
}
|
|
331
|
+
if (lowerModel.includes('o1')) {
|
|
332
|
+
return MODEL_PRICING.o1;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Legacy GPT
|
|
192
336
|
if (lowerModel.includes('gpt-4-turbo')) {
|
|
193
337
|
return MODEL_PRICING['gpt-4-turbo'];
|
|
194
338
|
}
|
|
@@ -198,12 +342,43 @@ export function getModelPricing(model: string): ModelPricing {
|
|
|
198
342
|
if (lowerModel.includes('gpt-3.5')) {
|
|
199
343
|
return MODEL_PRICING['gpt-3.5-turbo'];
|
|
200
344
|
}
|
|
345
|
+
|
|
346
|
+
// Claude 4.5 family
|
|
347
|
+
if (lowerModel.includes('opus-4.5') || lowerModel.includes('opus-4-5')) {
|
|
348
|
+
return MODEL_PRICING['claude-opus-4.5'];
|
|
349
|
+
}
|
|
350
|
+
if (lowerModel.includes('sonnet-4.5') || lowerModel.includes('sonnet-4-5')) {
|
|
351
|
+
return MODEL_PRICING['claude-sonnet-4.5'];
|
|
352
|
+
}
|
|
353
|
+
if (lowerModel.includes('haiku-4.5') || lowerModel.includes('haiku-4-5')) {
|
|
354
|
+
return MODEL_PRICING['claude-haiku-4.5'];
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Claude 4 family
|
|
358
|
+
if (lowerModel.includes('opus-4.1') || lowerModel.includes('opus-4-1')) {
|
|
359
|
+
return MODEL_PRICING['claude-opus-4.1'];
|
|
360
|
+
}
|
|
361
|
+
if (lowerModel.includes('opus-4')) {
|
|
362
|
+
return MODEL_PRICING['claude-opus-4'];
|
|
363
|
+
}
|
|
364
|
+
if (lowerModel.includes('sonnet-4')) {
|
|
365
|
+
return MODEL_PRICING['claude-sonnet-4'];
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Claude 3.7 family
|
|
369
|
+
if (lowerModel.includes('sonnet-3.7') || lowerModel.includes('sonnet-3-7')) {
|
|
370
|
+
return MODEL_PRICING['claude-sonnet-3.7'];
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Claude 3.5 family
|
|
201
374
|
if (lowerModel.includes('claude-3-5-sonnet') || lowerModel.includes('claude-3.5-sonnet')) {
|
|
202
375
|
return MODEL_PRICING['claude-3.5-sonnet'];
|
|
203
376
|
}
|
|
204
377
|
if (lowerModel.includes('claude-3-5-haiku') || lowerModel.includes('claude-3.5-haiku')) {
|
|
205
378
|
return MODEL_PRICING['claude-3.5-haiku'];
|
|
206
379
|
}
|
|
380
|
+
|
|
381
|
+
// Claude 3 family
|
|
207
382
|
if (lowerModel.includes('claude-3-opus')) {
|
|
208
383
|
return MODEL_PRICING['claude-3-opus'];
|
|
209
384
|
}
|
|
@@ -213,8 +388,10 @@ export function getModelPricing(model: string): ModelPricing {
|
|
|
213
388
|
if (lowerModel.includes('claude-3-haiku')) {
|
|
214
389
|
return MODEL_PRICING['claude-3-haiku'];
|
|
215
390
|
}
|
|
391
|
+
|
|
392
|
+
// Generic Claude fallback
|
|
216
393
|
if (lowerModel.includes('claude')) {
|
|
217
|
-
return MODEL_PRICING['claude-
|
|
394
|
+
return MODEL_PRICING['claude-sonnet-4.5'];
|
|
218
395
|
}
|
|
219
396
|
|
|
220
397
|
return DEFAULT_PRICING;
|
|
@@ -5,22 +5,27 @@
|
|
|
5
5
|
import type { Expected } from '../scenario/schema';
|
|
6
6
|
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
7
7
|
|
|
8
|
-
const GRADER_PROMPT = `You are
|
|
8
|
+
const GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
RUBRIC:
|
|
11
11
|
{{rubric}}
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
RESPONSE TO EVALUATE:
|
|
14
14
|
{{response}}
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
Score the response from 0.0 to 1.0 based on the rubric.
|
|
18
|
-
Be objective and consistent in your scoring.
|
|
16
|
+
TASK: Score the response from 0.0 to 1.0 based on the rubric above.
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
{"score":
|
|
18
|
+
OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
|
|
19
|
+
{"score":0.0,"reason":"explanation"}
|
|
22
20
|
|
|
23
|
-
|
|
21
|
+
RULES:
|
|
22
|
+
- Output ONLY valid JSON, no markdown, no code blocks, no extra text
|
|
23
|
+
- "score" must be a number between 0.0 and 1.0
|
|
24
|
+
- "reason" must be a brief string explaining the score
|
|
25
|
+
- Do NOT wrap in \`\`\`json or any formatting
|
|
26
|
+
- Your entire response must be parseable by JSON.parse()
|
|
27
|
+
|
|
28
|
+
JSON OUTPUT:`;
|
|
24
29
|
|
|
25
30
|
export class LLMGraderEvaluator implements Evaluator {
|
|
26
31
|
readonly type = 'llm_grader';
|
|
@@ -44,11 +49,13 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
44
49
|
);
|
|
45
50
|
|
|
46
51
|
try {
|
|
52
|
+
// Note: Some models (like o1, o3, gpt-5-mini, reasoning models) only support temperature=1
|
|
53
|
+
// We omit temperature to let the API use its default for maximum compatibility
|
|
54
|
+
// Use higher maxTokens for reasoning models which use tokens for internal "thinking"
|
|
47
55
|
const result = await context.client.generate({
|
|
48
56
|
prompt,
|
|
49
57
|
model: expected.model,
|
|
50
|
-
|
|
51
|
-
maxTokens: 200,
|
|
58
|
+
maxTokens: 1000,
|
|
52
59
|
});
|
|
53
60
|
|
|
54
61
|
const parsed = this.parseGraderResponse(result.text);
|
|
@@ -76,9 +83,25 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
76
83
|
}
|
|
77
84
|
|
|
78
85
|
private parseGraderResponse(text: string): { score: number; reason?: string } {
|
|
79
|
-
|
|
86
|
+
// Clean up the response - remove markdown code blocks if present
|
|
87
|
+
const cleanedText = text
|
|
88
|
+
.replace(/```json\s*/gi, '')
|
|
89
|
+
.replace(/```\s*/g, '')
|
|
90
|
+
.trim();
|
|
91
|
+
|
|
92
|
+
// Try to find JSON object in the response
|
|
93
|
+
const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
|
|
94
|
+
|
|
80
95
|
if (!jsonMatch) {
|
|
81
|
-
|
|
96
|
+
// Fallback: try to extract score from plain text patterns like "Score: 0.8" or "0.85"
|
|
97
|
+
const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
|
|
98
|
+
if (scoreMatch) {
|
|
99
|
+
const score = Number(scoreMatch[1]);
|
|
100
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
101
|
+
return { score, reason: cleanedText };
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
|
|
82
105
|
}
|
|
83
106
|
|
|
84
107
|
try {
|
|
@@ -94,6 +117,15 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
94
117
|
reason: parsed.reason,
|
|
95
118
|
};
|
|
96
119
|
} catch (error) {
|
|
120
|
+
// If JSON parsing fails, try extracting score directly
|
|
121
|
+
const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
|
|
122
|
+
if (scoreMatch) {
|
|
123
|
+
const score = Number(scoreMatch[1]);
|
|
124
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
125
|
+
const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
|
|
126
|
+
return { score, reason: reasonMatch?.[1] };
|
|
127
|
+
}
|
|
128
|
+
}
|
|
97
129
|
throw new Error(`Failed to parse grader response: ${(error as Error).message}`);
|
|
98
130
|
}
|
|
99
131
|
}
|
|
@@ -15,7 +15,8 @@ describe('SimilarityEvaluator', () => {
|
|
|
15
15
|
|
|
16
16
|
test('throws on invalid expected type', async () => {
|
|
17
17
|
await expect(
|
|
18
|
-
|
|
18
|
+
// @ts-expect-error Testing invalid type handling
|
|
19
|
+
evaluator.evaluate('response', { type: 'exact', value: 'test' })
|
|
19
20
|
).rejects.toThrow('Invalid expected type');
|
|
20
21
|
});
|
|
21
22
|
|
|
@@ -288,8 +289,8 @@ describe('SimilarityEvaluator', () => {
|
|
|
288
289
|
{
|
|
289
290
|
type: 'similarity',
|
|
290
291
|
value: 'Text B',
|
|
291
|
-
|
|
292
|
-
}
|
|
292
|
+
threshold: undefined, // Testing default threshold (0.75)
|
|
293
|
+
},
|
|
293
294
|
mockContext
|
|
294
295
|
);
|
|
295
296
|
|
package/src/scenario/schema.ts
CHANGED
|
@@ -44,6 +44,10 @@ export const ProviderConfigSchema = z
|
|
|
44
44
|
apiVersion: z.string().optional(),
|
|
45
45
|
embeddingDeploymentName: z.string().optional(),
|
|
46
46
|
|
|
47
|
+
// Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
|
|
48
|
+
// Used by OpenAI/Azure to determine which API parameters to use (max_tokens vs max_completion_tokens)
|
|
49
|
+
modelFamily: z.string().optional(),
|
|
50
|
+
|
|
47
51
|
// Vercel AI specific
|
|
48
52
|
underlyingProvider: z.enum(['openai', 'azure', 'anthropic', 'google', 'mistral']).optional(),
|
|
49
53
|
})
|