@artemiskit/core 0.1.6 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +116 -0
- package/dist/adapters/types.d.ts +8 -1
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +39 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/index.d.ts +5 -0
- package/dist/cost/index.d.ts.map +1 -0
- package/dist/cost/pricing.d.ts +67 -0
- package/dist/cost/pricing.d.ts.map +1 -0
- package/dist/evaluators/combined.d.ts +10 -0
- package/dist/evaluators/combined.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +4 -0
- package/dist/evaluators/index.d.ts.map +1 -1
- package/dist/evaluators/inline.d.ts +22 -0
- package/dist/evaluators/inline.d.ts.map +1 -0
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/evaluators/not-contains.d.ts +10 -0
- package/dist/evaluators/not-contains.d.ts.map +1 -0
- package/dist/evaluators/similarity.d.ts +16 -0
- package/dist/evaluators/similarity.d.ts.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +13212 -12018
- package/dist/scenario/discovery.d.ts +72 -0
- package/dist/scenario/discovery.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +1 -0
- package/dist/scenario/index.d.ts.map +1 -1
- package/dist/scenario/schema.d.ts +1253 -9
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +62 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +8 -1
- package/src/artifacts/types.ts +39 -0
- package/src/cost/index.ts +14 -0
- package/src/cost/pricing.ts +450 -0
- package/src/evaluators/combined.test.ts +172 -0
- package/src/evaluators/combined.ts +95 -0
- package/src/evaluators/index.ts +12 -0
- package/src/evaluators/inline.test.ts +409 -0
- package/src/evaluators/inline.ts +393 -0
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/not-contains.test.ts +105 -0
- package/src/evaluators/not-contains.ts +45 -0
- package/src/evaluators/similarity.test.ts +333 -0
- package/src/evaluators/similarity.ts +258 -0
- package/src/index.ts +3 -0
- package/src/scenario/discovery.test.ts +153 -0
- package/src/scenario/discovery.ts +277 -0
- package/src/scenario/index.ts +1 -0
- package/src/scenario/schema.ts +47 -2
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +162 -2
- package/src/storage/types.ts +73 -0
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Pricing Data and Cost Estimation
|
|
3
|
+
*
|
|
4
|
+
* Pricing is per 1,000 tokens (1K tokens) in USD
|
|
5
|
+
* Data is updated periodically - always verify with provider's official pricing
|
|
6
|
+
* Last comprehensive update: January 2026
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export interface ModelPricing {
|
|
10
|
+
/** Price per 1K prompt/input tokens in USD */
|
|
11
|
+
promptPer1K: number;
|
|
12
|
+
/** Price per 1K completion/output tokens in USD */
|
|
13
|
+
completionPer1K: number;
|
|
14
|
+
/** Last updated date */
|
|
15
|
+
lastUpdated: string;
|
|
16
|
+
/** Notes about the pricing */
|
|
17
|
+
notes?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface CostEstimate {
|
|
21
|
+
/** Estimated total cost in USD */
|
|
22
|
+
totalUsd: number;
|
|
23
|
+
/** Cost for prompt tokens */
|
|
24
|
+
promptCostUsd: number;
|
|
25
|
+
/** Cost for completion tokens */
|
|
26
|
+
completionCostUsd: number;
|
|
27
|
+
/** The model used for pricing */
|
|
28
|
+
model: string;
|
|
29
|
+
/** Pricing used */
|
|
30
|
+
pricing: ModelPricing;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Known model pricing data
|
|
35
|
+
* Prices are in USD per 1,000 tokens
|
|
36
|
+
*/
|
|
37
|
+
export const MODEL_PRICING: Record<string, ModelPricing> = {
|
|
38
|
+
// ============================================
|
|
39
|
+
// OpenAI GPT-5 family (Latest - 2025)
|
|
40
|
+
// ============================================
|
|
41
|
+
'gpt-5': {
|
|
42
|
+
promptPer1K: 0.00125,
|
|
43
|
+
completionPer1K: 0.01,
|
|
44
|
+
lastUpdated: '2026-01',
|
|
45
|
+
notes: '400K context window',
|
|
46
|
+
},
|
|
47
|
+
'gpt-5.1': {
|
|
48
|
+
promptPer1K: 0.00125,
|
|
49
|
+
completionPer1K: 0.01,
|
|
50
|
+
lastUpdated: '2026-01',
|
|
51
|
+
},
|
|
52
|
+
'gpt-5.2': {
|
|
53
|
+
promptPer1K: 0.00175,
|
|
54
|
+
completionPer1K: 0.014,
|
|
55
|
+
lastUpdated: '2026-01',
|
|
56
|
+
},
|
|
57
|
+
'gpt-5-mini': {
|
|
58
|
+
promptPer1K: 0.00025,
|
|
59
|
+
completionPer1K: 0.002,
|
|
60
|
+
lastUpdated: '2026-01',
|
|
61
|
+
},
|
|
62
|
+
'gpt-5-nano': {
|
|
63
|
+
promptPer1K: 0.00005,
|
|
64
|
+
completionPer1K: 0.0004,
|
|
65
|
+
lastUpdated: '2026-01',
|
|
66
|
+
},
|
|
67
|
+
|
|
68
|
+
// ============================================
|
|
69
|
+
// OpenAI GPT-4.1 family (2025)
|
|
70
|
+
// ============================================
|
|
71
|
+
'gpt-4.1': {
|
|
72
|
+
promptPer1K: 0.002,
|
|
73
|
+
completionPer1K: 0.008,
|
|
74
|
+
lastUpdated: '2026-01',
|
|
75
|
+
notes: '1M context window',
|
|
76
|
+
},
|
|
77
|
+
'gpt-4.1-mini': {
|
|
78
|
+
promptPer1K: 0.0004,
|
|
79
|
+
completionPer1K: 0.0016,
|
|
80
|
+
lastUpdated: '2026-01',
|
|
81
|
+
},
|
|
82
|
+
'gpt-4.1-nano': {
|
|
83
|
+
promptPer1K: 0.0001,
|
|
84
|
+
completionPer1K: 0.0004,
|
|
85
|
+
lastUpdated: '2026-01',
|
|
86
|
+
},
|
|
87
|
+
|
|
88
|
+
// ============================================
|
|
89
|
+
// OpenAI GPT-4o family (2024-2025)
|
|
90
|
+
// ============================================
|
|
91
|
+
'gpt-4o': {
|
|
92
|
+
promptPer1K: 0.0025,
|
|
93
|
+
completionPer1K: 0.01,
|
|
94
|
+
lastUpdated: '2026-01',
|
|
95
|
+
notes: '128K context window',
|
|
96
|
+
},
|
|
97
|
+
'gpt-4o-mini': {
|
|
98
|
+
promptPer1K: 0.00015,
|
|
99
|
+
completionPer1K: 0.0006,
|
|
100
|
+
lastUpdated: '2026-01',
|
|
101
|
+
notes: '128K context window',
|
|
102
|
+
},
|
|
103
|
+
|
|
104
|
+
// ============================================
|
|
105
|
+
// OpenAI O-series (Reasoning models)
|
|
106
|
+
// ============================================
|
|
107
|
+
o1: {
|
|
108
|
+
promptPer1K: 0.015,
|
|
109
|
+
completionPer1K: 0.06,
|
|
110
|
+
lastUpdated: '2026-01',
|
|
111
|
+
notes: 'Reasoning model - internal thinking tokens billed as output',
|
|
112
|
+
},
|
|
113
|
+
o3: {
|
|
114
|
+
promptPer1K: 0.002,
|
|
115
|
+
completionPer1K: 0.008,
|
|
116
|
+
lastUpdated: '2026-01',
|
|
117
|
+
},
|
|
118
|
+
'o3-mini': {
|
|
119
|
+
promptPer1K: 0.0011,
|
|
120
|
+
completionPer1K: 0.0044,
|
|
121
|
+
lastUpdated: '2026-01',
|
|
122
|
+
},
|
|
123
|
+
'o4-mini': {
|
|
124
|
+
promptPer1K: 0.0011,
|
|
125
|
+
completionPer1K: 0.0044,
|
|
126
|
+
lastUpdated: '2026-01',
|
|
127
|
+
},
|
|
128
|
+
|
|
129
|
+
// ============================================
|
|
130
|
+
// OpenAI Legacy GPT-4 family
|
|
131
|
+
// ============================================
|
|
132
|
+
'gpt-4-turbo': {
|
|
133
|
+
promptPer1K: 0.01,
|
|
134
|
+
completionPer1K: 0.03,
|
|
135
|
+
lastUpdated: '2026-01',
|
|
136
|
+
},
|
|
137
|
+
'gpt-4': {
|
|
138
|
+
promptPer1K: 0.03,
|
|
139
|
+
completionPer1K: 0.06,
|
|
140
|
+
lastUpdated: '2026-01',
|
|
141
|
+
},
|
|
142
|
+
'gpt-3.5-turbo': {
|
|
143
|
+
promptPer1K: 0.0005,
|
|
144
|
+
completionPer1K: 0.0015,
|
|
145
|
+
lastUpdated: '2026-01',
|
|
146
|
+
},
|
|
147
|
+
|
|
148
|
+
// ============================================
|
|
149
|
+
// Anthropic Claude 4.5 family (Latest - 2025)
|
|
150
|
+
// ============================================
|
|
151
|
+
'claude-opus-4.5': {
|
|
152
|
+
promptPer1K: 0.005,
|
|
153
|
+
completionPer1K: 0.025,
|
|
154
|
+
lastUpdated: '2026-01',
|
|
155
|
+
notes: 'Most capable Claude model',
|
|
156
|
+
},
|
|
157
|
+
'claude-sonnet-4.5': {
|
|
158
|
+
promptPer1K: 0.003,
|
|
159
|
+
completionPer1K: 0.015,
|
|
160
|
+
lastUpdated: '2026-01',
|
|
161
|
+
notes: 'Balanced performance and cost',
|
|
162
|
+
},
|
|
163
|
+
'claude-haiku-4.5': {
|
|
164
|
+
promptPer1K: 0.001,
|
|
165
|
+
completionPer1K: 0.005,
|
|
166
|
+
lastUpdated: '2026-01',
|
|
167
|
+
notes: 'Fastest Claude model',
|
|
168
|
+
},
|
|
169
|
+
|
|
170
|
+
// ============================================
|
|
171
|
+
// Anthropic Claude 4 family (2025)
|
|
172
|
+
// ============================================
|
|
173
|
+
'claude-opus-4': {
|
|
174
|
+
promptPer1K: 0.015,
|
|
175
|
+
completionPer1K: 0.075,
|
|
176
|
+
lastUpdated: '2026-01',
|
|
177
|
+
},
|
|
178
|
+
'claude-opus-4.1': {
|
|
179
|
+
promptPer1K: 0.015,
|
|
180
|
+
completionPer1K: 0.075,
|
|
181
|
+
lastUpdated: '2026-01',
|
|
182
|
+
},
|
|
183
|
+
'claude-sonnet-4': {
|
|
184
|
+
promptPer1K: 0.003,
|
|
185
|
+
completionPer1K: 0.015,
|
|
186
|
+
lastUpdated: '2026-01',
|
|
187
|
+
},
|
|
188
|
+
|
|
189
|
+
// ============================================
|
|
190
|
+
// Anthropic Claude 3.7 family
|
|
191
|
+
// ============================================
|
|
192
|
+
'claude-sonnet-3.7': {
|
|
193
|
+
promptPer1K: 0.003,
|
|
194
|
+
completionPer1K: 0.015,
|
|
195
|
+
lastUpdated: '2026-01',
|
|
196
|
+
},
|
|
197
|
+
'claude-3-7-sonnet': {
|
|
198
|
+
promptPer1K: 0.003,
|
|
199
|
+
completionPer1K: 0.015,
|
|
200
|
+
lastUpdated: '2026-01',
|
|
201
|
+
},
|
|
202
|
+
|
|
203
|
+
// ============================================
|
|
204
|
+
// Anthropic Claude 3.5 family (Legacy)
|
|
205
|
+
// ============================================
|
|
206
|
+
'claude-3-5-sonnet-20241022': {
|
|
207
|
+
promptPer1K: 0.003,
|
|
208
|
+
completionPer1K: 0.015,
|
|
209
|
+
lastUpdated: '2026-01',
|
|
210
|
+
},
|
|
211
|
+
'claude-3-5-haiku-20241022': {
|
|
212
|
+
promptPer1K: 0.0008,
|
|
213
|
+
completionPer1K: 0.004,
|
|
214
|
+
lastUpdated: '2026-01',
|
|
215
|
+
},
|
|
216
|
+
'claude-haiku-3.5': {
|
|
217
|
+
promptPer1K: 0.0008,
|
|
218
|
+
completionPer1K: 0.004,
|
|
219
|
+
lastUpdated: '2026-01',
|
|
220
|
+
},
|
|
221
|
+
|
|
222
|
+
// ============================================
|
|
223
|
+
// Anthropic Claude 3 family (Legacy)
|
|
224
|
+
// ============================================
|
|
225
|
+
'claude-3-opus': {
|
|
226
|
+
promptPer1K: 0.015,
|
|
227
|
+
completionPer1K: 0.075,
|
|
228
|
+
lastUpdated: '2026-01',
|
|
229
|
+
},
|
|
230
|
+
'claude-3-sonnet': {
|
|
231
|
+
promptPer1K: 0.003,
|
|
232
|
+
completionPer1K: 0.015,
|
|
233
|
+
lastUpdated: '2026-01',
|
|
234
|
+
},
|
|
235
|
+
'claude-3-haiku': {
|
|
236
|
+
promptPer1K: 0.00025,
|
|
237
|
+
completionPer1K: 0.00125,
|
|
238
|
+
lastUpdated: '2026-01',
|
|
239
|
+
},
|
|
240
|
+
|
|
241
|
+
// Aliases for common naming patterns
|
|
242
|
+
'claude-3.5-sonnet': {
|
|
243
|
+
promptPer1K: 0.003,
|
|
244
|
+
completionPer1K: 0.015,
|
|
245
|
+
lastUpdated: '2026-01',
|
|
246
|
+
},
|
|
247
|
+
'claude-3.5-haiku': {
|
|
248
|
+
promptPer1K: 0.0008,
|
|
249
|
+
completionPer1K: 0.004,
|
|
250
|
+
lastUpdated: '2026-01',
|
|
251
|
+
},
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Default pricing for unknown models
|
|
256
|
+
* Uses conservative estimates based on mid-tier model pricing
|
|
257
|
+
*/
|
|
258
|
+
export const DEFAULT_PRICING: ModelPricing = {
|
|
259
|
+
promptPer1K: 0.003,
|
|
260
|
+
completionPer1K: 0.015,
|
|
261
|
+
lastUpdated: '2026-01',
|
|
262
|
+
notes: 'Default pricing - verify with provider',
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Get pricing for a model
|
|
267
|
+
* @param model Model identifier
|
|
268
|
+
* @returns Pricing data or default if unknown
|
|
269
|
+
*/
|
|
270
|
+
export function getModelPricing(model: string): ModelPricing {
|
|
271
|
+
// Try exact match first
|
|
272
|
+
if (MODEL_PRICING[model]) {
|
|
273
|
+
return MODEL_PRICING[model];
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Try case-insensitive match
|
|
277
|
+
const lowerModel = model.toLowerCase();
|
|
278
|
+
for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
|
|
279
|
+
if (key.toLowerCase() === lowerModel) {
|
|
280
|
+
return pricing;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Try partial match for common patterns
|
|
285
|
+
// GPT-5 family
|
|
286
|
+
if (lowerModel.includes('gpt-5.2')) {
|
|
287
|
+
return MODEL_PRICING['gpt-5.2'];
|
|
288
|
+
}
|
|
289
|
+
if (lowerModel.includes('gpt-5.1')) {
|
|
290
|
+
return MODEL_PRICING['gpt-5.1'];
|
|
291
|
+
}
|
|
292
|
+
if (lowerModel.includes('gpt-5-mini')) {
|
|
293
|
+
return MODEL_PRICING['gpt-5-mini'];
|
|
294
|
+
}
|
|
295
|
+
if (lowerModel.includes('gpt-5-nano')) {
|
|
296
|
+
return MODEL_PRICING['gpt-5-nano'];
|
|
297
|
+
}
|
|
298
|
+
if (lowerModel.includes('gpt-5')) {
|
|
299
|
+
return MODEL_PRICING['gpt-5'];
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// GPT-4.1 family
|
|
303
|
+
if (lowerModel.includes('gpt-4.1-mini')) {
|
|
304
|
+
return MODEL_PRICING['gpt-4.1-mini'];
|
|
305
|
+
}
|
|
306
|
+
if (lowerModel.includes('gpt-4.1-nano')) {
|
|
307
|
+
return MODEL_PRICING['gpt-4.1-nano'];
|
|
308
|
+
}
|
|
309
|
+
if (lowerModel.includes('gpt-4.1')) {
|
|
310
|
+
return MODEL_PRICING['gpt-4.1'];
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// GPT-4o family
|
|
314
|
+
if (lowerModel.includes('gpt-4o-mini')) {
|
|
315
|
+
return MODEL_PRICING['gpt-4o-mini'];
|
|
316
|
+
}
|
|
317
|
+
if (lowerModel.includes('gpt-4o')) {
|
|
318
|
+
return MODEL_PRICING['gpt-4o'];
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// O-series
|
|
322
|
+
if (lowerModel.includes('o4-mini')) {
|
|
323
|
+
return MODEL_PRICING['o4-mini'];
|
|
324
|
+
}
|
|
325
|
+
if (lowerModel.includes('o3-mini')) {
|
|
326
|
+
return MODEL_PRICING['o3-mini'];
|
|
327
|
+
}
|
|
328
|
+
if (lowerModel.includes('o3')) {
|
|
329
|
+
return MODEL_PRICING.o3;
|
|
330
|
+
}
|
|
331
|
+
if (lowerModel.includes('o1')) {
|
|
332
|
+
return MODEL_PRICING.o1;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Legacy GPT
|
|
336
|
+
if (lowerModel.includes('gpt-4-turbo')) {
|
|
337
|
+
return MODEL_PRICING['gpt-4-turbo'];
|
|
338
|
+
}
|
|
339
|
+
if (lowerModel.includes('gpt-4')) {
|
|
340
|
+
return MODEL_PRICING['gpt-4'];
|
|
341
|
+
}
|
|
342
|
+
if (lowerModel.includes('gpt-3.5')) {
|
|
343
|
+
return MODEL_PRICING['gpt-3.5-turbo'];
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Claude 4.5 family
|
|
347
|
+
if (lowerModel.includes('opus-4.5') || lowerModel.includes('opus-4-5')) {
|
|
348
|
+
return MODEL_PRICING['claude-opus-4.5'];
|
|
349
|
+
}
|
|
350
|
+
if (lowerModel.includes('sonnet-4.5') || lowerModel.includes('sonnet-4-5')) {
|
|
351
|
+
return MODEL_PRICING['claude-sonnet-4.5'];
|
|
352
|
+
}
|
|
353
|
+
if (lowerModel.includes('haiku-4.5') || lowerModel.includes('haiku-4-5')) {
|
|
354
|
+
return MODEL_PRICING['claude-haiku-4.5'];
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Claude 4 family
|
|
358
|
+
if (lowerModel.includes('opus-4.1') || lowerModel.includes('opus-4-1')) {
|
|
359
|
+
return MODEL_PRICING['claude-opus-4.1'];
|
|
360
|
+
}
|
|
361
|
+
if (lowerModel.includes('opus-4')) {
|
|
362
|
+
return MODEL_PRICING['claude-opus-4'];
|
|
363
|
+
}
|
|
364
|
+
if (lowerModel.includes('sonnet-4')) {
|
|
365
|
+
return MODEL_PRICING['claude-sonnet-4'];
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Claude 3.7 family
|
|
369
|
+
if (lowerModel.includes('sonnet-3.7') || lowerModel.includes('sonnet-3-7')) {
|
|
370
|
+
return MODEL_PRICING['claude-sonnet-3.7'];
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Claude 3.5 family
|
|
374
|
+
if (lowerModel.includes('claude-3-5-sonnet') || lowerModel.includes('claude-3.5-sonnet')) {
|
|
375
|
+
return MODEL_PRICING['claude-3.5-sonnet'];
|
|
376
|
+
}
|
|
377
|
+
if (lowerModel.includes('claude-3-5-haiku') || lowerModel.includes('claude-3.5-haiku')) {
|
|
378
|
+
return MODEL_PRICING['claude-3.5-haiku'];
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Claude 3 family
|
|
382
|
+
if (lowerModel.includes('claude-3-opus')) {
|
|
383
|
+
return MODEL_PRICING['claude-3-opus'];
|
|
384
|
+
}
|
|
385
|
+
if (lowerModel.includes('claude-3-sonnet')) {
|
|
386
|
+
return MODEL_PRICING['claude-3-sonnet'];
|
|
387
|
+
}
|
|
388
|
+
if (lowerModel.includes('claude-3-haiku')) {
|
|
389
|
+
return MODEL_PRICING['claude-3-haiku'];
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// Generic Claude fallback
|
|
393
|
+
if (lowerModel.includes('claude')) {
|
|
394
|
+
return MODEL_PRICING['claude-sonnet-4.5'];
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
return DEFAULT_PRICING;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/**
|
|
401
|
+
* Estimate cost for token usage
|
|
402
|
+
* @param promptTokens Number of prompt/input tokens
|
|
403
|
+
* @param completionTokens Number of completion/output tokens
|
|
404
|
+
* @param model Model identifier
|
|
405
|
+
* @returns Cost estimate
|
|
406
|
+
*/
|
|
407
|
+
export function estimateCost(
|
|
408
|
+
promptTokens: number,
|
|
409
|
+
completionTokens: number,
|
|
410
|
+
model: string
|
|
411
|
+
): CostEstimate {
|
|
412
|
+
const pricing = getModelPricing(model);
|
|
413
|
+
|
|
414
|
+
const promptCostUsd = (promptTokens / 1000) * pricing.promptPer1K;
|
|
415
|
+
const completionCostUsd = (completionTokens / 1000) * pricing.completionPer1K;
|
|
416
|
+
const totalUsd = promptCostUsd + completionCostUsd;
|
|
417
|
+
|
|
418
|
+
return {
|
|
419
|
+
totalUsd,
|
|
420
|
+
promptCostUsd,
|
|
421
|
+
completionCostUsd,
|
|
422
|
+
model,
|
|
423
|
+
pricing,
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Format cost for display
|
|
429
|
+
* @param costUsd Cost in USD
|
|
430
|
+
* @returns Formatted string
|
|
431
|
+
*/
|
|
432
|
+
export function formatCost(costUsd: number): string {
|
|
433
|
+
if (costUsd < 0.01) {
|
|
434
|
+
return `$${(costUsd * 100).toFixed(4)} cents`;
|
|
435
|
+
}
|
|
436
|
+
if (costUsd < 1) {
|
|
437
|
+
return `$${costUsd.toFixed(4)}`;
|
|
438
|
+
}
|
|
439
|
+
return `$${costUsd.toFixed(2)}`;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* List all known models with pricing
|
|
444
|
+
*/
|
|
445
|
+
export function listKnownModels(): Array<{ model: string; pricing: ModelPricing }> {
|
|
446
|
+
return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
|
|
447
|
+
model,
|
|
448
|
+
pricing,
|
|
449
|
+
}));
|
|
450
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for CombinedEvaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { CombinedEvaluator } from './combined';
|
|
7
|
+
|
|
8
|
+
describe('CombinedEvaluator', () => {
|
|
9
|
+
const evaluator = new CombinedEvaluator();
|
|
10
|
+
|
|
11
|
+
describe('AND operator', () => {
|
|
12
|
+
test('passes when all expectations pass', async () => {
|
|
13
|
+
const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
|
|
14
|
+
type: 'combined',
|
|
15
|
+
operator: 'and',
|
|
16
|
+
expectations: [
|
|
17
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
18
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
19
|
+
],
|
|
20
|
+
});
|
|
21
|
+
expect(result.passed).toBe(true);
|
|
22
|
+
expect(result.score).toBe(1);
|
|
23
|
+
expect(result.reason).toContain('All 2 expectations passed');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('fails when one expectation fails', async () => {
|
|
27
|
+
const result = await evaluator.evaluate('The answer is 42.', {
|
|
28
|
+
type: 'combined',
|
|
29
|
+
operator: 'and',
|
|
30
|
+
expectations: [
|
|
31
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
32
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
33
|
+
],
|
|
34
|
+
});
|
|
35
|
+
expect(result.passed).toBe(false);
|
|
36
|
+
expect(result.score).toBe(0.5);
|
|
37
|
+
expect(result.reason).toContain('1/2 expectations passed');
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('fails when all expectations fail', async () => {
|
|
41
|
+
const result = await evaluator.evaluate('Nothing here.', {
|
|
42
|
+
type: 'combined',
|
|
43
|
+
operator: 'and',
|
|
44
|
+
expectations: [
|
|
45
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
46
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
47
|
+
],
|
|
48
|
+
});
|
|
49
|
+
expect(result.passed).toBe(false);
|
|
50
|
+
expect(result.score).toBe(0);
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
describe('OR operator', () => {
|
|
55
|
+
test('passes when all expectations pass', async () => {
|
|
56
|
+
const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
|
|
57
|
+
type: 'combined',
|
|
58
|
+
operator: 'or',
|
|
59
|
+
expectations: [
|
|
60
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
61
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
62
|
+
],
|
|
63
|
+
});
|
|
64
|
+
expect(result.passed).toBe(true);
|
|
65
|
+
expect(result.score).toBe(1);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test('passes when one expectation passes', async () => {
|
|
69
|
+
const result = await evaluator.evaluate('The answer is 42.', {
|
|
70
|
+
type: 'combined',
|
|
71
|
+
operator: 'or',
|
|
72
|
+
expectations: [
|
|
73
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
74
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
75
|
+
],
|
|
76
|
+
});
|
|
77
|
+
expect(result.passed).toBe(true);
|
|
78
|
+
expect(result.score).toBe(1); // Max score
|
|
79
|
+
expect(result.reason).toContain('1/2 expectations passed');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test('fails when all expectations fail', async () => {
|
|
83
|
+
const result = await evaluator.evaluate('Nothing here.', {
|
|
84
|
+
type: 'combined',
|
|
85
|
+
operator: 'or',
|
|
86
|
+
expectations: [
|
|
87
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
88
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
89
|
+
],
|
|
90
|
+
});
|
|
91
|
+
expect(result.passed).toBe(false);
|
|
92
|
+
expect(result.score).toBe(0);
|
|
93
|
+
expect(result.reason).toContain('No expectations passed');
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
describe('mixed expectation types', () => {
|
|
98
|
+
test('combines contains and regex expectations', async () => {
|
|
99
|
+
const result = await evaluator.evaluate('The result is 123-456-7890.', {
|
|
100
|
+
type: 'combined',
|
|
101
|
+
operator: 'and',
|
|
102
|
+
expectations: [
|
|
103
|
+
{ type: 'contains', values: ['result'], mode: 'all' },
|
|
104
|
+
{ type: 'regex', pattern: '\\d{3}-\\d{3}-\\d{4}' },
|
|
105
|
+
],
|
|
106
|
+
});
|
|
107
|
+
expect(result.passed).toBe(true);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test('combines exact and contains expectations', async () => {
|
|
111
|
+
const result = await evaluator.evaluate('Hello World', {
|
|
112
|
+
type: 'combined',
|
|
113
|
+
operator: 'or',
|
|
114
|
+
expectations: [
|
|
115
|
+
{ type: 'exact', value: 'Goodbye World', caseSensitive: true },
|
|
116
|
+
{ type: 'contains', values: ['Hello'], mode: 'all' },
|
|
117
|
+
],
|
|
118
|
+
});
|
|
119
|
+
expect(result.passed).toBe(true);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test('combines not_contains with contains', async () => {
|
|
123
|
+
const result = await evaluator.evaluate('The answer is correct.', {
|
|
124
|
+
type: 'combined',
|
|
125
|
+
operator: 'and',
|
|
126
|
+
expectations: [
|
|
127
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
128
|
+
{ type: 'not_contains', values: ['error', 'wrong'], mode: 'all' },
|
|
129
|
+
],
|
|
130
|
+
});
|
|
131
|
+
expect(result.passed).toBe(true);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe('edge cases', () => {
|
|
136
|
+
test('handles empty expectations array', async () => {
|
|
137
|
+
const result = await evaluator.evaluate('Any text', {
|
|
138
|
+
type: 'combined',
|
|
139
|
+
operator: 'and',
|
|
140
|
+
expectations: [],
|
|
141
|
+
});
|
|
142
|
+
expect(result.passed).toBe(true);
|
|
143
|
+
expect(result.score).toBe(1);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
test('handles single expectation', async () => {
|
|
147
|
+
const result = await evaluator.evaluate('Hello World', {
|
|
148
|
+
type: 'combined',
|
|
149
|
+
operator: 'and',
|
|
150
|
+
expectations: [{ type: 'contains', values: ['Hello'], mode: 'all' }],
|
|
151
|
+
});
|
|
152
|
+
expect(result.passed).toBe(true);
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
test('provides detailed results', async () => {
|
|
156
|
+
const result = await evaluator.evaluate('The answer is 42.', {
|
|
157
|
+
type: 'combined',
|
|
158
|
+
operator: 'and',
|
|
159
|
+
expectations: [
|
|
160
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
161
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
162
|
+
],
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
expect(result.details).toBeDefined();
|
|
166
|
+
expect(result.details?.operator).toBe('and');
|
|
167
|
+
expect(result.details?.results).toHaveLength(2);
|
|
168
|
+
expect(result.details?.passedCount).toBe(1);
|
|
169
|
+
expect(result.details?.totalCount).toBe(2);
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
});
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Combined evaluator - evaluates multiple expectations with and/or logic
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Expected } from '../scenario/schema';
|
|
6
|
+
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Get an evaluator by type (imported dynamically to avoid circular deps)
|
|
10
|
+
*/
|
|
11
|
+
async function getEvaluatorForType(type: string): Promise<Evaluator> {
|
|
12
|
+
// Dynamic import to avoid circular dependency with index.ts
|
|
13
|
+
const { getEvaluator } = await import('./index.js');
|
|
14
|
+
return getEvaluator(type);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class CombinedEvaluator implements Evaluator {
|
|
18
|
+
readonly type = 'combined';
|
|
19
|
+
|
|
20
|
+
async evaluate(
|
|
21
|
+
response: string,
|
|
22
|
+
expected: Expected,
|
|
23
|
+
context?: EvaluatorContext
|
|
24
|
+
): Promise<EvaluatorResult> {
|
|
25
|
+
if (expected.type !== 'combined') {
|
|
26
|
+
throw new Error('Invalid expected type for CombinedEvaluator');
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const { operator, expectations } = expected;
|
|
30
|
+
|
|
31
|
+
if (!expectations || expectations.length === 0) {
|
|
32
|
+
return {
|
|
33
|
+
passed: true,
|
|
34
|
+
score: 1,
|
|
35
|
+
reason: 'No expectations to evaluate',
|
|
36
|
+
details: { operator, results: [] },
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Evaluate all sub-expectations
|
|
41
|
+
const results: Array<{
|
|
42
|
+
type: string;
|
|
43
|
+
passed: boolean;
|
|
44
|
+
score: number;
|
|
45
|
+
reason?: string;
|
|
46
|
+
}> = [];
|
|
47
|
+
|
|
48
|
+
for (const subExpected of expectations) {
|
|
49
|
+
const evaluator = await getEvaluatorForType(subExpected.type);
|
|
50
|
+
const result = await evaluator.evaluate(response, subExpected, context);
|
|
51
|
+
results.push({
|
|
52
|
+
type: subExpected.type,
|
|
53
|
+
passed: result.passed,
|
|
54
|
+
score: result.score,
|
|
55
|
+
reason: result.reason,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Calculate combined result based on operator
|
|
60
|
+
let passed: boolean;
|
|
61
|
+
let score: number;
|
|
62
|
+
let reason: string;
|
|
63
|
+
|
|
64
|
+
if (operator === 'and') {
|
|
65
|
+
// AND: all must pass
|
|
66
|
+
passed = results.every((r) => r.passed);
|
|
67
|
+
score = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
68
|
+
const passedCount = results.filter((r) => r.passed).length;
|
|
69
|
+
reason = passed
|
|
70
|
+
? `All ${results.length} expectations passed`
|
|
71
|
+
: `${passedCount}/${results.length} expectations passed (all required)`;
|
|
72
|
+
} else {
|
|
73
|
+
// OR: at least one must pass
|
|
74
|
+
passed = results.some((r) => r.passed);
|
|
75
|
+
// For OR, take the max score
|
|
76
|
+
score = Math.max(...results.map((r) => r.score));
|
|
77
|
+
const passedCount = results.filter((r) => r.passed).length;
|
|
78
|
+
reason = passed
|
|
79
|
+
? `${passedCount}/${results.length} expectations passed (at least one required)`
|
|
80
|
+
: 'No expectations passed (at least one required)';
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
passed,
|
|
85
|
+
score,
|
|
86
|
+
reason,
|
|
87
|
+
details: {
|
|
88
|
+
operator,
|
|
89
|
+
results,
|
|
90
|
+
passedCount: results.filter((r) => r.passed).length,
|
|
91
|
+
totalCount: results.length,
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
}
|