@artemiskit/core 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  *
4
4
  * Pricing is per 1,000 tokens (1K tokens) in USD
5
5
  * Data is updated periodically - always verify with provider's official pricing
6
+ * Last comprehensive update: January 2026
6
7
  */
7
8
 
8
9
  export interface ModelPricing {
@@ -34,132 +35,230 @@ export interface CostEstimate {
34
35
  * Prices are in USD per 1,000 tokens
35
36
  */
36
37
  export const MODEL_PRICING: Record<string, ModelPricing> = {
37
- // OpenAI GPT-4 family
38
- 'gpt-4': {
39
- promptPer1K: 0.03,
40
- completionPer1K: 0.06,
41
- lastUpdated: '2024-01',
38
+ // ============================================
39
+ // OpenAI GPT-5 family (Latest - 2025)
40
+ // ============================================
41
+ 'gpt-5': {
42
+ promptPer1K: 0.00125,
43
+ completionPer1K: 0.01,
44
+ lastUpdated: '2026-01',
45
+ notes: '400K context window',
42
46
  },
43
- 'gpt-4-32k': {
44
- promptPer1K: 0.06,
45
- completionPer1K: 0.12,
46
- lastUpdated: '2024-01',
47
+ 'gpt-5.1': {
48
+ promptPer1K: 0.00125,
49
+ completionPer1K: 0.01,
50
+ lastUpdated: '2026-01',
47
51
  },
48
- 'gpt-4-turbo': {
49
- promptPer1K: 0.01,
50
- completionPer1K: 0.03,
51
- lastUpdated: '2024-01',
52
+ 'gpt-5.2': {
53
+ promptPer1K: 0.00175,
54
+ completionPer1K: 0.014,
55
+ lastUpdated: '2026-01',
52
56
  },
53
- 'gpt-4-turbo-preview': {
54
- promptPer1K: 0.01,
55
- completionPer1K: 0.03,
56
- lastUpdated: '2024-01',
57
+ 'gpt-5-mini': {
58
+ promptPer1K: 0.00025,
59
+ completionPer1K: 0.002,
60
+ lastUpdated: '2026-01',
61
+ },
62
+ 'gpt-5-nano': {
63
+ promptPer1K: 0.00005,
64
+ completionPer1K: 0.0004,
65
+ lastUpdated: '2026-01',
66
+ },
67
+
68
+ // ============================================
69
+ // OpenAI GPT-4.1 family (2025)
70
+ // ============================================
71
+ 'gpt-4.1': {
72
+ promptPer1K: 0.002,
73
+ completionPer1K: 0.008,
74
+ lastUpdated: '2026-01',
75
+ notes: '1M context window',
76
+ },
77
+ 'gpt-4.1-mini': {
78
+ promptPer1K: 0.0004,
79
+ completionPer1K: 0.0016,
80
+ lastUpdated: '2026-01',
81
+ },
82
+ 'gpt-4.1-nano': {
83
+ promptPer1K: 0.0001,
84
+ completionPer1K: 0.0004,
85
+ lastUpdated: '2026-01',
57
86
  },
87
+
88
+ // ============================================
89
+ // OpenAI GPT-4o family (2024-2025)
90
+ // ============================================
58
91
  'gpt-4o': {
59
- promptPer1K: 0.005,
60
- completionPer1K: 0.015,
61
- lastUpdated: '2024-05',
92
+ promptPer1K: 0.0025,
93
+ completionPer1K: 0.01,
94
+ lastUpdated: '2026-01',
95
+ notes: '128K context window',
62
96
  },
63
97
  'gpt-4o-mini': {
64
98
  promptPer1K: 0.00015,
65
99
  completionPer1K: 0.0006,
66
- lastUpdated: '2024-07',
100
+ lastUpdated: '2026-01',
101
+ notes: '128K context window',
67
102
  },
68
103
 
69
- // OpenAI GPT-3.5 family
104
+ // ============================================
105
+ // OpenAI O-series (Reasoning models)
106
+ // ============================================
107
+ o1: {
108
+ promptPer1K: 0.015,
109
+ completionPer1K: 0.06,
110
+ lastUpdated: '2026-01',
111
+ notes: 'Reasoning model - internal thinking tokens billed as output',
112
+ },
113
+ o3: {
114
+ promptPer1K: 0.002,
115
+ completionPer1K: 0.008,
116
+ lastUpdated: '2026-01',
117
+ },
118
+ 'o3-mini': {
119
+ promptPer1K: 0.0011,
120
+ completionPer1K: 0.0044,
121
+ lastUpdated: '2026-01',
122
+ },
123
+ 'o4-mini': {
124
+ promptPer1K: 0.0011,
125
+ completionPer1K: 0.0044,
126
+ lastUpdated: '2026-01',
127
+ },
128
+
129
+ // ============================================
130
+ // OpenAI Legacy GPT-4 family
131
+ // ============================================
132
+ 'gpt-4-turbo': {
133
+ promptPer1K: 0.01,
134
+ completionPer1K: 0.03,
135
+ lastUpdated: '2026-01',
136
+ },
137
+ 'gpt-4': {
138
+ promptPer1K: 0.03,
139
+ completionPer1K: 0.06,
140
+ lastUpdated: '2026-01',
141
+ },
70
142
  'gpt-3.5-turbo': {
71
143
  promptPer1K: 0.0005,
72
144
  completionPer1K: 0.0015,
73
- lastUpdated: '2024-01',
145
+ lastUpdated: '2026-01',
74
146
  },
75
- 'gpt-3.5-turbo-16k': {
147
+
148
+ // ============================================
149
+ // Anthropic Claude 4.5 family (Latest - 2025)
150
+ // ============================================
151
+ 'claude-opus-4.5': {
152
+ promptPer1K: 0.005,
153
+ completionPer1K: 0.025,
154
+ lastUpdated: '2026-01',
155
+ notes: 'Most capable Claude model',
156
+ },
157
+ 'claude-sonnet-4.5': {
76
158
  promptPer1K: 0.003,
77
- completionPer1K: 0.004,
78
- lastUpdated: '2024-01',
159
+ completionPer1K: 0.015,
160
+ lastUpdated: '2026-01',
161
+ notes: 'Balanced performance and cost',
162
+ },
163
+ 'claude-haiku-4.5': {
164
+ promptPer1K: 0.001,
165
+ completionPer1K: 0.005,
166
+ lastUpdated: '2026-01',
167
+ notes: 'Fastest Claude model',
79
168
  },
80
169
 
81
- // Anthropic Claude family
82
- 'claude-3-opus-20240229': {
170
+ // ============================================
171
+ // Anthropic Claude 4 family (2025)
172
+ // ============================================
173
+ 'claude-opus-4': {
83
174
  promptPer1K: 0.015,
84
175
  completionPer1K: 0.075,
85
- lastUpdated: '2024-03',
176
+ lastUpdated: '2026-01',
86
177
  },
87
- 'claude-3-sonnet-20240229': {
178
+ 'claude-opus-4.1': {
179
+ promptPer1K: 0.015,
180
+ completionPer1K: 0.075,
181
+ lastUpdated: '2026-01',
182
+ },
183
+ 'claude-sonnet-4': {
88
184
  promptPer1K: 0.003,
89
185
  completionPer1K: 0.015,
90
- lastUpdated: '2024-03',
186
+ lastUpdated: '2026-01',
91
187
  },
92
- 'claude-3-haiku-20240307': {
93
- promptPer1K: 0.00025,
94
- completionPer1K: 0.00125,
95
- lastUpdated: '2024-03',
188
+
189
+ // ============================================
190
+ // Anthropic Claude 3.7 family
191
+ // ============================================
192
+ 'claude-sonnet-3.7': {
193
+ promptPer1K: 0.003,
194
+ completionPer1K: 0.015,
195
+ lastUpdated: '2026-01',
96
196
  },
97
- 'claude-3-5-sonnet-20240620': {
197
+ 'claude-3-7-sonnet': {
98
198
  promptPer1K: 0.003,
99
199
  completionPer1K: 0.015,
100
- lastUpdated: '2024-06',
200
+ lastUpdated: '2026-01',
101
201
  },
202
+
203
+ // ============================================
204
+ // Anthropic Claude 3.5 family (Legacy)
205
+ // ============================================
102
206
  'claude-3-5-sonnet-20241022': {
103
207
  promptPer1K: 0.003,
104
208
  completionPer1K: 0.015,
105
- lastUpdated: '2024-10',
209
+ lastUpdated: '2026-01',
106
210
  },
107
211
  'claude-3-5-haiku-20241022': {
108
212
  promptPer1K: 0.0008,
109
213
  completionPer1K: 0.004,
110
- lastUpdated: '2024-10',
214
+ lastUpdated: '2026-01',
111
215
  },
112
- // Aliases
216
+ 'claude-haiku-3.5': {
217
+ promptPer1K: 0.0008,
218
+ completionPer1K: 0.004,
219
+ lastUpdated: '2026-01',
220
+ },
221
+
222
+ // ============================================
223
+ // Anthropic Claude 3 family (Legacy)
224
+ // ============================================
113
225
  'claude-3-opus': {
114
226
  promptPer1K: 0.015,
115
227
  completionPer1K: 0.075,
116
- lastUpdated: '2024-03',
228
+ lastUpdated: '2026-01',
117
229
  },
118
230
  'claude-3-sonnet': {
119
231
  promptPer1K: 0.003,
120
232
  completionPer1K: 0.015,
121
- lastUpdated: '2024-03',
233
+ lastUpdated: '2026-01',
122
234
  },
123
235
  'claude-3-haiku': {
124
236
  promptPer1K: 0.00025,
125
237
  completionPer1K: 0.00125,
126
- lastUpdated: '2024-03',
238
+ lastUpdated: '2026-01',
127
239
  },
240
+
241
+ // Aliases for common naming patterns
128
242
  'claude-3.5-sonnet': {
129
243
  promptPer1K: 0.003,
130
244
  completionPer1K: 0.015,
131
- lastUpdated: '2024-10',
245
+ lastUpdated: '2026-01',
132
246
  },
133
247
  'claude-3.5-haiku': {
134
248
  promptPer1K: 0.0008,
135
249
  completionPer1K: 0.004,
136
- lastUpdated: '2024-10',
137
- },
138
-
139
- // Legacy Claude
140
- 'claude-2': {
141
- promptPer1K: 0.008,
142
- completionPer1K: 0.024,
143
- lastUpdated: '2024-01',
250
+ lastUpdated: '2026-01',
144
251
  },
145
- 'claude-instant-1': {
146
- promptPer1K: 0.0008,
147
- completionPer1K: 0.0024,
148
- lastUpdated: '2024-01',
149
- },
150
-
151
- // Azure OpenAI (same pricing as OpenAI typically)
152
- // Add 'azure-' prefix versions if needed
153
252
  };
154
253
 
155
254
  /**
156
255
  * Default pricing for unknown models
157
- * Uses conservative estimates
256
+ * Uses conservative estimates based on mid-tier model pricing
158
257
  */
159
258
  export const DEFAULT_PRICING: ModelPricing = {
160
- promptPer1K: 0.01,
161
- completionPer1K: 0.03,
162
- lastUpdated: '2024-01',
259
+ promptPer1K: 0.003,
260
+ completionPer1K: 0.015,
261
+ lastUpdated: '2026-01',
163
262
  notes: 'Default pricing - verify with provider',
164
263
  };
165
264
 
@@ -183,12 +282,57 @@ export function getModelPricing(model: string): ModelPricing {
183
282
  }
184
283
 
185
284
  // Try partial match for common patterns
285
+ // GPT-5 family
286
+ if (lowerModel.includes('gpt-5.2')) {
287
+ return MODEL_PRICING['gpt-5.2'];
288
+ }
289
+ if (lowerModel.includes('gpt-5.1')) {
290
+ return MODEL_PRICING['gpt-5.1'];
291
+ }
292
+ if (lowerModel.includes('gpt-5-mini')) {
293
+ return MODEL_PRICING['gpt-5-mini'];
294
+ }
295
+ if (lowerModel.includes('gpt-5-nano')) {
296
+ return MODEL_PRICING['gpt-5-nano'];
297
+ }
298
+ if (lowerModel.includes('gpt-5')) {
299
+ return MODEL_PRICING['gpt-5'];
300
+ }
301
+
302
+ // GPT-4.1 family
303
+ if (lowerModel.includes('gpt-4.1-mini')) {
304
+ return MODEL_PRICING['gpt-4.1-mini'];
305
+ }
306
+ if (lowerModel.includes('gpt-4.1-nano')) {
307
+ return MODEL_PRICING['gpt-4.1-nano'];
308
+ }
309
+ if (lowerModel.includes('gpt-4.1')) {
310
+ return MODEL_PRICING['gpt-4.1'];
311
+ }
312
+
313
+ // GPT-4o family
186
314
  if (lowerModel.includes('gpt-4o-mini')) {
187
315
  return MODEL_PRICING['gpt-4o-mini'];
188
316
  }
189
317
  if (lowerModel.includes('gpt-4o')) {
190
318
  return MODEL_PRICING['gpt-4o'];
191
319
  }
320
+
321
+ // O-series
322
+ if (lowerModel.includes('o4-mini')) {
323
+ return MODEL_PRICING['o4-mini'];
324
+ }
325
+ if (lowerModel.includes('o3-mini')) {
326
+ return MODEL_PRICING['o3-mini'];
327
+ }
328
+ if (lowerModel.includes('o3')) {
329
+ return MODEL_PRICING.o3;
330
+ }
331
+ if (lowerModel.includes('o1')) {
332
+ return MODEL_PRICING.o1;
333
+ }
334
+
335
+ // Legacy GPT
192
336
  if (lowerModel.includes('gpt-4-turbo')) {
193
337
  return MODEL_PRICING['gpt-4-turbo'];
194
338
  }
@@ -198,12 +342,43 @@ export function getModelPricing(model: string): ModelPricing {
198
342
  if (lowerModel.includes('gpt-3.5')) {
199
343
  return MODEL_PRICING['gpt-3.5-turbo'];
200
344
  }
345
+
346
+ // Claude 4.5 family
347
+ if (lowerModel.includes('opus-4.5') || lowerModel.includes('opus-4-5')) {
348
+ return MODEL_PRICING['claude-opus-4.5'];
349
+ }
350
+ if (lowerModel.includes('sonnet-4.5') || lowerModel.includes('sonnet-4-5')) {
351
+ return MODEL_PRICING['claude-sonnet-4.5'];
352
+ }
353
+ if (lowerModel.includes('haiku-4.5') || lowerModel.includes('haiku-4-5')) {
354
+ return MODEL_PRICING['claude-haiku-4.5'];
355
+ }
356
+
357
+ // Claude 4 family
358
+ if (lowerModel.includes('opus-4.1') || lowerModel.includes('opus-4-1')) {
359
+ return MODEL_PRICING['claude-opus-4.1'];
360
+ }
361
+ if (lowerModel.includes('opus-4')) {
362
+ return MODEL_PRICING['claude-opus-4'];
363
+ }
364
+ if (lowerModel.includes('sonnet-4')) {
365
+ return MODEL_PRICING['claude-sonnet-4'];
366
+ }
367
+
368
+ // Claude 3.7 family
369
+ if (lowerModel.includes('sonnet-3.7') || lowerModel.includes('sonnet-3-7')) {
370
+ return MODEL_PRICING['claude-sonnet-3.7'];
371
+ }
372
+
373
+ // Claude 3.5 family
201
374
  if (lowerModel.includes('claude-3-5-sonnet') || lowerModel.includes('claude-3.5-sonnet')) {
202
375
  return MODEL_PRICING['claude-3.5-sonnet'];
203
376
  }
204
377
  if (lowerModel.includes('claude-3-5-haiku') || lowerModel.includes('claude-3.5-haiku')) {
205
378
  return MODEL_PRICING['claude-3.5-haiku'];
206
379
  }
380
+
381
+ // Claude 3 family
207
382
  if (lowerModel.includes('claude-3-opus')) {
208
383
  return MODEL_PRICING['claude-3-opus'];
209
384
  }
@@ -213,8 +388,10 @@ export function getModelPricing(model: string): ModelPricing {
213
388
  if (lowerModel.includes('claude-3-haiku')) {
214
389
  return MODEL_PRICING['claude-3-haiku'];
215
390
  }
391
+
392
+ // Generic Claude fallback
216
393
  if (lowerModel.includes('claude')) {
217
- return MODEL_PRICING['claude-2'];
394
+ return MODEL_PRICING['claude-sonnet-4.5'];
218
395
  }
219
396
 
220
397
  return DEFAULT_PRICING;
@@ -5,22 +5,27 @@
5
5
  import type { Expected } from '../scenario/schema';
6
6
  import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
7
7
 
8
- const GRADER_PROMPT = `You are an evaluator grading an AI response based on a rubric.
8
+ const GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
9
9
 
10
- ## RUBRIC
10
+ RUBRIC:
11
11
  {{rubric}}
12
12
 
13
- ## RESPONSE TO EVALUATE
13
+ RESPONSE TO EVALUATE:
14
14
  {{response}}
15
15
 
16
- ## INSTRUCTIONS
17
- Score the response from 0.0 to 1.0 based on the rubric.
18
- Be objective and consistent in your scoring.
16
+ TASK: Score the response from 0.0 to 1.0 based on the rubric above.
19
17
 
20
- Respond with ONLY a JSON object in this exact format:
21
- {"score": <number between 0 and 1>, "reason": "<brief explanation of score>"}
18
+ OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
19
+ {"score":0.0,"reason":"explanation"}
22
20
 
23
- Do not include any other text, markdown, or formatting.`;
21
+ RULES:
22
+ - Output ONLY valid JSON, no markdown, no code blocks, no extra text
23
+ - "score" must be a number between 0.0 and 1.0
24
+ - "reason" must be a brief string explaining the score
25
+ - Do NOT wrap in \`\`\`json or any formatting
26
+ - Your entire response must be parseable by JSON.parse()
27
+
28
+ JSON OUTPUT:`;
24
29
 
25
30
  export class LLMGraderEvaluator implements Evaluator {
26
31
  readonly type = 'llm_grader';
@@ -44,11 +49,13 @@ export class LLMGraderEvaluator implements Evaluator {
44
49
  );
45
50
 
46
51
  try {
52
+ // Note: Some models (like o1, o3, gpt-5-mini, reasoning models) only support temperature=1
53
+ // We omit temperature to let the API use its default for maximum compatibility
54
+ // Use higher maxTokens for reasoning models which use tokens for internal "thinking"
47
55
  const result = await context.client.generate({
48
56
  prompt,
49
57
  model: expected.model,
50
- temperature: 0,
51
- maxTokens: 200,
58
+ maxTokens: 1000,
52
59
  });
53
60
 
54
61
  const parsed = this.parseGraderResponse(result.text);
@@ -76,9 +83,25 @@ export class LLMGraderEvaluator implements Evaluator {
76
83
  }
77
84
 
78
85
  private parseGraderResponse(text: string): { score: number; reason?: string } {
79
- const jsonMatch = text.match(/\{[\s\S]*?\}/);
86
+ // Clean up the response - remove markdown code blocks if present
87
+ const cleanedText = text
88
+ .replace(/```json\s*/gi, '')
89
+ .replace(/```\s*/g, '')
90
+ .trim();
91
+
92
+ // Try to find JSON object in the response
93
+ const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
94
+
80
95
  if (!jsonMatch) {
81
- throw new Error('No JSON found in grader response');
96
+ // Fallback: try to extract score from plain text patterns like "Score: 0.8" or "0.85"
97
+ const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
98
+ if (scoreMatch) {
99
+ const score = Number(scoreMatch[1]);
100
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
101
+ return { score, reason: cleanedText };
102
+ }
103
+ }
104
+ throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
82
105
  }
83
106
 
84
107
  try {
@@ -94,6 +117,15 @@ export class LLMGraderEvaluator implements Evaluator {
94
117
  reason: parsed.reason,
95
118
  };
96
119
  } catch (error) {
120
+ // If JSON parsing fails, try extracting score directly
121
+ const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
122
+ if (scoreMatch) {
123
+ const score = Number(scoreMatch[1]);
124
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
125
+ const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
126
+ return { score, reason: reasonMatch?.[1] };
127
+ }
128
+ }
97
129
  throw new Error(`Failed to parse grader response: ${(error as Error).message}`);
98
130
  }
99
131
  }
@@ -15,7 +15,8 @@ describe('SimilarityEvaluator', () => {
15
15
 
16
16
  test('throws on invalid expected type', async () => {
17
17
  await expect(
18
- evaluator.evaluate('response', { type: 'exact', value: 'test' } as any)
18
+ // @ts-expect-error Testing invalid type handling
19
+ evaluator.evaluate('response', { type: 'exact', value: 'test' })
19
20
  ).rejects.toThrow('Invalid expected type');
20
21
  });
21
22
 
@@ -288,8 +289,8 @@ describe('SimilarityEvaluator', () => {
288
289
  {
289
290
  type: 'similarity',
290
291
  value: 'Text B',
291
- // No threshold specified, should use default 0.75
292
- } as any,
292
+ threshold: undefined, // Testing default threshold (0.75)
293
+ },
293
294
  mockContext
294
295
  );
295
296
 
@@ -44,6 +44,10 @@ export const ProviderConfigSchema = z
44
44
  apiVersion: z.string().optional(),
45
45
  embeddingDeploymentName: z.string().optional(),
46
46
 
47
+ // Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
48
+ // Used by OpenAI/Azure to determine which API parameters to use (max_tokens vs max_completion_tokens)
49
+ modelFamily: z.string().optional(),
50
+
47
51
  // Vercel AI specific
48
52
  underlyingProvider: z.enum(['openai', 'azure', 'anthropic', 'google', 'mistral']).optional(),
49
53
  })