@artemiskit/core 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +74 -0
  2. package/dist/adapters/types.d.ts +3 -1
  3. package/dist/adapters/types.d.ts.map +1 -1
  4. package/dist/artifacts/types.d.ts +39 -0
  5. package/dist/artifacts/types.d.ts.map +1 -1
  6. package/dist/cost/index.d.ts +5 -0
  7. package/dist/cost/index.d.ts.map +1 -0
  8. package/dist/cost/pricing.d.ts +66 -0
  9. package/dist/cost/pricing.d.ts.map +1 -0
  10. package/dist/evaluators/combined.d.ts +10 -0
  11. package/dist/evaluators/combined.d.ts.map +1 -0
  12. package/dist/evaluators/index.d.ts +4 -0
  13. package/dist/evaluators/index.d.ts.map +1 -1
  14. package/dist/evaluators/inline.d.ts +22 -0
  15. package/dist/evaluators/inline.d.ts.map +1 -0
  16. package/dist/evaluators/not-contains.d.ts +10 -0
  17. package/dist/evaluators/not-contains.d.ts.map +1 -0
  18. package/dist/evaluators/similarity.d.ts +16 -0
  19. package/dist/evaluators/similarity.d.ts.map +1 -0
  20. package/dist/events/emitter.d.ts +111 -0
  21. package/dist/events/emitter.d.ts.map +1 -0
  22. package/dist/events/index.d.ts +6 -0
  23. package/dist/events/index.d.ts.map +1 -0
  24. package/dist/events/types.d.ts +177 -0
  25. package/dist/events/types.d.ts.map +1 -0
  26. package/dist/index.d.ts +1 -0
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +13056 -12093
  29. package/dist/scenario/discovery.d.ts +72 -0
  30. package/dist/scenario/discovery.d.ts.map +1 -0
  31. package/dist/scenario/index.d.ts +1 -0
  32. package/dist/scenario/index.d.ts.map +1 -1
  33. package/dist/scenario/schema.d.ts +1245 -9
  34. package/dist/scenario/schema.d.ts.map +1 -1
  35. package/package.json +1 -1
  36. package/src/adapters/types.ts +3 -1
  37. package/src/artifacts/types.ts +39 -0
  38. package/src/cost/index.ts +14 -0
  39. package/src/cost/pricing.ts +273 -0
  40. package/src/evaluators/combined.test.ts +172 -0
  41. package/src/evaluators/combined.ts +95 -0
  42. package/src/evaluators/index.ts +12 -0
  43. package/src/evaluators/inline.test.ts +409 -0
  44. package/src/evaluators/inline.ts +393 -0
  45. package/src/evaluators/not-contains.test.ts +105 -0
  46. package/src/evaluators/not-contains.ts +45 -0
  47. package/src/evaluators/similarity.test.ts +333 -0
  48. package/src/evaluators/similarity.ts +258 -0
  49. package/src/index.ts +3 -0
  50. package/src/scenario/discovery.test.ts +153 -0
  51. package/src/scenario/discovery.ts +277 -0
  52. package/src/scenario/index.ts +1 -0
  53. package/src/scenario/schema.ts +43 -2
@@ -1 +1 @@
1
- {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/scenario/schema.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;GAEG;AACH,eAAO,MAAM,cAAc,mIAWzB,CAAC;AAEH;;;;GAIG;AACH,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoBpB,CAAC;AAEd;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA2CzB,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;EAG5B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,eAAe,gFAAuE,CAAC;AAEpG;;;GAGG;AACH,QAAA,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;GAAuC,CAAC;AAEnE;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAazB,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAE7D,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAYzB,6CAA6C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QAnB7C,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAiC7D,CAAC;AAEH,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAClE,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAChE,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,uBAAuB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC"}
1
+ {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/scenario/schema.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;GAEG;AACH,eAAO,MAAM,cAAc,mIAWzB,CAAC;AAEH;;;;GAIG;AACH,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqBpB,CAAC;AAoFd;;;GAGG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA7BvB,0GAA0G;;IAE1G,8EAA8E;;IAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QAJvF,0GAA0G;;QAE1G,8EAA8E;;QAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAyBR,CAAC;AAEpF;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;EAG5B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,eAAe,gFAAuE,CAAC;AAEpG;;;GAGG;AACH,QAAA,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;GAAuC,CAAC;AAEnE;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QArDvB,0GAA0G;;QAE1G,8EAA8E;;QAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAJvF,0GAA0G;;YAE1G,8EAA8E;;YAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA8DzF,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAE7D,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAYzB,6CAA6C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YArF3C,0GAA0G;;YAE1G,8EAA8E;;YAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBAJvF,0GAA0G;;gBAE1G,8EAA8E;;gBAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QA8DzF,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAiC7D,CAAC;AAEH,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAClE,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAChE,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,uBAAuB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@artemiskit/core",
3
- "version": "0.1.6",
3
+ "version": "0.2.0",
4
4
  "description": "Core runner, evaluators, and storage for ArtemisKit LLM evaluation toolkit",
5
5
  "type": "module",
6
6
  "license": "Apache-2.0",
@@ -105,7 +105,7 @@ export interface ModelClient {
105
105
 
106
106
  stream?(options: GenerateOptions, onChunk: (chunk: string) => void): AsyncIterable<string>;
107
107
 
108
- embed?(text: string): Promise<number[]>;
108
+ embed?(text: string, model?: string): Promise<number[]>;
109
109
 
110
110
  capabilities(): Promise<ModelCapabilities>;
111
111
 
@@ -155,6 +155,8 @@ export interface AzureOpenAIAdapterConfig extends BaseAdapterConfig {
155
155
  resourceName: string;
156
156
  deploymentName: string;
157
157
  apiVersion: string;
158
+ /** Optional separate deployment name for embedding models */
159
+ embeddingDeploymentName?: string;
158
160
  }
159
161
 
160
162
  /**
@@ -322,6 +322,15 @@ export interface StressRequestResult {
322
322
  error?: string;
323
323
  /** Timestamp of the request */
324
324
  timestamp: number;
325
+ /** Token usage for this request */
326
+ tokens?: {
327
+ /** Prompt/input tokens */
328
+ prompt: number;
329
+ /** Completion/output tokens */
330
+ completion: number;
331
+ /** Total tokens */
332
+ total: number;
333
+ };
325
334
  }
326
335
 
327
336
  /**
@@ -352,6 +361,36 @@ export interface StressMetrics {
352
361
  p95_latency_ms: number;
353
362
  /** 99th percentile latency */
354
363
  p99_latency_ms: number;
364
+ /** Token usage metrics (optional - only if provider returns token counts) */
365
+ tokens?: {
366
+ /** Total prompt/input tokens across all requests */
367
+ total_prompt_tokens: number;
368
+ /** Total completion/output tokens across all requests */
369
+ total_completion_tokens: number;
370
+ /** Total tokens (prompt + completion) */
371
+ total_tokens: number;
372
+ /** Average tokens per request */
373
+ avg_tokens_per_request: number;
374
+ };
375
+ /** Estimated cost metrics (optional - only if cost estimation is available) */
376
+ cost?: {
377
+ /** Estimated total cost in USD */
378
+ estimated_total_usd: number;
379
+ /** Cost breakdown by token type */
380
+ breakdown: {
381
+ /** Cost for prompt/input tokens */
382
+ prompt_cost_usd: number;
383
+ /** Cost for completion/output tokens */
384
+ completion_cost_usd: number;
385
+ };
386
+ /** Model used for cost calculation */
387
+ model: string;
388
+ /** Pricing used (per 1K tokens) */
389
+ pricing: {
390
+ prompt_per_1k: number;
391
+ completion_per_1k: number;
392
+ };
393
+ };
355
394
  }
356
395
 
357
396
  /**
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Cost estimation module
3
+ */
4
+
5
+ export {
6
+ MODEL_PRICING,
7
+ DEFAULT_PRICING,
8
+ getModelPricing,
9
+ estimateCost,
10
+ formatCost,
11
+ listKnownModels,
12
+ type ModelPricing,
13
+ type CostEstimate,
14
+ } from './pricing.js';
@@ -0,0 +1,273 @@
1
+ /**
2
+ * LLM Pricing Data and Cost Estimation
3
+ *
4
+ * Pricing is per 1,000 tokens (1K tokens) in USD
5
+ * Data is updated periodically - always verify with provider's official pricing
6
+ */
7
+
8
+ export interface ModelPricing {
9
+ /** Price per 1K prompt/input tokens in USD */
10
+ promptPer1K: number;
11
+ /** Price per 1K completion/output tokens in USD */
12
+ completionPer1K: number;
13
+ /** Last updated date */
14
+ lastUpdated: string;
15
+ /** Notes about the pricing */
16
+ notes?: string;
17
+ }
18
+
19
+ export interface CostEstimate {
20
+ /** Estimated total cost in USD */
21
+ totalUsd: number;
22
+ /** Cost for prompt tokens */
23
+ promptCostUsd: number;
24
+ /** Cost for completion tokens */
25
+ completionCostUsd: number;
26
+ /** The model used for pricing */
27
+ model: string;
28
+ /** Pricing used */
29
+ pricing: ModelPricing;
30
+ }
31
+
32
+ /**
33
+ * Known model pricing data
34
+ * Prices are in USD per 1,000 tokens
35
+ */
36
+ export const MODEL_PRICING: Record<string, ModelPricing> = {
37
+ // OpenAI GPT-4 family
38
+ 'gpt-4': {
39
+ promptPer1K: 0.03,
40
+ completionPer1K: 0.06,
41
+ lastUpdated: '2024-01',
42
+ },
43
+ 'gpt-4-32k': {
44
+ promptPer1K: 0.06,
45
+ completionPer1K: 0.12,
46
+ lastUpdated: '2024-01',
47
+ },
48
+ 'gpt-4-turbo': {
49
+ promptPer1K: 0.01,
50
+ completionPer1K: 0.03,
51
+ lastUpdated: '2024-01',
52
+ },
53
+ 'gpt-4-turbo-preview': {
54
+ promptPer1K: 0.01,
55
+ completionPer1K: 0.03,
56
+ lastUpdated: '2024-01',
57
+ },
58
+ 'gpt-4o': {
59
+ promptPer1K: 0.005,
60
+ completionPer1K: 0.015,
61
+ lastUpdated: '2024-05',
62
+ },
63
+ 'gpt-4o-mini': {
64
+ promptPer1K: 0.00015,
65
+ completionPer1K: 0.0006,
66
+ lastUpdated: '2024-07',
67
+ },
68
+
69
+ // OpenAI GPT-3.5 family
70
+ 'gpt-3.5-turbo': {
71
+ promptPer1K: 0.0005,
72
+ completionPer1K: 0.0015,
73
+ lastUpdated: '2024-01',
74
+ },
75
+ 'gpt-3.5-turbo-16k': {
76
+ promptPer1K: 0.003,
77
+ completionPer1K: 0.004,
78
+ lastUpdated: '2024-01',
79
+ },
80
+
81
+ // Anthropic Claude family
82
+ 'claude-3-opus-20240229': {
83
+ promptPer1K: 0.015,
84
+ completionPer1K: 0.075,
85
+ lastUpdated: '2024-03',
86
+ },
87
+ 'claude-3-sonnet-20240229': {
88
+ promptPer1K: 0.003,
89
+ completionPer1K: 0.015,
90
+ lastUpdated: '2024-03',
91
+ },
92
+ 'claude-3-haiku-20240307': {
93
+ promptPer1K: 0.00025,
94
+ completionPer1K: 0.00125,
95
+ lastUpdated: '2024-03',
96
+ },
97
+ 'claude-3-5-sonnet-20240620': {
98
+ promptPer1K: 0.003,
99
+ completionPer1K: 0.015,
100
+ lastUpdated: '2024-06',
101
+ },
102
+ 'claude-3-5-sonnet-20241022': {
103
+ promptPer1K: 0.003,
104
+ completionPer1K: 0.015,
105
+ lastUpdated: '2024-10',
106
+ },
107
+ 'claude-3-5-haiku-20241022': {
108
+ promptPer1K: 0.0008,
109
+ completionPer1K: 0.004,
110
+ lastUpdated: '2024-10',
111
+ },
112
+ // Aliases
113
+ 'claude-3-opus': {
114
+ promptPer1K: 0.015,
115
+ completionPer1K: 0.075,
116
+ lastUpdated: '2024-03',
117
+ },
118
+ 'claude-3-sonnet': {
119
+ promptPer1K: 0.003,
120
+ completionPer1K: 0.015,
121
+ lastUpdated: '2024-03',
122
+ },
123
+ 'claude-3-haiku': {
124
+ promptPer1K: 0.00025,
125
+ completionPer1K: 0.00125,
126
+ lastUpdated: '2024-03',
127
+ },
128
+ 'claude-3.5-sonnet': {
129
+ promptPer1K: 0.003,
130
+ completionPer1K: 0.015,
131
+ lastUpdated: '2024-10',
132
+ },
133
+ 'claude-3.5-haiku': {
134
+ promptPer1K: 0.0008,
135
+ completionPer1K: 0.004,
136
+ lastUpdated: '2024-10',
137
+ },
138
+
139
+ // Legacy Claude
140
+ 'claude-2': {
141
+ promptPer1K: 0.008,
142
+ completionPer1K: 0.024,
143
+ lastUpdated: '2024-01',
144
+ },
145
+ 'claude-instant-1': {
146
+ promptPer1K: 0.0008,
147
+ completionPer1K: 0.0024,
148
+ lastUpdated: '2024-01',
149
+ },
150
+
151
+ // Azure OpenAI (same pricing as OpenAI typically)
152
+ // Add 'azure-' prefix versions if needed
153
+ };
154
+
155
+ /**
156
+ * Default pricing for unknown models
157
+ * Uses conservative estimates
158
+ */
159
+ export const DEFAULT_PRICING: ModelPricing = {
160
+ promptPer1K: 0.01,
161
+ completionPer1K: 0.03,
162
+ lastUpdated: '2024-01',
163
+ notes: 'Default pricing - verify with provider',
164
+ };
165
+
166
+ /**
167
+ * Get pricing for a model
168
+ * @param model Model identifier
169
+ * @returns Pricing data or default if unknown
170
+ */
171
+ export function getModelPricing(model: string): ModelPricing {
172
+ // Try exact match first
173
+ if (MODEL_PRICING[model]) {
174
+ return MODEL_PRICING[model];
175
+ }
176
+
177
+ // Try case-insensitive match
178
+ const lowerModel = model.toLowerCase();
179
+ for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
180
+ if (key.toLowerCase() === lowerModel) {
181
+ return pricing;
182
+ }
183
+ }
184
+
185
+ // Try partial match for common patterns
186
+ if (lowerModel.includes('gpt-4o-mini')) {
187
+ return MODEL_PRICING['gpt-4o-mini'];
188
+ }
189
+ if (lowerModel.includes('gpt-4o')) {
190
+ return MODEL_PRICING['gpt-4o'];
191
+ }
192
+ if (lowerModel.includes('gpt-4-turbo')) {
193
+ return MODEL_PRICING['gpt-4-turbo'];
194
+ }
195
+ if (lowerModel.includes('gpt-4')) {
196
+ return MODEL_PRICING['gpt-4'];
197
+ }
198
+ if (lowerModel.includes('gpt-3.5')) {
199
+ return MODEL_PRICING['gpt-3.5-turbo'];
200
+ }
201
+ if (lowerModel.includes('claude-3-5-sonnet') || lowerModel.includes('claude-3.5-sonnet')) {
202
+ return MODEL_PRICING['claude-3.5-sonnet'];
203
+ }
204
+ if (lowerModel.includes('claude-3-5-haiku') || lowerModel.includes('claude-3.5-haiku')) {
205
+ return MODEL_PRICING['claude-3.5-haiku'];
206
+ }
207
+ if (lowerModel.includes('claude-3-opus')) {
208
+ return MODEL_PRICING['claude-3-opus'];
209
+ }
210
+ if (lowerModel.includes('claude-3-sonnet')) {
211
+ return MODEL_PRICING['claude-3-sonnet'];
212
+ }
213
+ if (lowerModel.includes('claude-3-haiku')) {
214
+ return MODEL_PRICING['claude-3-haiku'];
215
+ }
216
+ if (lowerModel.includes('claude')) {
217
+ return MODEL_PRICING['claude-2'];
218
+ }
219
+
220
+ return DEFAULT_PRICING;
221
+ }
222
+
223
+ /**
224
+ * Estimate cost for token usage
225
+ * @param promptTokens Number of prompt/input tokens
226
+ * @param completionTokens Number of completion/output tokens
227
+ * @param model Model identifier
228
+ * @returns Cost estimate
229
+ */
230
+ export function estimateCost(
231
+ promptTokens: number,
232
+ completionTokens: number,
233
+ model: string
234
+ ): CostEstimate {
235
+ const pricing = getModelPricing(model);
236
+
237
+ const promptCostUsd = (promptTokens / 1000) * pricing.promptPer1K;
238
+ const completionCostUsd = (completionTokens / 1000) * pricing.completionPer1K;
239
+ const totalUsd = promptCostUsd + completionCostUsd;
240
+
241
+ return {
242
+ totalUsd,
243
+ promptCostUsd,
244
+ completionCostUsd,
245
+ model,
246
+ pricing,
247
+ };
248
+ }
249
+
250
+ /**
251
+ * Format cost for display
252
+ * @param costUsd Cost in USD
253
+ * @returns Formatted string
254
+ */
255
+ export function formatCost(costUsd: number): string {
256
+ if (costUsd < 0.01) {
257
+ return `$${(costUsd * 100).toFixed(4)} cents`;
258
+ }
259
+ if (costUsd < 1) {
260
+ return `$${costUsd.toFixed(4)}`;
261
+ }
262
+ return `$${costUsd.toFixed(2)}`;
263
+ }
264
+
265
+ /**
266
+ * List all known models with pricing
267
+ */
268
+ export function listKnownModels(): Array<{ model: string; pricing: ModelPricing }> {
269
+ return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
270
+ model,
271
+ pricing,
272
+ }));
273
+ }
@@ -0,0 +1,172 @@
1
+ /**
2
+ * Tests for CombinedEvaluator
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test';
6
+ import { CombinedEvaluator } from './combined';
7
+
8
+ describe('CombinedEvaluator', () => {
9
+ const evaluator = new CombinedEvaluator();
10
+
11
+ describe('AND operator', () => {
12
+ test('passes when all expectations pass', async () => {
13
+ const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
14
+ type: 'combined',
15
+ operator: 'and',
16
+ expectations: [
17
+ { type: 'contains', values: ['42'], mode: 'all' },
18
+ { type: 'contains', values: ['correct'], mode: 'all' },
19
+ ],
20
+ });
21
+ expect(result.passed).toBe(true);
22
+ expect(result.score).toBe(1);
23
+ expect(result.reason).toContain('All 2 expectations passed');
24
+ });
25
+
26
+ test('fails when one expectation fails', async () => {
27
+ const result = await evaluator.evaluate('The answer is 42.', {
28
+ type: 'combined',
29
+ operator: 'and',
30
+ expectations: [
31
+ { type: 'contains', values: ['42'], mode: 'all' },
32
+ { type: 'contains', values: ['correct'], mode: 'all' },
33
+ ],
34
+ });
35
+ expect(result.passed).toBe(false);
36
+ expect(result.score).toBe(0.5);
37
+ expect(result.reason).toContain('1/2 expectations passed');
38
+ });
39
+
40
+ test('fails when all expectations fail', async () => {
41
+ const result = await evaluator.evaluate('Nothing here.', {
42
+ type: 'combined',
43
+ operator: 'and',
44
+ expectations: [
45
+ { type: 'contains', values: ['42'], mode: 'all' },
46
+ { type: 'contains', values: ['correct'], mode: 'all' },
47
+ ],
48
+ });
49
+ expect(result.passed).toBe(false);
50
+ expect(result.score).toBe(0);
51
+ });
52
+ });
53
+
54
+ describe('OR operator', () => {
55
+ test('passes when all expectations pass', async () => {
56
+ const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
57
+ type: 'combined',
58
+ operator: 'or',
59
+ expectations: [
60
+ { type: 'contains', values: ['42'], mode: 'all' },
61
+ { type: 'contains', values: ['correct'], mode: 'all' },
62
+ ],
63
+ });
64
+ expect(result.passed).toBe(true);
65
+ expect(result.score).toBe(1);
66
+ });
67
+
68
+ test('passes when one expectation passes', async () => {
69
+ const result = await evaluator.evaluate('The answer is 42.', {
70
+ type: 'combined',
71
+ operator: 'or',
72
+ expectations: [
73
+ { type: 'contains', values: ['42'], mode: 'all' },
74
+ { type: 'contains', values: ['correct'], mode: 'all' },
75
+ ],
76
+ });
77
+ expect(result.passed).toBe(true);
78
+ expect(result.score).toBe(1); // Max score
79
+ expect(result.reason).toContain('1/2 expectations passed');
80
+ });
81
+
82
+ test('fails when all expectations fail', async () => {
83
+ const result = await evaluator.evaluate('Nothing here.', {
84
+ type: 'combined',
85
+ operator: 'or',
86
+ expectations: [
87
+ { type: 'contains', values: ['42'], mode: 'all' },
88
+ { type: 'contains', values: ['correct'], mode: 'all' },
89
+ ],
90
+ });
91
+ expect(result.passed).toBe(false);
92
+ expect(result.score).toBe(0);
93
+ expect(result.reason).toContain('No expectations passed');
94
+ });
95
+ });
96
+
97
+ describe('mixed expectation types', () => {
98
+ test('combines contains and regex expectations', async () => {
99
+ const result = await evaluator.evaluate('The result is 123-456-7890.', {
100
+ type: 'combined',
101
+ operator: 'and',
102
+ expectations: [
103
+ { type: 'contains', values: ['result'], mode: 'all' },
104
+ { type: 'regex', pattern: '\\d{3}-\\d{3}-\\d{4}' },
105
+ ],
106
+ });
107
+ expect(result.passed).toBe(true);
108
+ });
109
+
110
+ test('combines exact and contains expectations', async () => {
111
+ const result = await evaluator.evaluate('Hello World', {
112
+ type: 'combined',
113
+ operator: 'or',
114
+ expectations: [
115
+ { type: 'exact', value: 'Goodbye World', caseSensitive: true },
116
+ { type: 'contains', values: ['Hello'], mode: 'all' },
117
+ ],
118
+ });
119
+ expect(result.passed).toBe(true);
120
+ });
121
+
122
+ test('combines not_contains with contains', async () => {
123
+ const result = await evaluator.evaluate('The answer is correct.', {
124
+ type: 'combined',
125
+ operator: 'and',
126
+ expectations: [
127
+ { type: 'contains', values: ['correct'], mode: 'all' },
128
+ { type: 'not_contains', values: ['error', 'wrong'], mode: 'all' },
129
+ ],
130
+ });
131
+ expect(result.passed).toBe(true);
132
+ });
133
+ });
134
+
135
+ describe('edge cases', () => {
136
+ test('handles empty expectations array', async () => {
137
+ const result = await evaluator.evaluate('Any text', {
138
+ type: 'combined',
139
+ operator: 'and',
140
+ expectations: [],
141
+ });
142
+ expect(result.passed).toBe(true);
143
+ expect(result.score).toBe(1);
144
+ });
145
+
146
+ test('handles single expectation', async () => {
147
+ const result = await evaluator.evaluate('Hello World', {
148
+ type: 'combined',
149
+ operator: 'and',
150
+ expectations: [{ type: 'contains', values: ['Hello'], mode: 'all' }],
151
+ });
152
+ expect(result.passed).toBe(true);
153
+ });
154
+
155
+ test('provides detailed results', async () => {
156
+ const result = await evaluator.evaluate('The answer is 42.', {
157
+ type: 'combined',
158
+ operator: 'and',
159
+ expectations: [
160
+ { type: 'contains', values: ['42'], mode: 'all' },
161
+ { type: 'contains', values: ['correct'], mode: 'all' },
162
+ ],
163
+ });
164
+
165
+ expect(result.details).toBeDefined();
166
+ expect(result.details?.operator).toBe('and');
167
+ expect(result.details?.results).toHaveLength(2);
168
+ expect(result.details?.passedCount).toBe(1);
169
+ expect(result.details?.totalCount).toBe(2);
170
+ });
171
+ });
172
+ });
@@ -0,0 +1,95 @@
1
+ /**
2
+ * Combined evaluator - evaluates multiple expectations with and/or logic
3
+ */
4
+
5
+ import type { Expected } from '../scenario/schema';
6
+ import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
7
+
8
+ /**
9
+ * Get an evaluator by type (imported dynamically to avoid circular deps)
10
+ */
11
+ async function getEvaluatorForType(type: string): Promise<Evaluator> {
12
+ // Dynamic import to avoid circular dependency with index.ts
13
+ const { getEvaluator } = await import('./index.js');
14
+ return getEvaluator(type);
15
+ }
16
+
17
+ export class CombinedEvaluator implements Evaluator {
18
+ readonly type = 'combined';
19
+
20
+ async evaluate(
21
+ response: string,
22
+ expected: Expected,
23
+ context?: EvaluatorContext
24
+ ): Promise<EvaluatorResult> {
25
+ if (expected.type !== 'combined') {
26
+ throw new Error('Invalid expected type for CombinedEvaluator');
27
+ }
28
+
29
+ const { operator, expectations } = expected;
30
+
31
+ if (!expectations || expectations.length === 0) {
32
+ return {
33
+ passed: true,
34
+ score: 1,
35
+ reason: 'No expectations to evaluate',
36
+ details: { operator, results: [] },
37
+ };
38
+ }
39
+
40
+ // Evaluate all sub-expectations
41
+ const results: Array<{
42
+ type: string;
43
+ passed: boolean;
44
+ score: number;
45
+ reason?: string;
46
+ }> = [];
47
+
48
+ for (const subExpected of expectations) {
49
+ const evaluator = await getEvaluatorForType(subExpected.type);
50
+ const result = await evaluator.evaluate(response, subExpected, context);
51
+ results.push({
52
+ type: subExpected.type,
53
+ passed: result.passed,
54
+ score: result.score,
55
+ reason: result.reason,
56
+ });
57
+ }
58
+
59
+ // Calculate combined result based on operator
60
+ let passed: boolean;
61
+ let score: number;
62
+ let reason: string;
63
+
64
+ if (operator === 'and') {
65
+ // AND: all must pass
66
+ passed = results.every((r) => r.passed);
67
+ score = results.reduce((sum, r) => sum + r.score, 0) / results.length;
68
+ const passedCount = results.filter((r) => r.passed).length;
69
+ reason = passed
70
+ ? `All ${results.length} expectations passed`
71
+ : `${passedCount}/${results.length} expectations passed (all required)`;
72
+ } else {
73
+ // OR: at least one must pass
74
+ passed = results.some((r) => r.passed);
75
+ // For OR, take the max score
76
+ score = Math.max(...results.map((r) => r.score));
77
+ const passedCount = results.filter((r) => r.passed).length;
78
+ reason = passed
79
+ ? `${passedCount}/${results.length} expectations passed (at least one required)`
80
+ : 'No expectations passed (at least one required)';
81
+ }
82
+
83
+ return {
84
+ passed,
85
+ score,
86
+ reason,
87
+ details: {
88
+ operator,
89
+ results,
90
+ passedCount: results.filter((r) => r.passed).length,
91
+ totalCount: results.length,
92
+ },
93
+ };
94
+ }
95
+ }
@@ -2,12 +2,16 @@
2
2
  * Evaluators module - exports all evaluator types and utilities
3
3
  */
4
4
 
5
+ import { CombinedEvaluator } from './combined';
5
6
  import { ContainsEvaluator } from './contains';
6
7
  import { ExactEvaluator } from './exact';
7
8
  import { FuzzyEvaluator } from './fuzzy';
9
+ import { InlineEvaluator } from './inline';
8
10
  import { JsonSchemaEvaluator } from './json-schema';
9
11
  import { LLMGraderEvaluator } from './llm-grader';
12
+ import { NotContainsEvaluator } from './not-contains';
10
13
  import { RegexEvaluator } from './regex';
14
+ import { SimilarityEvaluator } from './similarity';
11
15
  import type { Evaluator } from './types';
12
16
 
13
17
  const evaluators = new Map<string, Evaluator>();
@@ -15,8 +19,12 @@ evaluators.set('exact', new ExactEvaluator());
15
19
  evaluators.set('regex', new RegexEvaluator());
16
20
  evaluators.set('fuzzy', new FuzzyEvaluator());
17
21
  evaluators.set('contains', new ContainsEvaluator());
22
+ evaluators.set('not_contains', new NotContainsEvaluator());
23
+ evaluators.set('combined', new CombinedEvaluator());
18
24
  evaluators.set('json_schema', new JsonSchemaEvaluator());
19
25
  evaluators.set('llm_grader', new LLMGraderEvaluator());
26
+ evaluators.set('similarity', new SimilarityEvaluator());
27
+ evaluators.set('inline', new InlineEvaluator());
20
28
 
21
29
  /**
22
30
  * Get an evaluator by type
@@ -49,5 +57,9 @@ export { ExactEvaluator } from './exact';
49
57
  export { RegexEvaluator } from './regex';
50
58
  export { FuzzyEvaluator } from './fuzzy';
51
59
  export { ContainsEvaluator } from './contains';
60
+ export { NotContainsEvaluator } from './not-contains';
61
+ export { CombinedEvaluator } from './combined';
52
62
  export { JsonSchemaEvaluator } from './json-schema';
53
63
  export { LLMGraderEvaluator } from './llm-grader';
64
+ export { SimilarityEvaluator } from './similarity';
65
+ export { InlineEvaluator, SUPPORTED_EXPRESSIONS } from './inline';