@artemiskit/core 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -0
- package/dist/adapters/types.d.ts +3 -1
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +39 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/index.d.ts +5 -0
- package/dist/cost/index.d.ts.map +1 -0
- package/dist/cost/pricing.d.ts +66 -0
- package/dist/cost/pricing.d.ts.map +1 -0
- package/dist/evaluators/combined.d.ts +10 -0
- package/dist/evaluators/combined.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +4 -0
- package/dist/evaluators/index.d.ts.map +1 -1
- package/dist/evaluators/inline.d.ts +22 -0
- package/dist/evaluators/inline.d.ts.map +1 -0
- package/dist/evaluators/not-contains.d.ts +10 -0
- package/dist/evaluators/not-contains.d.ts.map +1 -0
- package/dist/evaluators/similarity.d.ts +16 -0
- package/dist/evaluators/similarity.d.ts.map +1 -0
- package/dist/events/emitter.d.ts +111 -0
- package/dist/events/emitter.d.ts.map +1 -0
- package/dist/events/index.d.ts +6 -0
- package/dist/events/index.d.ts.map +1 -0
- package/dist/events/types.d.ts +177 -0
- package/dist/events/types.d.ts.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +13056 -12093
- package/dist/scenario/discovery.d.ts +72 -0
- package/dist/scenario/discovery.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +1 -0
- package/dist/scenario/index.d.ts.map +1 -1
- package/dist/scenario/schema.d.ts +1245 -9
- package/dist/scenario/schema.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +3 -1
- package/src/artifacts/types.ts +39 -0
- package/src/cost/index.ts +14 -0
- package/src/cost/pricing.ts +273 -0
- package/src/evaluators/combined.test.ts +172 -0
- package/src/evaluators/combined.ts +95 -0
- package/src/evaluators/index.ts +12 -0
- package/src/evaluators/inline.test.ts +409 -0
- package/src/evaluators/inline.ts +393 -0
- package/src/evaluators/not-contains.test.ts +105 -0
- package/src/evaluators/not-contains.ts +45 -0
- package/src/evaluators/similarity.test.ts +333 -0
- package/src/evaluators/similarity.ts +258 -0
- package/src/index.ts +3 -0
- package/src/scenario/discovery.test.ts +153 -0
- package/src/scenario/discovery.ts +277 -0
- package/src/scenario/index.ts +1 -0
- package/src/scenario/schema.ts +43 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/scenario/schema.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;GAEG;AACH,eAAO,MAAM,cAAc,mIAWzB,CAAC;AAEH;;;;GAIG;AACH,eAAO,MAAM,oBAAoB
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/scenario/schema.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;GAEG;AACH,eAAO,MAAM,cAAc,mIAWzB,CAAC;AAEH;;;;GAIG;AACH,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqBpB,CAAC;AAoFd;;;GAGG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA7BvB,0GAA0G;;IAE1G,8EAA8E;;IAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QAJvF,0GAA0G;;QAE1G,8EAA8E;;QAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAyBR,CAAC;AAEpF;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;EAG5B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,eAAe,gFAAuE,CAAC;AAEpG;;;GAGG;AACH,QAAA,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;GAAuC,CAAC;AAEnE;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QArDvB,0GAA0G;;QAE1G,8EAA8E;;QAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAJvF,0GAA0G;;YAE1G,8EAA8E;;YAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA8DzF,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAE7D,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAYzB,6CAA6C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YArF3C,0GAA0G;;YAE1G,8EAA8E;;YAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBAJvF,0GAA0G;;gBAE1G,8EAA8E;;gBAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QA8DzF,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAiC7D,CAAC;AAEH,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAClE,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAChE,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,uBAAuB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC"}
|
package/package.json
CHANGED
package/src/adapters/types.ts
CHANGED
|
@@ -105,7 +105,7 @@ export interface ModelClient {
|
|
|
105
105
|
|
|
106
106
|
stream?(options: GenerateOptions, onChunk: (chunk: string) => void): AsyncIterable<string>;
|
|
107
107
|
|
|
108
|
-
embed?(text: string): Promise<number[]>;
|
|
108
|
+
embed?(text: string, model?: string): Promise<number[]>;
|
|
109
109
|
|
|
110
110
|
capabilities(): Promise<ModelCapabilities>;
|
|
111
111
|
|
|
@@ -155,6 +155,8 @@ export interface AzureOpenAIAdapterConfig extends BaseAdapterConfig {
|
|
|
155
155
|
resourceName: string;
|
|
156
156
|
deploymentName: string;
|
|
157
157
|
apiVersion: string;
|
|
158
|
+
/** Optional separate deployment name for embedding models */
|
|
159
|
+
embeddingDeploymentName?: string;
|
|
158
160
|
}
|
|
159
161
|
|
|
160
162
|
/**
|
package/src/artifacts/types.ts
CHANGED
|
@@ -322,6 +322,15 @@ export interface StressRequestResult {
|
|
|
322
322
|
error?: string;
|
|
323
323
|
/** Timestamp of the request */
|
|
324
324
|
timestamp: number;
|
|
325
|
+
/** Token usage for this request */
|
|
326
|
+
tokens?: {
|
|
327
|
+
/** Prompt/input tokens */
|
|
328
|
+
prompt: number;
|
|
329
|
+
/** Completion/output tokens */
|
|
330
|
+
completion: number;
|
|
331
|
+
/** Total tokens */
|
|
332
|
+
total: number;
|
|
333
|
+
};
|
|
325
334
|
}
|
|
326
335
|
|
|
327
336
|
/**
|
|
@@ -352,6 +361,36 @@ export interface StressMetrics {
|
|
|
352
361
|
p95_latency_ms: number;
|
|
353
362
|
/** 99th percentile latency */
|
|
354
363
|
p99_latency_ms: number;
|
|
364
|
+
/** Token usage metrics (optional - only if provider returns token counts) */
|
|
365
|
+
tokens?: {
|
|
366
|
+
/** Total prompt/input tokens across all requests */
|
|
367
|
+
total_prompt_tokens: number;
|
|
368
|
+
/** Total completion/output tokens across all requests */
|
|
369
|
+
total_completion_tokens: number;
|
|
370
|
+
/** Total tokens (prompt + completion) */
|
|
371
|
+
total_tokens: number;
|
|
372
|
+
/** Average tokens per request */
|
|
373
|
+
avg_tokens_per_request: number;
|
|
374
|
+
};
|
|
375
|
+
/** Estimated cost metrics (optional - only if cost estimation is available) */
|
|
376
|
+
cost?: {
|
|
377
|
+
/** Estimated total cost in USD */
|
|
378
|
+
estimated_total_usd: number;
|
|
379
|
+
/** Cost breakdown by token type */
|
|
380
|
+
breakdown: {
|
|
381
|
+
/** Cost for prompt/input tokens */
|
|
382
|
+
prompt_cost_usd: number;
|
|
383
|
+
/** Cost for completion/output tokens */
|
|
384
|
+
completion_cost_usd: number;
|
|
385
|
+
};
|
|
386
|
+
/** Model used for cost calculation */
|
|
387
|
+
model: string;
|
|
388
|
+
/** Pricing used (per 1K tokens) */
|
|
389
|
+
pricing: {
|
|
390
|
+
prompt_per_1k: number;
|
|
391
|
+
completion_per_1k: number;
|
|
392
|
+
};
|
|
393
|
+
};
|
|
355
394
|
}
|
|
356
395
|
|
|
357
396
|
/**
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Pricing Data and Cost Estimation
|
|
3
|
+
*
|
|
4
|
+
* Pricing is per 1,000 tokens (1K tokens) in USD
|
|
5
|
+
* Data is updated periodically - always verify with provider's official pricing
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface ModelPricing {
|
|
9
|
+
/** Price per 1K prompt/input tokens in USD */
|
|
10
|
+
promptPer1K: number;
|
|
11
|
+
/** Price per 1K completion/output tokens in USD */
|
|
12
|
+
completionPer1K: number;
|
|
13
|
+
/** Last updated date */
|
|
14
|
+
lastUpdated: string;
|
|
15
|
+
/** Notes about the pricing */
|
|
16
|
+
notes?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface CostEstimate {
|
|
20
|
+
/** Estimated total cost in USD */
|
|
21
|
+
totalUsd: number;
|
|
22
|
+
/** Cost for prompt tokens */
|
|
23
|
+
promptCostUsd: number;
|
|
24
|
+
/** Cost for completion tokens */
|
|
25
|
+
completionCostUsd: number;
|
|
26
|
+
/** The model used for pricing */
|
|
27
|
+
model: string;
|
|
28
|
+
/** Pricing used */
|
|
29
|
+
pricing: ModelPricing;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Known model pricing data
|
|
34
|
+
* Prices are in USD per 1,000 tokens
|
|
35
|
+
*/
|
|
36
|
+
export const MODEL_PRICING: Record<string, ModelPricing> = {
|
|
37
|
+
// OpenAI GPT-4 family
|
|
38
|
+
'gpt-4': {
|
|
39
|
+
promptPer1K: 0.03,
|
|
40
|
+
completionPer1K: 0.06,
|
|
41
|
+
lastUpdated: '2024-01',
|
|
42
|
+
},
|
|
43
|
+
'gpt-4-32k': {
|
|
44
|
+
promptPer1K: 0.06,
|
|
45
|
+
completionPer1K: 0.12,
|
|
46
|
+
lastUpdated: '2024-01',
|
|
47
|
+
},
|
|
48
|
+
'gpt-4-turbo': {
|
|
49
|
+
promptPer1K: 0.01,
|
|
50
|
+
completionPer1K: 0.03,
|
|
51
|
+
lastUpdated: '2024-01',
|
|
52
|
+
},
|
|
53
|
+
'gpt-4-turbo-preview': {
|
|
54
|
+
promptPer1K: 0.01,
|
|
55
|
+
completionPer1K: 0.03,
|
|
56
|
+
lastUpdated: '2024-01',
|
|
57
|
+
},
|
|
58
|
+
'gpt-4o': {
|
|
59
|
+
promptPer1K: 0.005,
|
|
60
|
+
completionPer1K: 0.015,
|
|
61
|
+
lastUpdated: '2024-05',
|
|
62
|
+
},
|
|
63
|
+
'gpt-4o-mini': {
|
|
64
|
+
promptPer1K: 0.00015,
|
|
65
|
+
completionPer1K: 0.0006,
|
|
66
|
+
lastUpdated: '2024-07',
|
|
67
|
+
},
|
|
68
|
+
|
|
69
|
+
// OpenAI GPT-3.5 family
|
|
70
|
+
'gpt-3.5-turbo': {
|
|
71
|
+
promptPer1K: 0.0005,
|
|
72
|
+
completionPer1K: 0.0015,
|
|
73
|
+
lastUpdated: '2024-01',
|
|
74
|
+
},
|
|
75
|
+
'gpt-3.5-turbo-16k': {
|
|
76
|
+
promptPer1K: 0.003,
|
|
77
|
+
completionPer1K: 0.004,
|
|
78
|
+
lastUpdated: '2024-01',
|
|
79
|
+
},
|
|
80
|
+
|
|
81
|
+
// Anthropic Claude family
|
|
82
|
+
'claude-3-opus-20240229': {
|
|
83
|
+
promptPer1K: 0.015,
|
|
84
|
+
completionPer1K: 0.075,
|
|
85
|
+
lastUpdated: '2024-03',
|
|
86
|
+
},
|
|
87
|
+
'claude-3-sonnet-20240229': {
|
|
88
|
+
promptPer1K: 0.003,
|
|
89
|
+
completionPer1K: 0.015,
|
|
90
|
+
lastUpdated: '2024-03',
|
|
91
|
+
},
|
|
92
|
+
'claude-3-haiku-20240307': {
|
|
93
|
+
promptPer1K: 0.00025,
|
|
94
|
+
completionPer1K: 0.00125,
|
|
95
|
+
lastUpdated: '2024-03',
|
|
96
|
+
},
|
|
97
|
+
'claude-3-5-sonnet-20240620': {
|
|
98
|
+
promptPer1K: 0.003,
|
|
99
|
+
completionPer1K: 0.015,
|
|
100
|
+
lastUpdated: '2024-06',
|
|
101
|
+
},
|
|
102
|
+
'claude-3-5-sonnet-20241022': {
|
|
103
|
+
promptPer1K: 0.003,
|
|
104
|
+
completionPer1K: 0.015,
|
|
105
|
+
lastUpdated: '2024-10',
|
|
106
|
+
},
|
|
107
|
+
'claude-3-5-haiku-20241022': {
|
|
108
|
+
promptPer1K: 0.0008,
|
|
109
|
+
completionPer1K: 0.004,
|
|
110
|
+
lastUpdated: '2024-10',
|
|
111
|
+
},
|
|
112
|
+
// Aliases
|
|
113
|
+
'claude-3-opus': {
|
|
114
|
+
promptPer1K: 0.015,
|
|
115
|
+
completionPer1K: 0.075,
|
|
116
|
+
lastUpdated: '2024-03',
|
|
117
|
+
},
|
|
118
|
+
'claude-3-sonnet': {
|
|
119
|
+
promptPer1K: 0.003,
|
|
120
|
+
completionPer1K: 0.015,
|
|
121
|
+
lastUpdated: '2024-03',
|
|
122
|
+
},
|
|
123
|
+
'claude-3-haiku': {
|
|
124
|
+
promptPer1K: 0.00025,
|
|
125
|
+
completionPer1K: 0.00125,
|
|
126
|
+
lastUpdated: '2024-03',
|
|
127
|
+
},
|
|
128
|
+
'claude-3.5-sonnet': {
|
|
129
|
+
promptPer1K: 0.003,
|
|
130
|
+
completionPer1K: 0.015,
|
|
131
|
+
lastUpdated: '2024-10',
|
|
132
|
+
},
|
|
133
|
+
'claude-3.5-haiku': {
|
|
134
|
+
promptPer1K: 0.0008,
|
|
135
|
+
completionPer1K: 0.004,
|
|
136
|
+
lastUpdated: '2024-10',
|
|
137
|
+
},
|
|
138
|
+
|
|
139
|
+
// Legacy Claude
|
|
140
|
+
'claude-2': {
|
|
141
|
+
promptPer1K: 0.008,
|
|
142
|
+
completionPer1K: 0.024,
|
|
143
|
+
lastUpdated: '2024-01',
|
|
144
|
+
},
|
|
145
|
+
'claude-instant-1': {
|
|
146
|
+
promptPer1K: 0.0008,
|
|
147
|
+
completionPer1K: 0.0024,
|
|
148
|
+
lastUpdated: '2024-01',
|
|
149
|
+
},
|
|
150
|
+
|
|
151
|
+
// Azure OpenAI (same pricing as OpenAI typically)
|
|
152
|
+
// Add 'azure-' prefix versions if needed
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Default pricing for unknown models
|
|
157
|
+
* Uses conservative estimates
|
|
158
|
+
*/
|
|
159
|
+
export const DEFAULT_PRICING: ModelPricing = {
|
|
160
|
+
promptPer1K: 0.01,
|
|
161
|
+
completionPer1K: 0.03,
|
|
162
|
+
lastUpdated: '2024-01',
|
|
163
|
+
notes: 'Default pricing - verify with provider',
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Get pricing for a model
|
|
168
|
+
* @param model Model identifier
|
|
169
|
+
* @returns Pricing data or default if unknown
|
|
170
|
+
*/
|
|
171
|
+
export function getModelPricing(model: string): ModelPricing {
|
|
172
|
+
// Try exact match first
|
|
173
|
+
if (MODEL_PRICING[model]) {
|
|
174
|
+
return MODEL_PRICING[model];
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Try case-insensitive match
|
|
178
|
+
const lowerModel = model.toLowerCase();
|
|
179
|
+
for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
|
|
180
|
+
if (key.toLowerCase() === lowerModel) {
|
|
181
|
+
return pricing;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Try partial match for common patterns
|
|
186
|
+
if (lowerModel.includes('gpt-4o-mini')) {
|
|
187
|
+
return MODEL_PRICING['gpt-4o-mini'];
|
|
188
|
+
}
|
|
189
|
+
if (lowerModel.includes('gpt-4o')) {
|
|
190
|
+
return MODEL_PRICING['gpt-4o'];
|
|
191
|
+
}
|
|
192
|
+
if (lowerModel.includes('gpt-4-turbo')) {
|
|
193
|
+
return MODEL_PRICING['gpt-4-turbo'];
|
|
194
|
+
}
|
|
195
|
+
if (lowerModel.includes('gpt-4')) {
|
|
196
|
+
return MODEL_PRICING['gpt-4'];
|
|
197
|
+
}
|
|
198
|
+
if (lowerModel.includes('gpt-3.5')) {
|
|
199
|
+
return MODEL_PRICING['gpt-3.5-turbo'];
|
|
200
|
+
}
|
|
201
|
+
if (lowerModel.includes('claude-3-5-sonnet') || lowerModel.includes('claude-3.5-sonnet')) {
|
|
202
|
+
return MODEL_PRICING['claude-3.5-sonnet'];
|
|
203
|
+
}
|
|
204
|
+
if (lowerModel.includes('claude-3-5-haiku') || lowerModel.includes('claude-3.5-haiku')) {
|
|
205
|
+
return MODEL_PRICING['claude-3.5-haiku'];
|
|
206
|
+
}
|
|
207
|
+
if (lowerModel.includes('claude-3-opus')) {
|
|
208
|
+
return MODEL_PRICING['claude-3-opus'];
|
|
209
|
+
}
|
|
210
|
+
if (lowerModel.includes('claude-3-sonnet')) {
|
|
211
|
+
return MODEL_PRICING['claude-3-sonnet'];
|
|
212
|
+
}
|
|
213
|
+
if (lowerModel.includes('claude-3-haiku')) {
|
|
214
|
+
return MODEL_PRICING['claude-3-haiku'];
|
|
215
|
+
}
|
|
216
|
+
if (lowerModel.includes('claude')) {
|
|
217
|
+
return MODEL_PRICING['claude-2'];
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return DEFAULT_PRICING;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Estimate cost for token usage
|
|
225
|
+
* @param promptTokens Number of prompt/input tokens
|
|
226
|
+
* @param completionTokens Number of completion/output tokens
|
|
227
|
+
* @param model Model identifier
|
|
228
|
+
* @returns Cost estimate
|
|
229
|
+
*/
|
|
230
|
+
export function estimateCost(
|
|
231
|
+
promptTokens: number,
|
|
232
|
+
completionTokens: number,
|
|
233
|
+
model: string
|
|
234
|
+
): CostEstimate {
|
|
235
|
+
const pricing = getModelPricing(model);
|
|
236
|
+
|
|
237
|
+
const promptCostUsd = (promptTokens / 1000) * pricing.promptPer1K;
|
|
238
|
+
const completionCostUsd = (completionTokens / 1000) * pricing.completionPer1K;
|
|
239
|
+
const totalUsd = promptCostUsd + completionCostUsd;
|
|
240
|
+
|
|
241
|
+
return {
|
|
242
|
+
totalUsd,
|
|
243
|
+
promptCostUsd,
|
|
244
|
+
completionCostUsd,
|
|
245
|
+
model,
|
|
246
|
+
pricing,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Format cost for display
|
|
252
|
+
* @param costUsd Cost in USD
|
|
253
|
+
* @returns Formatted string
|
|
254
|
+
*/
|
|
255
|
+
export function formatCost(costUsd: number): string {
|
|
256
|
+
if (costUsd < 0.01) {
|
|
257
|
+
return `$${(costUsd * 100).toFixed(4)} cents`;
|
|
258
|
+
}
|
|
259
|
+
if (costUsd < 1) {
|
|
260
|
+
return `$${costUsd.toFixed(4)}`;
|
|
261
|
+
}
|
|
262
|
+
return `$${costUsd.toFixed(2)}`;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* List all known models with pricing
|
|
267
|
+
*/
|
|
268
|
+
export function listKnownModels(): Array<{ model: string; pricing: ModelPricing }> {
|
|
269
|
+
return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
|
|
270
|
+
model,
|
|
271
|
+
pricing,
|
|
272
|
+
}));
|
|
273
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for CombinedEvaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { CombinedEvaluator } from './combined';
|
|
7
|
+
|
|
8
|
+
describe('CombinedEvaluator', () => {
|
|
9
|
+
const evaluator = new CombinedEvaluator();
|
|
10
|
+
|
|
11
|
+
describe('AND operator', () => {
|
|
12
|
+
test('passes when all expectations pass', async () => {
|
|
13
|
+
const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
|
|
14
|
+
type: 'combined',
|
|
15
|
+
operator: 'and',
|
|
16
|
+
expectations: [
|
|
17
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
18
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
19
|
+
],
|
|
20
|
+
});
|
|
21
|
+
expect(result.passed).toBe(true);
|
|
22
|
+
expect(result.score).toBe(1);
|
|
23
|
+
expect(result.reason).toContain('All 2 expectations passed');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('fails when one expectation fails', async () => {
|
|
27
|
+
const result = await evaluator.evaluate('The answer is 42.', {
|
|
28
|
+
type: 'combined',
|
|
29
|
+
operator: 'and',
|
|
30
|
+
expectations: [
|
|
31
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
32
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
33
|
+
],
|
|
34
|
+
});
|
|
35
|
+
expect(result.passed).toBe(false);
|
|
36
|
+
expect(result.score).toBe(0.5);
|
|
37
|
+
expect(result.reason).toContain('1/2 expectations passed');
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('fails when all expectations fail', async () => {
|
|
41
|
+
const result = await evaluator.evaluate('Nothing here.', {
|
|
42
|
+
type: 'combined',
|
|
43
|
+
operator: 'and',
|
|
44
|
+
expectations: [
|
|
45
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
46
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
47
|
+
],
|
|
48
|
+
});
|
|
49
|
+
expect(result.passed).toBe(false);
|
|
50
|
+
expect(result.score).toBe(0);
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
describe('OR operator', () => {
|
|
55
|
+
test('passes when all expectations pass', async () => {
|
|
56
|
+
const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
|
|
57
|
+
type: 'combined',
|
|
58
|
+
operator: 'or',
|
|
59
|
+
expectations: [
|
|
60
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
61
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
62
|
+
],
|
|
63
|
+
});
|
|
64
|
+
expect(result.passed).toBe(true);
|
|
65
|
+
expect(result.score).toBe(1);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test('passes when one expectation passes', async () => {
|
|
69
|
+
const result = await evaluator.evaluate('The answer is 42.', {
|
|
70
|
+
type: 'combined',
|
|
71
|
+
operator: 'or',
|
|
72
|
+
expectations: [
|
|
73
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
74
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
75
|
+
],
|
|
76
|
+
});
|
|
77
|
+
expect(result.passed).toBe(true);
|
|
78
|
+
expect(result.score).toBe(1); // Max score
|
|
79
|
+
expect(result.reason).toContain('1/2 expectations passed');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test('fails when all expectations fail', async () => {
|
|
83
|
+
const result = await evaluator.evaluate('Nothing here.', {
|
|
84
|
+
type: 'combined',
|
|
85
|
+
operator: 'or',
|
|
86
|
+
expectations: [
|
|
87
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
88
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
89
|
+
],
|
|
90
|
+
});
|
|
91
|
+
expect(result.passed).toBe(false);
|
|
92
|
+
expect(result.score).toBe(0);
|
|
93
|
+
expect(result.reason).toContain('No expectations passed');
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
describe('mixed expectation types', () => {
|
|
98
|
+
test('combines contains and regex expectations', async () => {
|
|
99
|
+
const result = await evaluator.evaluate('The result is 123-456-7890.', {
|
|
100
|
+
type: 'combined',
|
|
101
|
+
operator: 'and',
|
|
102
|
+
expectations: [
|
|
103
|
+
{ type: 'contains', values: ['result'], mode: 'all' },
|
|
104
|
+
{ type: 'regex', pattern: '\\d{3}-\\d{3}-\\d{4}' },
|
|
105
|
+
],
|
|
106
|
+
});
|
|
107
|
+
expect(result.passed).toBe(true);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test('combines exact and contains expectations', async () => {
|
|
111
|
+
const result = await evaluator.evaluate('Hello World', {
|
|
112
|
+
type: 'combined',
|
|
113
|
+
operator: 'or',
|
|
114
|
+
expectations: [
|
|
115
|
+
{ type: 'exact', value: 'Goodbye World', caseSensitive: true },
|
|
116
|
+
{ type: 'contains', values: ['Hello'], mode: 'all' },
|
|
117
|
+
],
|
|
118
|
+
});
|
|
119
|
+
expect(result.passed).toBe(true);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test('combines not_contains with contains', async () => {
|
|
123
|
+
const result = await evaluator.evaluate('The answer is correct.', {
|
|
124
|
+
type: 'combined',
|
|
125
|
+
operator: 'and',
|
|
126
|
+
expectations: [
|
|
127
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
128
|
+
{ type: 'not_contains', values: ['error', 'wrong'], mode: 'all' },
|
|
129
|
+
],
|
|
130
|
+
});
|
|
131
|
+
expect(result.passed).toBe(true);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe('edge cases', () => {
|
|
136
|
+
test('handles empty expectations array', async () => {
|
|
137
|
+
const result = await evaluator.evaluate('Any text', {
|
|
138
|
+
type: 'combined',
|
|
139
|
+
operator: 'and',
|
|
140
|
+
expectations: [],
|
|
141
|
+
});
|
|
142
|
+
expect(result.passed).toBe(true);
|
|
143
|
+
expect(result.score).toBe(1);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
test('handles single expectation', async () => {
|
|
147
|
+
const result = await evaluator.evaluate('Hello World', {
|
|
148
|
+
type: 'combined',
|
|
149
|
+
operator: 'and',
|
|
150
|
+
expectations: [{ type: 'contains', values: ['Hello'], mode: 'all' }],
|
|
151
|
+
});
|
|
152
|
+
expect(result.passed).toBe(true);
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
test('provides detailed results', async () => {
|
|
156
|
+
const result = await evaluator.evaluate('The answer is 42.', {
|
|
157
|
+
type: 'combined',
|
|
158
|
+
operator: 'and',
|
|
159
|
+
expectations: [
|
|
160
|
+
{ type: 'contains', values: ['42'], mode: 'all' },
|
|
161
|
+
{ type: 'contains', values: ['correct'], mode: 'all' },
|
|
162
|
+
],
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
expect(result.details).toBeDefined();
|
|
166
|
+
expect(result.details?.operator).toBe('and');
|
|
167
|
+
expect(result.details?.results).toHaveLength(2);
|
|
168
|
+
expect(result.details?.passedCount).toBe(1);
|
|
169
|
+
expect(result.details?.totalCount).toBe(2);
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
});
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Combined evaluator - evaluates multiple expectations with and/or logic
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Expected } from '../scenario/schema';
|
|
6
|
+
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Get an evaluator by type (imported dynamically to avoid circular deps)
|
|
10
|
+
*/
|
|
11
|
+
async function getEvaluatorForType(type: string): Promise<Evaluator> {
|
|
12
|
+
// Dynamic import to avoid circular dependency with index.ts
|
|
13
|
+
const { getEvaluator } = await import('./index.js');
|
|
14
|
+
return getEvaluator(type);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class CombinedEvaluator implements Evaluator {
|
|
18
|
+
readonly type = 'combined';
|
|
19
|
+
|
|
20
|
+
async evaluate(
|
|
21
|
+
response: string,
|
|
22
|
+
expected: Expected,
|
|
23
|
+
context?: EvaluatorContext
|
|
24
|
+
): Promise<EvaluatorResult> {
|
|
25
|
+
if (expected.type !== 'combined') {
|
|
26
|
+
throw new Error('Invalid expected type for CombinedEvaluator');
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const { operator, expectations } = expected;
|
|
30
|
+
|
|
31
|
+
if (!expectations || expectations.length === 0) {
|
|
32
|
+
return {
|
|
33
|
+
passed: true,
|
|
34
|
+
score: 1,
|
|
35
|
+
reason: 'No expectations to evaluate',
|
|
36
|
+
details: { operator, results: [] },
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Evaluate all sub-expectations
|
|
41
|
+
const results: Array<{
|
|
42
|
+
type: string;
|
|
43
|
+
passed: boolean;
|
|
44
|
+
score: number;
|
|
45
|
+
reason?: string;
|
|
46
|
+
}> = [];
|
|
47
|
+
|
|
48
|
+
for (const subExpected of expectations) {
|
|
49
|
+
const evaluator = await getEvaluatorForType(subExpected.type);
|
|
50
|
+
const result = await evaluator.evaluate(response, subExpected, context);
|
|
51
|
+
results.push({
|
|
52
|
+
type: subExpected.type,
|
|
53
|
+
passed: result.passed,
|
|
54
|
+
score: result.score,
|
|
55
|
+
reason: result.reason,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Calculate combined result based on operator
|
|
60
|
+
let passed: boolean;
|
|
61
|
+
let score: number;
|
|
62
|
+
let reason: string;
|
|
63
|
+
|
|
64
|
+
if (operator === 'and') {
|
|
65
|
+
// AND: all must pass
|
|
66
|
+
passed = results.every((r) => r.passed);
|
|
67
|
+
score = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
68
|
+
const passedCount = results.filter((r) => r.passed).length;
|
|
69
|
+
reason = passed
|
|
70
|
+
? `All ${results.length} expectations passed`
|
|
71
|
+
: `${passedCount}/${results.length} expectations passed (all required)`;
|
|
72
|
+
} else {
|
|
73
|
+
// OR: at least one must pass
|
|
74
|
+
passed = results.some((r) => r.passed);
|
|
75
|
+
// For OR, take the max score
|
|
76
|
+
score = Math.max(...results.map((r) => r.score));
|
|
77
|
+
const passedCount = results.filter((r) => r.passed).length;
|
|
78
|
+
reason = passed
|
|
79
|
+
? `${passedCount}/${results.length} expectations passed (at least one required)`
|
|
80
|
+
: 'No expectations passed (at least one required)';
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
passed,
|
|
85
|
+
score,
|
|
86
|
+
reason,
|
|
87
|
+
details: {
|
|
88
|
+
operator,
|
|
89
|
+
results,
|
|
90
|
+
passedCount: results.filter((r) => r.passed).length,
|
|
91
|
+
totalCount: results.length,
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
}
|
package/src/evaluators/index.ts
CHANGED
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
* Evaluators module - exports all evaluator types and utilities
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
+
import { CombinedEvaluator } from './combined';
|
|
5
6
|
import { ContainsEvaluator } from './contains';
|
|
6
7
|
import { ExactEvaluator } from './exact';
|
|
7
8
|
import { FuzzyEvaluator } from './fuzzy';
|
|
9
|
+
import { InlineEvaluator } from './inline';
|
|
8
10
|
import { JsonSchemaEvaluator } from './json-schema';
|
|
9
11
|
import { LLMGraderEvaluator } from './llm-grader';
|
|
12
|
+
import { NotContainsEvaluator } from './not-contains';
|
|
10
13
|
import { RegexEvaluator } from './regex';
|
|
14
|
+
import { SimilarityEvaluator } from './similarity';
|
|
11
15
|
import type { Evaluator } from './types';
|
|
12
16
|
|
|
13
17
|
const evaluators = new Map<string, Evaluator>();
|
|
@@ -15,8 +19,12 @@ evaluators.set('exact', new ExactEvaluator());
|
|
|
15
19
|
evaluators.set('regex', new RegexEvaluator());
|
|
16
20
|
evaluators.set('fuzzy', new FuzzyEvaluator());
|
|
17
21
|
evaluators.set('contains', new ContainsEvaluator());
|
|
22
|
+
evaluators.set('not_contains', new NotContainsEvaluator());
|
|
23
|
+
evaluators.set('combined', new CombinedEvaluator());
|
|
18
24
|
evaluators.set('json_schema', new JsonSchemaEvaluator());
|
|
19
25
|
evaluators.set('llm_grader', new LLMGraderEvaluator());
|
|
26
|
+
evaluators.set('similarity', new SimilarityEvaluator());
|
|
27
|
+
evaluators.set('inline', new InlineEvaluator());
|
|
20
28
|
|
|
21
29
|
/**
|
|
22
30
|
* Get an evaluator by type
|
|
@@ -49,5 +57,9 @@ export { ExactEvaluator } from './exact';
|
|
|
49
57
|
export { RegexEvaluator } from './regex';
|
|
50
58
|
export { FuzzyEvaluator } from './fuzzy';
|
|
51
59
|
export { ContainsEvaluator } from './contains';
|
|
60
|
+
export { NotContainsEvaluator } from './not-contains';
|
|
61
|
+
export { CombinedEvaluator } from './combined';
|
|
52
62
|
export { JsonSchemaEvaluator } from './json-schema';
|
|
53
63
|
export { LLMGraderEvaluator } from './llm-grader';
|
|
64
|
+
export { SimilarityEvaluator } from './similarity';
|
|
65
|
+
export { InlineEvaluator, SUPPORTED_EXPRESSIONS } from './inline';
|