@artemiskit/core 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +116 -0
  2. package/dist/adapters/types.d.ts +8 -1
  3. package/dist/adapters/types.d.ts.map +1 -1
  4. package/dist/artifacts/types.d.ts +39 -0
  5. package/dist/artifacts/types.d.ts.map +1 -1
  6. package/dist/cost/index.d.ts +5 -0
  7. package/dist/cost/index.d.ts.map +1 -0
  8. package/dist/cost/pricing.d.ts +67 -0
  9. package/dist/cost/pricing.d.ts.map +1 -0
  10. package/dist/evaluators/combined.d.ts +10 -0
  11. package/dist/evaluators/combined.d.ts.map +1 -0
  12. package/dist/evaluators/index.d.ts +4 -0
  13. package/dist/evaluators/index.d.ts.map +1 -1
  14. package/dist/evaluators/inline.d.ts +22 -0
  15. package/dist/evaluators/inline.d.ts.map +1 -0
  16. package/dist/evaluators/llm-grader.d.ts.map +1 -1
  17. package/dist/evaluators/not-contains.d.ts +10 -0
  18. package/dist/evaluators/not-contains.d.ts.map +1 -0
  19. package/dist/evaluators/similarity.d.ts +16 -0
  20. package/dist/evaluators/similarity.d.ts.map +1 -0
  21. package/dist/index.d.ts +1 -0
  22. package/dist/index.d.ts.map +1 -1
  23. package/dist/index.js +13212 -12018
  24. package/dist/scenario/discovery.d.ts +72 -0
  25. package/dist/scenario/discovery.d.ts.map +1 -0
  26. package/dist/scenario/index.d.ts +1 -0
  27. package/dist/scenario/index.d.ts.map +1 -1
  28. package/dist/scenario/schema.d.ts +1253 -9
  29. package/dist/scenario/schema.d.ts.map +1 -1
  30. package/dist/storage/local.d.ts +44 -2
  31. package/dist/storage/local.d.ts.map +1 -1
  32. package/dist/storage/types.d.ts +62 -0
  33. package/dist/storage/types.d.ts.map +1 -1
  34. package/package.json +1 -1
  35. package/src/adapters/types.ts +8 -1
  36. package/src/artifacts/types.ts +39 -0
  37. package/src/cost/index.ts +14 -0
  38. package/src/cost/pricing.ts +450 -0
  39. package/src/evaluators/combined.test.ts +172 -0
  40. package/src/evaluators/combined.ts +95 -0
  41. package/src/evaluators/index.ts +12 -0
  42. package/src/evaluators/inline.test.ts +409 -0
  43. package/src/evaluators/inline.ts +393 -0
  44. package/src/evaluators/llm-grader.ts +45 -13
  45. package/src/evaluators/not-contains.test.ts +105 -0
  46. package/src/evaluators/not-contains.ts +45 -0
  47. package/src/evaluators/similarity.test.ts +333 -0
  48. package/src/evaluators/similarity.ts +258 -0
  49. package/src/index.ts +3 -0
  50. package/src/scenario/discovery.test.ts +153 -0
  51. package/src/scenario/discovery.ts +277 -0
  52. package/src/scenario/index.ts +1 -0
  53. package/src/scenario/schema.ts +47 -2
  54. package/src/storage/local.test.ts +243 -0
  55. package/src/storage/local.ts +162 -2
  56. package/src/storage/types.ts +73 -0
@@ -0,0 +1,450 @@
1
+ /**
2
+ * LLM Pricing Data and Cost Estimation
3
+ *
4
+ * Pricing is per 1,000 tokens (1K tokens) in USD
5
+ * Data is updated periodically - always verify with provider's official pricing
6
+ * Last comprehensive update: January 2026
7
+ */
8
+
9
+ export interface ModelPricing {
10
+ /** Price per 1K prompt/input tokens in USD */
11
+ promptPer1K: number;
12
+ /** Price per 1K completion/output tokens in USD */
13
+ completionPer1K: number;
14
+ /** Last updated date */
15
+ lastUpdated: string;
16
+ /** Notes about the pricing */
17
+ notes?: string;
18
+ }
19
+
20
+ export interface CostEstimate {
21
+ /** Estimated total cost in USD */
22
+ totalUsd: number;
23
+ /** Cost for prompt tokens */
24
+ promptCostUsd: number;
25
+ /** Cost for completion tokens */
26
+ completionCostUsd: number;
27
+ /** The model used for pricing */
28
+ model: string;
29
+ /** Pricing used */
30
+ pricing: ModelPricing;
31
+ }
32
+
33
+ /**
34
+ * Known model pricing data
35
+ * Prices are in USD per 1,000 tokens
36
+ */
37
+ export const MODEL_PRICING: Record<string, ModelPricing> = {
38
+ // ============================================
39
+ // OpenAI GPT-5 family (Latest - 2025)
40
+ // ============================================
41
+ 'gpt-5': {
42
+ promptPer1K: 0.00125,
43
+ completionPer1K: 0.01,
44
+ lastUpdated: '2026-01',
45
+ notes: '400K context window',
46
+ },
47
+ 'gpt-5.1': {
48
+ promptPer1K: 0.00125,
49
+ completionPer1K: 0.01,
50
+ lastUpdated: '2026-01',
51
+ },
52
+ 'gpt-5.2': {
53
+ promptPer1K: 0.00175,
54
+ completionPer1K: 0.014,
55
+ lastUpdated: '2026-01',
56
+ },
57
+ 'gpt-5-mini': {
58
+ promptPer1K: 0.00025,
59
+ completionPer1K: 0.002,
60
+ lastUpdated: '2026-01',
61
+ },
62
+ 'gpt-5-nano': {
63
+ promptPer1K: 0.00005,
64
+ completionPer1K: 0.0004,
65
+ lastUpdated: '2026-01',
66
+ },
67
+
68
+ // ============================================
69
+ // OpenAI GPT-4.1 family (2025)
70
+ // ============================================
71
+ 'gpt-4.1': {
72
+ promptPer1K: 0.002,
73
+ completionPer1K: 0.008,
74
+ lastUpdated: '2026-01',
75
+ notes: '1M context window',
76
+ },
77
+ 'gpt-4.1-mini': {
78
+ promptPer1K: 0.0004,
79
+ completionPer1K: 0.0016,
80
+ lastUpdated: '2026-01',
81
+ },
82
+ 'gpt-4.1-nano': {
83
+ promptPer1K: 0.0001,
84
+ completionPer1K: 0.0004,
85
+ lastUpdated: '2026-01',
86
+ },
87
+
88
+ // ============================================
89
+ // OpenAI GPT-4o family (2024-2025)
90
+ // ============================================
91
+ 'gpt-4o': {
92
+ promptPer1K: 0.0025,
93
+ completionPer1K: 0.01,
94
+ lastUpdated: '2026-01',
95
+ notes: '128K context window',
96
+ },
97
+ 'gpt-4o-mini': {
98
+ promptPer1K: 0.00015,
99
+ completionPer1K: 0.0006,
100
+ lastUpdated: '2026-01',
101
+ notes: '128K context window',
102
+ },
103
+
104
+ // ============================================
105
+ // OpenAI O-series (Reasoning models)
106
+ // ============================================
107
+ o1: {
108
+ promptPer1K: 0.015,
109
+ completionPer1K: 0.06,
110
+ lastUpdated: '2026-01',
111
+ notes: 'Reasoning model - internal thinking tokens billed as output',
112
+ },
113
+ o3: {
114
+ promptPer1K: 0.002,
115
+ completionPer1K: 0.008,
116
+ lastUpdated: '2026-01',
117
+ },
118
+ 'o3-mini': {
119
+ promptPer1K: 0.0011,
120
+ completionPer1K: 0.0044,
121
+ lastUpdated: '2026-01',
122
+ },
123
+ 'o4-mini': {
124
+ promptPer1K: 0.0011,
125
+ completionPer1K: 0.0044,
126
+ lastUpdated: '2026-01',
127
+ },
128
+
129
+ // ============================================
130
+ // OpenAI Legacy GPT-4 family
131
+ // ============================================
132
+ 'gpt-4-turbo': {
133
+ promptPer1K: 0.01,
134
+ completionPer1K: 0.03,
135
+ lastUpdated: '2026-01',
136
+ },
137
+ 'gpt-4': {
138
+ promptPer1K: 0.03,
139
+ completionPer1K: 0.06,
140
+ lastUpdated: '2026-01',
141
+ },
142
+ 'gpt-3.5-turbo': {
143
+ promptPer1K: 0.0005,
144
+ completionPer1K: 0.0015,
145
+ lastUpdated: '2026-01',
146
+ },
147
+
148
+ // ============================================
149
+ // Anthropic Claude 4.5 family (Latest - 2025)
150
+ // ============================================
151
+ 'claude-opus-4.5': {
152
+ promptPer1K: 0.005,
153
+ completionPer1K: 0.025,
154
+ lastUpdated: '2026-01',
155
+ notes: 'Most capable Claude model',
156
+ },
157
+ 'claude-sonnet-4.5': {
158
+ promptPer1K: 0.003,
159
+ completionPer1K: 0.015,
160
+ lastUpdated: '2026-01',
161
+ notes: 'Balanced performance and cost',
162
+ },
163
+ 'claude-haiku-4.5': {
164
+ promptPer1K: 0.001,
165
+ completionPer1K: 0.005,
166
+ lastUpdated: '2026-01',
167
+ notes: 'Fastest Claude model',
168
+ },
169
+
170
+ // ============================================
171
+ // Anthropic Claude 4 family (2025)
172
+ // ============================================
173
+ 'claude-opus-4': {
174
+ promptPer1K: 0.015,
175
+ completionPer1K: 0.075,
176
+ lastUpdated: '2026-01',
177
+ },
178
+ 'claude-opus-4.1': {
179
+ promptPer1K: 0.015,
180
+ completionPer1K: 0.075,
181
+ lastUpdated: '2026-01',
182
+ },
183
+ 'claude-sonnet-4': {
184
+ promptPer1K: 0.003,
185
+ completionPer1K: 0.015,
186
+ lastUpdated: '2026-01',
187
+ },
188
+
189
+ // ============================================
190
+ // Anthropic Claude 3.7 family
191
+ // ============================================
192
+ 'claude-sonnet-3.7': {
193
+ promptPer1K: 0.003,
194
+ completionPer1K: 0.015,
195
+ lastUpdated: '2026-01',
196
+ },
197
+ 'claude-3-7-sonnet': {
198
+ promptPer1K: 0.003,
199
+ completionPer1K: 0.015,
200
+ lastUpdated: '2026-01',
201
+ },
202
+
203
+ // ============================================
204
+ // Anthropic Claude 3.5 family (Legacy)
205
+ // ============================================
206
+ 'claude-3-5-sonnet-20241022': {
207
+ promptPer1K: 0.003,
208
+ completionPer1K: 0.015,
209
+ lastUpdated: '2026-01',
210
+ },
211
+ 'claude-3-5-haiku-20241022': {
212
+ promptPer1K: 0.0008,
213
+ completionPer1K: 0.004,
214
+ lastUpdated: '2026-01',
215
+ },
216
+ 'claude-haiku-3.5': {
217
+ promptPer1K: 0.0008,
218
+ completionPer1K: 0.004,
219
+ lastUpdated: '2026-01',
220
+ },
221
+
222
+ // ============================================
223
+ // Anthropic Claude 3 family (Legacy)
224
+ // ============================================
225
+ 'claude-3-opus': {
226
+ promptPer1K: 0.015,
227
+ completionPer1K: 0.075,
228
+ lastUpdated: '2026-01',
229
+ },
230
+ 'claude-3-sonnet': {
231
+ promptPer1K: 0.003,
232
+ completionPer1K: 0.015,
233
+ lastUpdated: '2026-01',
234
+ },
235
+ 'claude-3-haiku': {
236
+ promptPer1K: 0.00025,
237
+ completionPer1K: 0.00125,
238
+ lastUpdated: '2026-01',
239
+ },
240
+
241
+ // Aliases for common naming patterns
242
+ 'claude-3.5-sonnet': {
243
+ promptPer1K: 0.003,
244
+ completionPer1K: 0.015,
245
+ lastUpdated: '2026-01',
246
+ },
247
+ 'claude-3.5-haiku': {
248
+ promptPer1K: 0.0008,
249
+ completionPer1K: 0.004,
250
+ lastUpdated: '2026-01',
251
+ },
252
+ };
253
+
254
+ /**
255
+ * Default pricing for unknown models
256
+ * Uses conservative estimates based on mid-tier model pricing
257
+ */
258
+ export const DEFAULT_PRICING: ModelPricing = {
259
+ promptPer1K: 0.003,
260
+ completionPer1K: 0.015,
261
+ lastUpdated: '2026-01',
262
+ notes: 'Default pricing - verify with provider',
263
+ };
264
+
265
+ /**
266
+ * Get pricing for a model
267
+ * @param model Model identifier
268
+ * @returns Pricing data or default if unknown
269
+ */
270
+ export function getModelPricing(model: string): ModelPricing {
271
+ // Try exact match first
272
+ if (MODEL_PRICING[model]) {
273
+ return MODEL_PRICING[model];
274
+ }
275
+
276
+ // Try case-insensitive match
277
+ const lowerModel = model.toLowerCase();
278
+ for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
279
+ if (key.toLowerCase() === lowerModel) {
280
+ return pricing;
281
+ }
282
+ }
283
+
284
+ // Try partial match for common patterns
285
+ // GPT-5 family
286
+ if (lowerModel.includes('gpt-5.2')) {
287
+ return MODEL_PRICING['gpt-5.2'];
288
+ }
289
+ if (lowerModel.includes('gpt-5.1')) {
290
+ return MODEL_PRICING['gpt-5.1'];
291
+ }
292
+ if (lowerModel.includes('gpt-5-mini')) {
293
+ return MODEL_PRICING['gpt-5-mini'];
294
+ }
295
+ if (lowerModel.includes('gpt-5-nano')) {
296
+ return MODEL_PRICING['gpt-5-nano'];
297
+ }
298
+ if (lowerModel.includes('gpt-5')) {
299
+ return MODEL_PRICING['gpt-5'];
300
+ }
301
+
302
+ // GPT-4.1 family
303
+ if (lowerModel.includes('gpt-4.1-mini')) {
304
+ return MODEL_PRICING['gpt-4.1-mini'];
305
+ }
306
+ if (lowerModel.includes('gpt-4.1-nano')) {
307
+ return MODEL_PRICING['gpt-4.1-nano'];
308
+ }
309
+ if (lowerModel.includes('gpt-4.1')) {
310
+ return MODEL_PRICING['gpt-4.1'];
311
+ }
312
+
313
+ // GPT-4o family
314
+ if (lowerModel.includes('gpt-4o-mini')) {
315
+ return MODEL_PRICING['gpt-4o-mini'];
316
+ }
317
+ if (lowerModel.includes('gpt-4o')) {
318
+ return MODEL_PRICING['gpt-4o'];
319
+ }
320
+
321
+ // O-series
322
+ if (lowerModel.includes('o4-mini')) {
323
+ return MODEL_PRICING['o4-mini'];
324
+ }
325
+ if (lowerModel.includes('o3-mini')) {
326
+ return MODEL_PRICING['o3-mini'];
327
+ }
328
+ if (lowerModel.includes('o3')) {
329
+ return MODEL_PRICING.o3;
330
+ }
331
+ if (lowerModel.includes('o1')) {
332
+ return MODEL_PRICING.o1;
333
+ }
334
+
335
+ // Legacy GPT
336
+ if (lowerModel.includes('gpt-4-turbo')) {
337
+ return MODEL_PRICING['gpt-4-turbo'];
338
+ }
339
+ if (lowerModel.includes('gpt-4')) {
340
+ return MODEL_PRICING['gpt-4'];
341
+ }
342
+ if (lowerModel.includes('gpt-3.5')) {
343
+ return MODEL_PRICING['gpt-3.5-turbo'];
344
+ }
345
+
346
+ // Claude 4.5 family
347
+ if (lowerModel.includes('opus-4.5') || lowerModel.includes('opus-4-5')) {
348
+ return MODEL_PRICING['claude-opus-4.5'];
349
+ }
350
+ if (lowerModel.includes('sonnet-4.5') || lowerModel.includes('sonnet-4-5')) {
351
+ return MODEL_PRICING['claude-sonnet-4.5'];
352
+ }
353
+ if (lowerModel.includes('haiku-4.5') || lowerModel.includes('haiku-4-5')) {
354
+ return MODEL_PRICING['claude-haiku-4.5'];
355
+ }
356
+
357
+ // Claude 4 family
358
+ if (lowerModel.includes('opus-4.1') || lowerModel.includes('opus-4-1')) {
359
+ return MODEL_PRICING['claude-opus-4.1'];
360
+ }
361
+ if (lowerModel.includes('opus-4')) {
362
+ return MODEL_PRICING['claude-opus-4'];
363
+ }
364
+ if (lowerModel.includes('sonnet-4')) {
365
+ return MODEL_PRICING['claude-sonnet-4'];
366
+ }
367
+
368
+ // Claude 3.7 family
369
+ if (lowerModel.includes('sonnet-3.7') || lowerModel.includes('sonnet-3-7')) {
370
+ return MODEL_PRICING['claude-sonnet-3.7'];
371
+ }
372
+
373
+ // Claude 3.5 family
374
+ if (lowerModel.includes('claude-3-5-sonnet') || lowerModel.includes('claude-3.5-sonnet')) {
375
+ return MODEL_PRICING['claude-3.5-sonnet'];
376
+ }
377
+ if (lowerModel.includes('claude-3-5-haiku') || lowerModel.includes('claude-3.5-haiku')) {
378
+ return MODEL_PRICING['claude-3.5-haiku'];
379
+ }
380
+
381
+ // Claude 3 family
382
+ if (lowerModel.includes('claude-3-opus')) {
383
+ return MODEL_PRICING['claude-3-opus'];
384
+ }
385
+ if (lowerModel.includes('claude-3-sonnet')) {
386
+ return MODEL_PRICING['claude-3-sonnet'];
387
+ }
388
+ if (lowerModel.includes('claude-3-haiku')) {
389
+ return MODEL_PRICING['claude-3-haiku'];
390
+ }
391
+
392
+ // Generic Claude fallback
393
+ if (lowerModel.includes('claude')) {
394
+ return MODEL_PRICING['claude-sonnet-4.5'];
395
+ }
396
+
397
+ return DEFAULT_PRICING;
398
+ }
399
+
400
+ /**
401
+ * Estimate cost for token usage
402
+ * @param promptTokens Number of prompt/input tokens
403
+ * @param completionTokens Number of completion/output tokens
404
+ * @param model Model identifier
405
+ * @returns Cost estimate
406
+ */
407
+ export function estimateCost(
408
+ promptTokens: number,
409
+ completionTokens: number,
410
+ model: string
411
+ ): CostEstimate {
412
+ const pricing = getModelPricing(model);
413
+
414
+ const promptCostUsd = (promptTokens / 1000) * pricing.promptPer1K;
415
+ const completionCostUsd = (completionTokens / 1000) * pricing.completionPer1K;
416
+ const totalUsd = promptCostUsd + completionCostUsd;
417
+
418
+ return {
419
+ totalUsd,
420
+ promptCostUsd,
421
+ completionCostUsd,
422
+ model,
423
+ pricing,
424
+ };
425
+ }
426
+
427
+ /**
428
+ * Format cost for display
429
+ * @param costUsd Cost in USD
430
+ * @returns Formatted string
431
+ */
432
+ export function formatCost(costUsd: number): string {
433
+ if (costUsd < 0.01) {
434
+ return `$${(costUsd * 100).toFixed(4)} cents`;
435
+ }
436
+ if (costUsd < 1) {
437
+ return `$${costUsd.toFixed(4)}`;
438
+ }
439
+ return `$${costUsd.toFixed(2)}`;
440
+ }
441
+
442
+ /**
443
+ * List all known models with pricing
444
+ */
445
+ export function listKnownModels(): Array<{ model: string; pricing: ModelPricing }> {
446
+ return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
447
+ model,
448
+ pricing,
449
+ }));
450
+ }
@@ -0,0 +1,172 @@
1
+ /**
2
+ * Tests for CombinedEvaluator
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test';
6
+ import { CombinedEvaluator } from './combined';
7
+
8
+ describe('CombinedEvaluator', () => {
9
+ const evaluator = new CombinedEvaluator();
10
+
11
+ describe('AND operator', () => {
12
+ test('passes when all expectations pass', async () => {
13
+ const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
14
+ type: 'combined',
15
+ operator: 'and',
16
+ expectations: [
17
+ { type: 'contains', values: ['42'], mode: 'all' },
18
+ { type: 'contains', values: ['correct'], mode: 'all' },
19
+ ],
20
+ });
21
+ expect(result.passed).toBe(true);
22
+ expect(result.score).toBe(1);
23
+ expect(result.reason).toContain('All 2 expectations passed');
24
+ });
25
+
26
+ test('fails when one expectation fails', async () => {
27
+ const result = await evaluator.evaluate('The answer is 42.', {
28
+ type: 'combined',
29
+ operator: 'and',
30
+ expectations: [
31
+ { type: 'contains', values: ['42'], mode: 'all' },
32
+ { type: 'contains', values: ['correct'], mode: 'all' },
33
+ ],
34
+ });
35
+ expect(result.passed).toBe(false);
36
+ expect(result.score).toBe(0.5);
37
+ expect(result.reason).toContain('1/2 expectations passed');
38
+ });
39
+
40
+ test('fails when all expectations fail', async () => {
41
+ const result = await evaluator.evaluate('Nothing here.', {
42
+ type: 'combined',
43
+ operator: 'and',
44
+ expectations: [
45
+ { type: 'contains', values: ['42'], mode: 'all' },
46
+ { type: 'contains', values: ['correct'], mode: 'all' },
47
+ ],
48
+ });
49
+ expect(result.passed).toBe(false);
50
+ expect(result.score).toBe(0);
51
+ });
52
+ });
53
+
54
+ describe('OR operator', () => {
55
+ test('passes when all expectations pass', async () => {
56
+ const result = await evaluator.evaluate('The answer is 42 and it is correct.', {
57
+ type: 'combined',
58
+ operator: 'or',
59
+ expectations: [
60
+ { type: 'contains', values: ['42'], mode: 'all' },
61
+ { type: 'contains', values: ['correct'], mode: 'all' },
62
+ ],
63
+ });
64
+ expect(result.passed).toBe(true);
65
+ expect(result.score).toBe(1);
66
+ });
67
+
68
+ test('passes when one expectation passes', async () => {
69
+ const result = await evaluator.evaluate('The answer is 42.', {
70
+ type: 'combined',
71
+ operator: 'or',
72
+ expectations: [
73
+ { type: 'contains', values: ['42'], mode: 'all' },
74
+ { type: 'contains', values: ['correct'], mode: 'all' },
75
+ ],
76
+ });
77
+ expect(result.passed).toBe(true);
78
+ expect(result.score).toBe(1); // Max score
79
+ expect(result.reason).toContain('1/2 expectations passed');
80
+ });
81
+
82
+ test('fails when all expectations fail', async () => {
83
+ const result = await evaluator.evaluate('Nothing here.', {
84
+ type: 'combined',
85
+ operator: 'or',
86
+ expectations: [
87
+ { type: 'contains', values: ['42'], mode: 'all' },
88
+ { type: 'contains', values: ['correct'], mode: 'all' },
89
+ ],
90
+ });
91
+ expect(result.passed).toBe(false);
92
+ expect(result.score).toBe(0);
93
+ expect(result.reason).toContain('No expectations passed');
94
+ });
95
+ });
96
+
97
+ describe('mixed expectation types', () => {
98
+ test('combines contains and regex expectations', async () => {
99
+ const result = await evaluator.evaluate('The result is 123-456-7890.', {
100
+ type: 'combined',
101
+ operator: 'and',
102
+ expectations: [
103
+ { type: 'contains', values: ['result'], mode: 'all' },
104
+ { type: 'regex', pattern: '\\d{3}-\\d{3}-\\d{4}' },
105
+ ],
106
+ });
107
+ expect(result.passed).toBe(true);
108
+ });
109
+
110
+ test('combines exact and contains expectations', async () => {
111
+ const result = await evaluator.evaluate('Hello World', {
112
+ type: 'combined',
113
+ operator: 'or',
114
+ expectations: [
115
+ { type: 'exact', value: 'Goodbye World', caseSensitive: true },
116
+ { type: 'contains', values: ['Hello'], mode: 'all' },
117
+ ],
118
+ });
119
+ expect(result.passed).toBe(true);
120
+ });
121
+
122
+ test('combines not_contains with contains', async () => {
123
+ const result = await evaluator.evaluate('The answer is correct.', {
124
+ type: 'combined',
125
+ operator: 'and',
126
+ expectations: [
127
+ { type: 'contains', values: ['correct'], mode: 'all' },
128
+ { type: 'not_contains', values: ['error', 'wrong'], mode: 'all' },
129
+ ],
130
+ });
131
+ expect(result.passed).toBe(true);
132
+ });
133
+ });
134
+
135
+ describe('edge cases', () => {
136
+ test('handles empty expectations array', async () => {
137
+ const result = await evaluator.evaluate('Any text', {
138
+ type: 'combined',
139
+ operator: 'and',
140
+ expectations: [],
141
+ });
142
+ expect(result.passed).toBe(true);
143
+ expect(result.score).toBe(1);
144
+ });
145
+
146
+ test('handles single expectation', async () => {
147
+ const result = await evaluator.evaluate('Hello World', {
148
+ type: 'combined',
149
+ operator: 'and',
150
+ expectations: [{ type: 'contains', values: ['Hello'], mode: 'all' }],
151
+ });
152
+ expect(result.passed).toBe(true);
153
+ });
154
+
155
+ test('provides detailed results', async () => {
156
+ const result = await evaluator.evaluate('The answer is 42.', {
157
+ type: 'combined',
158
+ operator: 'and',
159
+ expectations: [
160
+ { type: 'contains', values: ['42'], mode: 'all' },
161
+ { type: 'contains', values: ['correct'], mode: 'all' },
162
+ ],
163
+ });
164
+
165
+ expect(result.details).toBeDefined();
166
+ expect(result.details?.operator).toBe('and');
167
+ expect(result.details?.results).toHaveLength(2);
168
+ expect(result.details?.passedCount).toBe(1);
169
+ expect(result.details?.totalCount).toBe(2);
170
+ });
171
+ });
172
+ });
@@ -0,0 +1,95 @@
1
+ /**
2
+ * Combined evaluator - evaluates multiple expectations with and/or logic
3
+ */
4
+
5
+ import type { Expected } from '../scenario/schema';
6
+ import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
7
+
8
+ /**
9
+ * Get an evaluator by type (imported dynamically to avoid circular deps)
10
+ */
11
+ async function getEvaluatorForType(type: string): Promise<Evaluator> {
12
+ // Dynamic import to avoid circular dependency with index.ts
13
+ const { getEvaluator } = await import('./index.js');
14
+ return getEvaluator(type);
15
+ }
16
+
17
+ export class CombinedEvaluator implements Evaluator {
18
+ readonly type = 'combined';
19
+
20
+ async evaluate(
21
+ response: string,
22
+ expected: Expected,
23
+ context?: EvaluatorContext
24
+ ): Promise<EvaluatorResult> {
25
+ if (expected.type !== 'combined') {
26
+ throw new Error('Invalid expected type for CombinedEvaluator');
27
+ }
28
+
29
+ const { operator, expectations } = expected;
30
+
31
+ if (!expectations || expectations.length === 0) {
32
+ return {
33
+ passed: true,
34
+ score: 1,
35
+ reason: 'No expectations to evaluate',
36
+ details: { operator, results: [] },
37
+ };
38
+ }
39
+
40
+ // Evaluate all sub-expectations
41
+ const results: Array<{
42
+ type: string;
43
+ passed: boolean;
44
+ score: number;
45
+ reason?: string;
46
+ }> = [];
47
+
48
+ for (const subExpected of expectations) {
49
+ const evaluator = await getEvaluatorForType(subExpected.type);
50
+ const result = await evaluator.evaluate(response, subExpected, context);
51
+ results.push({
52
+ type: subExpected.type,
53
+ passed: result.passed,
54
+ score: result.score,
55
+ reason: result.reason,
56
+ });
57
+ }
58
+
59
+ // Calculate combined result based on operator
60
+ let passed: boolean;
61
+ let score: number;
62
+ let reason: string;
63
+
64
+ if (operator === 'and') {
65
+ // AND: all must pass
66
+ passed = results.every((r) => r.passed);
67
+ score = results.reduce((sum, r) => sum + r.score, 0) / results.length;
68
+ const passedCount = results.filter((r) => r.passed).length;
69
+ reason = passed
70
+ ? `All ${results.length} expectations passed`
71
+ : `${passedCount}/${results.length} expectations passed (all required)`;
72
+ } else {
73
+ // OR: at least one must pass
74
+ passed = results.some((r) => r.passed);
75
+ // For OR, take the max score
76
+ score = Math.max(...results.map((r) => r.score));
77
+ const passedCount = results.filter((r) => r.passed).length;
78
+ reason = passed
79
+ ? `${passedCount}/${results.length} expectations passed (at least one required)`
80
+ : 'No expectations passed (at least one required)';
81
+ }
82
+
83
+ return {
84
+ passed,
85
+ score,
86
+ reason,
87
+ details: {
88
+ operator,
89
+ results,
90
+ passedCount: results.filter((r) => r.passed).length,
91
+ totalCount: results.length,
92
+ },
93
+ };
94
+ }
95
+ }