@artemiskit/core 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +164 -0
- package/adapters/openai/dist/index.js +5626 -0
- package/dist/adapters/registry.d.ts.map +1 -1
- package/dist/adapters/types.d.ts +32 -2
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +12 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +762 -63
- package/dist/scenario/schema.d.ts +116 -84
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/supabase.d.ts +25 -4
- package/dist/storage/supabase.d.ts.map +1 -1
- package/dist/storage/types.d.ts +162 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/dist/validator/index.d.ts +6 -0
- package/dist/validator/index.d.ts.map +1 -0
- package/dist/validator/types.d.ts +58 -0
- package/dist/validator/types.d.ts.map +1 -0
- package/dist/validator/validator.d.ts +55 -0
- package/dist/validator/validator.d.ts.map +1 -0
- package/package.json +1 -1
- package/src/adapters/registry.ts +38 -0
- package/src/adapters/types.ts +38 -0
- package/src/artifacts/types.ts +16 -0
- package/src/index.ts +3 -0
- package/src/scenario/schema.ts +10 -0
- package/src/storage/supabase.test.ts +988 -0
- package/src/storage/supabase.ts +599 -5
- package/src/storage/types.ts +196 -0
- package/src/validator/index.ts +6 -0
- package/src/validator/types.ts +62 -0
- package/src/validator/validator.ts +345 -0
|
@@ -0,0 +1,988 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for Supabase storage adapter with analytics capabilities
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { afterEach, beforeEach, describe, expect, it, mock, spyOn } from 'bun:test';
|
|
6
|
+
import type { CaseResult, RunManifest, RunMetrics } from '../artifacts/types';
|
|
7
|
+
import { SupabaseStorageAdapter, type SupabaseStorageConfig } from './supabase';
|
|
8
|
+
import type { CaseResultRecord, MetricsSnapshot } from './types';
|
|
9
|
+
|
|
10
|
+
// Mock Supabase client
|
|
11
|
+
const mockFrom = mock(() => ({}));
|
|
12
|
+
const mockStorage = mock(() => ({}));
|
|
13
|
+
const mockClient = {
|
|
14
|
+
from: mockFrom,
|
|
15
|
+
storage: {
|
|
16
|
+
from: mockStorage,
|
|
17
|
+
},
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
// Mock createClient
|
|
21
|
+
mock.module('@supabase/supabase-js', () => ({
|
|
22
|
+
createClient: () => mockClient,
|
|
23
|
+
}));
|
|
24
|
+
|
|
25
|
+
// Helper to create mock run manifest
|
|
26
|
+
function createMockManifest(overrides: Partial<RunManifest> = {}): RunManifest {
|
|
27
|
+
return {
|
|
28
|
+
version: '1.0.0',
|
|
29
|
+
run_id: 'test-run-123',
|
|
30
|
+
project: 'test-project',
|
|
31
|
+
start_time: '2026-02-19T10:00:00.000Z',
|
|
32
|
+
end_time: '2026-02-19T10:05:00.000Z',
|
|
33
|
+
duration_ms: 300000,
|
|
34
|
+
config: {
|
|
35
|
+
scenario: 'test-scenario',
|
|
36
|
+
provider: 'openai',
|
|
37
|
+
model: 'gpt-4',
|
|
38
|
+
},
|
|
39
|
+
metrics: {
|
|
40
|
+
success_rate: 0.9,
|
|
41
|
+
total_cases: 10,
|
|
42
|
+
passed_cases: 9,
|
|
43
|
+
failed_cases: 1,
|
|
44
|
+
median_latency_ms: 150,
|
|
45
|
+
p95_latency_ms: 300,
|
|
46
|
+
total_tokens: 5000,
|
|
47
|
+
total_prompt_tokens: 3000,
|
|
48
|
+
total_completion_tokens: 2000,
|
|
49
|
+
},
|
|
50
|
+
git: {
|
|
51
|
+
commit: 'abc123',
|
|
52
|
+
branch: 'main',
|
|
53
|
+
dirty: false,
|
|
54
|
+
},
|
|
55
|
+
provenance: {
|
|
56
|
+
run_by: 'test-user',
|
|
57
|
+
run_reason: 'CI',
|
|
58
|
+
},
|
|
59
|
+
cases: [
|
|
60
|
+
createMockCaseResult('case-1', true),
|
|
61
|
+
createMockCaseResult('case-2', true),
|
|
62
|
+
createMockCaseResult('case-3', false),
|
|
63
|
+
],
|
|
64
|
+
environment: {
|
|
65
|
+
node_version: '20.0.0',
|
|
66
|
+
platform: 'linux',
|
|
67
|
+
arch: 'x64',
|
|
68
|
+
},
|
|
69
|
+
...overrides,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Helper to create mock case result
|
|
74
|
+
function createMockCaseResult(id: string, ok: boolean): CaseResult {
|
|
75
|
+
return {
|
|
76
|
+
id,
|
|
77
|
+
name: `Test Case ${id}`,
|
|
78
|
+
ok,
|
|
79
|
+
score: ok ? 1.0 : 0.0,
|
|
80
|
+
matcherType: 'contains',
|
|
81
|
+
reason: ok ? 'Matched' : 'Did not match',
|
|
82
|
+
latencyMs: 150,
|
|
83
|
+
tokens: {
|
|
84
|
+
prompt: 100,
|
|
85
|
+
completion: 50,
|
|
86
|
+
total: 150,
|
|
87
|
+
},
|
|
88
|
+
prompt: 'Test prompt',
|
|
89
|
+
response: 'Test response',
|
|
90
|
+
expected: { contains: 'expected' },
|
|
91
|
+
tags: ['unit-test'],
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Helper to create mock case result record
|
|
96
|
+
function createMockCaseResultRecord(overrides: Partial<CaseResultRecord> = {}): CaseResultRecord {
|
|
97
|
+
return {
|
|
98
|
+
runId: 'test-run-123',
|
|
99
|
+
caseId: 'case-1',
|
|
100
|
+
caseName: 'Test Case 1',
|
|
101
|
+
status: 'passed',
|
|
102
|
+
score: 1.0,
|
|
103
|
+
matcherType: 'contains',
|
|
104
|
+
reason: 'Matched',
|
|
105
|
+
response: 'Test response',
|
|
106
|
+
latencyMs: 150,
|
|
107
|
+
promptTokens: 100,
|
|
108
|
+
completionTokens: 50,
|
|
109
|
+
totalTokens: 150,
|
|
110
|
+
tags: ['unit-test'],
|
|
111
|
+
...overrides,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Helper to create mock metrics snapshot
|
|
116
|
+
function createMockMetricsSnapshot(overrides: Partial<MetricsSnapshot> = {}): MetricsSnapshot {
|
|
117
|
+
return {
|
|
118
|
+
date: '2026-02-19',
|
|
119
|
+
project: 'test-project',
|
|
120
|
+
scenario: 'test-scenario',
|
|
121
|
+
totalRuns: 10,
|
|
122
|
+
totalCases: 100,
|
|
123
|
+
passedCases: 90,
|
|
124
|
+
failedCases: 10,
|
|
125
|
+
avgSuccessRate: 0.9,
|
|
126
|
+
avgLatencyMs: 150,
|
|
127
|
+
avgTokensPerRun: 500,
|
|
128
|
+
minSuccessRate: 0.8,
|
|
129
|
+
maxSuccessRate: 1.0,
|
|
130
|
+
minLatencyMs: 100,
|
|
131
|
+
maxLatencyMs: 200,
|
|
132
|
+
totalTokens: 5000,
|
|
133
|
+
...overrides,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
describe('SupabaseStorageAdapter', () => {
|
|
138
|
+
let adapter: SupabaseStorageAdapter;
|
|
139
|
+
const config: SupabaseStorageConfig = {
|
|
140
|
+
url: 'https://test.supabase.co',
|
|
141
|
+
anonKey: 'test-key',
|
|
142
|
+
bucket: 'test-bucket',
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
beforeEach(() => {
|
|
146
|
+
// Reset mocks before each test
|
|
147
|
+
mockFrom.mockReset();
|
|
148
|
+
mockStorage.mockReset();
|
|
149
|
+
adapter = new SupabaseStorageAdapter(config, 'test-project');
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
describe('constructor', () => {
|
|
153
|
+
it('should create adapter with config', () => {
|
|
154
|
+
expect(adapter).toBeDefined();
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
it('should use default bucket if not provided', () => {
|
|
158
|
+
const adapterWithDefaults = new SupabaseStorageAdapter({
|
|
159
|
+
url: 'https://test.supabase.co',
|
|
160
|
+
anonKey: 'test-key',
|
|
161
|
+
});
|
|
162
|
+
expect(adapterWithDefaults).toBeDefined();
|
|
163
|
+
});
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
describe('save', () => {
|
|
167
|
+
it('should save manifest to storage and database', async () => {
|
|
168
|
+
const manifest = createMockManifest({ cases: [] }); // Empty cases to simplify test
|
|
169
|
+
|
|
170
|
+
// Mock storage upload
|
|
171
|
+
mockStorage.mockReturnValue({
|
|
172
|
+
upload: mock(() => Promise.resolve({ error: null })),
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
// Mock database upsert for runs
|
|
176
|
+
const mockUpsert = mock(() => Promise.resolve({ error: null }));
|
|
177
|
+
mockFrom.mockReturnValue({
|
|
178
|
+
upsert: mockUpsert,
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
const result = await adapter.save(manifest);
|
|
182
|
+
|
|
183
|
+
expect(result).toBe('test-project/test-run-123.json');
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
it('should throw error on storage upload failure', async () => {
|
|
187
|
+
const manifest = createMockManifest();
|
|
188
|
+
|
|
189
|
+
mockStorage.mockReturnValue({
|
|
190
|
+
upload: mock(() => Promise.resolve({ error: { message: 'Upload failed' } })),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
await expect(adapter.save(manifest)).rejects.toThrow('Failed to upload manifest');
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
it('should throw error on database save failure', async () => {
|
|
197
|
+
const manifest = createMockManifest();
|
|
198
|
+
|
|
199
|
+
mockStorage.mockReturnValue({
|
|
200
|
+
upload: mock(() => Promise.resolve({ error: null })),
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
mockFrom.mockReturnValue({
|
|
204
|
+
upsert: mock(() => Promise.resolve({ error: { message: 'DB error' } })),
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
await expect(adapter.save(manifest)).rejects.toThrow('Failed to save run metadata');
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
describe('load', () => {
|
|
212
|
+
it('should load manifest by run ID', async () => {
|
|
213
|
+
const manifest = createMockManifest();
|
|
214
|
+
|
|
215
|
+
// Mock database query
|
|
216
|
+
mockFrom.mockReturnValue({
|
|
217
|
+
select: mock(() => ({
|
|
218
|
+
eq: mock(() => ({
|
|
219
|
+
single: mock(() =>
|
|
220
|
+
Promise.resolve({
|
|
221
|
+
data: { manifest_path: 'test-project/test-run-123.json' },
|
|
222
|
+
error: null,
|
|
223
|
+
})
|
|
224
|
+
),
|
|
225
|
+
})),
|
|
226
|
+
})),
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
// Mock storage download
|
|
230
|
+
mockStorage.mockReturnValue({
|
|
231
|
+
download: mock(() =>
|
|
232
|
+
Promise.resolve({
|
|
233
|
+
data: new Blob([JSON.stringify(manifest)]),
|
|
234
|
+
error: null,
|
|
235
|
+
})
|
|
236
|
+
),
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
const result = await adapter.load('test-run-123');
|
|
240
|
+
|
|
241
|
+
expect(result.run_id).toBe('test-run-123');
|
|
242
|
+
expect(result.project).toBe('test-project');
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
it('should throw error if run not found', async () => {
|
|
246
|
+
mockFrom.mockReturnValue({
|
|
247
|
+
select: mock(() => ({
|
|
248
|
+
eq: mock(() => ({
|
|
249
|
+
single: mock(() => Promise.resolve({ data: null, error: { message: 'Not found' } })),
|
|
250
|
+
})),
|
|
251
|
+
})),
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
await expect(adapter.load('non-existent')).rejects.toThrow('Run not found');
|
|
255
|
+
});
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
describe('list', () => {
|
|
259
|
+
it('should list runs with default options', async () => {
|
|
260
|
+
mockFrom.mockReturnValue({
|
|
261
|
+
select: mock(() => ({
|
|
262
|
+
order: mock(() =>
|
|
263
|
+
Promise.resolve({
|
|
264
|
+
data: [
|
|
265
|
+
{
|
|
266
|
+
run_id: 'run-1',
|
|
267
|
+
scenario: 'scenario-1',
|
|
268
|
+
success_rate: 0.9,
|
|
269
|
+
started_at: '2026-02-19T10:00:00Z',
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
run_id: 'run-2',
|
|
273
|
+
scenario: 'scenario-2',
|
|
274
|
+
success_rate: 0.8,
|
|
275
|
+
started_at: '2026-02-19T09:00:00Z',
|
|
276
|
+
},
|
|
277
|
+
],
|
|
278
|
+
error: null,
|
|
279
|
+
})
|
|
280
|
+
),
|
|
281
|
+
})),
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
const result = await adapter.list();
|
|
285
|
+
|
|
286
|
+
expect(result).toHaveLength(2);
|
|
287
|
+
expect(result[0].runId).toBe('run-1');
|
|
288
|
+
expect(result[1].runId).toBe('run-2');
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
it('should filter by project', async () => {
|
|
292
|
+
const mockEq = mock(() => ({
|
|
293
|
+
limit: mock(() => Promise.resolve({ data: [], error: null })),
|
|
294
|
+
}));
|
|
295
|
+
const mockOrder = mock(() => ({
|
|
296
|
+
eq: mockEq,
|
|
297
|
+
}));
|
|
298
|
+
|
|
299
|
+
mockFrom.mockReturnValue({
|
|
300
|
+
select: mock(() => ({
|
|
301
|
+
order: mockOrder,
|
|
302
|
+
})),
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
await adapter.list({ project: 'test-project', limit: 10 });
|
|
306
|
+
|
|
307
|
+
expect(mockEq).toHaveBeenCalled();
|
|
308
|
+
});
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
describe('delete', () => {
|
|
312
|
+
it('should delete run from storage and database', async () => {
|
|
313
|
+
const mockDelete = mock(() => ({
|
|
314
|
+
eq: mock(() => Promise.resolve({ error: null })),
|
|
315
|
+
}));
|
|
316
|
+
|
|
317
|
+
mockFrom.mockReturnValue({
|
|
318
|
+
select: mock(() => ({
|
|
319
|
+
eq: mock(() => ({
|
|
320
|
+
single: mock(() =>
|
|
321
|
+
Promise.resolve({
|
|
322
|
+
data: { manifest_path: 'test-project/test-run-123.json' },
|
|
323
|
+
error: null,
|
|
324
|
+
})
|
|
325
|
+
),
|
|
326
|
+
})),
|
|
327
|
+
})),
|
|
328
|
+
delete: mockDelete,
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
mockStorage.mockReturnValue({
|
|
332
|
+
remove: mock(() => Promise.resolve({ error: null })),
|
|
333
|
+
});
|
|
334
|
+
|
|
335
|
+
await adapter.delete('test-run-123');
|
|
336
|
+
|
|
337
|
+
expect(mockDelete).toHaveBeenCalled();
|
|
338
|
+
});
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
describe('compare', () => {
|
|
342
|
+
it('should compare two runs', async () => {
|
|
343
|
+
const baseline = createMockManifest({
|
|
344
|
+
run_id: 'baseline-run',
|
|
345
|
+
metrics: { ...createMockManifest().metrics, success_rate: 0.8 },
|
|
346
|
+
});
|
|
347
|
+
const current = createMockManifest({
|
|
348
|
+
run_id: 'current-run',
|
|
349
|
+
metrics: { ...createMockManifest().metrics, success_rate: 0.9 },
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
// Mock load for both runs
|
|
353
|
+
let callCount = 0;
|
|
354
|
+
mockFrom.mockImplementation(() => ({
|
|
355
|
+
select: mock(() => ({
|
|
356
|
+
eq: mock(() => ({
|
|
357
|
+
single: mock(() => {
|
|
358
|
+
const manifest = callCount === 0 ? baseline : current;
|
|
359
|
+
callCount++;
|
|
360
|
+
return Promise.resolve({
|
|
361
|
+
data: { manifest_path: `test-project/${manifest.run_id}.json` },
|
|
362
|
+
error: null,
|
|
363
|
+
});
|
|
364
|
+
}),
|
|
365
|
+
})),
|
|
366
|
+
})),
|
|
367
|
+
}));
|
|
368
|
+
|
|
369
|
+
mockStorage.mockImplementation(() => ({
|
|
370
|
+
download: mock(() => {
|
|
371
|
+
const manifest = callCount <= 2 ? baseline : current;
|
|
372
|
+
return Promise.resolve({
|
|
373
|
+
data: new Blob([JSON.stringify(manifest)]),
|
|
374
|
+
error: null,
|
|
375
|
+
});
|
|
376
|
+
}),
|
|
377
|
+
}));
|
|
378
|
+
|
|
379
|
+
const result = await adapter.compare('baseline-run', 'current-run');
|
|
380
|
+
|
|
381
|
+
expect(result.delta).toBeDefined();
|
|
382
|
+
expect(result.baseline).toBeDefined();
|
|
383
|
+
expect(result.current).toBeDefined();
|
|
384
|
+
});
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
describe('setBaseline', () => {
|
|
388
|
+
it('should set baseline for a scenario', async () => {
|
|
389
|
+
const mockRun = {
|
|
390
|
+
project: 'test-project',
|
|
391
|
+
success_rate: 0.9,
|
|
392
|
+
median_latency_ms: 150,
|
|
393
|
+
total_tokens: 5000,
|
|
394
|
+
passed_cases: 9,
|
|
395
|
+
failed_cases: 1,
|
|
396
|
+
total_cases: 10,
|
|
397
|
+
run_by: 'test-user',
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
mockFrom.mockImplementation((table: string) => {
|
|
401
|
+
if (table === 'runs') {
|
|
402
|
+
return {
|
|
403
|
+
select: mock(() => ({
|
|
404
|
+
eq: mock(() => ({
|
|
405
|
+
single: mock(() => Promise.resolve({ data: mockRun, error: null })),
|
|
406
|
+
})),
|
|
407
|
+
})),
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
if (table === 'baselines') {
|
|
411
|
+
return {
|
|
412
|
+
upsert: mock(() => Promise.resolve({ error: null })),
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
return {};
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
const result = await adapter.setBaseline('test-scenario', 'test-run-123', 'v1.0');
|
|
419
|
+
|
|
420
|
+
expect(result.scenario).toBe('test-scenario');
|
|
421
|
+
expect(result.runId).toBe('test-run-123');
|
|
422
|
+
expect(result.tag).toBe('v1.0');
|
|
423
|
+
expect(result.metrics.successRate).toBe(0.9);
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
it('should throw error if run not found', async () => {
|
|
427
|
+
mockFrom.mockReturnValue({
|
|
428
|
+
select: mock(() => ({
|
|
429
|
+
eq: mock(() => ({
|
|
430
|
+
single: mock(() => Promise.resolve({ data: null, error: { message: 'Not found' } })),
|
|
431
|
+
})),
|
|
432
|
+
})),
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
await expect(adapter.setBaseline('test-scenario', 'non-existent')).rejects.toThrow(
|
|
436
|
+
'Run not found'
|
|
437
|
+
);
|
|
438
|
+
});
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
describe('getBaseline', () => {
|
|
442
|
+
it('should get baseline by scenario', async () => {
|
|
443
|
+
const mockBaseline = {
|
|
444
|
+
scenario: 'test-scenario',
|
|
445
|
+
run_id: 'baseline-run',
|
|
446
|
+
created_at: '2026-02-19T10:00:00Z',
|
|
447
|
+
success_rate: 0.9,
|
|
448
|
+
median_latency_ms: 150,
|
|
449
|
+
total_tokens: 5000,
|
|
450
|
+
passed_cases: 9,
|
|
451
|
+
failed_cases: 1,
|
|
452
|
+
total_cases: 10,
|
|
453
|
+
tag: 'v1.0',
|
|
454
|
+
};
|
|
455
|
+
|
|
456
|
+
mockFrom.mockReturnValue({
|
|
457
|
+
select: mock(() => ({
|
|
458
|
+
eq: mock(() => ({
|
|
459
|
+
eq: mock(() => ({
|
|
460
|
+
single: mock(() => Promise.resolve({ data: mockBaseline, error: null })),
|
|
461
|
+
})),
|
|
462
|
+
})),
|
|
463
|
+
})),
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
const result = await adapter.getBaseline('test-scenario');
|
|
467
|
+
|
|
468
|
+
expect(result).not.toBeNull();
|
|
469
|
+
expect(result?.scenario).toBe('test-scenario');
|
|
470
|
+
expect(result?.runId).toBe('baseline-run');
|
|
471
|
+
});
|
|
472
|
+
|
|
473
|
+
it('should return null if baseline not found', async () => {
|
|
474
|
+
mockFrom.mockReturnValue({
|
|
475
|
+
select: mock(() => ({
|
|
476
|
+
eq: mock(() => ({
|
|
477
|
+
eq: mock(() => ({
|
|
478
|
+
single: mock(() => Promise.resolve({ data: null, error: null })),
|
|
479
|
+
})),
|
|
480
|
+
})),
|
|
481
|
+
})),
|
|
482
|
+
});
|
|
483
|
+
|
|
484
|
+
const result = await adapter.getBaseline('non-existent');
|
|
485
|
+
|
|
486
|
+
expect(result).toBeNull();
|
|
487
|
+
});
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
describe('listBaselines', () => {
|
|
491
|
+
it('should list all baselines for project', async () => {
|
|
492
|
+
const mockBaselines = [
|
|
493
|
+
{
|
|
494
|
+
scenario: 'scenario-1',
|
|
495
|
+
run_id: 'run-1',
|
|
496
|
+
created_at: '2026-02-19T10:00:00Z',
|
|
497
|
+
success_rate: 0.9,
|
|
498
|
+
median_latency_ms: 150,
|
|
499
|
+
total_tokens: 5000,
|
|
500
|
+
passed_cases: 9,
|
|
501
|
+
failed_cases: 1,
|
|
502
|
+
total_cases: 10,
|
|
503
|
+
},
|
|
504
|
+
{
|
|
505
|
+
scenario: 'scenario-2',
|
|
506
|
+
run_id: 'run-2',
|
|
507
|
+
created_at: '2026-02-18T10:00:00Z',
|
|
508
|
+
success_rate: 0.85,
|
|
509
|
+
median_latency_ms: 200,
|
|
510
|
+
total_tokens: 4000,
|
|
511
|
+
passed_cases: 8,
|
|
512
|
+
failed_cases: 2,
|
|
513
|
+
total_cases: 10,
|
|
514
|
+
},
|
|
515
|
+
];
|
|
516
|
+
|
|
517
|
+
mockFrom.mockReturnValue({
|
|
518
|
+
select: mock(() => ({
|
|
519
|
+
eq: mock(() => ({
|
|
520
|
+
order: mock(() => Promise.resolve({ data: mockBaselines, error: null })),
|
|
521
|
+
})),
|
|
522
|
+
})),
|
|
523
|
+
});
|
|
524
|
+
|
|
525
|
+
const result = await adapter.listBaselines();
|
|
526
|
+
|
|
527
|
+
expect(result).toHaveLength(2);
|
|
528
|
+
expect(result[0].scenario).toBe('scenario-1');
|
|
529
|
+
expect(result[1].scenario).toBe('scenario-2');
|
|
530
|
+
});
|
|
531
|
+
});
|
|
532
|
+
|
|
533
|
+
describe('removeBaseline', () => {
|
|
534
|
+
it('should remove baseline by scenario', async () => {
|
|
535
|
+
mockFrom.mockReturnValue({
|
|
536
|
+
delete: mock(() => ({
|
|
537
|
+
eq: mock(() => ({
|
|
538
|
+
eq: mock(() => Promise.resolve({ error: null, count: 1 })),
|
|
539
|
+
})),
|
|
540
|
+
})),
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
const result = await adapter.removeBaseline('test-scenario');
|
|
544
|
+
|
|
545
|
+
expect(result).toBe(true);
|
|
546
|
+
});
|
|
547
|
+
|
|
548
|
+
it('should return false if baseline not found', async () => {
|
|
549
|
+
mockFrom.mockReturnValue({
|
|
550
|
+
delete: mock(() => ({
|
|
551
|
+
eq: mock(() => ({
|
|
552
|
+
eq: mock(() => Promise.resolve({ error: null, count: 0 })),
|
|
553
|
+
})),
|
|
554
|
+
})),
|
|
555
|
+
});
|
|
556
|
+
|
|
557
|
+
const result = await adapter.removeBaseline('non-existent');
|
|
558
|
+
|
|
559
|
+
expect(result).toBe(false);
|
|
560
|
+
});
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
describe('saveCaseResult', () => {
|
|
564
|
+
it('should save single case result', async () => {
|
|
565
|
+
const caseResult = createMockCaseResultRecord();
|
|
566
|
+
|
|
567
|
+
mockFrom.mockReturnValue({
|
|
568
|
+
upsert: mock(() => ({
|
|
569
|
+
select: mock(() => ({
|
|
570
|
+
single: mock(() => Promise.resolve({ data: { id: 'uuid-123' }, error: null })),
|
|
571
|
+
})),
|
|
572
|
+
})),
|
|
573
|
+
});
|
|
574
|
+
|
|
575
|
+
const result = await adapter.saveCaseResult(caseResult);
|
|
576
|
+
|
|
577
|
+
expect(result).toBe('uuid-123');
|
|
578
|
+
});
|
|
579
|
+
|
|
580
|
+
it('should throw error on save failure', async () => {
|
|
581
|
+
const caseResult = createMockCaseResultRecord();
|
|
582
|
+
|
|
583
|
+
mockFrom.mockReturnValue({
|
|
584
|
+
upsert: mock(() => ({
|
|
585
|
+
select: mock(() => ({
|
|
586
|
+
single: mock(() => Promise.resolve({ data: null, error: { message: 'Save failed' } })),
|
|
587
|
+
})),
|
|
588
|
+
})),
|
|
589
|
+
});
|
|
590
|
+
|
|
591
|
+
await expect(adapter.saveCaseResult(caseResult)).rejects.toThrow(
|
|
592
|
+
'Failed to save case result'
|
|
593
|
+
);
|
|
594
|
+
});
|
|
595
|
+
});
|
|
596
|
+
|
|
597
|
+
describe('saveCaseResults', () => {
|
|
598
|
+
it('should save multiple case results', async () => {
|
|
599
|
+
const caseResults = [
|
|
600
|
+
createMockCaseResultRecord({ caseId: 'case-1' }),
|
|
601
|
+
createMockCaseResultRecord({ caseId: 'case-2' }),
|
|
602
|
+
];
|
|
603
|
+
|
|
604
|
+
mockFrom.mockReturnValue({
|
|
605
|
+
upsert: mock(() => ({
|
|
606
|
+
select: mock(() =>
|
|
607
|
+
Promise.resolve({
|
|
608
|
+
data: [{ id: 'uuid-1' }, { id: 'uuid-2' }],
|
|
609
|
+
error: null,
|
|
610
|
+
})
|
|
611
|
+
),
|
|
612
|
+
})),
|
|
613
|
+
});
|
|
614
|
+
|
|
615
|
+
const result = await adapter.saveCaseResults(caseResults);
|
|
616
|
+
|
|
617
|
+
expect(result).toHaveLength(2);
|
|
618
|
+
});
|
|
619
|
+
|
|
620
|
+
it('should return empty array for empty input', async () => {
|
|
621
|
+
const result = await adapter.saveCaseResults([]);
|
|
622
|
+
|
|
623
|
+
expect(result).toEqual([]);
|
|
624
|
+
});
|
|
625
|
+
});
|
|
626
|
+
|
|
627
|
+
describe('getCaseResults', () => {
|
|
628
|
+
it('should get case results for a run', async () => {
|
|
629
|
+
const mockResults = [
|
|
630
|
+
{
|
|
631
|
+
id: 'uuid-1',
|
|
632
|
+
run_id: 'test-run-123',
|
|
633
|
+
case_id: 'case-1',
|
|
634
|
+
case_name: 'Test Case 1',
|
|
635
|
+
status: 'passed',
|
|
636
|
+
score: 1.0,
|
|
637
|
+
matcher_type: 'contains',
|
|
638
|
+
reason: 'Matched',
|
|
639
|
+
response: 'Test response',
|
|
640
|
+
latency_ms: 150,
|
|
641
|
+
prompt_tokens: 100,
|
|
642
|
+
completion_tokens: 50,
|
|
643
|
+
total_tokens: 150,
|
|
644
|
+
tags: ['unit-test'],
|
|
645
|
+
created_at: '2026-02-19T10:00:00Z',
|
|
646
|
+
},
|
|
647
|
+
];
|
|
648
|
+
|
|
649
|
+
mockFrom.mockReturnValue({
|
|
650
|
+
select: mock(() => ({
|
|
651
|
+
eq: mock(() => ({
|
|
652
|
+
order: mock(() => Promise.resolve({ data: mockResults, error: null })),
|
|
653
|
+
})),
|
|
654
|
+
})),
|
|
655
|
+
});
|
|
656
|
+
|
|
657
|
+
const result = await adapter.getCaseResults('test-run-123');
|
|
658
|
+
|
|
659
|
+
expect(result).toHaveLength(1);
|
|
660
|
+
expect(result[0].caseId).toBe('case-1');
|
|
661
|
+
expect(result[0].status).toBe('passed');
|
|
662
|
+
});
|
|
663
|
+
});
|
|
664
|
+
|
|
665
|
+
describe('queryCaseResults', () => {
|
|
666
|
+
it('should query case results with filters', async () => {
|
|
667
|
+
const mockResults = [
|
|
668
|
+
{
|
|
669
|
+
id: 'uuid-1',
|
|
670
|
+
run_id: 'test-run-123',
|
|
671
|
+
case_id: 'case-1',
|
|
672
|
+
status: 'failed',
|
|
673
|
+
score: 0,
|
|
674
|
+
matcher_type: 'contains',
|
|
675
|
+
response: 'Test response',
|
|
676
|
+
latency_ms: 150,
|
|
677
|
+
prompt_tokens: 100,
|
|
678
|
+
completion_tokens: 50,
|
|
679
|
+
total_tokens: 150,
|
|
680
|
+
tags: ['regression'],
|
|
681
|
+
created_at: '2026-02-19T10:00:00Z',
|
|
682
|
+
},
|
|
683
|
+
];
|
|
684
|
+
|
|
685
|
+
mockFrom.mockReturnValue({
|
|
686
|
+
select: mock(() => ({
|
|
687
|
+
order: mock(() => ({
|
|
688
|
+
eq: mock(() => ({
|
|
689
|
+
eq: mock(() => ({
|
|
690
|
+
overlaps: mock(() => ({
|
|
691
|
+
limit: mock(() => Promise.resolve({ data: mockResults, error: null })),
|
|
692
|
+
})),
|
|
693
|
+
})),
|
|
694
|
+
})),
|
|
695
|
+
})),
|
|
696
|
+
})),
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
const result = await adapter.queryCaseResults({
|
|
700
|
+
runId: 'test-run-123',
|
|
701
|
+
status: 'failed',
|
|
702
|
+
tags: ['regression'],
|
|
703
|
+
limit: 10,
|
|
704
|
+
});
|
|
705
|
+
|
|
706
|
+
expect(result).toHaveLength(1);
|
|
707
|
+
expect(result[0].status).toBe('failed');
|
|
708
|
+
});
|
|
709
|
+
});
|
|
710
|
+
|
|
711
|
+
describe('saveMetricsSnapshot', () => {
|
|
712
|
+
it('should save metrics snapshot', async () => {
|
|
713
|
+
const snapshot = createMockMetricsSnapshot();
|
|
714
|
+
|
|
715
|
+
mockFrom.mockReturnValue({
|
|
716
|
+
upsert: mock(() => ({
|
|
717
|
+
select: mock(() => ({
|
|
718
|
+
single: mock(() => Promise.resolve({ data: { id: 'uuid-123' }, error: null })),
|
|
719
|
+
})),
|
|
720
|
+
})),
|
|
721
|
+
});
|
|
722
|
+
|
|
723
|
+
const result = await adapter.saveMetricsSnapshot(snapshot);
|
|
724
|
+
|
|
725
|
+
expect(result).toBe('uuid-123');
|
|
726
|
+
});
|
|
727
|
+
});
|
|
728
|
+
|
|
729
|
+
describe('getMetricsTrend', () => {
|
|
730
|
+
it('should get metrics trend for a project', async () => {
|
|
731
|
+
const mockTrend = [
|
|
732
|
+
{
|
|
733
|
+
date: '2026-02-17',
|
|
734
|
+
avg_success_rate: 0.85,
|
|
735
|
+
avg_latency_ms: 160,
|
|
736
|
+
total_runs: 8,
|
|
737
|
+
total_tokens: 4000,
|
|
738
|
+
},
|
|
739
|
+
{
|
|
740
|
+
date: '2026-02-18',
|
|
741
|
+
avg_success_rate: 0.88,
|
|
742
|
+
avg_latency_ms: 155,
|
|
743
|
+
total_runs: 9,
|
|
744
|
+
total_tokens: 4500,
|
|
745
|
+
},
|
|
746
|
+
{
|
|
747
|
+
date: '2026-02-19',
|
|
748
|
+
avg_success_rate: 0.9,
|
|
749
|
+
avg_latency_ms: 150,
|
|
750
|
+
total_runs: 10,
|
|
751
|
+
total_tokens: 5000,
|
|
752
|
+
},
|
|
753
|
+
];
|
|
754
|
+
|
|
755
|
+
// Build the mock chain in reverse order
|
|
756
|
+
const mockFinalResult = Promise.resolve({ data: mockTrend, error: null });
|
|
757
|
+
const mockLimit = mock(() => mockFinalResult);
|
|
758
|
+
const mockLte = mock(() => ({ limit: mockLimit }));
|
|
759
|
+
const mockGte = mock(() => ({ lte: mockLte }));
|
|
760
|
+
const mockIs = mock(() => ({ gte: mockGte }));
|
|
761
|
+
const mockOrder = mock(() => ({ is: mockIs }));
|
|
762
|
+
const mockEq = mock(() => ({ order: mockOrder }));
|
|
763
|
+
const mockSelect = mock(() => ({ eq: mockEq }));
|
|
764
|
+
|
|
765
|
+
mockFrom.mockReturnValue({
|
|
766
|
+
select: mockSelect,
|
|
767
|
+
});
|
|
768
|
+
|
|
769
|
+
const result = await adapter.getMetricsTrend({
|
|
770
|
+
project: 'test-project',
|
|
771
|
+
startDate: '2026-02-17',
|
|
772
|
+
endDate: '2026-02-19',
|
|
773
|
+
limit: 30,
|
|
774
|
+
});
|
|
775
|
+
|
|
776
|
+
expect(result).toHaveLength(3);
|
|
777
|
+
expect(result[0].date).toBe('2026-02-17');
|
|
778
|
+
expect(result[2].successRate).toBe(0.9);
|
|
779
|
+
});
|
|
780
|
+
});
|
|
781
|
+
|
|
782
|
+
describe('getMetricsSnapshot', () => {
|
|
783
|
+
it('should get specific metrics snapshot', async () => {
|
|
784
|
+
const mockSnapshot = {
|
|
785
|
+
id: 'uuid-123',
|
|
786
|
+
date: '2026-02-19',
|
|
787
|
+
project: 'test-project',
|
|
788
|
+
scenario: 'test-scenario',
|
|
789
|
+
total_runs: 10,
|
|
790
|
+
total_cases: 100,
|
|
791
|
+
passed_cases: 90,
|
|
792
|
+
failed_cases: 10,
|
|
793
|
+
avg_success_rate: 0.9,
|
|
794
|
+
avg_latency_ms: 150,
|
|
795
|
+
avg_tokens_per_run: 500,
|
|
796
|
+
min_success_rate: 0.8,
|
|
797
|
+
max_success_rate: 1.0,
|
|
798
|
+
min_latency_ms: 100,
|
|
799
|
+
max_latency_ms: 200,
|
|
800
|
+
total_tokens: 5000,
|
|
801
|
+
created_at: '2026-02-19T10:00:00Z',
|
|
802
|
+
updated_at: '2026-02-19T10:00:00Z',
|
|
803
|
+
};
|
|
804
|
+
|
|
805
|
+
mockFrom.mockReturnValue({
|
|
806
|
+
select: mock(() => ({
|
|
807
|
+
eq: mock(() => ({
|
|
808
|
+
eq: mock(() => ({
|
|
809
|
+
eq: mock(() => ({
|
|
810
|
+
single: mock(() => Promise.resolve({ data: mockSnapshot, error: null })),
|
|
811
|
+
})),
|
|
812
|
+
})),
|
|
813
|
+
})),
|
|
814
|
+
})),
|
|
815
|
+
});
|
|
816
|
+
|
|
817
|
+
const result = await adapter.getMetricsSnapshot(
|
|
818
|
+
'2026-02-19',
|
|
819
|
+
'test-project',
|
|
820
|
+
'test-scenario'
|
|
821
|
+
);
|
|
822
|
+
|
|
823
|
+
expect(result).not.toBeNull();
|
|
824
|
+
expect(result?.date).toBe('2026-02-19');
|
|
825
|
+
expect(result?.avgSuccessRate).toBe(0.9);
|
|
826
|
+
});
|
|
827
|
+
|
|
828
|
+
it('should return null if snapshot not found', async () => {
|
|
829
|
+
mockFrom.mockReturnValue({
|
|
830
|
+
select: mock(() => ({
|
|
831
|
+
eq: mock(() => ({
|
|
832
|
+
eq: mock(() => ({
|
|
833
|
+
is: mock(() => ({
|
|
834
|
+
single: mock(() => Promise.resolve({ data: null, error: null })),
|
|
835
|
+
})),
|
|
836
|
+
})),
|
|
837
|
+
})),
|
|
838
|
+
})),
|
|
839
|
+
});
|
|
840
|
+
|
|
841
|
+
const result = await adapter.getMetricsSnapshot('2026-01-01', 'test-project');
|
|
842
|
+
|
|
843
|
+
expect(result).toBeNull();
|
|
844
|
+
});
|
|
845
|
+
});
|
|
846
|
+
|
|
847
|
+
describe('aggregateDailyMetrics', () => {
|
|
848
|
+
it('should aggregate metrics from runs', async () => {
|
|
849
|
+
const mockRuns = [
|
|
850
|
+
{
|
|
851
|
+
success_rate: 0.9,
|
|
852
|
+
total_cases: 10,
|
|
853
|
+
passed_cases: 9,
|
|
854
|
+
failed_cases: 1,
|
|
855
|
+
median_latency_ms: 150,
|
|
856
|
+
total_tokens: 500,
|
|
857
|
+
},
|
|
858
|
+
{
|
|
859
|
+
success_rate: 0.8,
|
|
860
|
+
total_cases: 10,
|
|
861
|
+
passed_cases: 8,
|
|
862
|
+
failed_cases: 2,
|
|
863
|
+
median_latency_ms: 200,
|
|
864
|
+
total_tokens: 600,
|
|
865
|
+
},
|
|
866
|
+
];
|
|
867
|
+
|
|
868
|
+
mockFrom.mockImplementation((table: string) => {
|
|
869
|
+
if (table === 'runs') {
|
|
870
|
+
return {
|
|
871
|
+
select: mock(() => ({
|
|
872
|
+
eq: mock(() => ({
|
|
873
|
+
gte: mock(() => ({
|
|
874
|
+
lte: mock(() => Promise.resolve({ data: mockRuns, error: null })),
|
|
875
|
+
})),
|
|
876
|
+
})),
|
|
877
|
+
})),
|
|
878
|
+
};
|
|
879
|
+
}
|
|
880
|
+
if (table === 'metrics_history') {
|
|
881
|
+
return {
|
|
882
|
+
upsert: mock(() => ({
|
|
883
|
+
select: mock(() => ({
|
|
884
|
+
single: mock(() => Promise.resolve({ data: { id: 'uuid-123' }, error: null })),
|
|
885
|
+
})),
|
|
886
|
+
})),
|
|
887
|
+
};
|
|
888
|
+
}
|
|
889
|
+
return {};
|
|
890
|
+
});
|
|
891
|
+
|
|
892
|
+
const result = await adapter.aggregateDailyMetrics('2026-02-19', 'test-project');
|
|
893
|
+
|
|
894
|
+
expect(result.totalRuns).toBe(2);
|
|
895
|
+
expect(result.totalCases).toBe(20);
|
|
896
|
+
expect(result.avgSuccessRate).toBeCloseTo(0.85, 5);
|
|
897
|
+
expect(result.avgLatencyMs).toBe(175);
|
|
898
|
+
});
|
|
899
|
+
|
|
900
|
+
it('should return empty snapshot if no runs', async () => {
|
|
901
|
+
mockFrom.mockImplementation((table: string) => {
|
|
902
|
+
if (table === 'runs') {
|
|
903
|
+
return {
|
|
904
|
+
select: mock(() => ({
|
|
905
|
+
eq: mock(() => ({
|
|
906
|
+
gte: mock(() => ({
|
|
907
|
+
lte: mock(() => Promise.resolve({ data: [], error: null })),
|
|
908
|
+
})),
|
|
909
|
+
})),
|
|
910
|
+
})),
|
|
911
|
+
};
|
|
912
|
+
}
|
|
913
|
+
if (table === 'metrics_history') {
|
|
914
|
+
return {
|
|
915
|
+
upsert: mock(() => ({
|
|
916
|
+
select: mock(() => ({
|
|
917
|
+
single: mock(() => Promise.resolve({ data: { id: 'uuid-123' }, error: null })),
|
|
918
|
+
})),
|
|
919
|
+
})),
|
|
920
|
+
};
|
|
921
|
+
}
|
|
922
|
+
return {};
|
|
923
|
+
});
|
|
924
|
+
|
|
925
|
+
const result = await adapter.aggregateDailyMetrics('2026-02-19', 'test-project');
|
|
926
|
+
|
|
927
|
+
expect(result.totalRuns).toBe(0);
|
|
928
|
+
expect(result.avgSuccessRate).toBe(0);
|
|
929
|
+
});
|
|
930
|
+
});
|
|
931
|
+
|
|
932
|
+
describe('compareToBaseline', () => {
|
|
933
|
+
it('should compare run to baseline and detect regression', async () => {
|
|
934
|
+
// This test is more complex due to multiple DB calls
|
|
935
|
+
// We'll verify the method exists and has the right signature
|
|
936
|
+
expect(typeof adapter.compareToBaseline).toBe('function');
|
|
937
|
+
});
|
|
938
|
+
});
|
|
939
|
+
});
|
|
940
|
+
|
|
941
|
+
// ============================================================================
|
|
942
|
+
// Integration-style tests (with type checking)
|
|
943
|
+
// ============================================================================
|
|
944
|
+
|
|
945
|
+
describe('Type Safety', () => {
|
|
946
|
+
it('should have correct CaseResultRecord interface', () => {
|
|
947
|
+
const record: CaseResultRecord = {
|
|
948
|
+
runId: 'run-123',
|
|
949
|
+
caseId: 'case-1',
|
|
950
|
+
status: 'passed',
|
|
951
|
+
score: 1.0,
|
|
952
|
+
matcherType: 'contains',
|
|
953
|
+
response: 'test',
|
|
954
|
+
latencyMs: 100,
|
|
955
|
+
promptTokens: 50,
|
|
956
|
+
completionTokens: 25,
|
|
957
|
+
totalTokens: 75,
|
|
958
|
+
};
|
|
959
|
+
|
|
960
|
+
expect(record.status).toBe('passed');
|
|
961
|
+
expect(record.score).toBe(1.0);
|
|
962
|
+
});
|
|
963
|
+
|
|
964
|
+
it('should have correct MetricsSnapshot interface', () => {
|
|
965
|
+
const snapshot: MetricsSnapshot = {
|
|
966
|
+
date: '2026-02-19',
|
|
967
|
+
project: 'test',
|
|
968
|
+
totalRuns: 10,
|
|
969
|
+
totalCases: 100,
|
|
970
|
+
passedCases: 90,
|
|
971
|
+
failedCases: 10,
|
|
972
|
+
avgSuccessRate: 0.9,
|
|
973
|
+
avgLatencyMs: 150,
|
|
974
|
+
avgTokensPerRun: 500,
|
|
975
|
+
totalTokens: 5000,
|
|
976
|
+
};
|
|
977
|
+
|
|
978
|
+
expect(snapshot.avgSuccessRate).toBe(0.9);
|
|
979
|
+
expect(snapshot.totalRuns).toBe(10);
|
|
980
|
+
});
|
|
981
|
+
|
|
982
|
+
it('should have valid CaseResultStatus types', () => {
|
|
983
|
+
const statuses: CaseResultRecord['status'][] = ['passed', 'failed', 'error'];
|
|
984
|
+
expect(statuses).toContain('passed');
|
|
985
|
+
expect(statuses).toContain('failed');
|
|
986
|
+
expect(statuses).toContain('error');
|
|
987
|
+
});
|
|
988
|
+
});
|