@artemiskit/core 0.1.6 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +116 -0
- package/dist/adapters/types.d.ts +8 -1
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +39 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/index.d.ts +5 -0
- package/dist/cost/index.d.ts.map +1 -0
- package/dist/cost/pricing.d.ts +67 -0
- package/dist/cost/pricing.d.ts.map +1 -0
- package/dist/evaluators/combined.d.ts +10 -0
- package/dist/evaluators/combined.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +4 -0
- package/dist/evaluators/index.d.ts.map +1 -1
- package/dist/evaluators/inline.d.ts +22 -0
- package/dist/evaluators/inline.d.ts.map +1 -0
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/evaluators/not-contains.d.ts +10 -0
- package/dist/evaluators/not-contains.d.ts.map +1 -0
- package/dist/evaluators/similarity.d.ts +16 -0
- package/dist/evaluators/similarity.d.ts.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +13212 -12018
- package/dist/scenario/discovery.d.ts +72 -0
- package/dist/scenario/discovery.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +1 -0
- package/dist/scenario/index.d.ts.map +1 -1
- package/dist/scenario/schema.d.ts +1253 -9
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +62 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +8 -1
- package/src/artifacts/types.ts +39 -0
- package/src/cost/index.ts +14 -0
- package/src/cost/pricing.ts +450 -0
- package/src/evaluators/combined.test.ts +172 -0
- package/src/evaluators/combined.ts +95 -0
- package/src/evaluators/index.ts +12 -0
- package/src/evaluators/inline.test.ts +409 -0
- package/src/evaluators/inline.ts +393 -0
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/not-contains.test.ts +105 -0
- package/src/evaluators/not-contains.ts +45 -0
- package/src/evaluators/similarity.test.ts +333 -0
- package/src/evaluators/similarity.ts +258 -0
- package/src/index.ts +3 -0
- package/src/scenario/discovery.test.ts +153 -0
- package/src/scenario/discovery.ts +277 -0
- package/src/scenario/index.ts +1 -0
- package/src/scenario/schema.ts +47 -2
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +162 -2
- package/src/storage/types.ts +73 -0
|
@@ -162,4 +162,247 @@ describe('LocalStorageAdapter', () => {
|
|
|
162
162
|
const runs = await emptyStorage.list();
|
|
163
163
|
expect(runs).toEqual([]);
|
|
164
164
|
});
|
|
165
|
+
|
|
166
|
+
// ==================== Baseline Tests ====================
|
|
167
|
+
|
|
168
|
+
describe('Baseline Management', () => {
|
|
169
|
+
const baselineManifest: RunManifest = {
|
|
170
|
+
...mockManifest,
|
|
171
|
+
run_id: 'baseline-test-run',
|
|
172
|
+
config: {
|
|
173
|
+
...mockManifest.config,
|
|
174
|
+
scenario: 'baseline-test-scenario',
|
|
175
|
+
},
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
test('sets a baseline for a scenario', async () => {
|
|
179
|
+
await storage.save(baselineManifest);
|
|
180
|
+
|
|
181
|
+
const baseline = await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
182
|
+
|
|
183
|
+
expect(baseline.scenario).toBe('baseline-test-scenario');
|
|
184
|
+
expect(baseline.runId).toBe('baseline-test-run');
|
|
185
|
+
expect(baseline.metrics.successRate).toBe(0.8);
|
|
186
|
+
expect(baseline.metrics.totalCases).toBe(10);
|
|
187
|
+
expect(baseline.metrics.passedCases).toBe(8);
|
|
188
|
+
expect(baseline.createdAt).toBeDefined();
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
test('sets a baseline with a tag', async () => {
|
|
192
|
+
await storage.save(baselineManifest);
|
|
193
|
+
|
|
194
|
+
const baseline = await storage.setBaseline(
|
|
195
|
+
'baseline-test-scenario',
|
|
196
|
+
'baseline-test-run',
|
|
197
|
+
'v1.0.0-release'
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
expect(baseline.tag).toBe('v1.0.0-release');
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
test('gets a baseline for a scenario', async () => {
|
|
204
|
+
await storage.save(baselineManifest);
|
|
205
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
206
|
+
|
|
207
|
+
const baseline = await storage.getBaseline('baseline-test-scenario');
|
|
208
|
+
|
|
209
|
+
expect(baseline).not.toBeNull();
|
|
210
|
+
expect(baseline?.runId).toBe('baseline-test-run');
|
|
211
|
+
expect(baseline?.metrics.successRate).toBe(0.8);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
test('returns null for non-existent baseline', async () => {
|
|
215
|
+
const baseline = await storage.getBaseline('non-existent-scenario');
|
|
216
|
+
expect(baseline).toBeNull();
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
test('gets a baseline by run ID', async () => {
|
|
220
|
+
await storage.save(baselineManifest);
|
|
221
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'by-run-id-test');
|
|
222
|
+
|
|
223
|
+
const baseline = await storage.getBaselineByRunId('baseline-test-run');
|
|
224
|
+
|
|
225
|
+
expect(baseline).not.toBeNull();
|
|
226
|
+
expect(baseline?.scenario).toBe('baseline-test-scenario');
|
|
227
|
+
expect(baseline?.runId).toBe('baseline-test-run');
|
|
228
|
+
expect(baseline?.tag).toBe('by-run-id-test');
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
test('returns null for non-existent baseline by run ID', async () => {
|
|
232
|
+
const baseline = await storage.getBaselineByRunId('non-existent-run-id');
|
|
233
|
+
expect(baseline).toBeNull();
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
test('lists all baselines', async () => {
|
|
237
|
+
// Clear any existing baselines by creating a new storage instance
|
|
238
|
+
const freshStorage = new LocalStorageAdapter(TEST_DIR);
|
|
239
|
+
|
|
240
|
+
// Create multiple runs and baselines
|
|
241
|
+
const manifest1 = {
|
|
242
|
+
...mockManifest,
|
|
243
|
+
run_id: 'list-baseline-1',
|
|
244
|
+
config: { ...mockManifest.config, scenario: 'scenario-1' },
|
|
245
|
+
};
|
|
246
|
+
const manifest2 = {
|
|
247
|
+
...mockManifest,
|
|
248
|
+
run_id: 'list-baseline-2',
|
|
249
|
+
config: { ...mockManifest.config, scenario: 'scenario-2' },
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
await freshStorage.save(manifest1);
|
|
253
|
+
await freshStorage.save(manifest2);
|
|
254
|
+
await freshStorage.setBaseline('scenario-1', 'list-baseline-1');
|
|
255
|
+
await freshStorage.setBaseline('scenario-2', 'list-baseline-2');
|
|
256
|
+
|
|
257
|
+
const baselines = await freshStorage.listBaselines();
|
|
258
|
+
|
|
259
|
+
expect(baselines.length).toBeGreaterThanOrEqual(2);
|
|
260
|
+
expect(baselines.some((b) => b.scenario === 'scenario-1')).toBe(true);
|
|
261
|
+
expect(baselines.some((b) => b.scenario === 'scenario-2')).toBe(true);
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
test('removes a baseline', async () => {
|
|
265
|
+
await storage.save(baselineManifest);
|
|
266
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
267
|
+
|
|
268
|
+
// Verify it exists
|
|
269
|
+
const before = await storage.getBaseline('baseline-test-scenario');
|
|
270
|
+
expect(before).not.toBeNull();
|
|
271
|
+
|
|
272
|
+
// Remove
|
|
273
|
+
const removed = await storage.removeBaseline('baseline-test-scenario');
|
|
274
|
+
expect(removed).toBe(true);
|
|
275
|
+
|
|
276
|
+
// Verify it's gone
|
|
277
|
+
const after = await storage.getBaseline('baseline-test-scenario');
|
|
278
|
+
expect(after).toBeNull();
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
test('returns false when removing non-existent baseline', async () => {
|
|
282
|
+
const removed = await storage.removeBaseline('non-existent-baseline');
|
|
283
|
+
expect(removed).toBe(false);
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
test('removes a baseline by run ID', async () => {
|
|
287
|
+
const removeByIdManifest = {
|
|
288
|
+
...mockManifest,
|
|
289
|
+
run_id: 'remove-by-id-run',
|
|
290
|
+
config: { ...mockManifest.config, scenario: 'remove-by-id-scenario' },
|
|
291
|
+
};
|
|
292
|
+
await storage.save(removeByIdManifest);
|
|
293
|
+
await storage.setBaseline('remove-by-id-scenario', 'remove-by-id-run');
|
|
294
|
+
|
|
295
|
+
// Verify it exists
|
|
296
|
+
const before = await storage.getBaselineByRunId('remove-by-id-run');
|
|
297
|
+
expect(before).not.toBeNull();
|
|
298
|
+
|
|
299
|
+
// Remove by run ID
|
|
300
|
+
const removed = await storage.removeBaselineByRunId('remove-by-id-run');
|
|
301
|
+
expect(removed).toBe(true);
|
|
302
|
+
|
|
303
|
+
// Verify it's gone
|
|
304
|
+
const after = await storage.getBaselineByRunId('remove-by-id-run');
|
|
305
|
+
expect(after).toBeNull();
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
test('returns false when removing non-existent baseline by run ID', async () => {
|
|
309
|
+
const removed = await storage.removeBaselineByRunId('non-existent-run-id');
|
|
310
|
+
expect(removed).toBe(false);
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
test('updates existing baseline when set again', async () => {
|
|
314
|
+
await storage.save(baselineManifest);
|
|
315
|
+
|
|
316
|
+
// Set initial baseline
|
|
317
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'v1.0');
|
|
318
|
+
|
|
319
|
+
// Create a new run with different metrics
|
|
320
|
+
const newManifest = {
|
|
321
|
+
...baselineManifest,
|
|
322
|
+
run_id: 'baseline-test-run-2',
|
|
323
|
+
metrics: {
|
|
324
|
+
...baselineManifest.metrics,
|
|
325
|
+
success_rate: 0.95,
|
|
326
|
+
passed_cases: 9,
|
|
327
|
+
failed_cases: 1,
|
|
328
|
+
},
|
|
329
|
+
};
|
|
330
|
+
await storage.save(newManifest);
|
|
331
|
+
|
|
332
|
+
// Update baseline
|
|
333
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run-2', 'v2.0');
|
|
334
|
+
|
|
335
|
+
const baseline = await storage.getBaseline('baseline-test-scenario');
|
|
336
|
+
expect(baseline?.runId).toBe('baseline-test-run-2');
|
|
337
|
+
expect(baseline?.tag).toBe('v2.0');
|
|
338
|
+
expect(baseline?.metrics.successRate).toBe(0.95);
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
test('compares run to baseline and detects no regression', async () => {
|
|
342
|
+
// Create baseline run
|
|
343
|
+
const baselineRun = {
|
|
344
|
+
...mockManifest,
|
|
345
|
+
run_id: 'compare-baseline-run',
|
|
346
|
+
config: { ...mockManifest.config, scenario: 'compare-scenario' },
|
|
347
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.8 },
|
|
348
|
+
};
|
|
349
|
+
await storage.save(baselineRun);
|
|
350
|
+
await storage.setBaseline('compare-scenario', 'compare-baseline-run');
|
|
351
|
+
|
|
352
|
+
// Create current run with same or better success rate
|
|
353
|
+
const currentRun = {
|
|
354
|
+
...mockManifest,
|
|
355
|
+
run_id: 'compare-current-run',
|
|
356
|
+
config: { ...mockManifest.config, scenario: 'compare-scenario' },
|
|
357
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.85 },
|
|
358
|
+
};
|
|
359
|
+
await storage.save(currentRun);
|
|
360
|
+
|
|
361
|
+
const result = await storage.compareToBaseline('compare-current-run', 0.05);
|
|
362
|
+
|
|
363
|
+
expect(result).not.toBeNull();
|
|
364
|
+
expect(result?.hasRegression).toBe(false);
|
|
365
|
+
expect(result?.comparison.delta.successRate).toBeCloseTo(0.05, 2);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
test('compares run to baseline and detects regression', async () => {
|
|
369
|
+
// Create baseline run
|
|
370
|
+
const baselineRun = {
|
|
371
|
+
...mockManifest,
|
|
372
|
+
run_id: 'regression-baseline-run',
|
|
373
|
+
config: { ...mockManifest.config, scenario: 'regression-scenario' },
|
|
374
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.9 },
|
|
375
|
+
};
|
|
376
|
+
await storage.save(baselineRun);
|
|
377
|
+
await storage.setBaseline('regression-scenario', 'regression-baseline-run');
|
|
378
|
+
|
|
379
|
+
// Create current run with worse success rate (regression)
|
|
380
|
+
const currentRun = {
|
|
381
|
+
...mockManifest,
|
|
382
|
+
run_id: 'regression-current-run',
|
|
383
|
+
config: { ...mockManifest.config, scenario: 'regression-scenario' },
|
|
384
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.7 },
|
|
385
|
+
};
|
|
386
|
+
await storage.save(currentRun);
|
|
387
|
+
|
|
388
|
+
const result = await storage.compareToBaseline('regression-current-run', 0.05);
|
|
389
|
+
|
|
390
|
+
expect(result).not.toBeNull();
|
|
391
|
+
expect(result?.hasRegression).toBe(true);
|
|
392
|
+
expect(result?.comparison.delta.successRate).toBeCloseTo(-0.2, 2);
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
test('returns null when no baseline exists for scenario', async () => {
|
|
396
|
+
const noBaselineRun = {
|
|
397
|
+
...mockManifest,
|
|
398
|
+
run_id: 'no-baseline-run',
|
|
399
|
+
config: { ...mockManifest.config, scenario: 'no-baseline-scenario' },
|
|
400
|
+
};
|
|
401
|
+
await storage.save(noBaselineRun);
|
|
402
|
+
|
|
403
|
+
const result = await storage.compareToBaseline('no-baseline-run');
|
|
404
|
+
|
|
405
|
+
expect(result).toBeNull();
|
|
406
|
+
});
|
|
407
|
+
});
|
|
165
408
|
});
|
package/src/storage/local.ts
CHANGED
|
@@ -5,7 +5,13 @@
|
|
|
5
5
|
import { mkdir, readFile, readdir, unlink, writeFile } from 'node:fs/promises';
|
|
6
6
|
import { join, resolve } from 'node:path';
|
|
7
7
|
import type { AnyManifest, RedTeamManifest, RunManifest, StressManifest } from '../artifacts/types';
|
|
8
|
-
import type {
|
|
8
|
+
import type {
|
|
9
|
+
BaselineMetadata,
|
|
10
|
+
BaselineStorageAdapter,
|
|
11
|
+
ComparisonResult,
|
|
12
|
+
ListOptions,
|
|
13
|
+
RunListItem,
|
|
14
|
+
} from './types';
|
|
9
15
|
|
|
10
16
|
/**
|
|
11
17
|
* Get manifest type from a manifest object
|
|
@@ -39,11 +45,21 @@ function getScenario(manifest: AnyManifest): string {
|
|
|
39
45
|
return manifest.config.scenario;
|
|
40
46
|
}
|
|
41
47
|
|
|
42
|
-
|
|
48
|
+
/**
|
|
49
|
+
* Baselines file structure
|
|
50
|
+
*/
|
|
51
|
+
interface BaselinesFile {
|
|
52
|
+
version: string;
|
|
53
|
+
baselines: Record<string, BaselineMetadata>;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export class LocalStorageAdapter implements BaselineStorageAdapter {
|
|
43
57
|
private basePath: string;
|
|
58
|
+
private baselinesPath: string;
|
|
44
59
|
|
|
45
60
|
constructor(basePath = './artemis-runs') {
|
|
46
61
|
this.basePath = resolve(basePath);
|
|
62
|
+
this.baselinesPath = join(this.basePath, '.artemis', 'baselines.json');
|
|
47
63
|
}
|
|
48
64
|
|
|
49
65
|
async save(manifest: AnyManifest): Promise<string> {
|
|
@@ -191,4 +207,148 @@ export class LocalStorageAdapter implements StorageAdapter {
|
|
|
191
207
|
return [];
|
|
192
208
|
}
|
|
193
209
|
}
|
|
210
|
+
|
|
211
|
+
// ==================== Baseline Methods ====================
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Load baselines file
|
|
215
|
+
*/
|
|
216
|
+
private async loadBaselinesFile(): Promise<BaselinesFile> {
|
|
217
|
+
try {
|
|
218
|
+
const content = await readFile(this.baselinesPath, 'utf-8');
|
|
219
|
+
return JSON.parse(content);
|
|
220
|
+
} catch {
|
|
221
|
+
return { version: '1.0', baselines: {} };
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Save baselines file
|
|
227
|
+
*/
|
|
228
|
+
private async saveBaselinesFile(data: BaselinesFile): Promise<void> {
|
|
229
|
+
const dir = join(this.basePath, '.artemis');
|
|
230
|
+
await mkdir(dir, { recursive: true });
|
|
231
|
+
await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Set a baseline for a scenario
|
|
236
|
+
*/
|
|
237
|
+
async setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata> {
|
|
238
|
+
// Load the run to extract metrics
|
|
239
|
+
const manifest = await this.loadRun(runId);
|
|
240
|
+
const scenarioName = scenario || getScenario(manifest);
|
|
241
|
+
|
|
242
|
+
const baseline: BaselineMetadata = {
|
|
243
|
+
scenario: scenarioName,
|
|
244
|
+
runId,
|
|
245
|
+
createdAt: new Date().toISOString(),
|
|
246
|
+
metrics: {
|
|
247
|
+
successRate: manifest.metrics.success_rate,
|
|
248
|
+
medianLatencyMs: manifest.metrics.median_latency_ms,
|
|
249
|
+
totalTokens: manifest.metrics.total_tokens,
|
|
250
|
+
passedCases: manifest.metrics.passed_cases,
|
|
251
|
+
failedCases: manifest.metrics.failed_cases,
|
|
252
|
+
totalCases: manifest.metrics.total_cases,
|
|
253
|
+
},
|
|
254
|
+
tag,
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
// Load existing baselines and add/update
|
|
258
|
+
const data = await this.loadBaselinesFile();
|
|
259
|
+
data.baselines[scenarioName] = baseline;
|
|
260
|
+
await this.saveBaselinesFile(data);
|
|
261
|
+
|
|
262
|
+
return baseline;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Get the baseline for a scenario
|
|
267
|
+
*/
|
|
268
|
+
async getBaseline(scenario: string): Promise<BaselineMetadata | null> {
|
|
269
|
+
const data = await this.loadBaselinesFile();
|
|
270
|
+
return data.baselines[scenario] || null;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Get a baseline by run ID
|
|
275
|
+
*/
|
|
276
|
+
async getBaselineByRunId(runId: string): Promise<BaselineMetadata | null> {
|
|
277
|
+
const data = await this.loadBaselinesFile();
|
|
278
|
+
const baselines = Object.values(data.baselines);
|
|
279
|
+
return baselines.find((b) => b.runId === runId) || null;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* List all baselines
|
|
284
|
+
*/
|
|
285
|
+
async listBaselines(): Promise<BaselineMetadata[]> {
|
|
286
|
+
const data = await this.loadBaselinesFile();
|
|
287
|
+
return Object.values(data.baselines).sort(
|
|
288
|
+
(a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime()
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* Remove a baseline by scenario name
|
|
294
|
+
*/
|
|
295
|
+
async removeBaseline(scenario: string): Promise<boolean> {
|
|
296
|
+
const data = await this.loadBaselinesFile();
|
|
297
|
+
if (data.baselines[scenario]) {
|
|
298
|
+
delete data.baselines[scenario];
|
|
299
|
+
await this.saveBaselinesFile(data);
|
|
300
|
+
return true;
|
|
301
|
+
}
|
|
302
|
+
return false;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Remove a baseline by run ID
|
|
307
|
+
*/
|
|
308
|
+
async removeBaselineByRunId(runId: string): Promise<boolean> {
|
|
309
|
+
const data = await this.loadBaselinesFile();
|
|
310
|
+
const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
|
|
311
|
+
if (entry) {
|
|
312
|
+
delete data.baselines[entry[0]];
|
|
313
|
+
await this.saveBaselinesFile(data);
|
|
314
|
+
return true;
|
|
315
|
+
}
|
|
316
|
+
return false;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Compare a run against its baseline (if exists)
|
|
321
|
+
*/
|
|
322
|
+
async compareToBaseline(
|
|
323
|
+
runId: string,
|
|
324
|
+
regressionThreshold = 0.05
|
|
325
|
+
): Promise<{
|
|
326
|
+
baseline: BaselineMetadata;
|
|
327
|
+
comparison: ComparisonResult;
|
|
328
|
+
hasRegression: boolean;
|
|
329
|
+
regressionThreshold: number;
|
|
330
|
+
} | null> {
|
|
331
|
+
// Load the current run
|
|
332
|
+
const currentManifest = await this.loadRun(runId);
|
|
333
|
+
const scenario = getScenario(currentManifest);
|
|
334
|
+
|
|
335
|
+
// Get baseline for this scenario
|
|
336
|
+
const baseline = await this.getBaseline(scenario);
|
|
337
|
+
if (!baseline) {
|
|
338
|
+
return null;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Load baseline manifest for full comparison
|
|
342
|
+
const comparison = await this.compare(baseline.runId, runId);
|
|
343
|
+
|
|
344
|
+
// Check for regression (negative delta in success rate)
|
|
345
|
+
const hasRegression = comparison.delta.successRate < -regressionThreshold;
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
baseline,
|
|
349
|
+
comparison,
|
|
350
|
+
hasRegression,
|
|
351
|
+
regressionThreshold,
|
|
352
|
+
};
|
|
353
|
+
}
|
|
194
354
|
}
|
package/src/storage/types.ts
CHANGED
|
@@ -96,3 +96,76 @@ export interface StorageConfig {
|
|
|
96
96
|
bucket?: string;
|
|
97
97
|
basePath?: string;
|
|
98
98
|
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Baseline metadata for regression comparison
|
|
102
|
+
*/
|
|
103
|
+
export interface BaselineMetadata {
|
|
104
|
+
/** Scenario name or identifier */
|
|
105
|
+
scenario: string;
|
|
106
|
+
/** Run ID of the baseline */
|
|
107
|
+
runId: string;
|
|
108
|
+
/** ISO timestamp when baseline was set */
|
|
109
|
+
createdAt: string;
|
|
110
|
+
/** Key metrics captured at baseline time */
|
|
111
|
+
metrics: {
|
|
112
|
+
successRate: number;
|
|
113
|
+
medianLatencyMs: number;
|
|
114
|
+
totalTokens: number;
|
|
115
|
+
passedCases: number;
|
|
116
|
+
failedCases: number;
|
|
117
|
+
totalCases: number;
|
|
118
|
+
};
|
|
119
|
+
/** Optional description or tag */
|
|
120
|
+
tag?: string;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Extended storage adapter with baseline support
|
|
125
|
+
*/
|
|
126
|
+
export interface BaselineStorageAdapter extends StorageAdapter {
|
|
127
|
+
/**
|
|
128
|
+
* Set a baseline for a scenario
|
|
129
|
+
*/
|
|
130
|
+
setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata>;
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Get the baseline by scenario name
|
|
134
|
+
*/
|
|
135
|
+
getBaseline(scenario: string): Promise<BaselineMetadata | null>;
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Get the baseline by run ID
|
|
139
|
+
*/
|
|
140
|
+
getBaselineByRunId(runId: string): Promise<BaselineMetadata | null>;
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* List all baselines
|
|
144
|
+
*/
|
|
145
|
+
listBaselines(): Promise<BaselineMetadata[]>;
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Remove a baseline by scenario name
|
|
149
|
+
*/
|
|
150
|
+
removeBaseline(scenario: string): Promise<boolean>;
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Remove a baseline by run ID
|
|
154
|
+
*/
|
|
155
|
+
removeBaselineByRunId(runId: string): Promise<boolean>;
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Compare a run against its baseline (if exists)
|
|
159
|
+
* @param runId - The run ID to compare
|
|
160
|
+
* @param regressionThreshold - Threshold for regression detection (0-1), default 0.05
|
|
161
|
+
*/
|
|
162
|
+
compareToBaseline?(
|
|
163
|
+
runId: string,
|
|
164
|
+
regressionThreshold?: number
|
|
165
|
+
): Promise<{
|
|
166
|
+
baseline: BaselineMetadata;
|
|
167
|
+
comparison: ComparisonResult;
|
|
168
|
+
hasRegression: boolean;
|
|
169
|
+
regressionThreshold: number;
|
|
170
|
+
} | null>;
|
|
171
|
+
}
|