@artemiskit/core 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/dist/adapters/types.d.ts +5 -0
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/manifest.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +20 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/pricing.d.ts +2 -1
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/index.js +468 -205
- package/dist/scenario/schema.d.ts +8 -0
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +66 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +5 -0
- package/src/artifacts/manifest.ts +24 -2
- package/src/artifacts/types.ts +21 -0
- package/src/cost/pricing.ts +242 -65
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/similarity.test.ts +4 -3
- package/src/scenario/schema.ts +4 -0
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +186 -4
- package/src/storage/types.ts +77 -0
- package/dist/events/emitter.d.ts +0 -111
- package/dist/events/emitter.d.ts.map +0 -1
- package/dist/events/index.d.ts +0 -6
- package/dist/events/index.d.ts.map +0 -1
- package/dist/events/types.d.ts +0 -177
- package/dist/events/types.d.ts.map +0 -1
|
@@ -162,4 +162,247 @@ describe('LocalStorageAdapter', () => {
|
|
|
162
162
|
const runs = await emptyStorage.list();
|
|
163
163
|
expect(runs).toEqual([]);
|
|
164
164
|
});
|
|
165
|
+
|
|
166
|
+
// ==================== Baseline Tests ====================
|
|
167
|
+
|
|
168
|
+
describe('Baseline Management', () => {
|
|
169
|
+
const baselineManifest: RunManifest = {
|
|
170
|
+
...mockManifest,
|
|
171
|
+
run_id: 'baseline-test-run',
|
|
172
|
+
config: {
|
|
173
|
+
...mockManifest.config,
|
|
174
|
+
scenario: 'baseline-test-scenario',
|
|
175
|
+
},
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
test('sets a baseline for a scenario', async () => {
|
|
179
|
+
await storage.save(baselineManifest);
|
|
180
|
+
|
|
181
|
+
const baseline = await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
182
|
+
|
|
183
|
+
expect(baseline.scenario).toBe('baseline-test-scenario');
|
|
184
|
+
expect(baseline.runId).toBe('baseline-test-run');
|
|
185
|
+
expect(baseline.metrics.successRate).toBe(0.8);
|
|
186
|
+
expect(baseline.metrics.totalCases).toBe(10);
|
|
187
|
+
expect(baseline.metrics.passedCases).toBe(8);
|
|
188
|
+
expect(baseline.createdAt).toBeDefined();
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
test('sets a baseline with a tag', async () => {
|
|
192
|
+
await storage.save(baselineManifest);
|
|
193
|
+
|
|
194
|
+
const baseline = await storage.setBaseline(
|
|
195
|
+
'baseline-test-scenario',
|
|
196
|
+
'baseline-test-run',
|
|
197
|
+
'v1.0.0-release'
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
expect(baseline.tag).toBe('v1.0.0-release');
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
test('gets a baseline for a scenario', async () => {
|
|
204
|
+
await storage.save(baselineManifest);
|
|
205
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
206
|
+
|
|
207
|
+
const baseline = await storage.getBaseline('baseline-test-scenario');
|
|
208
|
+
|
|
209
|
+
expect(baseline).not.toBeNull();
|
|
210
|
+
expect(baseline?.runId).toBe('baseline-test-run');
|
|
211
|
+
expect(baseline?.metrics.successRate).toBe(0.8);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
test('returns null for non-existent baseline', async () => {
|
|
215
|
+
const baseline = await storage.getBaseline('non-existent-scenario');
|
|
216
|
+
expect(baseline).toBeNull();
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
test('gets a baseline by run ID', async () => {
|
|
220
|
+
await storage.save(baselineManifest);
|
|
221
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'by-run-id-test');
|
|
222
|
+
|
|
223
|
+
const baseline = await storage.getBaselineByRunId('baseline-test-run');
|
|
224
|
+
|
|
225
|
+
expect(baseline).not.toBeNull();
|
|
226
|
+
expect(baseline?.scenario).toBe('baseline-test-scenario');
|
|
227
|
+
expect(baseline?.runId).toBe('baseline-test-run');
|
|
228
|
+
expect(baseline?.tag).toBe('by-run-id-test');
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
test('returns null for non-existent baseline by run ID', async () => {
|
|
232
|
+
const baseline = await storage.getBaselineByRunId('non-existent-run-id');
|
|
233
|
+
expect(baseline).toBeNull();
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
test('lists all baselines', async () => {
|
|
237
|
+
// Clear any existing baselines by creating a new storage instance
|
|
238
|
+
const freshStorage = new LocalStorageAdapter(TEST_DIR);
|
|
239
|
+
|
|
240
|
+
// Create multiple runs and baselines
|
|
241
|
+
const manifest1 = {
|
|
242
|
+
...mockManifest,
|
|
243
|
+
run_id: 'list-baseline-1',
|
|
244
|
+
config: { ...mockManifest.config, scenario: 'scenario-1' },
|
|
245
|
+
};
|
|
246
|
+
const manifest2 = {
|
|
247
|
+
...mockManifest,
|
|
248
|
+
run_id: 'list-baseline-2',
|
|
249
|
+
config: { ...mockManifest.config, scenario: 'scenario-2' },
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
await freshStorage.save(manifest1);
|
|
253
|
+
await freshStorage.save(manifest2);
|
|
254
|
+
await freshStorage.setBaseline('scenario-1', 'list-baseline-1');
|
|
255
|
+
await freshStorage.setBaseline('scenario-2', 'list-baseline-2');
|
|
256
|
+
|
|
257
|
+
const baselines = await freshStorage.listBaselines();
|
|
258
|
+
|
|
259
|
+
expect(baselines.length).toBeGreaterThanOrEqual(2);
|
|
260
|
+
expect(baselines.some((b) => b.scenario === 'scenario-1')).toBe(true);
|
|
261
|
+
expect(baselines.some((b) => b.scenario === 'scenario-2')).toBe(true);
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
test('removes a baseline', async () => {
|
|
265
|
+
await storage.save(baselineManifest);
|
|
266
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
267
|
+
|
|
268
|
+
// Verify it exists
|
|
269
|
+
const before = await storage.getBaseline('baseline-test-scenario');
|
|
270
|
+
expect(before).not.toBeNull();
|
|
271
|
+
|
|
272
|
+
// Remove
|
|
273
|
+
const removed = await storage.removeBaseline('baseline-test-scenario');
|
|
274
|
+
expect(removed).toBe(true);
|
|
275
|
+
|
|
276
|
+
// Verify it's gone
|
|
277
|
+
const after = await storage.getBaseline('baseline-test-scenario');
|
|
278
|
+
expect(after).toBeNull();
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
test('returns false when removing non-existent baseline', async () => {
|
|
282
|
+
const removed = await storage.removeBaseline('non-existent-baseline');
|
|
283
|
+
expect(removed).toBe(false);
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
test('removes a baseline by run ID', async () => {
|
|
287
|
+
const removeByIdManifest = {
|
|
288
|
+
...mockManifest,
|
|
289
|
+
run_id: 'remove-by-id-run',
|
|
290
|
+
config: { ...mockManifest.config, scenario: 'remove-by-id-scenario' },
|
|
291
|
+
};
|
|
292
|
+
await storage.save(removeByIdManifest);
|
|
293
|
+
await storage.setBaseline('remove-by-id-scenario', 'remove-by-id-run');
|
|
294
|
+
|
|
295
|
+
// Verify it exists
|
|
296
|
+
const before = await storage.getBaselineByRunId('remove-by-id-run');
|
|
297
|
+
expect(before).not.toBeNull();
|
|
298
|
+
|
|
299
|
+
// Remove by run ID
|
|
300
|
+
const removed = await storage.removeBaselineByRunId('remove-by-id-run');
|
|
301
|
+
expect(removed).toBe(true);
|
|
302
|
+
|
|
303
|
+
// Verify it's gone
|
|
304
|
+
const after = await storage.getBaselineByRunId('remove-by-id-run');
|
|
305
|
+
expect(after).toBeNull();
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
test('returns false when removing non-existent baseline by run ID', async () => {
|
|
309
|
+
const removed = await storage.removeBaselineByRunId('non-existent-run-id');
|
|
310
|
+
expect(removed).toBe(false);
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
test('updates existing baseline when set again', async () => {
|
|
314
|
+
await storage.save(baselineManifest);
|
|
315
|
+
|
|
316
|
+
// Set initial baseline
|
|
317
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'v1.0');
|
|
318
|
+
|
|
319
|
+
// Create a new run with different metrics
|
|
320
|
+
const newManifest = {
|
|
321
|
+
...baselineManifest,
|
|
322
|
+
run_id: 'baseline-test-run-2',
|
|
323
|
+
metrics: {
|
|
324
|
+
...baselineManifest.metrics,
|
|
325
|
+
success_rate: 0.95,
|
|
326
|
+
passed_cases: 9,
|
|
327
|
+
failed_cases: 1,
|
|
328
|
+
},
|
|
329
|
+
};
|
|
330
|
+
await storage.save(newManifest);
|
|
331
|
+
|
|
332
|
+
// Update baseline
|
|
333
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run-2', 'v2.0');
|
|
334
|
+
|
|
335
|
+
const baseline = await storage.getBaseline('baseline-test-scenario');
|
|
336
|
+
expect(baseline?.runId).toBe('baseline-test-run-2');
|
|
337
|
+
expect(baseline?.tag).toBe('v2.0');
|
|
338
|
+
expect(baseline?.metrics.successRate).toBe(0.95);
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
test('compares run to baseline and detects no regression', async () => {
|
|
342
|
+
// Create baseline run
|
|
343
|
+
const baselineRun = {
|
|
344
|
+
...mockManifest,
|
|
345
|
+
run_id: 'compare-baseline-run',
|
|
346
|
+
config: { ...mockManifest.config, scenario: 'compare-scenario' },
|
|
347
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.8 },
|
|
348
|
+
};
|
|
349
|
+
await storage.save(baselineRun);
|
|
350
|
+
await storage.setBaseline('compare-scenario', 'compare-baseline-run');
|
|
351
|
+
|
|
352
|
+
// Create current run with same or better success rate
|
|
353
|
+
const currentRun = {
|
|
354
|
+
...mockManifest,
|
|
355
|
+
run_id: 'compare-current-run',
|
|
356
|
+
config: { ...mockManifest.config, scenario: 'compare-scenario' },
|
|
357
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.85 },
|
|
358
|
+
};
|
|
359
|
+
await storage.save(currentRun);
|
|
360
|
+
|
|
361
|
+
const result = await storage.compareToBaseline('compare-current-run', 0.05);
|
|
362
|
+
|
|
363
|
+
expect(result).not.toBeNull();
|
|
364
|
+
expect(result?.hasRegression).toBe(false);
|
|
365
|
+
expect(result?.comparison.delta.successRate).toBeCloseTo(0.05, 2);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
test('compares run to baseline and detects regression', async () => {
|
|
369
|
+
// Create baseline run
|
|
370
|
+
const baselineRun = {
|
|
371
|
+
...mockManifest,
|
|
372
|
+
run_id: 'regression-baseline-run',
|
|
373
|
+
config: { ...mockManifest.config, scenario: 'regression-scenario' },
|
|
374
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.9 },
|
|
375
|
+
};
|
|
376
|
+
await storage.save(baselineRun);
|
|
377
|
+
await storage.setBaseline('regression-scenario', 'regression-baseline-run');
|
|
378
|
+
|
|
379
|
+
// Create current run with worse success rate (regression)
|
|
380
|
+
const currentRun = {
|
|
381
|
+
...mockManifest,
|
|
382
|
+
run_id: 'regression-current-run',
|
|
383
|
+
config: { ...mockManifest.config, scenario: 'regression-scenario' },
|
|
384
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.7 },
|
|
385
|
+
};
|
|
386
|
+
await storage.save(currentRun);
|
|
387
|
+
|
|
388
|
+
const result = await storage.compareToBaseline('regression-current-run', 0.05);
|
|
389
|
+
|
|
390
|
+
expect(result).not.toBeNull();
|
|
391
|
+
expect(result?.hasRegression).toBe(true);
|
|
392
|
+
expect(result?.comparison.delta.successRate).toBeCloseTo(-0.2, 2);
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
test('returns null when no baseline exists for scenario', async () => {
|
|
396
|
+
const noBaselineRun = {
|
|
397
|
+
...mockManifest,
|
|
398
|
+
run_id: 'no-baseline-run',
|
|
399
|
+
config: { ...mockManifest.config, scenario: 'no-baseline-scenario' },
|
|
400
|
+
};
|
|
401
|
+
await storage.save(noBaselineRun);
|
|
402
|
+
|
|
403
|
+
const result = await storage.compareToBaseline('no-baseline-run');
|
|
404
|
+
|
|
405
|
+
expect(result).toBeNull();
|
|
406
|
+
});
|
|
407
|
+
});
|
|
165
408
|
});
|
package/src/storage/local.ts
CHANGED
|
@@ -5,7 +5,13 @@
|
|
|
5
5
|
import { mkdir, readFile, readdir, unlink, writeFile } from 'node:fs/promises';
|
|
6
6
|
import { join, resolve } from 'node:path';
|
|
7
7
|
import type { AnyManifest, RedTeamManifest, RunManifest, StressManifest } from '../artifacts/types';
|
|
8
|
-
import type {
|
|
8
|
+
import type {
|
|
9
|
+
BaselineMetadata,
|
|
10
|
+
BaselineStorageAdapter,
|
|
11
|
+
ComparisonResult,
|
|
12
|
+
ListOptions,
|
|
13
|
+
RunListItem,
|
|
14
|
+
} from './types';
|
|
9
15
|
|
|
10
16
|
/**
|
|
11
17
|
* Get manifest type from a manifest object
|
|
@@ -32,6 +38,21 @@ function getSuccessRate(manifest: AnyManifest): number {
|
|
|
32
38
|
return (manifest as RunManifest).metrics.success_rate;
|
|
33
39
|
}
|
|
34
40
|
|
|
41
|
+
/**
|
|
42
|
+
* Get estimated cost from any manifest type
|
|
43
|
+
*/
|
|
44
|
+
function getEstimatedCost(manifest: AnyManifest): number | undefined {
|
|
45
|
+
const type = getManifestType(manifest);
|
|
46
|
+
if (type === 'stress') {
|
|
47
|
+
return (manifest as StressManifest).metrics.cost?.estimated_total_usd;
|
|
48
|
+
}
|
|
49
|
+
if (type === 'run') {
|
|
50
|
+
return (manifest as RunManifest).metrics.cost?.total_usd;
|
|
51
|
+
}
|
|
52
|
+
// Redteam doesn't have cost tracking yet
|
|
53
|
+
return undefined;
|
|
54
|
+
}
|
|
55
|
+
|
|
35
56
|
/**
|
|
36
57
|
* Get scenario name from any manifest type
|
|
37
58
|
*/
|
|
@@ -39,11 +60,21 @@ function getScenario(manifest: AnyManifest): string {
|
|
|
39
60
|
return manifest.config.scenario;
|
|
40
61
|
}
|
|
41
62
|
|
|
42
|
-
|
|
63
|
+
/**
|
|
64
|
+
* Baselines file structure
|
|
65
|
+
*/
|
|
66
|
+
interface BaselinesFile {
|
|
67
|
+
version: string;
|
|
68
|
+
baselines: Record<string, BaselineMetadata>;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export class LocalStorageAdapter implements BaselineStorageAdapter {
|
|
43
72
|
private basePath: string;
|
|
73
|
+
private baselinesPath: string;
|
|
44
74
|
|
|
45
75
|
constructor(basePath = './artemis-runs') {
|
|
46
76
|
this.basePath = resolve(basePath);
|
|
77
|
+
this.baselinesPath = join(this.basePath, '.artemis', 'baselines.json');
|
|
47
78
|
}
|
|
48
79
|
|
|
49
80
|
async save(manifest: AnyManifest): Promise<string> {
|
|
@@ -122,13 +153,20 @@ export class LocalStorageAdapter implements StorageAdapter {
|
|
|
122
153
|
continue;
|
|
123
154
|
}
|
|
124
155
|
|
|
125
|
-
|
|
156
|
+
const item: RunListItem = {
|
|
126
157
|
runId: manifest.run_id,
|
|
127
158
|
scenario: getScenario(manifest),
|
|
128
159
|
successRate: getSuccessRate(manifest),
|
|
129
160
|
createdAt: manifest.start_time,
|
|
130
161
|
type: manifestType,
|
|
131
|
-
}
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
// Include cost if requested
|
|
165
|
+
if (options?.includeCost) {
|
|
166
|
+
item.estimatedCostUsd = getEstimatedCost(manifest);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
results.push(item);
|
|
132
170
|
} catch {}
|
|
133
171
|
}
|
|
134
172
|
}
|
|
@@ -191,4 +229,148 @@ export class LocalStorageAdapter implements StorageAdapter {
|
|
|
191
229
|
return [];
|
|
192
230
|
}
|
|
193
231
|
}
|
|
232
|
+
|
|
233
|
+
// ==================== Baseline Methods ====================
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Load baselines file
|
|
237
|
+
*/
|
|
238
|
+
private async loadBaselinesFile(): Promise<BaselinesFile> {
|
|
239
|
+
try {
|
|
240
|
+
const content = await readFile(this.baselinesPath, 'utf-8');
|
|
241
|
+
return JSON.parse(content);
|
|
242
|
+
} catch {
|
|
243
|
+
return { version: '1.0', baselines: {} };
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Save baselines file
|
|
249
|
+
*/
|
|
250
|
+
private async saveBaselinesFile(data: BaselinesFile): Promise<void> {
|
|
251
|
+
const dir = join(this.basePath, '.artemis');
|
|
252
|
+
await mkdir(dir, { recursive: true });
|
|
253
|
+
await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Set a baseline for a scenario
|
|
258
|
+
*/
|
|
259
|
+
async setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata> {
|
|
260
|
+
// Load the run to extract metrics
|
|
261
|
+
const manifest = await this.loadRun(runId);
|
|
262
|
+
const scenarioName = scenario || getScenario(manifest);
|
|
263
|
+
|
|
264
|
+
const baseline: BaselineMetadata = {
|
|
265
|
+
scenario: scenarioName,
|
|
266
|
+
runId,
|
|
267
|
+
createdAt: new Date().toISOString(),
|
|
268
|
+
metrics: {
|
|
269
|
+
successRate: manifest.metrics.success_rate,
|
|
270
|
+
medianLatencyMs: manifest.metrics.median_latency_ms,
|
|
271
|
+
totalTokens: manifest.metrics.total_tokens,
|
|
272
|
+
passedCases: manifest.metrics.passed_cases,
|
|
273
|
+
failedCases: manifest.metrics.failed_cases,
|
|
274
|
+
totalCases: manifest.metrics.total_cases,
|
|
275
|
+
},
|
|
276
|
+
tag,
|
|
277
|
+
};
|
|
278
|
+
|
|
279
|
+
// Load existing baselines and add/update
|
|
280
|
+
const data = await this.loadBaselinesFile();
|
|
281
|
+
data.baselines[scenarioName] = baseline;
|
|
282
|
+
await this.saveBaselinesFile(data);
|
|
283
|
+
|
|
284
|
+
return baseline;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Get the baseline for a scenario
|
|
289
|
+
*/
|
|
290
|
+
async getBaseline(scenario: string): Promise<BaselineMetadata | null> {
|
|
291
|
+
const data = await this.loadBaselinesFile();
|
|
292
|
+
return data.baselines[scenario] || null;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Get a baseline by run ID
|
|
297
|
+
*/
|
|
298
|
+
async getBaselineByRunId(runId: string): Promise<BaselineMetadata | null> {
|
|
299
|
+
const data = await this.loadBaselinesFile();
|
|
300
|
+
const baselines = Object.values(data.baselines);
|
|
301
|
+
return baselines.find((b) => b.runId === runId) || null;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* List all baselines
|
|
306
|
+
*/
|
|
307
|
+
async listBaselines(): Promise<BaselineMetadata[]> {
|
|
308
|
+
const data = await this.loadBaselinesFile();
|
|
309
|
+
return Object.values(data.baselines).sort(
|
|
310
|
+
(a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime()
|
|
311
|
+
);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Remove a baseline by scenario name
|
|
316
|
+
*/
|
|
317
|
+
async removeBaseline(scenario: string): Promise<boolean> {
|
|
318
|
+
const data = await this.loadBaselinesFile();
|
|
319
|
+
if (data.baselines[scenario]) {
|
|
320
|
+
delete data.baselines[scenario];
|
|
321
|
+
await this.saveBaselinesFile(data);
|
|
322
|
+
return true;
|
|
323
|
+
}
|
|
324
|
+
return false;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Remove a baseline by run ID
|
|
329
|
+
*/
|
|
330
|
+
async removeBaselineByRunId(runId: string): Promise<boolean> {
|
|
331
|
+
const data = await this.loadBaselinesFile();
|
|
332
|
+
const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
|
|
333
|
+
if (entry) {
|
|
334
|
+
delete data.baselines[entry[0]];
|
|
335
|
+
await this.saveBaselinesFile(data);
|
|
336
|
+
return true;
|
|
337
|
+
}
|
|
338
|
+
return false;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Compare a run against its baseline (if exists)
|
|
343
|
+
*/
|
|
344
|
+
async compareToBaseline(
|
|
345
|
+
runId: string,
|
|
346
|
+
regressionThreshold = 0.05
|
|
347
|
+
): Promise<{
|
|
348
|
+
baseline: BaselineMetadata;
|
|
349
|
+
comparison: ComparisonResult;
|
|
350
|
+
hasRegression: boolean;
|
|
351
|
+
regressionThreshold: number;
|
|
352
|
+
} | null> {
|
|
353
|
+
// Load the current run
|
|
354
|
+
const currentManifest = await this.loadRun(runId);
|
|
355
|
+
const scenario = getScenario(currentManifest);
|
|
356
|
+
|
|
357
|
+
// Get baseline for this scenario
|
|
358
|
+
const baseline = await this.getBaseline(scenario);
|
|
359
|
+
if (!baseline) {
|
|
360
|
+
return null;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Load baseline manifest for full comparison
|
|
364
|
+
const comparison = await this.compare(baseline.runId, runId);
|
|
365
|
+
|
|
366
|
+
// Check for regression (negative delta in success rate)
|
|
367
|
+
const hasRegression = comparison.delta.successRate < -regressionThreshold;
|
|
368
|
+
|
|
369
|
+
return {
|
|
370
|
+
baseline,
|
|
371
|
+
comparison,
|
|
372
|
+
hasRegression,
|
|
373
|
+
regressionThreshold,
|
|
374
|
+
};
|
|
375
|
+
}
|
|
194
376
|
}
|
package/src/storage/types.ts
CHANGED
|
@@ -14,6 +14,8 @@ export interface RunListItem {
|
|
|
14
14
|
createdAt: string;
|
|
15
15
|
/** Type of manifest (run, redteam, stress) */
|
|
16
16
|
type?: 'run' | 'redteam' | 'stress';
|
|
17
|
+
/** Estimated cost in USD (optional, included when --show-cost is used) */
|
|
18
|
+
estimatedCostUsd?: number;
|
|
17
19
|
}
|
|
18
20
|
|
|
19
21
|
/**
|
|
@@ -39,6 +41,8 @@ export interface ListOptions {
|
|
|
39
41
|
offset?: number;
|
|
40
42
|
/** Filter by manifest type */
|
|
41
43
|
type?: 'run' | 'redteam' | 'stress';
|
|
44
|
+
/** Include cost information in results */
|
|
45
|
+
includeCost?: boolean;
|
|
42
46
|
}
|
|
43
47
|
|
|
44
48
|
/**
|
|
@@ -96,3 +100,76 @@ export interface StorageConfig {
|
|
|
96
100
|
bucket?: string;
|
|
97
101
|
basePath?: string;
|
|
98
102
|
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Baseline metadata for regression comparison
|
|
106
|
+
*/
|
|
107
|
+
export interface BaselineMetadata {
|
|
108
|
+
/** Scenario name or identifier */
|
|
109
|
+
scenario: string;
|
|
110
|
+
/** Run ID of the baseline */
|
|
111
|
+
runId: string;
|
|
112
|
+
/** ISO timestamp when baseline was set */
|
|
113
|
+
createdAt: string;
|
|
114
|
+
/** Key metrics captured at baseline time */
|
|
115
|
+
metrics: {
|
|
116
|
+
successRate: number;
|
|
117
|
+
medianLatencyMs: number;
|
|
118
|
+
totalTokens: number;
|
|
119
|
+
passedCases: number;
|
|
120
|
+
failedCases: number;
|
|
121
|
+
totalCases: number;
|
|
122
|
+
};
|
|
123
|
+
/** Optional description or tag */
|
|
124
|
+
tag?: string;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Extended storage adapter with baseline support
|
|
129
|
+
*/
|
|
130
|
+
export interface BaselineStorageAdapter extends StorageAdapter {
|
|
131
|
+
/**
|
|
132
|
+
* Set a baseline for a scenario
|
|
133
|
+
*/
|
|
134
|
+
setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata>;
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Get the baseline by scenario name
|
|
138
|
+
*/
|
|
139
|
+
getBaseline(scenario: string): Promise<BaselineMetadata | null>;
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Get the baseline by run ID
|
|
143
|
+
*/
|
|
144
|
+
getBaselineByRunId(runId: string): Promise<BaselineMetadata | null>;
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* List all baselines
|
|
148
|
+
*/
|
|
149
|
+
listBaselines(): Promise<BaselineMetadata[]>;
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Remove a baseline by scenario name
|
|
153
|
+
*/
|
|
154
|
+
removeBaseline(scenario: string): Promise<boolean>;
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Remove a baseline by run ID
|
|
158
|
+
*/
|
|
159
|
+
removeBaselineByRunId(runId: string): Promise<boolean>;
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Compare a run against its baseline (if exists)
|
|
163
|
+
* @param runId - The run ID to compare
|
|
164
|
+
* @param regressionThreshold - Threshold for regression detection (0-1), default 0.05
|
|
165
|
+
*/
|
|
166
|
+
compareToBaseline?(
|
|
167
|
+
runId: string,
|
|
168
|
+
regressionThreshold?: number
|
|
169
|
+
): Promise<{
|
|
170
|
+
baseline: BaselineMetadata;
|
|
171
|
+
comparison: ComparisonResult;
|
|
172
|
+
hasRegression: boolean;
|
|
173
|
+
regressionThreshold: number;
|
|
174
|
+
} | null>;
|
|
175
|
+
}
|
package/dist/events/emitter.d.ts
DELETED
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Event emitter for observability
|
|
3
|
-
*
|
|
4
|
-
* A lightweight typed event emitter that allows the observability package
|
|
5
|
-
* to subscribe to events from the core runner.
|
|
6
|
-
*/
|
|
7
|
-
import type { ArtemisEvent, ArtemisEventMap, ArtemisEventType } from './types';
|
|
8
|
-
/**
|
|
9
|
-
* Event handler function type
|
|
10
|
-
*/
|
|
11
|
-
export type EventHandler<T extends ArtemisEventType> = (event: ArtemisEventMap[T]) => void;
|
|
12
|
-
/**
|
|
13
|
-
* Async event handler function type
|
|
14
|
-
*/
|
|
15
|
-
export type AsyncEventHandler<T extends ArtemisEventType> = (event: ArtemisEventMap[T]) => void | Promise<void>;
|
|
16
|
-
/**
|
|
17
|
-
* Subscriber interface for observability integrations
|
|
18
|
-
*/
|
|
19
|
-
export interface EventSubscriber {
|
|
20
|
-
/** Unique identifier for this subscriber */
|
|
21
|
-
id: string;
|
|
22
|
-
/** Handle an event */
|
|
23
|
-
handleEvent(event: ArtemisEvent): void | Promise<void>;
|
|
24
|
-
/** Optional: called when subscriber is removed */
|
|
25
|
-
dispose?(): void | Promise<void>;
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Global event emitter instance
|
|
29
|
-
*
|
|
30
|
-
* This singleton allows the observability package to subscribe to events
|
|
31
|
-
* without tight coupling to the core runner.
|
|
32
|
-
*/
|
|
33
|
-
declare class ArtemisEventEmitter {
|
|
34
|
-
private subscribers;
|
|
35
|
-
private handlers;
|
|
36
|
-
private enabled;
|
|
37
|
-
/**
|
|
38
|
-
* Enable or disable event emission
|
|
39
|
-
* When disabled, no events are emitted (zero overhead)
|
|
40
|
-
*/
|
|
41
|
-
setEnabled(enabled: boolean): void;
|
|
42
|
-
/**
|
|
43
|
-
* Check if any subscribers are registered
|
|
44
|
-
*/
|
|
45
|
-
hasSubscribers(): boolean;
|
|
46
|
-
/**
|
|
47
|
-
* Register a subscriber that receives all events
|
|
48
|
-
*/
|
|
49
|
-
subscribe(subscriber: EventSubscriber): () => void;
|
|
50
|
-
/**
|
|
51
|
-
* Remove a subscriber by ID
|
|
52
|
-
*/
|
|
53
|
-
unsubscribe(subscriberId: string): Promise<void>;
|
|
54
|
-
/**
|
|
55
|
-
* Register a handler for a specific event type
|
|
56
|
-
*/
|
|
57
|
-
on<T extends ArtemisEventType>(eventType: T, handler: AsyncEventHandler<T>): () => void;
|
|
58
|
-
/**
|
|
59
|
-
* Remove a handler for a specific event type
|
|
60
|
-
*/
|
|
61
|
-
off<T extends ArtemisEventType>(eventType: T, handler: AsyncEventHandler<T>): void;
|
|
62
|
-
/**
|
|
63
|
-
* Emit an event to all subscribers and type-specific handlers
|
|
64
|
-
*
|
|
65
|
-
* Events are emitted asynchronously but errors are caught and logged
|
|
66
|
-
* to prevent observability from breaking the test runner.
|
|
67
|
-
*/
|
|
68
|
-
emit<T extends ArtemisEventType>(event: ArtemisEventMap[T]): void;
|
|
69
|
-
/**
|
|
70
|
-
* Emit an event and wait for all handlers to complete
|
|
71
|
-
* Useful for testing or when you need to ensure handlers have run
|
|
72
|
-
*/
|
|
73
|
-
emitAsync<T extends ArtemisEventType>(event: ArtemisEventMap[T]): Promise<void>;
|
|
74
|
-
/**
|
|
75
|
-
* Remove all subscribers and handlers
|
|
76
|
-
*/
|
|
77
|
-
clear(): Promise<void>;
|
|
78
|
-
/**
|
|
79
|
-
* Get the number of registered subscribers
|
|
80
|
-
*/
|
|
81
|
-
get subscriberCount(): number;
|
|
82
|
-
/**
|
|
83
|
-
* Get the number of registered handlers (across all event types)
|
|
84
|
-
*/
|
|
85
|
-
get handlerCount(): number;
|
|
86
|
-
}
|
|
87
|
-
/**
|
|
88
|
-
* Global event emitter singleton
|
|
89
|
-
*
|
|
90
|
-
* Usage in core:
|
|
91
|
-
* ```typescript
|
|
92
|
-
* import { events } from '@artemiskit/core';
|
|
93
|
-
* events.emit({ type: 'test:start', ... });
|
|
94
|
-
* ```
|
|
95
|
-
*
|
|
96
|
-
* Usage in observability:
|
|
97
|
-
* ```typescript
|
|
98
|
-
* import { events } from '@artemiskit/core';
|
|
99
|
-
* events.subscribe({
|
|
100
|
-
* id: 'metrics',
|
|
101
|
-
* handleEvent: (event) => { ... }
|
|
102
|
-
* });
|
|
103
|
-
* ```
|
|
104
|
-
*/
|
|
105
|
-
export declare const events: ArtemisEventEmitter;
|
|
106
|
-
/**
|
|
107
|
-
* Helper to create a timestamp for events
|
|
108
|
-
*/
|
|
109
|
-
export declare function createTimestamp(): number;
|
|
110
|
-
export {};
|
|
111
|
-
//# sourceMappingURL=emitter.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"emitter.d.ts","sourceRoot":"","sources":["../../src/events/emitter.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAE/E;;GAEG;AACH,MAAM,MAAM,YAAY,CAAC,CAAC,SAAS,gBAAgB,IAAI,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC;AAE3F;;GAEG;AACH,MAAM,MAAM,iBAAiB,CAAC,CAAC,SAAS,gBAAgB,IAAI,CAC1D,KAAK,EAAE,eAAe,CAAC,CAAC,CAAC,KACtB,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;AAE1B;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,4CAA4C;IAC5C,EAAE,EAAE,MAAM,CAAC;IACX,sBAAsB;IACtB,WAAW,CAAC,KAAK,EAAE,YAAY,GAAG,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACvD,kDAAkD;IAClD,OAAO,CAAC,IAAI,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAClC;AAED;;;;;GAKG;AACH,cAAM,mBAAmB;IACvB,OAAO,CAAC,WAAW,CAA2C;IAC9D,OAAO,CAAC,QAAQ,CAAiE;IACjF,OAAO,CAAC,OAAO,CAAiB;IAEhC;;;OAGG;IACH,UAAU,CAAC,OAAO,EAAE,OAAO,GAAG,IAAI;IAIlC;;OAEG;IACH,cAAc,IAAI,OAAO;IAIzB;;OAEG;IACH,SAAS,CAAC,UAAU,EAAE,eAAe,GAAG,MAAM,IAAI;IASlD;;OAEG;IACG,WAAW,CAAC,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAQtD;;OAEG;IACH,EAAE,CAAC,CAAC,SAAS,gBAAgB,EAAE,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,iBAAiB,CAAC,CAAC,CAAC,GAAG,MAAM,IAAI;IAYvF;;OAEG;IACH,GAAG,CAAC,CAAC,SAAS,gBAAgB,EAAE,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,iBAAiB,CAAC,CAAC,CAAC,GAAG,IAAI;IAUlF;;;;;OAKG;IACH,IAAI,CAAC,CAAC,SAAS,gBAAgB,EAAE,KAAK,EAAE,eAAe,CAAC,CAAC,CAAC,GAAG,IAAI;IAWjE;;;OAGG;IACG,SAAS,CAAC,CAAC,SAAS,gBAAgB,EAAE,KAAK,EAAE,eAAe,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAoCrF;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAS5B;;OAEG;IACH,IAAI,eAAe,IAAI,MAAM,CAE5B;IAED;;OAEG;IACH,IAAI,YAAY,IAAI,MAAM,CAMzB;CACF;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,eAAO,MAAM,MAAM,qBAA4B,CAAC;AAEhD;;GAEG;AACH,wBAAgB,eAAe,IAAI,MAAM,CAExC"}
|
package/dist/events/index.d.ts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/events/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,SAAS,CAAC;AACxB,OAAO,EAAE,MAAM,EAAE,eAAe,EAAE,KAAK,eAAe,EAAE,KAAK,YAAY,EAAE,KAAK,iBAAiB,EAAE,MAAM,WAAW,CAAC"}
|